datapreprocessing.py 文件源码

python
阅读 18 收藏 0 点赞 0 评论 0

项目:House-Pricing 作者: playing-kaggle 项目源码 文件源码
def data_cleaning(file_path):
    data = pd.read_csv(file_path, index_col=False)
    data.drop(['Street', 'Utilities', 'Condition2', 'RoofMatl', 'Alley',
               'GarageYrBlt', 'GarageCond', 'PoolQC', 'MiscFeature'],
              axis=1, inplace=True)
    # marked as NA in BsmtExposure and not NA in other Bsmt Attributes
    data.loc[np.logical_xor(data['BsmtCond'].isnull(), data['BsmtExposure'].isnull()), 'BsmtExposure'] = 'No'
    # LotFrontage's N/A is assigned zero, will it cause problem?
    data.fillna(value={'MasVnrType': 'None', 'MasVnrArea': 0, 'BsmtQual': 'NoBsmt', 'BsmtCond': 'NoBsmt',
                       'BsmtExposure': 'NoBsmt', 'BsmtFinType1': 'NoBsmt', 'BsmtFinType2': 'NoBsmt',
                       'Electrical': 'SBrkr', 'FireplaceQu': 'NoFP', 'GarageType': 'Noga',
                       'GarageFinish': 'Noga', 'GarageQual': 'Noga', 'Fence': 'NoFence', 'LotFrontage': 0},
                inplace=True)
    data.loc[:, 'YrSold'] = 2016 - data.loc[:, 'YrSold']
    data.loc[:, 'YearBuilt'] = 2016 - data.loc[:, 'YearBuilt']
    data.loc[:, 'YearRemodAdd'] = 2016 - data.loc[:, 'YearRemodAdd']
    data.loc[data.loc[:, 'PoolArea'] != 0, 'PoolArea'] = 'Y'
    data.loc[data.loc[:, 'PoolArea'] == 0, 'PoolArea'] = 'N'
    data.loc[:, 'Porch'] = np.sum(data.loc[:, ['EnclosedPorch', '3SsnPorch', 'ScreenPorch']], axis=1)
    data.drop(['EnclosedPorch', '3SsnPorch', 'ScreenPorch'], axis=1, inplace=True)
    data.replace({'BsmtFullBath': {3: 2},
                  'LotShape': {'IR3': 'IR2'}},
                 inplace=True)
    return data
    # data.columns
    # examine columns containing NA value
    # print(data)
    # print(data.columns[np.sum(data.isnull(), axis=0) != 0])
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号