def pre_process(df):
# LotFrontage's N/A is assigned zero, will it cause problem?
df.fillna(value={'MasVnrType': 'None', 'MasVnrArea': 0,'Electrical': 'SBrkr', 'FireplaceQu': 'NoFP', 'GarageType': 'Noga',
'GarageFinish': 'Noga', 'GarageQual': 'Noga', 'Fence': 'NoFence',
'BsmtFinSF1':0,'BsmtFinSF2':0,'BsmtUnfSF':0,'TotalBsmtSF':0,'BsmtFullBath':0,'BsmtHalfBath':0,
'LotFrontage': 0},
inplace=True)
df.loc[:, 'YrSold'] = 2016 - df.loc[:, 'YrSold']
df.loc[df.loc[:, 'PoolArea'] != 0, 'PoolArea'] = 1
df.loc[:, 'Porch'] = np.sum(df.loc[:, ['EnclosedPorch', '3SsnPorch', 'ScreenPorch']], axis=1)
df.drop(['EnclosedPorch', '3SsnPorch', 'ScreenPorch'], axis=1, inplace=True)
df.replace({'BsmtFullBath': {3: 2}, 'LotShape': {'IR3': 'IR2'}}, inplace=True)
# fill missing values in bsmt
df = fill_bsmt_missing(df)
def fill_na(df, col_name, value = None):
if value == None:
value = df[col_name].mean()
df.loc[df[col_name].isnull(),col_name] = value
fill_na(df, 'Fence','WD')
fill_na(df, 'GarageArea')
fill_na(df, 'GarageCars')
fill_na(df, 'SaleType', df['SaleType'].mode().values[0])
fill_na(df, 'KitchenQual', df['KitchenQual'].mode().values[0])
fill_na(df, 'Functional', df['Functional'].mode().values[0])
fill_na(df, 'Exterior1st', df['Exterior1st'].mode().values[0])
fill_na(df, 'Exterior2nd', df['Exterior2nd'].mode().values[0])
fill_na(df, 'MSZoning', 'RL')
bool_cols = np.array([df[col_name].isnull() for col_name in df.columns])
print('rows containing na:',np.sum(bool_cols.any(axis=0)))
print('rows all na:',np.sum(bool_cols.all(axis=0)))
# log1pskewed_feats
numeric_feats = df.dtypes[df.dtypes != "object"].index
skewed_feats = df[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
df[skewed_feats] = np.log1p(df[skewed_feats])
return df
#%%
#log transform the target: ignore for test data
#
#train_data = pre_process(train_df.copy())
#test_data = pre_process(test_df.copy())
评论列表
文章目录