def data_preprocess(train, test):
outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477,
478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169,
1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447]
train.drop(train.index[outlier_idx], inplace=True)
all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
test.loc[:, 'MSSubClass':'SaleCondition']))
to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
all_data = all_data.drop(to_delete, axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
# log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) # compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(method='ffill')
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train, X_test, y
ensemble_stacking.py 文件源码
python
阅读 38
收藏 0
点赞 0
评论 0
评论列表
文章目录