lasso_regression.py 文件源码-python代码片段

def pre_process(df):
    # LotFrontage's N/A is assigned zero, will it cause problem?
    df.fillna(value={'MasVnrType': 'None', 'MasVnrArea': 0,'Electrical': 'SBrkr', 'FireplaceQu': 'NoFP', 'GarageType': 'Noga',
                           'GarageFinish': 'Noga', 'GarageQual': 'Noga', 'Fence': 'NoFence', 
                           'BsmtFinSF1':0,'BsmtFinSF2':0,'BsmtUnfSF':0,'TotalBsmtSF':0,'BsmtFullBath':0,'BsmtHalfBath':0,
                           'LotFrontage': 0},
                    inplace=True)

    df.loc[:, 'YrSold'] = 2016 - df.loc[:, 'YrSold']

    df.loc[df.loc[:, 'PoolArea'] != 0, 'PoolArea'] = 1

    df.loc[:, 'Porch'] = np.sum(df.loc[:, ['EnclosedPorch', '3SsnPorch', 'ScreenPorch']], axis=1)
    df.drop(['EnclosedPorch', '3SsnPorch', 'ScreenPorch'], axis=1, inplace=True)

    df.replace({'BsmtFullBath': {3: 2}, 'LotShape': {'IR3': 'IR2'}}, inplace=True)


    # fill missing values in bsmt
    df = fill_bsmt_missing(df)

    def fill_na(df, col_name, value = None):
        if value == None:
            value = df[col_name].mean()
        df.loc[df[col_name].isnull(),col_name] = value

    fill_na(df, 'Fence','WD')
    fill_na(df, 'GarageArea')
    fill_na(df, 'GarageCars')
    fill_na(df, 'SaleType', df['SaleType'].mode().values[0])
    fill_na(df, 'KitchenQual', df['KitchenQual'].mode().values[0])
    fill_na(df, 'Functional', df['Functional'].mode().values[0])
    fill_na(df, 'Exterior1st', df['Exterior1st'].mode().values[0])
    fill_na(df, 'Exterior2nd', df['Exterior2nd'].mode().values[0])
    fill_na(df, 'MSZoning', 'RL')


    bool_cols = np.array([df[col_name].isnull() for col_name in df.columns])
    print('rows containing na:',np.sum(bool_cols.any(axis=0)))
    print('rows all na:',np.sum(bool_cols.all(axis=0)))


    # log1pskewed_feats
    numeric_feats = df.dtypes[df.dtypes != "object"].index

    skewed_feats = df[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    df[skewed_feats] = np.log1p(df[skewed_feats])


    return df

#%%
#log transform the target: ignore for test data
#

#train_data = pre_process(train_df.copy())
#test_data = pre_process(test_df.copy())