def pre_processData(train_data,file_path):
train_data.loc[(train_data.Age.isnull()), 'Age' ] = np.mean(train_data.Age) # ???????????
train_data.loc[(train_data.Cabin.notnull(),'Cabin')] = 'yes' # Cabin??????yes
train_data.loc[(train_data.Cabin.isnull(),'Cabin')] = 'no'
'''0/1????'''
dummies_cabin = pd.get_dummies(train_data['Cabin'],prefix='Cabin') # get_dummies?????0/1??????????????prefix???Cabin
dummies_Embarked = pd.get_dummies(train_data['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(train_data['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(train_data['Pclass'],prefix='Pclass')
train_data = pd.concat([train_data,dummies_cabin,dummies_Embarked,dummies_Pclass,dummies_Sex], axis=1) # ??dataframe,axis=1??
train_data.drop(['Pclass','Name','Sex','Embarked','Cabin','Ticket'],axis=1,inplace=True) # ????????????
header_string = ','.join(train_data.columns.tolist()) # ?????string???????
np.savetxt(file_path+r'/pre_processData1.csv', train_data, delimiter=',',header=header_string) # ?????????????
'''???????(Age?Fare)'''
scaler = StandardScaler()
age_scaler = scaler.fit(train_data['Age'])
train_data['Age'] = age_scaler.fit_transform(train_data['Age'])
if np.sum(train_data.Fare.isnull()): # ??Fare???????????
train_data.loc[(train_data.Fare.isnull(),'Fare')]=np.mean(train_data.Fare)
fare_scaler = scaler.fit(train_data['Fare'])
train_data['Fare'] = fare_scaler.transform(train_data['Fare'])
header_string = ','.join(train_data.columns.tolist()) # ?????string???????
np.savetxt(file_path+r'/pre_processData_scaled.csv', train_data, delimiter=',',header=header_string) # ?????????????
return train_data
## feature engineering?????-?????
评论列表
文章目录