def __do_one_hot_encodings(self):
df_train, cv = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
enc = OneHotEncoder(sparse=False)
cross_feature_dict = self.__get_label_encode_dict()
to_be_encoded = []
for _, new_feature_name in cross_feature_dict.iteritems():
to_be_encoded.append(new_feature_name)
#fix all data source
to_be_stacked_df = pd.concat([df_train[to_be_encoded], df_testset1[to_be_encoded], df_testset2[to_be_encoded]], axis = 0)
enc.fit(to_be_stacked_df)
enc, to_be_encoded = self.__filter_too_big_onehot_encoding(enc, to_be_encoded, df_train, df_testset1, df_testset2)
# transform on seprate data source
self.res_data_dict[g_singletonDataFilePath.getTrainDir()] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded),cv
self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] = self.__do_one_hot_encoding(df_testset1,enc, to_be_encoded)
self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded)
return
评论列表
文章目录