def feature_engineering_ensemble(df,category,sample=60000,purpose='word',\
countries = ['US','BR','RU','KR']):
'''
function:
- aggregates multiple user defined functions to create dataframe for ensemble method modeling.
- it also prints out how long it takes to run
- processes google quickdraw raw data dataframe
- after this processing, dataframe contains 404 features
- the output of this function will be used for ensemble method modeling.
input:
- df = dataframe that was converted from raw_data json file
- category = used to name output pickle file
- sample = number of datapoints included in the final dataframe. (Used only when purpose = 'word')
- purpose = 'word' or 'country'. prepares data for different purposes.
'word' for image recognition, 'country' for country prediction
- countries = list of country code used in country prediction
output:
- pickled dataframe that will be used for ensemble method (404 features)
filename: "./data/MY_feature_{}.pkl".format(category)
'''
start_time = time.time()
#runs feature_eng_pt1 through pt5.
df_test1 = feature_eng_pt1(df)
df_test2 = feature_eng_pt2(df_test1)
df_test3 = feature_eng_pt3(df_test2)
df_subset = feature_eng_pt4(df_test3)
df_subset2 = feature_eng_pt5(df_test3)
df_final = pd.concat([df_test3,df_subset,df_subset2], axis=1)
# prepares final dataframe
#If purpose = 'word' it will randomly select 'sample' number of datapoints from df_final
if purpose == 'word':
df_final.index = xrange(len(df_final))
random_ind = np.random.choice(list(df_final.index), sample, replace=False)
df_final = df_final.loc[list(random_ind)]
#if purpose = 'country', it will correct all datapoints from the selected countries.
elif purpose == 'country':
df_final = df_final[(df_final['countrycode']==countries[0])|\
(df_final['countrycode']==countries[1])|\
(df_final['countrycode']==countries[2])|(df_final['countrycode']==countries[3])]
df_final.index = df_final['key_id']
df_final.to_pickle("./data/MY_feature_{}.pkl".format(category))
print("--- %s seconds ---" % (time.time() - start_time))
feature_engineering_func.py 文件源码
python
阅读 29
收藏 0
点赞 0
评论 0
评论列表
文章目录