def feature_engineering_CNN(df,category,sample=60000,purpose='word',countries = ['US','BR','RU','KR']):
'''
function:
- aggregates 2 user defined functions that prepares dataframe for CNN modeling.
- it also prints out how long it takes to run.
input:
- df = dataframe that was converted from raw_data json file
- category = used to name output pickle file
- sample = number of datapoints included in the final dataframe. (Used only when purpose = 'word')
- purpose = 'word' or 'country'. prepares data for different purposes.
'word' for image recognition, 'country' for country prediction
- countries = list of country codes used in country prediction
output:
- pickled dataframe that will be used for CNN modeling (1176 features)
- each row represents 42 by 28 pixel image
file name: "./data/{}.pkl".format(category)
'''
start_time = time.time()
#runs CNN feature engineering functions
df_1 = CNN_feat_eng_pt1(df)
df_2 = CNN_feat_eng_pt2(df_1)
#If purpose = 'word' it will randomly select 'sample' number of datapoints from df_final
if purpose == 'word':
df_2.index = xrange(len(df_2))
random_ind = np.random.choice(list(df_2.index), sample, replace=False)
df_2 = df_2.loc[list(random_ind)]
#If purpose = 'country', it will correct all datapoints from the selected countries.
elif purpose == 'country':
df_2 = df_2[(df_2['countrycode']==countries[0])|(df_2['countrycode']==countries[1])|\
(df_2['countrycode']==countries[2])|(df_2['countrycode']==countries[3])]
df_2.index = df_2['key_id']
df_2.to_pickle("./data/{}.pkl".format(category))
print("--- %s seconds ---" % (time.time() - start_time))
return df_2
##############################################################################
# functions for feature engineeering for ensemble methods #
##############################################################################
feature_engineering_func.py 文件源码
python
阅读 36
收藏 0
点赞 0
评论 0
评论列表
文章目录