def image_identification_datasetup(df1,df2,sample=30000):
'''
Function:
- takes two dataframe (dataframe should be the output dataframe
from "feature_engineering_CNN" of feature_engineering_func.py) and
convine two dataframe into one.
- it also creates label pd.series for CNN image recognition
filter applied:
- "sample" value determines number of sample extract from each dataframe.
for instance if sample = 30000,
30000 rows are randomly chosen from df1,df2,df3 and df4.
- it also takeout countrycode and word columns
inputs:
2 dataframe
sample = number of rows you want to extract frim each dataframe
outputs:
dataframe and a label
'''
random_index1 = np.random.choice(list(df1.index), sample, replace=False)
random_index2 = np.random.choice(list(df2.index), sample, replace=False)
df1 = df1.loc[list(random_index1)]
df2 = df2.loc[list(random_index2)]
df_test = pd.concat([df1,df2],axis = 0)
df_test = df_test.drop(['countrycode','word'], axis=1)
label = [1]*sample+[0]*sample
# 1= df1, 0 = df2
label = np.array(label)
label = pd.Series(label)
label.index = df_test.index
return df_test,label
cnn_func.py 文件源码
python
阅读 22
收藏 0
点赞 0
评论 0
评论列表
文章目录