def get_features(df_features):
print('use w2v to document presentation')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge(x['question1'], x['question2']), axis = 1)
print('get_w2v')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
df_features['q1_unique_w2v_weight'] = df_features.q1_unique.map(lambda x: get_vector(" ".join(x)))
df_features['q2_unique_w2v_weight'] = df_features.q2_unique.map(lambda x: get_vector(" ".join(x)))
df_features['q1_unique_w2v'] = df_features.q1_unique.map(lambda x: get_weight_vector(" ".join(x)))
df_features['q2_unique_w2v'] = df_features.q2_unique.map(lambda x: get_weight_vector(" ".join(x)))
print('z_dist')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_tfidf_cos_sim')
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_w2v_calc')
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim(x['q1_unique'], x['q2_unique']), axis=1)
df_features['z_w2v_unique_dis_e_weight'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
df_features['z_w2v_unique_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
df_features['z_w2v_unique_dis_mink_w'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight'],3), axis=1)
df_features['z_w2v_unique_dis_cityblock_w'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
df_features['z_w2v_unique_dis_canberra_w'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
df_features['z_w2v_unique_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v'], x['q2_unique_w2v'],3), axis=1)
df_features['z_w2v_unique_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
df_features['z_w2v_unique_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
df_features['z_q1_unique_skew_w'] = df_features.q1_unique_w2v_weight.map(lambda x:skew(x))
df_features['z_q2_unique_skew_w'] = df_features.q2_unique_w2v_weight.map(lambda x:skew(x))
df_features['z_q1_unique_kur_w'] = df_features.q1_unique_w2v_weight.map(lambda x:kurtosis(x))
df_features['z_q2_unique_kur_w'] = df_features.q2_unique_w2v_weight.map(lambda x:kurtosis(x))
df_features['z_q1_unique_skew'] = df_features.q1_unique_w2v.map(lambda x:skew(x))
df_features['z_q2_unique_skew'] = df_features.q2_unique_w2v.map(lambda x:skew(x))
df_features['z_q1_unique_kur'] = df_features.q1_unique_w2v.map(lambda x:kurtosis(x))
df_features['z_q2_unique_kur'] = df_features.q2_unique_w2v.map(lambda x:kurtosis(x))
del df_features['q1_unique_w2v_weight']
del df_features['q2_unique_w2v_weight']
del df_features['q1_unique_w2v']
del df_features['q2_unique_w2v']
print('all done')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features.fillna(0.0)
return df_features
评论列表
文章目录