def get_features(df_features):
print('use w2v to document presentation')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge_tfidf(x['question1'], x['question2']), axis = 1)
print('nones')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
#df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
#df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
df_features['question1_w2v'] = df_features.question1.map(lambda x: get_vector_tfidf(" ".join(x)))
df_features['question2_w2v'] = df_features.question2.map(lambda x: get_vector_tfidf(" ".join(x)))
print('z_dist')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_tfidf_cos_sim')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_w2v_nones')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim_tfidf(x['q1_unique'], x['q2_unique']), axis=1)
df_features['z_w2v_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['question1_w2v'], x['question2_w2v']), axis=1)
df_features['z_w2v_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['question1_w2v'], x['question2_w2v'],3), axis=1)
df_features['z_w2v_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['question1_w2v'], x['question2_w2v']), axis=1)
df_features['z_w2v_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['question1_w2v'], x['question2_w2v']), axis=1)
df_features['z_q1_skew'] = df_features.question1_w2v.map(lambda x:skew(x))
df_features['z_q2_skew'] = df_features.question2_w2v.map(lambda x:skew(x))
df_features['z_q1_kur'] = df_features.question1_w2v.map(lambda x:kurtosis(x))
df_features['z_q2_kur'] = df_features.question2_w2v.map(lambda x:kurtosis(x))
del df_features['question1_w2v']
del df_features['question2_w2v']
print('all done')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features.fillna(0.0)
return df_features
评论列表
文章目录