def get_features(df_features):
# now = datetime.datetime.now()
# print now.strftime('%Y-%m-%d %H:%M:%S')
# print "matchnouns"
# df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
# df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
# #df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1) #takes long
# df_features['z_noun_match'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1_nouns, r.question2_nouns), axis = 1)
# now = datetime.datetime.now()
# print now.strftime('%Y-%m-%d %H:%M:%S')
# print "matchverb"
# df_features['question1_verbs'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[0] == 'V' and t[1] == 'B'])
# df_features['question2_verbs'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[0] == 'V' and t[1] == 'B'])
# #df_features['z_verb_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_verbs if w in r.question2_verbs]), axis=1) #takes long
# df_features['z_verb_match'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1_verbs, r.question2_verbs), axis = 1)
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
print "stem_tfidf"
df_features['q1_stem'] = df_features.question1.map(lambda x: [w for w in nltk.PorterStemmer().stem_word(str(x).lower()).split(' ')])
df_features['q2_stem'] = df_features.question2.map(lambda x: [w for w in nltk.PorterStemmer().stem_word(str(x).lower()).split(' ')])
#df_features['z_adj_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_adjs if w in r.question2_adjs]), axis=1) #takes long
df_features['z_stem_tfidf'] = df_features.apply(lambda r : tfidf_word_match_share(r.q1_stem, r.q2_stem), axis = 1)
now = datetime.datetime.now()
# print now.strftime('%Y-%m-%d %H:%M:%S')
# print('w2v tfidf...')
# df_features['z_tfidf_w2v'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1.tolist(), r.question2.tolist()), axis = 1)
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
print('nouns...')
df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1) #takes long
print('lengths...')
df_features['z_len1'] = df_features.question1.map(lambda x: len(str(x)))
df_features['z_len2'] = df_features.question2.map(lambda x: len(str(x)))
df_features['z_word_len1'] = df_features.question1.map(lambda x: len(str(x).split()))
df_features['z_word_len2'] = df_features.question2.map(lambda x: len(str(x).split()))
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
print('difflib...')
df_features['z_match_ratio'] = df_features.apply(lambda r: diff_ratios(r.question1, r.question2), axis=1) #takes long
print('word match...')
df_features['z_word_match'] = df_features.apply(word_match_share, axis=1, raw=True)
print('tfidf...')
df_features['z_tfidf_sum1'] = df_features.question1.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
df_features['z_tfidf_sum2'] = df_features.question2.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
df_features['z_tfidf_mean1'] = df_features.question1.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
df_features['z_tfidf_mean2'] = df_features.question2.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
df_features['z_tfidf_len1'] = df_features.question1.map(lambda x: len(tfidf.transform([str(x)]).data))
df_features['z_tfidf_len2'] = df_features.question2.map(lambda x: len(tfidf.transform([str(x)]).data))
return df_features.fillna(0.0)
评论列表
文章目录