baseline.py 文件源码-python代码片段

def get_features(df_features):
    # now = datetime.datetime.now()
    # print now.strftime('%Y-%m-%d %H:%M:%S') 
    # print "matchnouns"
    # df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    # df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    # #df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1)  #takes long
    # df_features['z_noun_match'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1_nouns, r.question2_nouns), axis = 1)

    # now = datetime.datetime.now()
    # print now.strftime('%Y-%m-%d %H:%M:%S')   
    # print "matchverb"
    # df_features['question1_verbs'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[0] == 'V' and t[1] == 'B'])
    # df_features['question2_verbs'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[0] == 'V' and t[1] == 'B'])
    # #df_features['z_verb_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_verbs if w in r.question2_verbs]), axis=1)  #takes long
    # df_features['z_verb_match'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1_verbs, r.question2_verbs), axis = 1)

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    print "stem_tfidf"
    df_features['q1_stem'] = df_features.question1.map(lambda x: [w for w in nltk.PorterStemmer().stem_word(str(x).lower()).split(' ')])
    df_features['q2_stem'] = df_features.question2.map(lambda x: [w for w in nltk.PorterStemmer().stem_word(str(x).lower()).split(' ')])
    #df_features['z_adj_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_adjs if w in r.question2_adjs]), axis=1)  #takes long
    df_features['z_stem_tfidf'] = df_features.apply(lambda r : tfidf_word_match_share(r.q1_stem, r.q2_stem), axis = 1)
    now = datetime.datetime.now()
    # print now.strftime('%Y-%m-%d %H:%M:%S')
    # print('w2v tfidf...')
    # df_features['z_tfidf_w2v'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1.tolist(), r.question2.tolist()), axis = 1)
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    print('nouns...')
    df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1)  #takes long
    print('lengths...')
    df_features['z_len1'] = df_features.question1.map(lambda x: len(str(x)))
    df_features['z_len2'] = df_features.question2.map(lambda x: len(str(x)))
    df_features['z_word_len1'] = df_features.question1.map(lambda x: len(str(x).split()))
    df_features['z_word_len2'] = df_features.question2.map(lambda x: len(str(x).split()))
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    print('difflib...')
    df_features['z_match_ratio'] = df_features.apply(lambda r: diff_ratios(r.question1, r.question2), axis=1)  #takes long
    print('word match...')
    df_features['z_word_match'] = df_features.apply(word_match_share, axis=1, raw=True)
    print('tfidf...')
    df_features['z_tfidf_sum1'] = df_features.question1.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_sum2'] = df_features.question2.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_mean1'] = df_features.question1.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_mean2'] = df_features.question2.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_len1'] = df_features.question1.map(lambda x: len(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_len2'] = df_features.question2.map(lambda x: len(tfidf.transform([str(x)]).data))
    return df_features.fillna(0.0)