string_base.py 文件源码-python代码片段

def makeFeature(df_features):
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get sentence vector')
    model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
    # model = KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False)
    # model = Word2Vec(brown.sents())
    df_features['vec1'] = df_features.q1_expand.map(lambda x: getVec(x, model))
    df_features['vec2'] = df_features.q2_expand.map(lambda x: getVec(x, model))

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get six kinds of coefficient about vector')
    df_features['f_cosine'] = df_features.apply(lambda x: Cosine(x['vec1'], x['vec2']), axis=1)
    df_features['f_manhatton'] = df_features.apply(lambda x: Manhatton(x['vec1'], x['vec2']), axis=1)
    df_features['f_euclidean'] = df_features.apply(lambda x: Euclidean(x['vec1'], x['vec2']), axis=1)
    df_features['f_pearson'] = df_features.apply(lambda x: PearsonSimilar(x['vec1'], x['vec2']), axis=1)
    df_features['f_spearman'] = df_features.apply(lambda x: SpearmanSimilar(x['vec1'], x['vec2']), axis=1)
    df_features['f_kendall'] = df_features.apply(lambda x: KendallSimilar(x['vec1'], x['vec2']), axis=1)

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get 3 kinds of coefficient about from w2c 2 document')
    df_features['f_cosine_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Cosine, model), axis=1)
    df_features['f_euclidean_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Euclidean, model), axis=1)
    df_features['f_manhatton_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Manhatton, model), axis=1)

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get three kinds of coefficient about nouns, verb, adj')
    df_features['f_raw_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1'], x['question2']), axis=1)
    df_features['f_raw_dice'] = df_features.apply(lambda x: Dice(x['question1'], x['question2']),axis=1)
    df_features['f_raw_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1'], x['question2']), axis=1)
    df_features['f_expand_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['q1_expand'], x['q2_expand']), axis=1)
    df_features['f_expand_dice'] = df_features.apply(lambda x: Dice(x['q1_expand'], x['q2_expand']),axis=1)
    df_features['f_expand_ochiai'] = df_features.apply(lambda x: Ochiai(x['q1_expand'], x['q2_expand']), axis=1)
    df_features['f_nouns_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_nouns'], x['question2_nouns']), axis=1)
    df_features['f_nouns_dice'] = df_features.apply(lambda x: Dice(x['question1_nouns'], x['question2_nouns']),axis=1)
    df_features['f_nouns_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_nouns'], x['question2_nouns']), axis=1)
    df_features['f_verbs_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_verbs'], x['question2_verbs']), axis=1)
    df_features['f_verbs_dice'] = df_features.apply(lambda x: Dice(x['question1_verbs'], x['question2_verbs']),axis=1)
    df_features['f_verbs_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_verbs'], x['question2_verbs']), axis=1)
    df_features['f_adjs_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_adjs'], x['question2_adjs']), axis=1)
    df_features['f_adjs_dice'] = df_features.apply(lambda x: Dice(x['question1_adjs'], x['question2_adjs']),axis=1)
    df_features['f_adjs_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_adjs'], x['question2_adjs']), axis=1)

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get weighted overlap about expand')
    weights = word_weights(df_features)
    df_features['f_weighted_overlap'] = df_features.apply(lambda x: weighted_Overlap(x['q1_expand'], x['q2_expand'], weights), axis=1)

    print('all done')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    df_features.fillna(0.0)
    return df_features