transact.py 文件源码-python代码片段

transact.py 文件源码

python

阅读 29 收藏 0 点赞 0 评论 0

项目：banking-class 作者: eli-goodfriend 项目源码文件源码

def make_word_feature(df,embeddings):
    # use embeddings to vectorize merchant description
    # currently using averaging to combine words in merchant
    # there are other options: http://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence
    merchants = df.merchant.tolist()
    veclen = len(embeddings['food'])
    word_feature = np.zeros((len(merchants),veclen))
    for idx, merchant in enumerate(merchants):
        num_known = 0
        try:
            words = tokenize.word_tokenize(merchant)
            words = [word.lower() for word in words]
            for word in words:
                wordvec = embeddings[word]
                word_feature[idx,:] += wordvec
                num_known += 1
        except:
            pass
        word_feature[idx,:] = word_feature[idx,:] / float(max(num_known,1))

    return word_feature