def make_word_feature(df,embeddings):
# use embeddings to vectorize merchant description
# currently using averaging to combine words in merchant
# there are other options: http://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence
merchants = df.merchant.tolist()
veclen = len(embeddings['food'])
word_feature = np.zeros((len(merchants),veclen))
for idx, merchant in enumerate(merchants):
num_known = 0
try:
words = tokenize.word_tokenize(merchant)
words = [word.lower() for word in words]
for word in words:
wordvec = embeddings[word]
word_feature[idx,:] += wordvec
num_known += 1
except:
pass
word_feature[idx,:] = word_feature[idx,:] / float(max(num_known,1))
return word_feature
评论列表
文章目录