def build_dictionary(words, max_df=5):
word_freq = [[unkown_token, -1], [pad_token, 0]]
word_freq.extend(nltk.FreqDist(itertools.chain(words)).most_common())
word_freq = OrderedDict(word_freq)
word2idx = {unkown_token: 0, pad_token: 1}
idx2word = {0: unkown_token, 1: pad_token}
idx = 2
for w in word_freq:
f = word_freq[w]
if f >= max_df:
word2idx[w] = idx
idx2word[idx] = w
idx += 1
else:
word2idx[w] = 0 # map the rare word into the unkwon token
word_freq[unkown_token] += 1 # increment the number of unknown tokens
return word2idx, idx2word, word_freq
1-train-CBOW.py 文件源码
python
阅读 39
收藏 0
点赞 0
评论 0
评论列表
文章目录