1-train-CBOW.py 文件源码-python代码片段

1-train-CBOW.py 文件源码

python

阅读 52 收藏 0 点赞 0 评论 0

项目：Deep-Learning-with-Theano 作者: PacktPublishing 项目源码文件源码

def build_dictionary(words, max_df=5):

    word_freq = [[unkown_token, -1], [pad_token, 0]]
    word_freq.extend(nltk.FreqDist(itertools.chain(words)).most_common())
    word_freq = OrderedDict(word_freq)
    word2idx = {unkown_token: 0, pad_token: 1}
    idx2word = {0: unkown_token, 1: pad_token}
    idx = 2
    for w in word_freq:
      f = word_freq[w]
      if f >= max_df:
        word2idx[w] = idx
        idx2word[idx] = w
        idx += 1
      else:
        word2idx[w] = 0 # map the rare word into the unkwon token
        word_freq[unkown_token] += 1 # increment the number of unknown tokens

    return word2idx, idx2word, word_freq