def build_vocab(word_freq, threshold=5, topn=None, start_idx=0):
"""
threshold only take effects when topn is None.
words are indexed by overall frequency in the dataset.
"""
word_freq = sorted(word_freq.iteritems(), key=lambda d:d[1], reverse=True)
if topn:
word_freq = zip(*word_freq[:topn])[0]
vocab_dict = dict(zip(word_freq, range(start_idx, len(word_freq) + start_idx)))
else:
idx = start_idx
vocab_dict = {}
for word, freq in word_freq:
if freq < threshold:
return vocab_dict
vocab_dict[word] = idx
idx += 1
return vocab_dict
评论列表
文章目录