def build_vocab(words, vocab_size):
""" Build vocabulary of VOCAB_SIZE most frequent words """
dictionary = dict()
count = [('UNK', -1)]
count.extend(Counter(words).most_common(vocab_size - 1))
index = 0
utils.make_dir('processed')
with open('processed/vocab_1000.tsv', "w") as f:
for word, _ in count:
dictionary[word] = index
if index < 1000:
f.write(word + "\n")
index += 1
index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return dictionary, index_dictionary
评论列表
文章目录