def save_vocab(self, path_count, path_vocab, word_limit=100000):
""" Saves the master vocabulary into a file.
"""
# reserve space for 10 special tokens
words = OrderedDict()
for token in SPECIAL_TOKENS:
# store -1 instead of np.inf, which can overflow
words[token] = -1
# sort words by frequency
desc_order = OrderedDict(sorted(self.master_vocab.items(),
key=lambda kv: kv[1], reverse=True))
words.update(desc_order)
# use encoding of up to 30 characters (no token conversions)
# use float to store large numbers (we don't care about precision loss)
np_vocab = np.array(words.items(),
dtype=([('word', '|S30'), ('count', 'float')]))
# output count for debugging
counts = np_vocab[:word_limit]
np.savez_compressed(path_count, counts=counts)
# output the index of each word for easy lookup
final_words = OrderedDict()
for i, w in enumerate(words.keys()[:word_limit]):
final_words.update({w: i})
with open(path_vocab, 'w') as f:
f.write(json.dumps(final_words, indent=4, separators=(',', ': ')))
评论列表
文章目录