def build_vocabulary( words, max_size ):
vocab_instances = 0
unique_counts = Counter(words)
d = dict(unique_counts.most_common(cfg.vocabulary_size-2) )
vocabulary = OrderedDict( sorted(d.items(), key=lambda t: t[1], reverse=True) )
# start at 2 to leave room for padding & unknown
pb = Progress_bar(len(d) - 1)
for i, (key, value) in enumerate(vocabulary.items(), start=2):
vocab_instances += value
vocabulary[key] = i
pb.tick()
vocabulary[cfg.padding_char] = 0
vocabulary[cfg.placeholder_char] = 1
#reverse the vocbulary (for reverse lookup)
rev_vocabulary = {v: k for k, v in vocabulary.items()}
vocab = (len(unique_counts), vocab_instances, vocabulary, rev_vocabulary)
return vocab
preprocess_data.py 文件源码
python
阅读 22
收藏 0
点赞 0
评论 0
评论列表
文章目录