def maybe_build_vocab(reuters_dir, vocab_file):
vocab = collections.defaultdict(int)
if os.path.exists(vocab_file):
fvoc = open(vocab_file, "rb")
for line in fvoc:
word, idx = line.strip().split("\t")
vocab[word] = int(idx)
fvoc.close()
else:
counter = collections.Counter()
num_docs_read = 0
for doc in stream_reuters_documents(reuters_dir):
if num_docs_read % 100 == 0:
print("building vocab from {:d} docs"
.format(num_docs_read))
topics = doc["topics"]
if len(topics) == 0:
continue
title = doc["title"]
body = doc["body"]
title_body = ". ".join([title, body]).lower()
for sent in nltk.sent_tokenize(title_body):
for word in nltk.word_tokenize(sent):
counter[word] += 1
for i, c in enumerate(counter.most_common(VOCAB_SIZE)):
vocab[c[0]] = i + 1
num_docs_read += 1
print("vocab built from {:d} docs, complete"
.format(num_docs_read))
fvoc = open(vocab_file, "wb")
for k in vocab.keys():
fvoc.write("{:s}\t{:d}\n".format(k, vocab[k]))
fvoc.close()
return vocab
sent-thoughts-parse.py 文件源码
python
阅读 19
收藏 0
点赞 0
评论 0
评论列表
文章目录