def create_vocabulary(input_stream, vocab_size, sentence_to_tokens_fn=None):
t0 = time.time()
print(" [*] Creating a new vocabulary...")
if not sentence_to_tokens_fn:
sentence_to_tokens_fn = default_sentence_to_tokens
docs = []
lines = []
for line in input_stream:
rline = line.strip()
tokens = sentence_to_tokens_fn(rline)
if '##########' not in tokens and len(rline) > 0:
lines += [token.lower() for token in tokens if token.lower() not in cachedStopWords]
elif '##########' in tokens:
docs.append(lines)
lines = []
limit = np.abs(vocab_size - 4)
vocab = corpora.Dictionary(docs)
vocab.filter_extremes(no_below=1, no_above=0.7, keep_n=limit)
print(" [*] Tokenize : %.4fs" % (time.time() - t0))
return vocab
评论列表
文章目录