utils.py 文件源码-python代码片段

utils.py 文件源码

python

阅读 16 收藏 0 点赞 0 评论 0

项目：attention-over-attention 作者: marshmelloX 项目源码文件源码

def create_vocabulary(input_stream, vocab_size, sentence_to_tokens_fn=None):
  t0 = time.time()
  print(" [*] Creating a new vocabulary...")

  if not sentence_to_tokens_fn:
    sentence_to_tokens_fn = default_sentence_to_tokens

  docs = []
  lines = []
  for line in input_stream:
    rline = line.strip()
    tokens = sentence_to_tokens_fn(rline)
    if '##########' not in tokens and len(rline) > 0:
      lines += [token.lower() for token in tokens if token.lower() not in cachedStopWords]
    elif '##########' in tokens:
      docs.append(lines)
      lines = []

  limit = np.abs(vocab_size - 4)
  vocab = corpora.Dictionary(docs)
  vocab.filter_extremes(no_below=1, no_above=0.7, keep_n=limit)
  print(" [*] Tokenize : %.4fs" % (time.time() - t0))

  return vocab