phrases.py 文件源码-python代码片段

phrases.py 文件源码

python

阅读 27 收藏 0 点赞 0 评论 0

def build_phrase_models(content, base_path, settings):
    """ Build and save the phrase models
    """

    ngram_level = int(settings['level'])

    # According to tee() docs, this may be inefficient in terms of memory.
    # We need to do this because we need multiple passes through the
    # content stream.
    content = chain.from_iterable(doc.tokenized_text for doc in content)
    cs1, cs2 = tee(content, 2)

    for i in range(ngram_level-1):
        phrases = Phrases(cs1)
        path = "%s.%s" % (base_path, i + 2)     # save path as n-gram level
        logger.info("Phrase processor: Saving %s", path)
        phrases.save(path)
        # TODO: gensim complains about not using Phraser(phrases)
        content = phrases[cs2]  # tokenize phrases in content stream
        cs1, cs2 = tee(content, 2)