convert.py 文件源码-python代码片段

convert.py 文件源码
python
阅读 37 收藏 0 点赞 0 评论 0
def to_text(passage, sentences=True, *args, **kwargs):
    """Converts from a Passage object to tokenized strings.

    :param passage: the Passage object to convert
    :param sentences: whether to break the Passage to sentences (one for string)
                      or leave as one string. Defaults to True

    :return a list of strings - 1 if sentences=False, # of sentences otherwise
    """
    del args, kwargs
    tokens = [x.text for x in sorted(passage.layer(layer0.LAYER_ID).all,
                                     key=operator.attrgetter('position'))]
    # break2sentences return the positions of the end tokens, which is
    # always the index into tokens incremented by ones (tokens index starts
    # with 0, positions with 1). So in essence, it returns the index to start
    # the next sentence from, and we should add index 0 for the first sentence
    if sentences:
        starts = [0] + textutil.break2sentences(passage)
    else:
        starts = [0, len(tokens)]
    return [' '.join(tokens[starts[i]:starts[i + 1]])
            for i in range(len(starts) - 1)]