def to_text(passage, sentences=True, *args, **kwargs):
"""Converts from a Passage object to tokenized strings.
:param passage: the Passage object to convert
:param sentences: whether to break the Passage to sentences (one for string)
or leave as one string. Defaults to True
:return a list of strings - 1 if sentences=False, # of sentences otherwise
"""
del args, kwargs
tokens = [x.text for x in sorted(passage.layer(layer0.LAYER_ID).all,
key=operator.attrgetter('position'))]
# break2sentences return the positions of the end tokens, which is
# always the index into tokens incremented by ones (tokens index starts
# with 0, positions with 1). So in essence, it returns the index to start
# the next sentence from, and we should add index 0 for the first sentence
if sentences:
starts = [0] + textutil.break2sentences(passage)
else:
starts = [0, len(tokens)]
return [' '.join(tokens[starts[i]:starts[i + 1]])
for i in range(len(starts) - 1)]
评论列表
文章目录