def split_into_sentences(text):
potential_end_pat = re.compile(r"".join([
r"([\w\.'’&\]\)]+[\.\?!])", # A word that ends with punctuation
r"([‘’“”'\"\)\]]*)", # Followed by optional quote/parens/etc
r"(\s+(?![a-z\-–—]))", # Followed by whitespace + non-(lowercase or dash)
]),
re.U
)
dot_iter = re.finditer(potential_end_pat, text)
end_indices = [
(x.start() + len(x.group(1)) + len(x.group(2)))
for x in dot_iter
if is_sentence_ender(x.group(1))
]
spans = zip([None] + end_indices, end_indices + [None])
sentences = [
text[start:end].strip() for start, end in spans
]
return sentences
评论列表
文章目录