def ie_preprocess(self, document):
"""This function takes raw text and chops and then connects the process to break
it down into sentences"""
# Pre-processing
# e.g.","exempli gratia"
document = document.replace("e.g.", "exempli gratia")
# Sentence tokenizer out of nltk.sent_tokenize
split = re.split('\n|\*', document)
# Sentence tokenizer
sentences = []
for sent in split:
sents = nltk.sent_tokenize(sent)
length = len(sents)
if length == 0:
next
elif length == 1:
sentences.append(sents[0])
else:
for i in range(length):
sentences.append(sents[i])
return sentences
评论列表
文章目录