def nltk_extract_claims(text):
"""
Attempts to extract claims as a list from a large text string.
Uses nltk sent_tokenize function in tokenize library
param string text: string containing several claims
"""
sent_list = sent_tokenize(text)
# On a test string this returned a list with the claim number
# and then the claim text as separate items
claims_list = []
for i in range(0, len(sent_list), 2):
try:
number = int(sent_list[i].split(".")[0])
except:
number = 0
claims_list.append(
(number, sent_list[i+1])
)
return claims_list
评论列表
文章目录