def extract(text, paper=None, logger=logger):
search_any = functools.partial(re_util.search_any, logger=logger)
if not text and paper:
try:
text, _ = paper.get_text()
except pdfutil.pdfutil.MalformedPDF as e:
return None
for sentence in nltk.sent_tokenize(text):
if search_any([r'data mine.*?source', r'text mine.*?shared'], sentence):
# yapf: disable
match = search_any([
"data mine.*?(\w*\d[\w\d/-]*)",
"text mine.*?(\w*\d[\w\d/-]*)"
], sentence)
# yapf: enable
source_type = "extracted"
source_detail = "nltk search v1"
value_text = sentence
try:
value_result = match.group(1).strip()
return (value_text, value_result, source_type, source_detail)
except AttributeError: # no match was found
return None
return None
评论列表
文章目录