def update_ner_pubtator(self):
''' Process sentence tokens and see if any match to PubTator entity
mentions. If so, replace their token['ner'] with the PubTator NER
class (CHEMICAL, DISEASE, etc.)
'''
if self.pubtator:
for sent in self.sentences:
sentence_index = sent['index']
# are there any PubTator NER tags for this sentence?
if not self.pubtator.sentence_ner[sentence_index]:
continue
# process pubtator NER! (read CoreNLP tokens, see any of them match exactly...)
for t in sent['tokens']:
for biothing in self.pubtator.sentence_ner[sentence_index]:
start, end = biothing.corenlp_offsets
if t['characterOffsetBegin'] == start and t['characterOffsetEnd'] == end:
# exact match! update CoreNLP NER with PubTator NER
biothing.matched_corenlp_token = t['index']
t['ner'] = biothing.ner_type
break
elif fuzz and self.fuzzy_ner_match:
if fuzz.ratio(t['originalText'].lower(), biothing.token.lower()) > self.fuzzy_ner_match:
biothing.matched_corenlp_token = t['index']
t['ner'] = biothing.ner_type
break
self.pubtator_ner_updated = True
return self.pubtator_ner_updated
评论列表
文章目录