corenlp_parse.py 文件源码-python代码片段

def update_ner_pubtator(self):

        ''' Process sentence tokens and see if any match to PubTator entity
            mentions. If so, replace their token['ner'] with the PubTator NER
            class (CHEMICAL, DISEASE, etc.)
        '''

        if self.pubtator:
            for sent in self.sentences:
                sentence_index = sent['index']

                # are there any PubTator NER tags for this sentence?
                if not self.pubtator.sentence_ner[sentence_index]:
                    continue

                # process pubtator NER! (read CoreNLP tokens, see any of them match exactly...)
                for t in sent['tokens']:
                    for biothing in self.pubtator.sentence_ner[sentence_index]:
                        start, end = biothing.corenlp_offsets
                        if t['characterOffsetBegin'] == start and t['characterOffsetEnd'] == end:
                            # exact match! update CoreNLP NER with PubTator NER
                            biothing.matched_corenlp_token = t['index']
                            t['ner'] = biothing.ner_type
                            break
                        elif fuzz and self.fuzzy_ner_match:
                            if fuzz.ratio(t['originalText'].lower(), biothing.token.lower()) > self.fuzzy_ner_match:
                                biothing.matched_corenlp_token = t['index']
                                t['ner'] = biothing.ner_type
                                break
            self.pubtator_ner_updated = True

        return self.pubtator_ner_updated