crf_entity_extractor.py 文件源码-python代码片段

crf_entity_extractor.py 文件源码

python

阅读 27 收藏 0 点赞 0 评论 0

def _from_json_to_crf(self, message, entity_offsets):
        # type: (Message, List[Tuple[int, int, Text]]) -> List[Tuple[Text, Text, Text, Text]]
        """Takes the json examples and switches them to a format which crfsuite likes."""
        from spacy.gold import GoldParse

        doc = message.get("spacy_doc")
        gold = GoldParse(doc, entities=entity_offsets)
        ents = [l[5] for l in gold.orig_annot]
        if '-' in ents:
            logger.warn("Misaligned entity annotation in sentence '{}'. ".format(doc.text) +
                        "Make sure the start and end values of the annotated training " +
                        "examples end at token boundaries (e.g. don't include trailing whitespaces).")
        if not self.BILOU_flag:
            for i, entity in enumerate(ents):
                if entity.startswith('B-') or \
                        entity.startswith('I-') or \
                        entity.startswith('U-') or \
                        entity.startswith('L-'):
                    ents[i] = entity[2:]        # removes the BILOU tags

        return self._from_text_to_crf(message, ents)