def _from_json_to_crf(self, message, entity_offsets):
# type: (Message, List[Tuple[int, int, Text]]) -> List[Tuple[Text, Text, Text, Text]]
"""Takes the json examples and switches them to a format which crfsuite likes."""
from spacy.gold import GoldParse
doc = message.get("spacy_doc")
gold = GoldParse(doc, entities=entity_offsets)
ents = [l[5] for l in gold.orig_annot]
if '-' in ents:
logger.warn("Misaligned entity annotation in sentence '{}'. ".format(doc.text) +
"Make sure the start and end values of the annotated training " +
"examples end at token boundaries (e.g. don't include trailing whitespaces).")
if not self.BILOU_flag:
for i, entity in enumerate(ents):
if entity.startswith('B-') or \
entity.startswith('I-') or \
entity.startswith('U-') or \
entity.startswith('L-'):
ents[i] = entity[2:] # removes the BILOU tags
return self._from_text_to_crf(message, ents)
评论列表
文章目录