def __init__(self, entity_list):
"""
[{"@id":"1","name":"??"},{"@id":"2","name":"??"}]
all input text are assumed (or will be converted into) unicode
"""
# init entity index
self.entities = collections.defaultdict(list)
entity_list_unicode = []
for entity in entity_list:
entity_list_unicode.append(any2unicode(entity))
for entity in entity_list_unicode:
name = entity["name"]
self.entities[name].append(entity)
for entity in entity_list_unicode:
for name in entity.get("alternateName", []):
self.entities[name].append(entity)
stat(entity_list_unicode, ["name"])
# init jieba
self.tokenizer = jieba.Tokenizer()
for name in self.entities:
self.tokenizer.add_word(name)
评论列表
文章目录