def __init__(self, entity_list):
"""
[{"@id":"1","name":"??"},{"@id":"2","name":"??"}]
all input text are assumed (or will be converted into) unicode
"""
# init entity index
self.entities = collections.defaultdict(list)
entity_list_unicode = []
for entity in entity_list:
entity_list_unicode.append(any2unicode(entity))
for entity in entity_list_unicode:
name = entity["name"]
self.entities[name].append(entity)
for entity in entity_list_unicode:
for name in entity.get("alternateName", []):
self.entities[name].append(entity)
stat(entity_list_unicode, ["name"])
# init jieba
self.tokenizer = jieba.Tokenizer()
for name in self.entities:
self.tokenizer.add_word(name)
python类Tokenizer()的实例源码
def __init__(self, tokenizer=None):
self.tokenizer = tokenizer or jieba.Tokenizer()
self.load_word_tag(self.tokenizer.get_dict_file())
def lcut(self, *args, **kwargs):
return list(self.cut(*args, **kwargs))
# default Tokenizer instance
def __init__(self, tokenizer=None):
self.tokenizer = tokenizer or jieba.Tokenizer()
self.load_word_tag(self.tokenizer.get_dict_file())
def lcut(self, *args, **kwargs):
return list(self.cut(*args, **kwargs))
# default Tokenizer instance
def __init__(self, tokenizer=None):
self.tokenizer = tokenizer or jieba.Tokenizer()
self.load_word_tag(self.tokenizer.get_dict_file())
def lcut(self, *args, **kwargs):
return list(self.cut(*args, **kwargs))
# default Tokenizer instance
def __init__(self, tokenizer=None):
self.tokenizer = tokenizer or jieba.Tokenizer()
self.load_word_tag(self.tokenizer.get_dict_file())
def lcut(self, *args, **kwargs):
return list(self.cut(*args, **kwargs))
# default Tokenizer instance
def __init__(self, tokenizer=None):
self.tokenizer = tokenizer or jieba.Tokenizer()
self.load_word_tag(self.tokenizer.get_dict_file())
def lcut(self, *args, **kwargs):
return list(self.cut(*args, **kwargs))
# default Tokenizer instance
def __init__(self, tokenizer=None):
self.tokenizer = tokenizer or jieba.Tokenizer()
self.load_word_tag(self.tokenizer.get_dict_file())
def lcut(self, *args, **kwargs):
return list(self.cut(*args, **kwargs))
# default Tokenizer instance
def __init__(self, tokenizer=None):
self.tokenizer = tokenizer or jieba.Tokenizer()
self.load_word_tag(self.tokenizer.get_dict_file())
def lcut(self, *args, **kwargs):
return list(self.cut(*args, **kwargs))
# default Tokenizer instance