def _asian_tokenization(doc, entity_type, tag_type, tokenizer):
sents = []
for paragraph in doc.split('\n'):
sent_splits = iter(re.split(r'(?|?|?|?)+', paragraph, flags=re.MULTILINE))
for partial_sent in sent_splits:
sent = partial_sent + next(sent_splits, '')
if sent.strip() == '': continue
toks = []
# for tok in jieba.cut(sent, ):
for tok in tokenizer(sent):
pos = 'WORD'
if tok.strip() == '':
pos = 'SPACE'
elif punct_re.match(tok):
pos = 'PUNCT'
toks.append(Tok(pos,
tok[:2].lower(),
tok.lower(),
tok,
ent_type='' if entity_type is None else entity_type.get(tok, ''),
tag='' if tag_type is None else tag_type.get(tok, '')))
sents.append(Sentence(toks, sent))
return Doc(sents, doc)
评论列表
文章目录