AsianNLP.py 文件源码-python代码片段

AsianNLP.py 文件源码

python

阅读 25 收藏 0 点赞 0 评论 0

项目：scattertext 作者: JasonKessler 项目源码文件源码

def _asian_tokenization(doc, entity_type, tag_type, tokenizer):
    sents = []
    for paragraph in doc.split('\n'):
        sent_splits = iter(re.split(r'(?|?|?|?)+', paragraph, flags=re.MULTILINE))
        for partial_sent in sent_splits:
            sent = partial_sent + next(sent_splits, '')
            if sent.strip() == '': continue
            toks = []
            # for tok in jieba.cut(sent, ):
            for tok in tokenizer(sent):
                pos = 'WORD'
                if tok.strip() == '':
                    pos = 'SPACE'
                elif punct_re.match(tok):
                    pos = 'PUNCT'
                toks.append(Tok(pos,
                                tok[:2].lower(),
                                tok.lower(),
                                tok,
                                ent_type='' if entity_type is None else entity_type.get(tok, ''),
                                tag='' if tag_type is None else tag_type.get(tok, '')))
            sents.append(Sentence(toks, sent))
    return Doc(sents, doc)