normalization.py 文件源码-python代码片段

normalization.py 文件源码

python

阅读 28 收藏 0 点赞 0 评论 0

项目：text-analytics-with-python 作者: dipanjanS 项目源码文件源码

def normalize_corpus(corpus, lemmatize=True, 
                     only_text_chars=False,
                     tokenize=False):

    normalized_corpus = []    
    for text in corpus:
        text = html_parser.unescape(text)
        text = expand_contractions(text, CONTRACTION_MAP)
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower()
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if only_text_chars:
            text = keep_text_characters(text)

        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)

    return normalized_corpus