def normalize_corpus(corpus, lemmatize=True,
only_text_chars=False,
tokenize=False):
normalized_corpus = []
for text in corpus:
text = html_parser.unescape(text)
text = expand_contractions(text, CONTRACTION_MAP)
if lemmatize:
text = lemmatize_text(text)
else:
text = text.lower()
text = remove_special_characters(text)
text = remove_stopwords(text)
if only_text_chars:
text = keep_text_characters(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
else:
normalized_corpus.append(text)
return normalized_corpus
normalization.py 文件源码
python
阅读 27
收藏 0
点赞 0
评论 0
评论列表
文章目录