def getBOW(token_pattern = token_pattern,
max_df = bow__max_df,
min_df = bow__min_df,
ngram_range = (1, 1),
vocabulary = None,
stop_words = 'english'):
bow =CountVectorizer(min_df=min_df, max_df=max_df, max_features=None,
strip_accents='unicode', analyzer='word',
token_pattern=token_pattern,
ngram_range=ngram_range,
stop_words = stop_words, vocabulary=vocabulary)
return bow
########################################################
# ------------------------------
# Simple text cleaning using
#
# -replacement dict
#
# or
#
# -WordReplacer object
#--------------------------------
评论列表
文章目录