def word_bigrams():
preprocessor = TextCleaner(lowercase=True,
filter_urls=True,
filter_mentions=True,
filter_hashtags=True,
alphabetic=True,
strip_accents=True,
filter_rt=True)
pipeline = Pipeline([('vect', CountVectorizer(preprocessor=preprocessor,
ngram_range=(2, 2))),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('scale', Normalizer())])
return ('word_bigrams', pipeline)
评论列表
文章目录