def word_unigrams():
preprocessor = TextCleaner(lowercase=True,
filter_urls=True,
filter_mentions=True,
filter_hashtags=True,
alphabetic=True,
strip_accents=True,
filter_rt=True)
vectorizer = CountVectorizer(min_df=2,
stop_words=get_stopwords(),
preprocessor=preprocessor,
ngram_range=(1, 1))
pipeline = Pipeline([('vect', vectorizer),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('scale', Normalizer())])
return ('word_unigrams', pipeline)
评论列表
文章目录