def construct_pipeline(classifier):
"""
This function creates a feature extraction pipeline that accepts data
from a CorpusLoader and appends the classification model to the end of
the pipeline, returning a newly constructed Pipeline object that is
ready to be fit and trained!
"""
return Pipeline([
# Create a Feature Union of Text Stats and Bag of Words
('union', FeatureUnion(
transformer_list = [
# Pipeline for pulling document structure features
('stats', Pipeline([
('stats', TextStats()),
('vect', DictVectorizer()),
])),
# Pipeline for creating a bag of words TF-IDF vector
('bow', Pipeline([
('tokens', TextNormalizer()),
('tfidf', TfidfVectorizer(
tokenizer=identity, preprocessor=None, lowercase=False
)),
('best', TruncatedSVD(n_components=1000)),
])),
],
# weight components in feature union
transformer_weights = {
'stats': 0.15,
'bow': 0.85,
},
)),
# Append the estimator to the end of the pipeline
('classifier', classifier),
])
评论列表
文章目录