def _consolidate_pipeline(self, transformation_pipeline, final_model=None):
# First, restrict our DictVectorizer or DataFrameVectorizer
# This goes through and has DV only output the items that have passed our support mask
# This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step
# It also significantly reduces the size of dv.vocabulary_ which can get quite large
dv = transformation_pipeline.named_steps['dv']
try:
feature_selection = transformation_pipeline.named_steps['feature_selection']
feature_selection_mask = feature_selection.support_mask
dv.restrict(feature_selection_mask)
except KeyError:
pass
# We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline
# In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow
trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model)
return trained_pipeline_without_feature_selection
评论列表
文章目录