def test_random_grid():
# build a pipeline
pipe = Pipeline([
('retainer', FeatureRetainer()), # will retain all
('dropper', FeatureDropper()), # won't drop any
('mapper', FunctionMapper()), # pass through
('encoder', OneHotCategoricalEncoder()), # no object dtypes, so will pass through
('collinearity', MulticollinearityFilterer(threshold=0.85)),
('imputer', SelectiveImputer()), # pass through
('scaler', SelectiveScaler()),
('boxcox', BoxCoxTransformer()),
('nzv', NearZeroVarianceFilterer(threshold=1e-4)),
('pca', SelectivePCA(n_components=0.9)),
('model', RandomForestClassifier(n_jobs=1))
])
# let's define a set of hyper-parameters over which to search
hp = {
'collinearity__threshold': uniform(loc=.8, scale=.15),
'collinearity__method': ['pearson', 'kendall', 'spearman'],
'scaler__scaler': [StandardScaler(), RobustScaler()],
'pca__n_components': uniform(loc=.75, scale=.2),
'pca__whiten': [True, False],
'model__n_estimators': randint(5, 10),
'model__max_depth': randint(2, 5),
'model__min_samples_leaf': randint(1, 5),
'model__max_features': uniform(loc=.5, scale=.5),
'model__max_leaf_nodes': randint(10, 15)
}
# define the gridsearch
search = RandomizedSearchCV(pipe, hp,
n_iter=2, # just to test it even works
scoring='accuracy',
cv=2,
random_state=42)
# fit the search
search.fit(X_train, y_train)
# test the report
report_grid_score_detail(search, charts=False)
评论列表
文章目录