test_pipe.py 文件源码-python代码片段

def test_random_grid():
    # build a pipeline
    pipe = Pipeline([
        ('retainer',       FeatureRetainer()),  # will retain all
        ('dropper',        FeatureDropper()),  # won't drop any
        ('mapper',         FunctionMapper()),  # pass through
        ('encoder',        OneHotCategoricalEncoder()),  # no object dtypes, so will pass through
        ('collinearity',   MulticollinearityFilterer(threshold=0.85)),
        ('imputer',        SelectiveImputer()),  # pass through
        ('scaler',         SelectiveScaler()),
        ('boxcox',         BoxCoxTransformer()),
        ('nzv',            NearZeroVarianceFilterer(threshold=1e-4)),
        ('pca',            SelectivePCA(n_components=0.9)),
        ('model',          RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold':    uniform(loc=.8, scale=.15),
        'collinearity__method':       ['pearson', 'kendall', 'spearman'],
        'scaler__scaler':             [StandardScaler(), RobustScaler()],
        'pca__n_components':          uniform(loc=.75, scale=.2),
        'pca__whiten':                [True, False],
        'model__n_estimators':        randint(5, 10),
        'model__max_depth':           randint(2, 5),
        'model__min_samples_leaf':    randint(1, 5),
        'model__max_features':        uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes':      randint(10, 15)
    }

    # define the gridsearch
    search = RandomizedSearchCV(pipe, hp,
                                n_iter=2,  # just to test it even works
                                scoring='accuracy',
                                cv=2,
                                random_state=42)

    # fit the search
    search.fit(X_train, y_train)

    # test the report
    report_grid_score_detail(search, charts=False)