python类RandomizedSearchCV()的实例源码-面圈网

predictor.py 文件源码项目：kaggle-Kobe-Bryant-Shot-Selection 作者: shiba24 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def randomsearch_xgboost(df):
        param_distributions={'max_depth': sp.stats.randint(1, 11),
                             'subsample': sp.stats.uniform(0.25, 0.75),
                             'colsample_bytree': sp.stats.uniform(0.25, 0.75)
        }
        xgb_model = XGBClassifier()
        rs = RandomizedSearchCV(xgb_model,
                                param_distributions,
                                cv=10,
                                n_iter=20,
                                scoring="log_loss",
                                n_jobs=1,
                                verbose=2)
        rs.fit(train_X, train_y.transpose()[0]) 
        predict = rs.predict_proba(test_X)
        return predict[:, 1]

tunemodels.py 文件源码项目：Supply-demand-forecasting 作者: LevinJ 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def runGridSearch(self, model):
        logging.debug("run grid search on model: {}".format(model.__class__.__name__))
        logging.debug("cross validation strategy: {}".format(model.holdout_split))
        logging.debug("used features: {}".format(model.usedFeatures))
        logging.debug("tuned parameters: {}".format(model.getTunedParamterOptions()))

        features,labels,cv = model.getFeaturesLabel()
        # do grid search
        if self.do_random_gridsearch:
            estimator = RandomizedSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv, n_jobs=self.n_jobs,
                       scoring=mean_absolute_percentage_error_scoring, verbose = 500, n_iter=self.n_iter_randomsearch)
        else:
            estimator = GridSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv,n_jobs=-self.n_jobs, 
                                     fit_params=model.get_fit_params(),
                       scoring=mean_absolute_percentage_error_scoring, verbose = 500)
        estimator.fit(features, labels)
        model.clf = estimator.best_estimator_
        model.save_final_model = True
        model.save_model()

#         model.dispFeatureImportance()
        logging.debug('estimaator parameters: {}'.format(estimator.get_params))
        logging.debug('Best parameters: {}'.format(estimator.best_params_))
        logging.debug('Best Scores: {}'.format(-estimator.best_score_))
        logging.debug('Score grid: {}'.format(estimator.grid_scores_ ))
        for i in estimator.grid_scores_ :
            logging.debug('parameters: {}'.format(i.parameters ))
            logging.debug('mean_validation_score: {}'.format(np.absolute(i.mean_validation_score)))
            logging.debug('cv_validation_scores: {}'.format(np.absolute(i.cv_validation_scores) ))



        return

test_big.py 文件源码项目：skutil 作者: tgsmith61591 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works

optimization.py 文件源码项目：DataMining 作者: lidalei 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def random_search_cv(clf, param_distribution, n_iter_search, X_train, y_train):
    '''
    random search with optimization with nested resampling
    @return: random search object
    '''
    rnd_search = RandomizedSearchCV(clf, param_distributions = param_distribution,
                                    n_iter = n_iter_search, pre_dispatch = '2*n_jobs', n_jobs = 4)
    rnd_search.fit(X_train, y_train)

    return rnd_search

VUVclassification.py 文件源码项目：jingjuPhoneticSegmentation 作者: ronggong 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def rf_cv(fv_train,target_train,fv_test,target_test):

    ####---- cross validation of train dataset, gridsearch the best parameters for random forest

    # Set the parameters by cross-validation
    tuned_parameters = {'n_estimators': [1000, 2000],
                        "max_depth": [3, 6, 9, None],
                        "max_features": ["auto","log2",None],
                        "class_weight": [None, 'balanced']}

    scores = ['recall_macro']

    n_iter_search   = 20

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        mycv = StratifiedKFold(target_train, n_folds = 5)

        clf = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1), tuned_parameters, cv=mycv, n_iter=n_iter_search,
                           scoring='%s' % score)

        clf.fit(fv_train, target_train)

        report_cv(clf,fv_test,target_test)

classifier.py 文件源码项目：quoll 作者: LanguageMachines 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def train_classifier(self, trainvectors, labels, c='', kernel='', gamma='', degree='', class_weight='', iterations=10):
        if len(self.label_encoder.classes_) > 2: # more than two classes to distinguish
            parameters = ['estimator__C', 'estimator__kernel', 'estimator__gamma', 'estimator__degree']
            multi = True
        else: # only two classes to distinguish
            parameters = ['C', 'kernel', 'gamma', 'degree']
            multi = False
        c_values = [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000] if c == '' else [float(x) for x in c.split()]
        kernel_values = ['linear', 'rbf', 'poly'] if kernel == '' else [k for  k in kernel.split()]
        gamma_values = [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048] if gamma == '' else [float(x) for x in gamma.split()]
        degree_values = [1, 2, 3, 4] if degree == '' else [int(x) for x in degree.split()]
        grid_values = [c_values, kernel_values, gamma_values, degree_values]
        if not False in [len(x) == 1 for x in grid_values]: # only sinle parameter settings
            settings = {}
            for i, parameter in enumerate(parameters):
                settings[parameter] = grid_values[i][0]
            if class_weight == '':
                class_weight = 'balanced'
        else:
            iterations=int(iterations)
            param_grid = {}
            for i, parameter in enumerate(parameters):
                param_grid[parameter] = grid_values[i]
            model = svm.SVC(probability=True)
            if multi:
                model = OutputCodeClassifier(model)
            paramsearch = RandomizedSearchCV(model, param_grid, cv = 5, verbose = 2, n_iter = iterations, n_jobs = 10, pre_dispatch = 4)
            paramsearch.fit(trainvectors, self.label_encoder.transform(labels))
            settings = paramsearch.best_params_
        # train an SVC classifier with the settings that led to the best performance
        self.model = svm.SVC(
           probability = True,
           C = settings[parameters[0]],
           kernel = settings[parameters[1]],
           gamma = settings[parameters[2]],
           degree = settings[parameters[3]],
           class_weight = class_weight,
           cache_size = 1000,
           verbose = 2
           )
        # if multi:
        #     self.model = OutputCodeClassifier(self.model)
        #     trainvectors = trainvectors.todense()
        self.model.fit(trainvectors, self.label_encoder.transform(labels))

classifier.py 文件源码项目：quoll 作者: LanguageMachines 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def train_classifier(self, trainvectors, labels, c='', solver='', dual='', penalty='', multiclass='', max_iterations=1000, iterations=10):
        if len(self.label_encoder.classes_) > 2: # more than two classes to distinguish
            parameters = ['estimator__C', 'estimator__solver', 'estimator__penalty', 'estimator__dual', 'estimator__multi_class']
            # multi = True
        else: # only two classes to distinguish
            parameters = ['C', 'solver', 'penalty', 'dual', 'multi_class']
            # multi = False
        c_values = [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000] if c == '' else [float(x) for x in c.split()]
        solver_values = ['newton-cg', 'lbfgs', 'liblinear', 'sag'] if solver == '' else [s for  s in solver.split()]
        if penalty == '':
            if not set(['newton-cg','lbfgs','sag']) & set(solver_values):
                penalty_values = ['l1', 'l2']
            else:
                penalty_values = ['l2']
        else:
            penalty_values = [penalty]
        if dual == '':
            if len(solver_values) == 1 and solver_values[0] == 'liblinear':
                if len(penalty_values) == 1 and penalty_values[0] == 'l2':
                    dual_values = [True,False]
            else:
                dual_values = [False]
        else:
            dual_values = [int(dual)] # 1 or 0
        if multiclass == '':
            if 'liblinear' not in solver_values:
                multiclass_values = ['ovr', 'multinomial']
            else:
                multiclass_values = ['ovr']
        else:
            multiclass_values = [multiclass]
        grid_values = [c_values, solver_values, penalty_values, dual_values, multiclass_values]
        max_iterations = int(max_iterations)
        if not False in [len(x) == 1 for x in grid_values]: # only sinle parameter settings
            settings = {}
            for i, parameter in enumerate(parameters):
                settings[parameter] = grid_values[i][0]
        else: # try different parameter combinations
            iterations=int(iterations)
            param_grid = {}
            for i, parameter in enumerate(parameters):
                param_grid[parameter] = grid_values[i]
            model = LogisticRegression(max_iter=max_iterations)
            # if multi:
            #     model = OutputCodeClassifier(model)
            paramsearch = RandomizedSearchCV(model, param_grid, cv = 5, verbose = 2, n_iter = iterations, n_jobs = 10, pre_dispatch = 4)
            paramsearch.fit(trainvectors, self.label_encoder.transform(labels))
            settings = paramsearch.best_params_
        # train a logistic regression classifier with the settings that led to the best performance
        self.model = LogisticRegression(
            C = settings[parameters[0]],
            solver = settings[parameters[1]],
            penalty = settings[parameters[2]],
            dual = settings[parameters[3]],
            multi_class = settings[parameters[4]],
            max_iter = max_iterations,
            verbose = 2
        )
        # if multi:
        #     self.model = OutputCodeClassifier(self.model)
        self.model.fit(trainvectors, self.label_encoder.transform(labels))

sentiment_rf.py 文件源码项目：OpinionMining728 作者: stasi009 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def search_best_rf():
    Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
    print "training data loaded"
    print_label_frequency(ytrain_raw)

    ############# create the pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=do_nothing)),
        ('tfidf', TfidfTransformer()),
        ('rf', RandomForestClassifier(oob_score=True, verbose=1)),
    ])

    ############# initialize the search
    parameters = {
        'vect__max_features': (2000,3000,4000),
        'rf__n_estimators': range(300,1200,100),
        'rf__criterion':['gini','entropy'],
        'rf__max_depth': range(10,100,10),
        'rf__min_samples_split': range(10,100,10),
    }
    validate_split = PredefinedSplit(test_fold=make_train_validate_split(len(ytrain_raw)))

    scoring_method = "roc_auc"
    searchcv = RandomizedSearchCV(estimator=pipeline,
                                param_distributions=parameters,
                                n_iter=200,
                                scoring=scoring_method,
                                n_jobs=-1,
                                verbose=1,
                                cv = validate_split)

    ############# search
    print "#################### search cv begins"
    searchcv.fit(Xtrain_raw, ytrain_raw)
    print "#################### search cv ends"
    print "best {}: {}".format(scoring_method, searchcv.best_score_)
    print "best parameters: ", searchcv.best_params_

    ############# check the best model
    bestpipeline = searchcv.best_estimator_
    common.dump_predictor("pipeline_rf.pkl",bestpipeline)

    rf = bestpipeline.steps[-1][1]
    print "RF's OOB score: {}".format(rf.oob_score_)

    # words = bestpipeline.steps[0][1].get_feature_names()
    # feat_importances = zip(words, rf.feature_importances_)
    # feat_importances.sort(key=lambda t: -t[1])
    # print feat_importances

    ############# training error analysis
    ytrain_predict = bestpipeline.predict(Xtrain_raw)
    print_classification_report('Training Data', ytrain_raw, ytrain_predict)

    ############# test error analysis
    Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
    ytest_predict = bestpipeline.predict(Xtest_raw)
    print_classification_report('Testing Data', ytest_raw, ytest_predict)

trainClassifiers.py 文件源码项目：nlpSentiment 作者: ClimbsRocks 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def trainClassifier(X, y, testTweetsAll, ensembleTweets, ensembleSentiment):

    # split our data into training and test datasets
    xTrain, xTest, yTrain, yTest = train_test_split(
        X, y, test_size=0.33, random_state=8)


    classifier = RandomForestClassifier(n_estimators=20, n_jobs=-1)

    # for simplicity's sake, we could train a single random forest:
    # classifier.fit(xTrain, yTrain)
    # print classifier.score(xTest, yTest)


    # for more fun, we will optimize the hyperparameters for our random forest using RandomizedSearchCV
    parametersToTry = {
        'max_features': ['sqrt','log2',None,.01,.1,.2,.3],
        'criterion': ['gini','entropy'],
        'min_samples_leaf': [1],
        'min_samples_split': scipy.stats.randint(2,30),
        'bootstrap': [True,False]
    }

    # RandomizedSearchCV will optimize our hyperparameters for us in a way that is much more efficient and comprehensive than GridSearchCV.
    # run on all cores, fail gracefully if a combination of hyperparameters fails to converge, try 10 different combinations of hyperparameters, train on all the training data when finished, and use a third of the dataset for cross-validation while searching for the best hyperparameters
    searchCV = RandomizedSearchCV(classifier, parametersToTry, n_jobs=-1, error_score=0, n_iter=10, refit=True, cv=3)


    print 'shape of this training data set:'
    print xTrain.shape
    searchCV.fit(xTrain, yTrain)
    print 'the best hyperparameters from this search are:'
    print searchCV.best_params_
    print 'best score from hyperparameter search is: ' + str(searchCV.best_score_)
    print 'score on the holdout portion of the training set: ' + str( searchCV.score(xTest, yTest) )
    print 'score on the ensemble data: ' + str( searchCV.score(ensembleTweets, ensembleSentiment) ) + '\n\n'


    testPredictions = searchCV.predict_proba(testTweetsAll)
    ensemblePredictions = searchCV.predict_proba(ensembleTweets)


    def singlePrediction(predictions):
        cleanedPredictions = []
        for predictionRow in predictions:
            cleanedPredictions.append(predictionRow[1])
        return cleanedPredictions

    # the classifier gives us a predicted probability for both the 0 and the 1 case. Given that they're mutually exclusive, we can simplify down to a single number (the predicted probability of the 1 case)
    testPredictions = singlePrediction(testPredictions)
    ensemblePredictions = singlePrediction(ensemblePredictions)

    return testPredictions, ensemblePredictions

tuning.py 文件源码项目：menrva 作者: amirziai 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def tune(insights, x_train, y_train, x_test, y_test, models='all', requirements=None, maximize=False):
    if requirements is None:
        requirements = requirements_bare_minimum(y_train)

    # do vanilla models satisfy the requirements?
    # assuming decision tree is the most intuitive, then logistic regression and then random forest
    # TODO: extend this to metrics other than accuracy using the confusion matrix
    for model_name in ['dt', 'lr', 'rf']:
        model_insights = insights[model_name]
        model_variation = np.std(model_insights['accuracy_folds'])

        if check_requirements(model_insights, requirements) and not maximize:
            pass
            # TODO: turn this back on
            # return model_name

    # model selection and tuning loop
    models_to_train = []

    if models == 'all':
        models_to_train += models_linear + models_nonlinear_cheap + models_nonlinear_expensive
    elif models == 'linear':
        models_to_train += models_online
    elif models_to_train == 'cheap':
        models_to_train += models_linear + models_nonlinear_cheap

    # TODO: using all of the training data, need to use less data if runtime for insights models is large (how large?)
    for model in models_to_train:
        # TODO: add the looping logic
        if model == LogisticRegression:
            number_configurations = np.prod(np.array([len(_) for _ in hyperparameters[model]]))
            random_search_iterations = np.min([random_search_iterations_max, number_configurations])
            random_search = RandomizedSearchCV(model(n_jobs=-1, random_state=random_state),
                param_distributions=hyperparameters[model], n_iter=random_search_iterations, n_jobs=-1, random_state=0)
            runtime = time()
            random_search.fit(x_train, y_train)
            runtime = time() - runtime

            info = dict()
            info['runtime'] = runtime
            # info['accuracy'] = min(scores)
            # info['accuracy_test'] = accuracy_score(y_test, y_test_predicted)
            # info['accuracy_folds'] = scores
            # info['confusion_matrix'] = confusion_matrix(y_test, y_test_predicted)
            # clf.fit(x_train, y_train)
            # fpr, tpr, _ = roc_curve(y_test, clf_predict_proba(clf, x_test))
            # info['fpr'] = fpr
            # info['tpr'] = tpr
            # info['auc'] = auc(fpr, tpr)

            return random_search

    return None

generate_model.py 文件源码项目：vangogh 作者: gfolego 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def generate_model(data, classes, args):

    # Define the parameters
    tuned_parameters = {'C': C_RANGE,
                        'class_weight': CLASS_WEIGHTS}

    # Define the classifier
    if args.kernel == 'rbf':
        clf = svm.SVC(cache_size=CACHE_SIZE)
        tuned_parameters['gamma'] = GAMMA_RANGE
    else:
        clf = svm.LinearSVC(dual=False)

    print_verbose("Classifier: %s" % str(clf), 5)
    print_verbose("Parameters: %s" % str(tuned_parameters), 5)

    # Generate the K-fold development
    skf = cross_validation.StratifiedKFold(classes, n_folds=K_FOLD, shuffle=True)
    print_verbose("KFold: %s" % str(skf), 5)

    # Generate the grid search
    if args.search == 'grid':
        gscv = grid_search.GridSearchCV(clf, tuned_parameters, cv=skf, scoring='f1',
                                        n_jobs=1, verbose=get_verbose_level())
    else:
        gscv = grid_search.RandomizedSearchCV(clf, tuned_parameters, cv=skf, scoring='f1',
                                              n_jobs=1, verbose=get_verbose_level(), n_iter=args.iter)

    # Search
    print_verbose("GridSearch: %s" % str(gscv), 5)
    gscv.fit(data, classes)

    # Print scores
    print_verbose("GridSearch scores:", 5)
    for params, mean_score, scores in gscv.grid_scores_:
        print_verbose("%0.6f (+/-%0.06f) for %r"
                      % (mean_score, scores.std() / 2, params), 5)

    # Print best score
    print_verbose("GridSearch best score:", 0)
    print_verbose("%0.6f for %r" % (gscv.best_score_, gscv.best_params_), 0)

    return gscv