def randomsearch_xgboost(df):
param_distributions={'max_depth': sp.stats.randint(1, 11),
'subsample': sp.stats.uniform(0.25, 0.75),
'colsample_bytree': sp.stats.uniform(0.25, 0.75)
}
xgb_model = XGBClassifier()
rs = RandomizedSearchCV(xgb_model,
param_distributions,
cv=10,
n_iter=20,
scoring="log_loss",
n_jobs=1,
verbose=2)
rs.fit(train_X, train_y.transpose()[0])
predict = rs.predict_proba(test_X)
return predict[:, 1]
python类RandomizedSearchCV()的实例源码
def runGridSearch(self, model):
logging.debug("run grid search on model: {}".format(model.__class__.__name__))
logging.debug("cross validation strategy: {}".format(model.holdout_split))
logging.debug("used features: {}".format(model.usedFeatures))
logging.debug("tuned parameters: {}".format(model.getTunedParamterOptions()))
features,labels,cv = model.getFeaturesLabel()
# do grid search
if self.do_random_gridsearch:
estimator = RandomizedSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv, n_jobs=self.n_jobs,
scoring=mean_absolute_percentage_error_scoring, verbose = 500, n_iter=self.n_iter_randomsearch)
else:
estimator = GridSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv,n_jobs=-self.n_jobs,
fit_params=model.get_fit_params(),
scoring=mean_absolute_percentage_error_scoring, verbose = 500)
estimator.fit(features, labels)
model.clf = estimator.best_estimator_
model.save_final_model = True
model.save_model()
# model.dispFeatureImportance()
logging.debug('estimaator parameters: {}'.format(estimator.get_params))
logging.debug('Best parameters: {}'.format(estimator.best_params_))
logging.debug('Best Scores: {}'.format(-estimator.best_score_))
logging.debug('Score grid: {}'.format(estimator.grid_scores_ ))
for i in estimator.grid_scores_ :
logging.debug('parameters: {}'.format(i.parameters ))
logging.debug('mean_validation_score: {}'.format(np.absolute(i.mean_validation_score)))
logging.debug('cv_validation_scores: {}'.format(np.absolute(i.cv_validation_scores) ))
return
def test_large_grid():
"""In this test, we purposely overfit a RandomForest to completely random data
in order to assert that the test error will far supercede the train error.
"""
if not SK18:
custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
else:
custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)
# define the pipe
pipe = Pipeline([
('scaler', SelectiveScaler()),
('pca', SelectivePCA(weight=True)),
('rf', RandomForestClassifier(random_state=42))
])
# define hyper parameters
hp = {
'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
'pca__whiten': [True, False],
'pca__weight': [True, False],
'pca__n_components': uniform(0.75, 0.15),
'rf__n_estimators': randint(5, 10),
'rf__max_depth': randint(5, 15)
}
# define the grid
grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)
# this will fail because we haven't fit yet
assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)
# fit the grid
grid.fit(X_train, y_train)
# score for coverage -- this might warn...
with warnings.catch_warnings():
warnings.simplefilter("ignore")
grid.score(X_train, y_train)
# coverage:
assert grid._estimator_type == 'classifier'
# get predictions
tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)
# evaluate score (SHOULD be better than random...)
accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)
# grid score reports:
# assert fails for bad percentile
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})
# assert fails for bad y_axis
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})
# assert passes otherwise
report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def random_search_cv(clf, param_distribution, n_iter_search, X_train, y_train):
'''
random search with optimization with nested resampling
@return: random search object
'''
rnd_search = RandomizedSearchCV(clf, param_distributions = param_distribution,
n_iter = n_iter_search, pre_dispatch = '2*n_jobs', n_jobs = 4)
rnd_search.fit(X_train, y_train)
return rnd_search
def rf_cv(fv_train,target_train,fv_test,target_test):
####---- cross validation of train dataset, gridsearch the best parameters for random forest
# Set the parameters by cross-validation
tuned_parameters = {'n_estimators': [1000, 2000],
"max_depth": [3, 6, 9, None],
"max_features": ["auto","log2",None],
"class_weight": [None, 'balanced']}
scores = ['recall_macro']
n_iter_search = 20
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print()
mycv = StratifiedKFold(target_train, n_folds = 5)
clf = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1), tuned_parameters, cv=mycv, n_iter=n_iter_search,
scoring='%s' % score)
clf.fit(fv_train, target_train)
report_cv(clf,fv_test,target_test)
def train_classifier(self, trainvectors, labels, c='', kernel='', gamma='', degree='', class_weight='', iterations=10):
if len(self.label_encoder.classes_) > 2: # more than two classes to distinguish
parameters = ['estimator__C', 'estimator__kernel', 'estimator__gamma', 'estimator__degree']
multi = True
else: # only two classes to distinguish
parameters = ['C', 'kernel', 'gamma', 'degree']
multi = False
c_values = [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000] if c == '' else [float(x) for x in c.split()]
kernel_values = ['linear', 'rbf', 'poly'] if kernel == '' else [k for k in kernel.split()]
gamma_values = [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048] if gamma == '' else [float(x) for x in gamma.split()]
degree_values = [1, 2, 3, 4] if degree == '' else [int(x) for x in degree.split()]
grid_values = [c_values, kernel_values, gamma_values, degree_values]
if not False in [len(x) == 1 for x in grid_values]: # only sinle parameter settings
settings = {}
for i, parameter in enumerate(parameters):
settings[parameter] = grid_values[i][0]
if class_weight == '':
class_weight = 'balanced'
else:
iterations=int(iterations)
param_grid = {}
for i, parameter in enumerate(parameters):
param_grid[parameter] = grid_values[i]
model = svm.SVC(probability=True)
if multi:
model = OutputCodeClassifier(model)
paramsearch = RandomizedSearchCV(model, param_grid, cv = 5, verbose = 2, n_iter = iterations, n_jobs = 10, pre_dispatch = 4)
paramsearch.fit(trainvectors, self.label_encoder.transform(labels))
settings = paramsearch.best_params_
# train an SVC classifier with the settings that led to the best performance
self.model = svm.SVC(
probability = True,
C = settings[parameters[0]],
kernel = settings[parameters[1]],
gamma = settings[parameters[2]],
degree = settings[parameters[3]],
class_weight = class_weight,
cache_size = 1000,
verbose = 2
)
# if multi:
# self.model = OutputCodeClassifier(self.model)
# trainvectors = trainvectors.todense()
self.model.fit(trainvectors, self.label_encoder.transform(labels))
def train_classifier(self, trainvectors, labels, c='', solver='', dual='', penalty='', multiclass='', max_iterations=1000, iterations=10):
if len(self.label_encoder.classes_) > 2: # more than two classes to distinguish
parameters = ['estimator__C', 'estimator__solver', 'estimator__penalty', 'estimator__dual', 'estimator__multi_class']
# multi = True
else: # only two classes to distinguish
parameters = ['C', 'solver', 'penalty', 'dual', 'multi_class']
# multi = False
c_values = [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000] if c == '' else [float(x) for x in c.split()]
solver_values = ['newton-cg', 'lbfgs', 'liblinear', 'sag'] if solver == '' else [s for s in solver.split()]
if penalty == '':
if not set(['newton-cg','lbfgs','sag']) & set(solver_values):
penalty_values = ['l1', 'l2']
else:
penalty_values = ['l2']
else:
penalty_values = [penalty]
if dual == '':
if len(solver_values) == 1 and solver_values[0] == 'liblinear':
if len(penalty_values) == 1 and penalty_values[0] == 'l2':
dual_values = [True,False]
else:
dual_values = [False]
else:
dual_values = [int(dual)] # 1 or 0
if multiclass == '':
if 'liblinear' not in solver_values:
multiclass_values = ['ovr', 'multinomial']
else:
multiclass_values = ['ovr']
else:
multiclass_values = [multiclass]
grid_values = [c_values, solver_values, penalty_values, dual_values, multiclass_values]
max_iterations = int(max_iterations)
if not False in [len(x) == 1 for x in grid_values]: # only sinle parameter settings
settings = {}
for i, parameter in enumerate(parameters):
settings[parameter] = grid_values[i][0]
else: # try different parameter combinations
iterations=int(iterations)
param_grid = {}
for i, parameter in enumerate(parameters):
param_grid[parameter] = grid_values[i]
model = LogisticRegression(max_iter=max_iterations)
# if multi:
# model = OutputCodeClassifier(model)
paramsearch = RandomizedSearchCV(model, param_grid, cv = 5, verbose = 2, n_iter = iterations, n_jobs = 10, pre_dispatch = 4)
paramsearch.fit(trainvectors, self.label_encoder.transform(labels))
settings = paramsearch.best_params_
# train a logistic regression classifier with the settings that led to the best performance
self.model = LogisticRegression(
C = settings[parameters[0]],
solver = settings[parameters[1]],
penalty = settings[parameters[2]],
dual = settings[parameters[3]],
multi_class = settings[parameters[4]],
max_iter = max_iterations,
verbose = 2
)
# if multi:
# self.model = OutputCodeClassifier(self.model)
self.model.fit(trainvectors, self.label_encoder.transform(labels))
def search_best_rf():
Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
print "training data loaded"
print_label_frequency(ytrain_raw)
############# create the pipeline
pipeline = Pipeline([
('vect', CountVectorizer(analyzer=do_nothing)),
('tfidf', TfidfTransformer()),
('rf', RandomForestClassifier(oob_score=True, verbose=1)),
])
############# initialize the search
parameters = {
'vect__max_features': (2000,3000,4000),
'rf__n_estimators': range(300,1200,100),
'rf__criterion':['gini','entropy'],
'rf__max_depth': range(10,100,10),
'rf__min_samples_split': range(10,100,10),
}
validate_split = PredefinedSplit(test_fold=make_train_validate_split(len(ytrain_raw)))
scoring_method = "roc_auc"
searchcv = RandomizedSearchCV(estimator=pipeline,
param_distributions=parameters,
n_iter=200,
scoring=scoring_method,
n_jobs=-1,
verbose=1,
cv = validate_split)
############# search
print "#################### search cv begins"
searchcv.fit(Xtrain_raw, ytrain_raw)
print "#################### search cv ends"
print "best {}: {}".format(scoring_method, searchcv.best_score_)
print "best parameters: ", searchcv.best_params_
############# check the best model
bestpipeline = searchcv.best_estimator_
common.dump_predictor("pipeline_rf.pkl",bestpipeline)
rf = bestpipeline.steps[-1][1]
print "RF's OOB score: {}".format(rf.oob_score_)
# words = bestpipeline.steps[0][1].get_feature_names()
# feat_importances = zip(words, rf.feature_importances_)
# feat_importances.sort(key=lambda t: -t[1])
# print feat_importances
############# training error analysis
ytrain_predict = bestpipeline.predict(Xtrain_raw)
print_classification_report('Training Data', ytrain_raw, ytrain_predict)
############# test error analysis
Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
ytest_predict = bestpipeline.predict(Xtest_raw)
print_classification_report('Testing Data', ytest_raw, ytest_predict)
def trainClassifier(X, y, testTweetsAll, ensembleTweets, ensembleSentiment):
# split our data into training and test datasets
xTrain, xTest, yTrain, yTest = train_test_split(
X, y, test_size=0.33, random_state=8)
classifier = RandomForestClassifier(n_estimators=20, n_jobs=-1)
# for simplicity's sake, we could train a single random forest:
# classifier.fit(xTrain, yTrain)
# print classifier.score(xTest, yTest)
# for more fun, we will optimize the hyperparameters for our random forest using RandomizedSearchCV
parametersToTry = {
'max_features': ['sqrt','log2',None,.01,.1,.2,.3],
'criterion': ['gini','entropy'],
'min_samples_leaf': [1],
'min_samples_split': scipy.stats.randint(2,30),
'bootstrap': [True,False]
}
# RandomizedSearchCV will optimize our hyperparameters for us in a way that is much more efficient and comprehensive than GridSearchCV.
# run on all cores, fail gracefully if a combination of hyperparameters fails to converge, try 10 different combinations of hyperparameters, train on all the training data when finished, and use a third of the dataset for cross-validation while searching for the best hyperparameters
searchCV = RandomizedSearchCV(classifier, parametersToTry, n_jobs=-1, error_score=0, n_iter=10, refit=True, cv=3)
print 'shape of this training data set:'
print xTrain.shape
searchCV.fit(xTrain, yTrain)
print 'the best hyperparameters from this search are:'
print searchCV.best_params_
print 'best score from hyperparameter search is: ' + str(searchCV.best_score_)
print 'score on the holdout portion of the training set: ' + str( searchCV.score(xTest, yTest) )
print 'score on the ensemble data: ' + str( searchCV.score(ensembleTweets, ensembleSentiment) ) + '\n\n'
testPredictions = searchCV.predict_proba(testTweetsAll)
ensemblePredictions = searchCV.predict_proba(ensembleTweets)
def singlePrediction(predictions):
cleanedPredictions = []
for predictionRow in predictions:
cleanedPredictions.append(predictionRow[1])
return cleanedPredictions
# the classifier gives us a predicted probability for both the 0 and the 1 case. Given that they're mutually exclusive, we can simplify down to a single number (the predicted probability of the 1 case)
testPredictions = singlePrediction(testPredictions)
ensemblePredictions = singlePrediction(ensemblePredictions)
return testPredictions, ensemblePredictions
def tune(insights, x_train, y_train, x_test, y_test, models='all', requirements=None, maximize=False):
if requirements is None:
requirements = requirements_bare_minimum(y_train)
# do vanilla models satisfy the requirements?
# assuming decision tree is the most intuitive, then logistic regression and then random forest
# TODO: extend this to metrics other than accuracy using the confusion matrix
for model_name in ['dt', 'lr', 'rf']:
model_insights = insights[model_name]
model_variation = np.std(model_insights['accuracy_folds'])
if check_requirements(model_insights, requirements) and not maximize:
pass
# TODO: turn this back on
# return model_name
# model selection and tuning loop
models_to_train = []
if models == 'all':
models_to_train += models_linear + models_nonlinear_cheap + models_nonlinear_expensive
elif models == 'linear':
models_to_train += models_online
elif models_to_train == 'cheap':
models_to_train += models_linear + models_nonlinear_cheap
# TODO: using all of the training data, need to use less data if runtime for insights models is large (how large?)
for model in models_to_train:
# TODO: add the looping logic
if model == LogisticRegression:
number_configurations = np.prod(np.array([len(_) for _ in hyperparameters[model]]))
random_search_iterations = np.min([random_search_iterations_max, number_configurations])
random_search = RandomizedSearchCV(model(n_jobs=-1, random_state=random_state),
param_distributions=hyperparameters[model], n_iter=random_search_iterations, n_jobs=-1, random_state=0)
runtime = time()
random_search.fit(x_train, y_train)
runtime = time() - runtime
info = dict()
info['runtime'] = runtime
# info['accuracy'] = min(scores)
# info['accuracy_test'] = accuracy_score(y_test, y_test_predicted)
# info['accuracy_folds'] = scores
# info['confusion_matrix'] = confusion_matrix(y_test, y_test_predicted)
# clf.fit(x_train, y_train)
# fpr, tpr, _ = roc_curve(y_test, clf_predict_proba(clf, x_test))
# info['fpr'] = fpr
# info['tpr'] = tpr
# info['auc'] = auc(fpr, tpr)
return random_search
return None
def generate_model(data, classes, args):
# Define the parameters
tuned_parameters = {'C': C_RANGE,
'class_weight': CLASS_WEIGHTS}
# Define the classifier
if args.kernel == 'rbf':
clf = svm.SVC(cache_size=CACHE_SIZE)
tuned_parameters['gamma'] = GAMMA_RANGE
else:
clf = svm.LinearSVC(dual=False)
print_verbose("Classifier: %s" % str(clf), 5)
print_verbose("Parameters: %s" % str(tuned_parameters), 5)
# Generate the K-fold development
skf = cross_validation.StratifiedKFold(classes, n_folds=K_FOLD, shuffle=True)
print_verbose("KFold: %s" % str(skf), 5)
# Generate the grid search
if args.search == 'grid':
gscv = grid_search.GridSearchCV(clf, tuned_parameters, cv=skf, scoring='f1',
n_jobs=1, verbose=get_verbose_level())
else:
gscv = grid_search.RandomizedSearchCV(clf, tuned_parameters, cv=skf, scoring='f1',
n_jobs=1, verbose=get_verbose_level(), n_iter=args.iter)
# Search
print_verbose("GridSearch: %s" % str(gscv), 5)
gscv.fit(data, classes)
# Print scores
print_verbose("GridSearch scores:", 5)
for params, mean_score, scores in gscv.grid_scores_:
print_verbose("%0.6f (+/-%0.06f) for %r"
% (mean_score, scores.std() / 2, params), 5)
# Print best score
print_verbose("GridSearch best score:", 0)
print_verbose("%0.6f for %r" % (gscv.best_score_, gscv.best_params_), 0)
return gscv