def train_model_with_cv(model, params, X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
# Use Train data to parameter selection in a Grid Search
gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5)
gs_clf = gs_clf.fit(X_train, y_train)
model = gs_clf.best_estimator_
# Use best model and test data for final evaluation
y_pred = model.predict(X_test)
_f1 = f1_score(y_test, y_pred, average='micro')
_confusion = confusion_matrix(y_test, y_pred)
__precision = precision_score(y_test, y_pred)
_recall = recall_score(y_test, y_pred)
_statistics = {'f1_score': _f1,
'confusion_matrix': _confusion,
'precision': __precision,
'recall': _recall
}
return model, _statistics
python类GridSearchCV()的实例源码
def fit(self, X, y=None):
"""Fitting function on the data."""
if self.data_normalizer is not None:
X = self.normalize_data(X)
if self.label_normalizer is not None:
y = self.normalize_label(y)
if self.force_classifier:
clf = make_classifier(self.learner, params=self.learner_options)
elif callable(self.learner):
# self.learner = type(self.learner)
clf = self.learner(**self.learner_options)
else:
clf = self.learner
self.gs_ = GridSearchCV(estimator=clf, **self.cv_options)
self.gs_.fit(X, y)
def train(self, train_size=0.8, k_folds=5):
# retrieve data from DB and pre-process
self._get_data()
# perform train/test split
self._get_train_test_split(train_size=train_size)
# define text pre-processing pipeline
text_pipeline = Pipeline([
('extract_text', DFColumnExtractor(TEXT_FEATURES)),
('vect', TfidfVectorizer(tokenizer=twitter_tokenizer))
])
# define pipeline for pre-processing of numeric features
numeric_pipeline = Pipeline([
('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)),
('scaler', MinMaxScaler())
])
# combine both steps into a single pipeline
pipeline = Pipeline([
('features', FeatureUnion([
('text_processing', text_pipeline),
('num_processing', numeric_pipeline)
])),
('clf', self._estimator)
])
self.logger.info('Fitting model hyperparameters with {0}-fold CV'.format(k_folds))
gs = GridSearchCV(pipeline, self.params, n_jobs=-1, cv=k_folds)
X = self.data.iloc[self.train_inds_, :]
y = self.data[LABEL].values[self.train_inds_]
gs.fit(X, y)
self.logger.info('Validation set accuracy is {0}'.format(gs.best_score_))
self.gs_ = gs
self.model_ = gs.best_estimator_
def fit(self, X, y=None, groups=None):
"""Run fit with all sets of parameters.
Parameters
----------
X : array-like, shape=(n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape=(n_samples,) or (n_samples, n_output), optional (default=None)
Target relative to X for classification or regression;
None for unsupervised learning.
groups : array-like, shape=(n_samples,), optional (default=None)
Group labels for the samples used while splitting the dataset into
train/test set.
"""
return super(GridSearchCV, self).fit(X, _as_numpy(y), groups)
def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=4, cv=5):
# global dtrain_whole
global num_boost_round
global params_sklearn
# global x
# global y
for param_untuned in params_untuned:
print '========== ', param_untuned, ' =============='
print_params(params_sklearn)
estimator = xgb.XGBClassifier(**params_sklearn)
grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10)
grid_search.fit(x, y)
df0 = pd.DataFrame(grid_search.cv_results_)
df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
# print df0
print df
print 'the best_params : ', grid_search.best_params_
print 'the best_score : ', grid_search.best_score_
# print grid_search.cv_results_
for k,v in grid_search.best_params_.items():
params_sklearn[k] = v
if len(params_untuned)==1:
return v
def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=1, cv=5):
# global dtrain_whole
global num_boost_round
global params_sklearn
# global x
# global y
for param_untuned in params_untuned:
print '========== ', param_untuned, ' =============='
print_params(params_sklearn)
estimator = xgb.XGBClassifier(**params_sklearn)
grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10)
grid_search.fit(x, y)
df0 = pd.DataFrame(grid_search.cv_results_)
df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
# print df0
print df
print 'the best_params : ', grid_search.best_params_
print 'the best_score : ', grid_search.best_score_
# print grid_search.cv_results_
for k,v in grid_search.best_params_.items():
params_sklearn[k] = v
def tune_classifier(estimator,params,X_train,Y_train,scoring='roc_auc',n_jobs=3,cv=5):
results = []
for k,values in params.items():
params_single = dict(k=values)
print '========== ',params_single,' =============='
grid_search = GridSearchCV(estimator,param_grid=params_single,scoring=scoring,n_jobs=n_jobs,cv=cv,verbose=5)
grid_search.fit(X_train,Y_train)
df0 = pd.DataFrame(grid_search.cv_results_)
df = pd.DataFrame(grid_search.cv_results_)[['params','mean_train_score','mean_test_score']]
# print df0
print df
print 'the best_params : ',grid_search.best_params_
print 'the best_score : ',grid_search.best_score_
# print grid_search.cv_results_
results.append(grid_search.best_params_)
return results
def tune_xgb_cv(params_untuned,params_sklearn,scoring='roc_auc', n_jobs=4, cv=5,verbose=10):
for param_untuned in params_untuned:
print '========== ', param_untuned, ' =============='
print_params(params_sklearn)
estimator = xgb.XGBClassifier(**params_sklearn)
# if(param_untuned.keys()[0] == 'n_estimators'):
# cv = 1
grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=verbose)
grid_search.fit(x, y)
df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
print df
print 'the best_params : ', grid_search.best_params_
print 'the best_score : ', grid_search.best_score_
for k,v in grid_search.best_params_.items():
params_sklearn[k] = v
return estimator,params_sklearn
def test_pipeline(get_models, get_transform, get_kernel):
alg, model = get_models
trans = get_transform()
kernel = get_kernel() + WhiteKernel()
pipe = Pipeline(steps=[(alg, model())])
param_dict = {}
if hasattr(model(), 'n_estimators'):
param_dict[alg + '__n_estimators'] = [5]
if hasattr(model(), 'kernel'):
param_dict[alg + '__kernel'] = [kernel]
param_dict[alg + '__target_transform'] = [trans]
estimator = GridSearchCV(pipe,
param_dict,
n_jobs=1,
iid=False,
pre_dispatch=2,
verbose=True,
)
np.random.seed(10)
estimator.fit(X=1 + np.random.rand(10, 3), y=1. + np.random.rand(10))
assert estimator.cv_results_['mean_train_score'][0] > -15.0
def test_svr_pipeline(get_transform, get_svr_kernel):
trans = get_transform()
pipe = Pipeline(steps=[('svr', svr())])
param_dict = {'svr__kernel': [get_svr_kernel]}
param_dict['svr__target_transform'] = [trans]
estimator = GridSearchCV(pipe,
param_dict,
n_jobs=1,
iid=False,
pre_dispatch=2,
verbose=True,
)
np.random.seed(1)
estimator.fit(X=1 + np.random.rand(10, 5), y=1. + np.random.rand(10))
assert estimator.cv_results_['mean_train_score'][0] > -10.0
def test_krige_pipeline(get_krige_method, get_variogram_model):
pipe = Pipeline(steps=[('krige', Krige(method=get_krige_method))])
param_dict = {'krige__variogram_model': [get_variogram_model]}
estimator = GridSearchCV(pipe,
param_dict,
n_jobs=1,
iid=False,
pre_dispatch=2,
verbose=True
)
np.random.seed(1)
X = np.random.randint(0, 400, size=(20, 2)).astype(float)
y = 5*np.random.rand(20)
estimator.fit(X=X, y=y)
assert estimator.cv_results_['mean_train_score'][0] > -1.0
def test_cv():
"""Simple CV check."""
# XXX: don't use scikit-learn for tests.
X, y = make_regression()
cv = KFold(X.shape[0], 5)
glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1)
# check that it returns 5 scores
scores = cross_val_score(glm_normal, X, y, cv=cv)
assert_equal(len(scores), 5)
param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)},
{'reg_lambda': np.logspace(np.log(0.5), np.log(0.01),
10, base=np.exp(1))}]
glmcv = GridSearchCV(glm_normal, param_grid, cv=cv)
glmcv.fit(X, y)
def setBestParameters(self):
cv = StratifiedKFold(n_splits = self.conf.num_folds)
param_grid = self.conf.getParamGrid()
if param_grid is None:
# No parameter value to select
return
if self.conf.families_supervision:
scoring = 'f1_macro'
else:
scoring = 'roc_auc'
grid_search = GridSearchCV(self.pipeline, param_grid = param_grid,
scoring = scoring,
cv = cv,
n_jobs = -1,
fit_params = {'model__sample_weight': self.datasets.sample_weight})
grid_search.fit(self.datasets.train_instances.getFeatures(),
self.getSupervision(self.datasets.train_instances))
self.conf.setBestValues(grid_search)
self.pipeline.set_params(**self.conf.getBestValues())
return cv
def xgb_model_select(file_name):
train_df = read_from_file(file_name)
selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
train_np = selected_train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Select Model...'
start_time = datetime.datetime.now()
xgb_clf = xgb.XGBRegressor()
parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9]}
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1)
print("parameters:")
pprint.pprint(parameters)
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
end_time = datetime.datetime.now()
print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def gbdt_select_model(file_name):
train_df = read_from_file(file_name)
#featrue 16
selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
train_np = selected_train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Select Model...'
start_time = datetime.datetime.now()
gbdt = GradientBoostingRegressor()
parameters = {'n_estimators': [100, 120], 'max_depth':[4, 5, 6]}
grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1)
print("parameters:")
pprint.pprint(parameters)
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
end_time = datetime.datetime.now()
print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def select_model(file_name):
train_df = read_from_file(file_name)
#featrue 16
selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
train_np = selected_train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Select Model...'
start_time = datetime.datetime.now()
gbdt = GradientBoostingRegressor()
parameters = {'n_estimators': [10000, 12000], 'max_depth':[16,15, 14]}
grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1)
print("parameters:")
pprint.pprint(parameters)
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
end_time = datetime.datetime.now()
print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def xgb_model_select(train_file_name):
train_df = merge_features_to_use(train_file_name)
train_df.drop(['conversionTime'], axis=1, inplace=True)
print 'Train And Fix Missing App Count Value...'
train_df, xgb_appcount = train_model_for_appcounts(train_df)
joblib.dump(xgb_appcount, 'XGB_missing.model')
print train_df.info()
print train_df.describe()
print train_df.isnull().sum()
train_np = train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Select Model...'
start_time = datetime.datetime.now()
xgb_clf = xgb.XGBRegressor()
parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9], 'gamma':[0.1,0.3,0.5,0.7], 'min_child_weight':[1,3,5,7], }
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1)
print("parameters:")
pprint.pprint(parameters)
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
end_time = datetime.datetime.now()
print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def grid(X, y):
'''
Adapted from: http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-grid-search-text-feature-extraction-py
Perform a grid search.
'''
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=8)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X, y)
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
def fit(self, df, y, param_grid=None):
from sklearn.model_selection import GridSearchCV
X = df.drop(y, axis=1).values
y = df[y].values
meta_X = self.get_meta(X)
if param_grid is not None:
model = self.stacked_model_class()
gridsearch = GridSearchCV(model, param_grid)
gridsearch.fit(meta_X, y)
self.stacked_model = self.stacked_model_class(**gridsearch.best_params_)
else:
self.stacked_model = self.stacked_model_class()
self.stacked_model.fit(meta_X, y)
def grid_search_cv(clf, x, y, params, cv = 5):
"""
:param clf: The classifier over which we want to perform
gridsearch.
:param x: Features
:param y: Target
:param params: Hyperparameters to perform gs on
:cv: kfold cv parameter
"""
gs = GridSearchCV(clf, param_grid = params, cv = cv)
gs.fit(x, y)
print
print 'BEST PARAMS:', gs.best_params_
print 'BEST SCORE:', gs.best_score_
print
best_estimator = gs.best_estimator_
return best_estimator
######################
# PREPARING THE DATA #
######################
#get the last 4 images from each file
def LogisticRegression(X_train, y_train):
from sklearn.linear_model import LogisticRegression
parameters = {
'C':[0.6, 0.8, 1.0, 1.2],
'class_weight':[None, 'balanced'],
}
LR = LogisticRegression()
grid_search = GridSearchCV(estimator=LR, param_grid=parameters, cv=5, scoring='neg_log_loss',n_jobs=4)
now = datetime.datetime.now()
print ("logestic regression grid_search start in " + now.strftime('%Y-%m-%d %H:%M:%S'))
grid_search.fit(X_train, y_train)
print ("logestic regression grid_search done in " + now.strftime('%Y-%m-%d %H:%M:%S'))
results = grid_search.grid_scores_
for result in results:
print(result)
print("\nBest score: %0.3f\n" % grid_search.best_score_)
print ("---------best parameters---------")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print ("%s: %r" % (param_name, best_parameters[param_name]))
def build_grid_search(X, y):
parameters = {
"estimator__criterion": ['gini', 'entropy'],
"estimator__max_depth": [10, 15, 20, 25, None],
"estimator__max_features": ['auto', 'sqrt', 'log2', None]
}
ovr = OneVsRestClassifier(RandomForestClassifier(n_estimators=1000,
oob_score=True, n_jobs=-1, verbose=1))
model_tunning = GridSearchCV(ovr, param_grid=parameters, verbose=1,
n_jobs=-1, cv=10,
scoring=make_scorer(f1_score))
model_tunning.fit(X, y)
test_score = model_tunning.best_score_
print 'The best test score: ', test_score
y_score = model_tunning.predict_proba(X_test)
multiclass_roc(y_score, 'grid_search_02')
return model_tunning
def clean_params_for_sk(params: dict) -> dict:
"""
Given a dictionary of XGB parameters, return a copy without parameters that will cause issues with scikit-learn's grid or
randomized search estimators.
:param params:
A dictionary of XGB parameters.
:return:
A copy of the same dictionary without the aforementioned problematic parameters.
"""
# In the xgb.cv call, nthread should be equal to the CPU count, but this causes a hang when
# called through GridSearchCV - parallelism should be achieved through its n_jobs parameter.
# See https://github.com/scikit-learn/scikit-learn/issues/6627 for more details.
params_copy = params.copy()
params_copy['nthread'] = 1
# In multiclass problems, this parameter is required for XGBoost, but is not a parameter of interest to be tuned.
if 'num_class' in params_copy.keys():
del params_copy['num_class']
return params_copy
def fit(self, X, *args, **kwargs):
if self._grid_search:
model = GridSearchCV(self._model, **self._grid_search)
elif self._random_search:
model = RandomizedSearchCV(self._model, **self._random_search)
else:
model = self._model
if self._grid_search is not None:
self._grid = model
elif self._random_search is not None:
self._rnd = model
assert (self.target in X.columns.values), 'X must contain the target column'
self._xcols = list(X.columns.values)
self._xcols.remove(self.target)
if len(self._columns_exclude) == 0 and len(self._columns_include) > 0:
self._columns_exclude = list(set(self._xcols) - set(self._columns_include))
[self._xcols.remove(t) for t in self._columns_exclude]
x = X[self._xcols]
y = X[self.target]
model.fit(x, y, **kwargs)
return self
def gs_numpy( method, X, Y, alphas_log = (-1, 1, 9), n_splits=5, n_jobs = -1, disp = True):
"""
Grid search method with numpy array of X and Y
Previously, np.mat are used for compatible with Matlab notation.
"""
if disp:
print( X.shape, Y.shape)
clf = getattr( linear_model, method)()
parmas = {'alpha': np.logspace( *alphas_log)}
kf5_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
#kf5 = kf5_c.split( X)
gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = n_jobs)
gs.fit( X, Y)
return gs
def gs_classfier( classifier, xM, yVc, params, n_splits=5, n_jobs=-1):
"""
gs = gs_classfier( classifier, xM, yVc, params, n_splits=5, n_jobs=-1)
Inputs
======
classifier = svm.SVC(), for example
param = {"C": np.logspace(-2,2,5)}
"""
#print(xM.shape, yVc.shape)
kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=True)
gs = model_selection.GridSearchCV( classifier, params, cv=kf5_c, n_jobs=n_jobs)
gs.fit( xM, yVc)
return gs
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_splits = 5, n_jobs = -1):
"""
As is a list of A matrices where A is similarity matrix.
X is a concatened linear descriptors.
If no X is used, X can be empty
"""
clf = binary_model.BIKE_Ridge( A_list, XX)
parmas = {'alpha': np.logspace( *alphas_log)}
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
#kf_n = kf5_ext_c.split( A_list[0])
gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)
AX_idx = np.array([list(range( ln))]).T
gs.fit( AX_idx, yV)
return gs
def gs_BIKE_Ridge( A_list, yV, alphas_log = (1, -1, 9), X_concat = None, n_splits = 5, n_jobs = -1):
"""
As is a list of A matrices where A is similarity matrix.
X is a concatened linear descriptors.
If no X is used, X can be empty
"""
clf = binary_model.BIKE_Ridge( A_list, X_concat)
parmas = {'alpha': np.logspace( *alphas_log)}
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
#kf_n = kf5_ext_c.split( A_list[0])
gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)
AX_idx = np.array([list(range( ln))]).T
gs.fit( AX_idx, yV)
return gs
def gs_numpy( method, X, Y, alphas_log = (-1, 1, 9), n_splits=5, n_jobs = -1, disp = True):
"""
Grid search method with numpy array of X and Y
Previously, np.mat are used for compatible with Matlab notation.
"""
if disp:
print( X.shape, Y.shape)
clf = getattr( linear_model, method)()
parmas = {'alpha': np.logspace( *alphas_log)}
kf5_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
#kf5 = kf5_c.split( X)
gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = n_jobs)
gs.fit( X, Y)
return gs
def gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1, graph=False):
"""
gs = gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1)
Inputs
======
model = svm.SVC(), or linear_model.LinearRegression(), for example
param = {"C": np.logspace(-2,2,5)}
"""
#print(xM.shape, yVc.shape)
kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
gs = model_selection.GridSearchCV( model, param_grid, cv=kf5_c, n_jobs=n_jobs)
gs.fit( X, y)
if graph:
plt.plot( gs.cv_results_["mean_train_score"], label='E[Train]')
plt.plot( gs.cv_results_["mean_test_score"], label='E[Test]')
plt.legend(loc=0)
plt.grid()
return gs