def build_ensemble(**kwargs):
"""Generate ensemble."""
ens = SuperLearner(**kwargs)
prep = {'Standard Scaling': [StandardScaler()],
'Min Max Scaling': [MinMaxScaler()],
'No Preprocessing': []}
est = {'Standard Scaling':
[ElasticNet(), Lasso(), KNeighborsRegressor()],
'Min Max Scaling':
[SVR()],
'No Preprocessing':
[RandomForestRegressor(random_state=SEED),
GradientBoostingRegressor()]}
ens.add(est, prep)
ens.add(GradientBoostingRegressor(), meta=True)
return ens
python类Lasso()的实例源码
def _cv_r0( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def model_cross_valid(X,Y):
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
def bulid_model(model_name):
model = model_name()
return model
scoring = 'neg_mean_squared_error'
# + random fest boost lstm gbdt
for model_name in [LinearRegression,ElasticNet]:
#for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]:
model = bulid_model(model_name)
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(model_name,results.mean())
def fit_regression(X, y, regression_class=LinearRegression, regularization_const=.001):
'''
Given a dataset and some solutions (X, y) a regression class (from scikit learn)
and an Lambda which is required if the regression class is Lasso or Ridge
X (pandas DataFrame): The data.
y (pandas DataFrame or Series): The answers.
regression_class (class): One of sklearn.linear_model.[LinearRegression, Ridge, Lasso]
regularization_const: the regularization_const value (regularization parameter) for Ridge or Lasso.
Called alpha by scikit learn for interface reasons.
Return:
tuple, (the_fitted_regressor, mean(cross_val_score)).
'''
if regression_class is LinearRegression:
predictor = regression_class()
else:
predictor = regression_class(alpha=regularization_const, normalize=True)
predictor.fit(X, y)
cross_scores = cross_val_score(predictor, X, y=y, scoring='neg_mean_squared_error')
cross_scores_corrected = np.sqrt(-1 * cross_scores) # Scikit learn returns negative vals && we need root
return (predictor, np.mean(cross_scores_corrected))
def run(self):
""" ?? """
# ????
X, Y = self._fetch_data()
clf = self.get_classifier(X, Y)
# ??
X, Y = self._fetch_test_data()
res = []
for item in range(11):
hit_ratio = self.predict(clf, X, Y, item * 0.1)
res.append([item * 0.1 * 100, hit_ratio * 100])
# ??????????????
arr = np.array(res)
plt.plot(arr[:, 0], arr[:, 1]) # ???
plt.plot(arr[:, 0], arr[:, 1], 'ro') # ???
plt.xlabel('???(%)')
plt.ylabel('???(%)')
plt.title('??Lasso?????????????')
plt.show()
def lasso_regularization(matrix_a, vector_y, lambda_parameter=0):
"""
Lasso algorithm that solves min ||y - Ax||_2^2 + lambda ||x||_1
:param matrix_a:
:param vector_y:
:param lambda_parameter:
:return: estimated x
"""
# convert regularization parameter (sklearn considers (1/2m factor))
reg_parameter = lambda_parameter / (2 * len(vector_y))
# initialize model
clf = linear_model.Lasso(reg_parameter, fit_intercept=False, normalize=False)
# fit it
clf.fit(matrix_a, vector_y)
# return estimate
x = clf.coef_
return x
def _cv_r0( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
kf_n = kf5_ext_c.split( xM)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def cv( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
kf_n = kf5_ext_c.split( xM)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def mlr_val_vseq_lasso( RM, yE, v_seq, alpha = .5, disp = True, graph = True):
"""
Validation is peformed using vseq indexed values.
"""
org_seq = list(range( len( yE)))
t_seq = [x for x in org_seq if x not in v_seq]
RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]
clf = linear_model.Lasso( alpha = alpha)
clf.fit( RMt, yEt)
if disp: print('Training result')
mlr_show( clf, RMt, yEt, disp = disp, graph = graph)
if disp: print('Validation result')
r_sqr, RMSE = mlr_show( clf, RMv, yEv, disp = disp, graph = graph)
#if r_sqr < 0:
# print 'v_seq:', v_seq, '--> r_sqr = ', r_sqr
return r_sqr, RMSE
def cv_SVR( xM, yV, svr_params, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = svm.SVR( **svr_params)
kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
kf_n = kf5_ext_c.split( xM)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
kutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def _cv_r0( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
kf_n = kf5_ext_c.split( xM)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
kutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def cv( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
kf_n = kf_n_c.split( xM)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
kutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def cvLOO( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
n_splits = xM.shape[0]
# print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = model_selection.KFold( xM.shape[0], n_splits=n_splits)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
kutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def mlr_val_vseq_lasso( RM, yE, v_seq, alpha = .5, disp = True, graph = True):
"""
Validation is peformed using vseq indexed values.
"""
org_seq = list(range( len( yE)))
t_seq = [x for x in org_seq if x not in v_seq]
RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]
clf = linear_model.Lasso( alpha = alpha)
clf.fit( RMt, yEt)
if disp: print('Training result')
mlr_show( clf, RMt, yEt, disp = disp, graph = graph)
if disp: print('Validation result')
r_sqr, RMSE = mlr_show( clf, RMv, yEv, disp = disp, graph = graph)
#if r_sqr < 0:
# print 'v_seq:', v_seq, '--> r_sqr = ', r_sqr
return r_sqr, RMSE
def pd_gscv( pdr, method, xM, yV, alphas_log, colname = 'Predicted-RP', fname = 'sheet/rafa36795_cxcalc_prp1000.csv'):
"""
This run grid search, perform cross-validation for plotting and save the predicted values,
"""
print("1. Searching the best hyper-parameter by a grid method.")
gr = jgrid.gs( method, xM, yV, alphas_log)
print(gr.grid_scores_)
print("Best alpha:", gr.best_params_['alpha'])
print("2. Predicting the property using the best hyper-parameter and show a x-y plot")
yV_pred = jgrid.cv( 'Lasso', xM, yV, alpha = gr.best_params_['alpha'], grid_std = gr_beststd(gr))
print("3. Saving the predicted results in crossvalidation into", fname)
pdw = pdr.copy()
pdw[ colname] = yV_pred.tolist()
pdw.to_csv( fname, index = False)
print("4. Saving the best estimator as a pkl file")
print(gr.best_estimator_)
externals.joblib.dump(gr.best_estimator_, fname[:-3] + "pkl")
def cv_SVR(xM, yV, svr_params, n_folds=5, n_jobs=-1, grid_std=None, graph=True, shuffle=True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = svm.SVR(**svr_params)
kf_n_c = model_selection.KFold(n_splits=n_folds, shuffle=True)
kf_n = kf_n_c.split(xM)
yV_pred = model_selection.cross_val_predict(
clf, xM, yV.A1, cv=kf_n, n_jobs=n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show(yV, yV_pred, grid_std=grid_std)
return yV_pred
def _cv_r0(method, xM, yV, alpha, n_folds=5, n_jobs=-1, grid_std=None, graph=True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr(linear_model, method)(alpha=alpha)
kf_n_c = model_selection.KFold(n_splits=n_folds, shuffle=True)
kf_n = kf_n_c.split(xM)
yV_pred = model_selection.cross_val_predict(
clf, xM, yV, cv=kf_n, n_jobs=n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show(yV, yV_pred, grid_std=grid_std)
return yV_pred
def cv(method, xM, yV, alpha, n_folds=5, n_jobs=-1, grid_std=None, graph=True, shuffle=True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
Return
--------
yV_pred
"""
print(xM.shape, yV.shape)
clf = getattr(linear_model, method)(alpha=alpha)
kf_n_c = model_selection.KFold(n_splits=n_folds, shuffle=True)
kf_n = kf_n_c.split(xM)
yV_pred = model_selection.cross_val_predict(
clf, xM, yV, cv=kf_n, n_jobs=n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show(yV, yV_pred, grid_std=grid_std)
return yV_pred
def _cv_LOO_r0(method, xM, yV, alpha, n_jobs=-1, grid_std=None, graph=True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
n_folds = xM.shape[0]
print(xM.shape, yV.shape)
clf = getattr(linear_model, method)(alpha=alpha)
# print("Note - shuffling is not applied because of LOO.")
kf_n_c = model_selection.KFold(n_splits=n_folds)
kf_n = kf_n_c.split(xM)
yV_pred = model_selection.cross_val_predict(
clf, xM, yV, cv=kf_n, n_jobs=n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show(yV, yV_pred, grid_std=grid_std)
return yV_pred
jgrid (james-90X3A's conflicted copy 2016-04-21).py 文件源码
项目:jamespy_py3
作者: jskDr
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=shuffle)
yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def _cv_LOO_r0( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
n_folds = xM.shape[0]
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds)
yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def test_Lasso(*data):
'''
test the correlation between alpha and sparse condition
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X,y=data
alphas=np.logspace(-2,2)
zeros=[]
for alpha in alphas:
regr=Lasso(alpha=alpha)
regr.fit(X,y)
num=0
for ele in regr.coef_:
if abs(ele) < 1e-5:num+=1
zeros.append(num)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(alphas,zeros)
ax.set_xlabel(r"$\alpha$")
ax.set_xscale("log")
ax.set_ylim(0,X.shape[1]+1)
ax.set_ylabel("zeros in coef")
ax.set_title("Sparsity In Lasso")
plt.show()
def test_Lasso_alpha(*data):
'''
test the score with different alpha
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
alphas=[0.01,0.02,0.05,0.1,0.2,0.5,1,2,5,10,20,50,100,200,500,1000]
scores=[]
for i,alpha in enumerate(alphas):
regr = linear_model.Lasso(alpha=alpha)
regr.fit(X_train, y_train)
scores.append(regr.score(X_test, y_test))
## graph
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(alphas,scores)
ax.set_xlabel(r"$\alpha$")
ax.set_ylabel(r"score")
ax.set_xscale('log')
ax.set_title("Lasso")
plt.show()
def test_rank_deficient_design():
# consistency test that checks that LARS Lasso is handling rank
# deficient input data (with n_features < rank) in the same way
# as coordinate descent Lasso
y = [5, 0, 5]
for X in ([[5, 0],
[0, 5],
[10, 10]],
[[10, 10, 0],
[1e-32, 0, 0],
[0, 0, 1]],
):
# To be able to use the coefs to compute the objective function,
# we need to turn off normalization
lars = linear_model.LassoLars(.1, normalize=False)
coef_lars_ = lars.fit(X, y).coef_
obj_lars = (1. / (2. * 3.)
* linalg.norm(y - np.dot(X, coef_lars_)) ** 2
+ .1 * linalg.norm(coef_lars_, 1))
coord_descent = linear_model.Lasso(.1, tol=1e-6, normalize=False)
coef_cd_ = coord_descent.fit(X, y).coef_
obj_cd = ((1. / (2. * 3.)) * linalg.norm(y - np.dot(X, coef_cd_)) ** 2
+ .1 * linalg.norm(coef_cd_, 1))
assert_less(obj_lars, obj_cd * (1. + 1e-8))
def test_lasso_lars_vs_lasso_cd_early_stopping(verbose=False):
# Test that LassoLars and Lasso using coordinate descent give the
# same results when early stopping is used.
# (test : before, in the middle, and in the last part of the path)
alphas_min = [10, 0.9, 1e-4]
for alphas_min in alphas_min:
alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso',
alpha_min=0.9)
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
lasso_cd.alpha = alphas[-1]
lasso_cd.fit(X, y)
error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
assert_less(error, 0.01)
alphas_min = [10, 0.9, 1e-4]
# same test, with normalization
for alphas_min in alphas_min:
alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso',
alpha_min=0.9)
lasso_cd = linear_model.Lasso(fit_intercept=True, normalize=True,
tol=1e-8)
lasso_cd.alpha = alphas[-1]
lasso_cd.fit(X, y)
error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
assert_less(error, 0.01)
def run(self):
""" ?? """
# ????
X, Y = self._fetch_data()
clf = self.get_classifier(X, Y)
# ??
X, Y = self._fetch_test_data()
res = []
for item in range(11):
hit_ratio = self.predict(clf, X, Y, item * 0.1)
res.append([item * 0.1 * 100, hit_ratio * 100])
# ??????????????
arr = np.array(res)
plt.plot(arr[:, 0], arr[:, 1]) # ???
plt.plot(arr[:, 0], arr[:, 1], 'ro') # ???
plt.xlabel('???(%)')
plt.ylabel('???(%)')
plt.title('??Lasso?????????????')
plt.show()
def run(self):
""" ?? """
# ????
X, Y = self._fetch_data()
clf = self.get_classifier(X, Y)
# ??
X, Y = self._fetch_test_data()
res = []
for item in range(11):
hit_ratio = self.predict(clf, X, Y, item * 0.1)
res.append([item * 0.1 * 100, hit_ratio * 100])
# ??????????????
arr = np.array(res)
plt.plot(arr[:, 0], arr[:, 1]) # ???
plt.plot(arr[:, 0], arr[:, 1], 'ro') # ???
plt.xlabel('???(%)')
plt.ylabel('???(%)')
plt.title('??Lasso?????????????')
plt.show()
def train(self):
""""""
start = time.time()
extra_tr = pd.read_hdf(path_or_buf='%s/p21/eval_train.hdf' % self.InputDir, key='train')
print('size before truncated outliers is %d ' % len(self.TrainData))
self.TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)]
#self.TrainData = self.TrainData.join(extra_tr, on='parcelid', how= 'left')
self.TrainData = pd.concat([self.TrainData, extra_tr.drop('parcelid', axis= 1)], axis = 1)
print('size after truncated outliers is %d ' % len(self.TrainData))
X = self.TrainData.drop(self._l_drop_cols, axis=1)
Y = self.TrainData['logerror']
self._l_train_columns = X.columns
X = X.values.astype(np.float32, copy=False)
lr = Lasso(alpha= self._lr_alpha, max_iter= self._lr_iter, tol= 1e-4, random_state= 2017, selection= self._lr_sel)
self._model = lr.fit(X, Y)
end = time.time()
print('Training iterates %d, time consumed %d ' % (self._model.n_iter_, (end - start)))
self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__,
datetime.now().strftime('%Y%m%d-%H:%M:%S'))
with open(self._f_eval_train_model, 'wb') as o_file:
pickle.dump(self._model, o_file, -1)
o_file.close()
#self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]],
# ignore_index=True) ## ignore_index will reset the index or index will be overlaped
return
def define_model(self):
#if self.modeltype == "AR" :
# return statsmodels.tsa.ar_model.AR(max_order=self.parameters['max_order'])
if self.modeltype == "RandomForest" :
return ensemble.RandomForestRegressor(n_estimators=self.parameters['n_estimators'])
#return ensemble.RandomForestClassifier(
# n_estimators=self.parameters['n_estimators'])
elif self.modeltype == "LinearRegression" :
return linear_model.LinearRegression()
elif self.modeltype == "Lasso" :
return linear_model.Lasso(
alpha=self.parameters['alpha'])
elif self.modeltype == "ElasticNet" :
return linear_model.ElasticNet(
alpha=self.parameters['alpha'],
l1_ratio=self.parameters['l1_ratio'])
elif self.modeltype == "SVR" :
return SVR(
C=self.parameters['C'],
epsilon=self.parameters['epsilon'],
kernel=self.parameters['kernel'])
#elif self.modeltype == 'StaticModel':
# return StaticModel (
# parameters=self.parameters
# )
#elif self.modeltype == 'AdvancedStaticModel':
# return AdvancedStaticModel (
# parameters=self.parameters
# )
# elif self.modeltype == 'SGDRegressor' :
# print(self.parameters)
# return linear_model.SGDRegressor(
# loss=self.parameters['loss'],
# penalty=self.parameters['penalty'],
# l1_ratio=self.parameters['l1_ratio'])
else:
raise ConfigError("Unsupported model {0}".format(self.modeltype))