def train_lassolars_model(train_x, train_y, predict_x):
print_title("LassoLars Regressor")
reg = linear_model.LassoLarsCV(
cv=10, n_jobs=3, max_iter=2000, normalize=False)
reg.fit(train_x, train_y)
print("alphas and cv_alphas: {0} and {1}".format(
reg.alphas_.shape, reg.cv_alphas_.shape))
print("alphas[%d]: %s" % (len(reg.cv_alphas_), reg.cv_alphas_))
print("mse shape: {0}".format(reg.cv_mse_path_.shape))
# print("mse: %s" % np.mean(_mse, axis=0))
# print("mse: %s" % np.mean(_mse, axis=1))
# index = np.where(reg.alphas_ == reg.alpha_)
# print("itemindex: %s" % index)
index = np.where(reg.cv_alphas_ == reg.alpha_)
_mse_v = np.mean(reg.cv_mse_path_[index, :])
print("mse value: %f" % _mse_v)
print("best alpha: %f" % reg.alpha_)
best_alpha = reg.alpha_
reg = linear_model.LassoLars(alpha=best_alpha)
reg.fit(train_x, train_y)
n_nonzeros = (reg.coef_ != 0).sum()
print("Non-zeros coef: %d" % n_nonzeros)
predict_y = reg.predict(predict_x)
return {'y': predict_y, "coef": reg.coef_}
python类LassoLars()的实例源码
def test_rank_deficient_design():
# consistency test that checks that LARS Lasso is handling rank
# deficient input data (with n_features < rank) in the same way
# as coordinate descent Lasso
y = [5, 0, 5]
for X in ([[5, 0],
[0, 5],
[10, 10]],
[[10, 10, 0],
[1e-32, 0, 0],
[0, 0, 1]],
):
# To be able to use the coefs to compute the objective function,
# we need to turn off normalization
lars = linear_model.LassoLars(.1, normalize=False)
coef_lars_ = lars.fit(X, y).coef_
obj_lars = (1. / (2. * 3.)
* linalg.norm(y - np.dot(X, coef_lars_)) ** 2
+ .1 * linalg.norm(coef_lars_, 1))
coord_descent = linear_model.Lasso(.1, tol=1e-6, normalize=False)
coef_cd_ = coord_descent.fit(X, y).coef_
obj_cd = ((1. / (2. * 3.)) * linalg.norm(y - np.dot(X, coef_cd_)) ** 2
+ .1 * linalg.norm(coef_cd_, 1))
assert_less(obj_lars, obj_cd * (1. + 1e-8))
def test_lasso_lars_vs_lasso_cd_early_stopping(verbose=False):
# Test that LassoLars and Lasso using coordinate descent give the
# same results when early stopping is used.
# (test : before, in the middle, and in the last part of the path)
alphas_min = [10, 0.9, 1e-4]
for alphas_min in alphas_min:
alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso',
alpha_min=0.9)
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
lasso_cd.alpha = alphas[-1]
lasso_cd.fit(X, y)
error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
assert_less(error, 0.01)
alphas_min = [10, 0.9, 1e-4]
# same test, with normalization
for alphas_min in alphas_min:
alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso',
alpha_min=0.9)
lasso_cd = linear_model.Lasso(fit_intercept=True, normalize=True,
tol=1e-8)
lasso_cd.alpha = alphas[-1]
lasso_cd.fit(X, y)
error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
assert_less(error, 0.01)
def test_multitarget():
# Assure that estimators receiving multidimensional y do the right thing
X = diabetes.data
Y = np.vstack([diabetes.target, diabetes.target ** 2]).T
n_targets = Y.shape[1]
for estimator in (linear_model.LassoLars(), linear_model.Lars()):
estimator.fit(X, Y)
Y_pred = estimator.predict(X)
Y_dec = assert_warns(DeprecationWarning, estimator.decision_function, X)
assert_array_almost_equal(Y_pred, Y_dec)
alphas, active, coef, path = (estimator.alphas_, estimator.active_,
estimator.coef_, estimator.coef_path_)
for k in range(n_targets):
estimator.fit(X, Y[:, k])
y_pred = estimator.predict(X)
assert_array_almost_equal(alphas[k], estimator.alphas_)
assert_array_almost_equal(active[k], estimator.active_)
assert_array_almost_equal(coef[k], estimator.coef_)
assert_array_almost_equal(path[k], estimator.coef_path_)
assert_array_almost_equal(Y_pred[:, k], y_pred)
def train(self):
""""""
start = time.time()
print('size before truncated outliers is %d ' % len(self.TrainData))
TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)]
print('size after truncated outliers is %d ' % len(self.TrainData))
TrainData['longitude'] -= -118600000
TrainData['latitude'] -= 34220000
#extra_tr = pd.read_hdf(path_or_buf='%s/p21/eval_train.hdf' % self.InputDir, key='train')
#self.TrainData = pd.concat([self.TrainData, extra_tr.drop('parcelid', axis= 1)], axis = 1)
X = self.TrainData.drop(self._l_drop_cols, axis=1)
Y = self.TrainData['logerror']
self._l_train_columns = X.columns
X = X.values.astype(np.float32, copy=False)
lr = LassoLars(alpha= self._lr_alpha, max_iter= self._lr_iter, verbose= True)
self._model = lr.fit(X, Y)
end = time.time()
print('Training iterates %d, time consumed %d ' % (self._model.n_iter_, (end - start)))
self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__,
datetime.now().strftime('%Y%m%d-%H:%M:%S'))
#with open(self._f_eval_train_model, 'wb') as o_file:
# pickle.dump(self._model, o_file, -1)
#o_file.close()
#self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]],
# ignore_index=True) ## ignore_index will reset the index or index will be overlaped
return
def get_models4ensamble(conf):
models = []
#models = [RFRModel(conf), DLModel(conf), LRModel(conf)]
#models = [LRModel(conf)]
# see http://scikit-learn.org/stable/modules/linear_model.html
#0 was too big to run with depth set to 1, and 1 was overfitting a bit
if conf.command == 1:
xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":3, "eta":0.1, "min_child_weight":5,
"subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0}
else:
xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":10, "eta":0.1, "min_child_weight":8,
"subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0}
#xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":10, "eta":0.1, "min_child_weight":8,
# "subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0}
models = [
#DLModel(conf),
#LRModel(conf, model=linear_model.BayesianRidge()),
#LRModel(conf, model=linear_model.LassoLars(alpha=.1)),
#LRModel(conf, model=linear_model.Lasso(alpha = 0.1)),
#LRModel(conf, model=Pipeline([('poly', PolynomialFeatures(degree=3)),
#LRModel(conf, model=linear_model.Ridge (alpha = .5))
# ('linear', LinearRegression(fit_intercept=False))])),
XGBoostModel(conf, xgb_params, use_cv=True),
LRModel(conf, model=linear_model.Lasso(alpha = 0.3)),
RFRModel(conf, RandomForestRegressor(oob_score=True, n_jobs=4)),
#LRModel(conf, model=linear_model.Lasso(alpha = 0.2)),
ETRModel(conf, model=ExtraTreesRegressor(n_jobs=4)),
#AdaBoostRModel(conf, model=AdaBoostRegressor(loss='square'))
]
return models
#return [XGBoostModel(conf, xgb_params, use_cv=True)]
def test_regressor_cv(self):
"""
Ensure only "CV" regressors are allowed
"""
for model in (SVR, Ridge, Lasso, LassoLars, ElasticNet):
with self.assertRaises(YellowbrickTypeError):
alphas = AlphaSelection(model())
for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV):
try:
alphas = AlphaSelection(model())
except YellowbrickTypeError:
self.fail("could not instantiate RegressorCV on alpha selection")
def getModels():
result = []
result.append("LinearRegression")
result.append("BayesianRidge")
result.append("ARDRegression")
result.append("ElasticNet")
result.append("HuberRegressor")
result.append("Lasso")
result.append("LassoLars")
result.append("Rigid")
result.append("SGDRegressor")
result.append("SVR")
result.append("MLPClassifier")
result.append("KNeighborsClassifier")
result.append("SVC")
result.append("GaussianProcessClassifier")
result.append("DecisionTreeClassifier")
result.append("RandomForestClassifier")
result.append("AdaBoostClassifier")
result.append("GaussianNB")
result.append("LogisticRegression")
result.append("QuadraticDiscriminantAnalysis")
return result
def compute_bench(alpha, n_samples, n_features, precompute):
lasso_results = []
lars_lasso_results = []
it = 0
for ns in n_samples:
for nf in n_features:
it += 1
print('==================')
print('Iteration %s of %s' % (it, max(len(n_samples),
len(n_features))))
print('==================')
n_informative = nf // 10
X, Y, coef_ = make_regression(n_samples=ns, n_features=nf,
n_informative=n_informative,
noise=0.1, coef=True)
X /= np.sqrt(np.sum(X ** 2, axis=0)) # Normalize data
gc.collect()
print("- benchmarking Lasso")
clf = Lasso(alpha=alpha, fit_intercept=False,
precompute=precompute)
tstart = time()
clf.fit(X, Y)
lasso_results.append(time() - tstart)
gc.collect()
print("- benchmarking LassoLars")
clf = LassoLars(alpha=alpha, fit_intercept=False,
normalize=False, precompute=precompute)
tstart = time()
clf.fit(X, Y)
lars_lasso_results.append(time() - tstart)
return lasso_results, lars_lasso_results
def test_lars_lstsq():
# Test that Lars gives least square solution at the end
# of the path
X1 = 3 * diabetes.data # use un-normalized dataset
clf = linear_model.LassoLars(alpha=0.)
clf.fit(X1, y)
coef_lstsq = np.linalg.lstsq(X1, y)[0]
assert_array_almost_equal(clf.coef_, coef_lstsq)
def test_lasso_lars_vs_lasso_cd(verbose=False):
# Test that LassoLars and Lasso using coordinate descent give the
# same results.
X = 3 * diabetes.data
alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso')
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
for c, a in zip(lasso_path.T, alphas):
if a == 0:
continue
lasso_cd.alpha = a
lasso_cd.fit(X, y)
error = linalg.norm(c - lasso_cd.coef_)
assert_less(error, 0.01)
# similar test, with the classifiers
for alpha in np.linspace(1e-2, 1 - 1e-2, 20):
clf1 = linear_model.LassoLars(alpha=alpha, normalize=False).fit(X, y)
clf2 = linear_model.Lasso(alpha=alpha, tol=1e-8,
normalize=False).fit(X, y)
err = linalg.norm(clf1.coef_ - clf2.coef_)
assert_less(err, 1e-3)
# same test, with normalized data
X = diabetes.data
alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso')
lasso_cd = linear_model.Lasso(fit_intercept=False, normalize=True,
tol=1e-8)
for c, a in zip(lasso_path.T, alphas):
if a == 0:
continue
lasso_cd.alpha = a
lasso_cd.fit(X, y)
error = linalg.norm(c - lasso_cd.coef_)
assert_less(error, 0.01)
def test_lasso_lars_vs_lasso_cd_ill_conditioned2():
# Create an ill-conditioned situation in which the LARS has to go
# far in the path to converge, and check that LARS and coordinate
# descent give the same answers
# Note it used to be the case that Lars had to use the drop for good
# strategy for this but this is no longer the case with the
# equality_tolerance checks
X = [[1e20, 1e20, 0],
[-1e-32, 0, 0],
[1, 1, 1]]
y = [10, 10, 1]
alpha = .0001
def objective_function(coef):
return (1. / (2. * len(X)) * linalg.norm(y - np.dot(X, coef)) ** 2
+ alpha * linalg.norm(coef, 1))
lars = linear_model.LassoLars(alpha=alpha, normalize=False)
assert_warns(ConvergenceWarning, lars.fit, X, y)
lars_coef_ = lars.coef_
lars_obj = objective_function(lars_coef_)
coord_descent = linear_model.Lasso(alpha=alpha, tol=1e-4, normalize=False)
cd_coef_ = coord_descent.fit(X, y).coef_
cd_obj = objective_function(cd_coef_)
assert_less(lars_obj, cd_obj * (1. + 1e-8))
def getSKLearnModel(modelName):
if modelName == 'LinearRegression':
model = linear_model.LinearRegression()
elif modelName == 'BayesianRidge':
model = linear_model.BayesianRidge()
elif modelName == 'ARDRegression':
model = linear_model.ARDRegression()
elif modelName == 'ElasticNet':
model = linear_model.ElasticNet()
elif modelName == 'HuberRegressor':
model = linear_model.HuberRegressor()
elif modelName == 'Lasso':
model = linear_model.Lasso()
elif modelName == 'LassoLars':
model = linear_model.LassoLars()
elif modelName == 'Rigid':
model = linear_model.Ridge()
elif modelName == 'SGDRegressor':
model = linear_model.SGDRegressor()
elif modelName == 'SVR':
model = SVR()
elif modelName=='MLPClassifier':
model = MLPClassifier()
elif modelName=='KNeighborsClassifier':
model = KNeighborsClassifier()
elif modelName=='SVC':
model = SVC()
elif modelName=='GaussianProcessClassifier':
model = GaussianProcessClassifier()
elif modelName=='DecisionTreeClassifier':
model = DecisionTreeClassifier()
elif modelName=='RandomForestClassifier':
model = RandomForestClassifier()
elif modelName=='AdaBoostClassifier':
model = AdaBoostClassifier()
elif modelName=='GaussianNB':
model = GaussianNB()
elif modelName=='LogisticRegression':
model = linear_model.LogisticRegression()
elif modelName=='QuadraticDiscriminantAnalysis':
model = QuadraticDiscriminantAnalysis()
return model
def test_lasso_lars_vs_lasso_cd_positive(verbose=False):
# Test that LassoLars and Lasso using coordinate descent give the
# same results when using the positive option
# This test is basically a copy of the above with additional positive
# option. However for the middle part, the comparison of coefficient values
# for a range of alphas, we had to make an adaptations. See below.
# not normalized data
X = 3 * diabetes.data
alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso',
positive=True)
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True)
for c, a in zip(lasso_path.T, alphas):
if a == 0:
continue
lasso_cd.alpha = a
lasso_cd.fit(X, y)
error = linalg.norm(c - lasso_cd.coef_)
assert_less(error, 0.01)
# The range of alphas chosen for coefficient comparison here is restricted
# as compared with the above test without the positive option. This is due
# to the circumstance that the Lars-Lasso algorithm does not converge to
# the least-squares-solution for small alphas, see 'Least Angle Regression'
# by Efron et al 2004. The coefficients are typically in congruence up to
# the smallest alpha reached by the Lars-Lasso algorithm and start to
# diverge thereafter. See
# https://gist.github.com/michigraber/7e7d7c75eca694c7a6ff
for alpha in np.linspace(6e-1, 1 - 1e-2, 20):
clf1 = linear_model.LassoLars(fit_intercept=False, alpha=alpha,
normalize=False, positive=True).fit(X, y)
clf2 = linear_model.Lasso(fit_intercept=False, alpha=alpha, tol=1e-8,
normalize=False, positive=True).fit(X, y)
err = linalg.norm(clf1.coef_ - clf2.coef_)
assert_less(err, 1e-3)
# normalized data
X = diabetes.data
alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso',
positive=True)
lasso_cd = linear_model.Lasso(fit_intercept=False, normalize=True,
tol=1e-8, positive=True)
for c, a in zip(lasso_path.T[:-1], alphas[:-1]): # don't include alpha=0
lasso_cd.alpha = a
lasso_cd.fit(X, y)
error = linalg.norm(c - lasso_cd.coef_)
assert_less(error, 0.01)
def get_model_list(task_name):
model_list, name_list = [], []
model_list.append(linear_model.LinearRegression())
name_list.append('LR')
#
model_list.append(linear_model.SGDRegressor())
name_list.append('LR_SGD')
model_list.append(linear_model.Lasso(alpha = 1.0))
name_list.append('Lasso')
model_list.append(linear_model.Ridge (alpha = 1.0))
name_list.append('Ridge')
model_list.append(linear_model.LassoLars(alpha=.1))
name_list.append('LassoLars')
model_list.append(linear_model.BayesianRidge())
name_list.append('BayesianRidge')
model_list.append(KernelRidge(alpha=1.0))
name_list.append('KernelRidge')
model_list.append(gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1))
name_list.append('GaussianProcess')
model_list.append(KNeighborsRegressor(weights = 'uniform',n_neighbors=3))
name_list.append('KNN_unif')
model_list.append(KNeighborsRegressor(weights = 'distance',n_neighbors=3))
name_list.append('KNN_dist')
model_list.append(SVR(kernel = 'linear', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
name_list.append('SVM_linear')
model_list.append(SVR(kernel = 'poly', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
name_list.append('SVM_poly')
model_list.append(SVR(kernel = 'rbf', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
name_list.append('SVM_rbf')
model_list.append(DecisionTreeRegressor())
name_list.append('DT')
model_list.append(RandomForestRegressor(n_estimators=100, max_depth=None,min_samples_split=2, random_state=0))
name_list.append('RF')
model_list.append(ExtraTreesRegressor(n_estimators=100, max_depth=None, max_features='auto', min_samples_split=2, random_state=0))
name_list.append('ET')
return model_list, name_list