def __remodel__(self, model_type, regr, __X_train, __Y_train):
"""
Function to retrain certain models based on optimal alphas and/or ratios
"""
if model_type == "ridge":
alpha = regr.alpha_
regr = linear_model.RidgeCV(alphas = self.__realpha__(alpha), cv = 10)
elif model_type == "lasso":
alpha = regr.alpha_
regr = linear_model.LassoCV(alphas = self.__realpha__(alpha), max_iter = 5000, cv = 10)
elif model_type == "elasticnet":
alpha = regr.alpha_
ratio = regr.l1_ratio_
regr = linear_model.ElasticNetCV(l1_ratio = self.__reratio__(ratio), alphas = self.__elasticnet_init["alpha"], max_iter = 1000, cv = 3)
regr.fit(__X_train, __Y_train)
return regr
python类LassoCV()的实例源码
def run_lasso(X, y, max_iter=3000, cv=5, n_threads=1):
""" Implement LassoCV in sklearn
Args:
X (np.array): scaled X.
y (pd.df): four columns response table.
max_iter (int): max iteration.
cv (int): CV fold.
n_threads (int): Number of threads to use for parallel computing.
Returns:
float: trained alpha value.
"""
logger.info('Implementing LassoCV with {} iter. and {}-fold CV'.format(max_iter, cv))
# generate logit response
y_logit = logit((y.nMut + 0.5) / (y.length * y.N))
# sub-sampling X and y (300,000)
use_ix = np.random.choice(y_logit.shape[0], 300000, replace=False)
Xsub = X[use_ix, :]
ysub = y_logit[use_ix]
reg = LassoCV(max_iter=max_iter, cv=cv, copy_X=False, n_jobs=n_threads)
lassocv = reg.fit(Xsub, ysub)
logger.info('LassoCV alpha = {}'.format(lassocv.alpha_))
return lassocv.alpha_
def predicted_vs_actual_sale_price(self, x_train, y_train, title_name):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
0.3, 0.6, 1],
max_iter=50000, cv=10)
# lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
# 0.3, 0.6, 1], cv=10)
lasso.fit(x_train_split, y_train_split)
y_predicted = lasso.predict(X=x_test_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def get_logistic_regression_coefs_l1(self, category,
clf=LassoCV(alphas=[0.1, 0.001],
max_iter=10000,
n_jobs=-1)):
''' Computes l1-penalized logistic regression score.
Parameters
----------
category : str
category name to score
Returns
-------
(coefficient array, accuracy, majority class baseline accuracy)
'''
from sklearn.cross_validation import cross_val_predict
y = self._get_mask_from_category(category)
y_continuous = self._get_continuous_version_boolean_y(y)
# X = TfidfTransformer().fit_transform(self._X)
X = self._X
clf.fit(X, y_continuous)
y_hat = (cross_val_predict(clf, X, y_continuous) > 0)
acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
clf.fit(X, y_continuous)
return clf.coef_, acc, baseline
def test_get_errors_param(self):
"""
Test known models we can get the cv errors for alpha selection
"""
# Test original CV models
for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV):
try:
model = AlphaSelection(model())
X, y = make_regression()
model.fit(X, y)
errors = model._find_errors_param()
self.assertTrue(len(errors) > 0)
except YellowbrickValueError:
self.fail("could not find errors on {}".format(model.name))
def lasso_train(X, y):
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X, y)
print ('lasso mean cv is ',rmse_cv(model_lasso,X,y).mean())
return model_lasso
#%%
def train_lasso_model(_train_x, train_y, _predict_x):
print_title("Lasso Regressor")
train_x, predict_x = \
standarize_feature(_train_x, _predict_x)
reg = linear_model.LassoCV(
precompute=True, cv=5, verbose=1, n_jobs=4)
reg.fit(train_x, train_y)
print("alphas: %s" % reg.alphas_)
print("mse path: %s" % np.mean(reg.mse_path_, axis=1))
itemindex = np.where(reg.alphas_ == reg.alpha_)
print("itemindex: %s" % itemindex)
_mse = np.mean(reg.mse_path_[itemindex[0], :])
print("Best alpha using bulit-in LassoCV: %f(mse: %f)" %
(reg.alpha_, _mse))
alpha = reg.alpha_
reg = linear_model.Lasso(alpha=alpha)
reg.fit(train_x, train_y)
n_nonzeros = (reg.coef_ != 0).sum()
print("Non-zeros coef: %d" % n_nonzeros)
predict_y = reg.predict(predict_x)
train_y_pred = reg.predict(train_x)
return {"y": predict_y, "train_y": train_y_pred, "coef": reg.coef_}
def test_real_model(self):
"""
Test that model name works for sklearn estimators
"""
model1 = LassoCV()
model2 = LSHForest()
model3 = KMeans()
model4 = RandomForestClassifier()
self.assertEqual(get_model_name(model1), 'LassoCV')
self.assertEqual(get_model_name(model2), 'LSHForest')
self.assertEqual(get_model_name(model3), 'KMeans')
self.assertEqual(get_model_name(model4), 'RandomForestClassifier')
def test_regressor_cv(self):
"""
Ensure only "CV" regressors are allowed
"""
for model in (SVR, Ridge, Lasso, LassoLars, ElasticNet):
with self.assertRaises(YellowbrickTypeError):
alphas = AlphaSelection(model())
for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV):
try:
alphas = AlphaSelection(model())
except YellowbrickTypeError:
self.fail("could not instantiate RegressorCV on alpha selection")
def test_get_alphas_param(self):
"""
Assert that we can get the alphas from ridge, lasso, and elasticnet
"""
alphas = np.logspace(-10, -2, 100)
# Test original CV models
for model in (RidgeCV, LassoCV, ElasticNetCV):
try:
model = AlphaSelection(model(alphas=alphas))
malphas = model._find_alphas_param()
self.assertTrue(np.array_equal(alphas, malphas))
except YellowbrickValueError:
self.fail("could not find alphas on {}".format(model.name))
def Lasso_regression():
lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
0.3, 0.6, 1],
max_iter=50000, cv=10)
lasso.fit(train_split, y)
alpha = lasso.alpha_
print("Best alpha :", alpha)
print("Try again for more precision with alphas centered around " + str(alpha))
lasso = LassoCV(alphas=[alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8,
alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05,
alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35,
alpha * 1.4],
max_iter=50000, cv=10)
lasso.fit(train_split, y)
alpha = lasso.alpha_
print("Best alpha :", alpha)
print("Lasso RMSE on Training set :", rmse_cv(lasso, train_split, y).mean())
y_train_las = lasso.predict(train_split)
# Plot residuals
plt.scatter(y_train_las, y_train_las - y, c="blue", marker="s", label="Training data")
plt.title("Linear regression with Lasso regularization")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc="upper left")
plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
plt.show()
# Plot predictions
plt.scatter(y_train_las, y, c="blue", marker="s", label="Training data")
plt.title("Linear regression with Lasso regularization")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc="upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
plt.show()
# # Plot important coefficients
coefs = pd.DataFrame(lasso.coef_, index=X_train.columns,columns=['value'])
# print("Lasso picked " + str(sum(coefs != 0)) + " features and eliminated the other " + \
# str(sum(coefs == 0)) + " features")
# imp_coefs = pd.concat([coefs.sort_values().head(10),
# coefs.sort_values().tail(10)])
# imp_coefs.plot(kind="barh")
# plt.title("Coefficients in the Lasso Model")
# plt.show()
return coefs,lasso