def __remodel__(self, model_type, regr, __X_train, __Y_train):
"""
Function to retrain certain models based on optimal alphas and/or ratios
"""
if model_type == "ridge":
alpha = regr.alpha_
regr = linear_model.RidgeCV(alphas = self.__realpha__(alpha), cv = 10)
elif model_type == "lasso":
alpha = regr.alpha_
regr = linear_model.LassoCV(alphas = self.__realpha__(alpha), max_iter = 5000, cv = 10)
elif model_type == "elasticnet":
alpha = regr.alpha_
ratio = regr.l1_ratio_
regr = linear_model.ElasticNetCV(l1_ratio = self.__reratio__(ratio), alphas = self.__elasticnet_init["alpha"], max_iter = 1000, cv = 3)
regr.fit(__X_train, __Y_train)
return regr
python类RidgeCV()的实例源码
def predicted_vs_actual_sale_price(self, x_train, y_train, title_name):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
0.3, 0.6, 1],
max_iter=50000, cv=10)
# lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
# 0.3, 0.6, 1], cv=10)
lasso.fit(x_train_split, y_train_split)
y_predicted = lasso.predict(X=x_test_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def train_ridge_linear_model(_train_x, train_y, _predict_x,
sample_weight=None):
print_title("Ridge Regressor")
train_x, predict_x = \
standarize_feature(_train_x, _predict_x)
# using the default CV
alphas = [0.1, 1, 10, 100, 1e3, 1e4, 2e4, 5e4, 8e4, 1e5, 1e6, 1e7, 1e8]
reg = linear_model.RidgeCV(alphas=alphas, store_cv_values=True)
#reg.fit(train_x, train_y, sample_weight=sample_weight)
reg.fit(train_x, train_y)
cv_mse = np.mean(reg.cv_values_, axis=0)
print("alphas: %s" % alphas)
print("CV MSE: %s" % cv_mse)
print("Best alpha using built-in RidgeCV: %f" % reg.alpha_)
# generate the prediction using the best model
alpha = reg.alpha_
reg = linear_model.Ridge(alpha=alpha)
#reg.fit(train_x, train_y, sample_weight=sample_weight)
reg.fit(train_x, train_y)
predict_y = reg.predict(predict_x)
train_y_pred = reg.predict(train_x)
return {"y": predict_y, "train_y": train_y_pred, "coef": reg.coef_}
def test_get_errors_param(self):
"""
Test known models we can get the cv errors for alpha selection
"""
# Test original CV models
for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV):
try:
model = AlphaSelection(model())
X, y = make_regression()
model.fit(X, y)
errors = model._find_errors_param()
self.assertTrue(len(errors) > 0)
except YellowbrickValueError:
self.fail("could not find errors on {}".format(model.name))
def test_clusterer_enforcement(self):
"""
Assert that only clustering estimators can be passed to cluster viz
"""
nomodels = [
SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier
]
for nomodel in nomodels:
with self.assertRaises(YellowbrickTypeError):
visualizer = ClusteringScoreVisualizer(nomodel())
models = [
KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch
]
for model in models:
try:
visualizer = ClusteringScoreVisualizer(model())
except YellowbrickTypeError:
self.fail("could not pass clustering estimator to visualizer")
def build_signature_model(X,gidx,n_alphas=5):
model = RidgeCV(alphas=(.1,1,10,100,1000,10000,100000),cv=5)
model.fit(X[gidx].T,X.T)
return model
def test_regressor_cv(self):
"""
Ensure only "CV" regressors are allowed
"""
for model in (SVR, Ridge, Lasso, LassoLars, ElasticNet):
with self.assertRaises(YellowbrickTypeError):
alphas = AlphaSelection(model())
for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV):
try:
alphas = AlphaSelection(model())
except YellowbrickTypeError:
self.fail("could not instantiate RegressorCV on alpha selection")
def test_store_cv_values(self):
"""
Assert that store_cv_values is true on RidgeCV
"""
model = AlphaSelection(RidgeCV())
self.assertTrue(model.estimator.store_cv_values)
model = AlphaSelection(RidgeCV(store_cv_values=True))
self.assertTrue(model.estimator.store_cv_values)
model = AlphaSelection(RidgeCV(store_cv_values=False))
self.assertTrue(model.estimator.store_cv_values)
def test_get_alphas_param(self):
"""
Assert that we can get the alphas from ridge, lasso, and elasticnet
"""
alphas = np.logspace(-10, -2, 100)
# Test original CV models
for model in (RidgeCV, LassoCV, ElasticNetCV):
try:
model = AlphaSelection(model(alphas=alphas))
malphas = model._find_alphas_param()
self.assertTrue(np.array_equal(alphas, malphas))
except YellowbrickValueError:
self.fail("could not find alphas on {}".format(model.name))
def fit_thresholds(self, data, alpha, batch_size=128, verbose=0,
validation_data=None, cv=None, top_k=None):
inputs = np.hstack([data[k] for k in self._graph_inputs])
probs = self.predict(data, batch_size=batch_size)
targets = {k: data[k] for k in self._graph_outputs}
if isinstance(alpha, list):
if validation_data is None and cv is None:
warnings.warn("Neither validation data, nor the number of "
"cross-validation folds is provided. "
"The alpha parameter for threshold model will "
"be selected based on the default "
"cross-validation procedure in RidgeCV.")
elif validation_data is not None:
val_inputs = np.hstack([validation_data[k]
for k in self._graph_inputs])
val_probs = self.predict(validation_data)
val_targets = {k: validation_data[k]
for k in self._graph_outputs}
if verbose:
sys.stdout.write("Constructing thresholds.")
sys.stdout.flush()
self.t_models = {}
for k in self._graph_outputs:
if verbose:
sys.stdout.write(".")
sys.stdout.flush()
T = self._construct_thresholds(probs[k], targets[k])
if isinstance(alpha, list):
if validation_data is not None:
val_T = self._construct_thresholds(val_probs[k],
val_targets[k],
top_k=top_k)
score_best, alpha_best = -np.Inf, None
for a in alpha:
model = lm.Ridge(alpha=a).fit(inputs, T)
score = model.score(val_inputs, val_T)
if score > score_best:
score_best, alpha_best = score, a
alpha = alpha_best
else:
model = lm.RidgeCV(alphas=alpha, cv=cv).fit(inputs, T)
alpha = model.alpha_
self.t_models[k] = lm.Ridge(alpha=alpha)
self.t_models[k].fit(inputs, T)
if verbose:
sys.stdout.write("Done.\n")
sys.stdout.flush()
def ridge_regression():
ridge = RidgeCV(alphas=[0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)
print("Try again for more precision with alphas centered around " + str(alpha))
ridge = RidgeCV(alphas=[alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85,
alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4],
cv=10)
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)
print("Ridge RMSE on Training set :", rmse_cv(ridge, X_train, y_train).mean())
print("Ridge RMSE on Test set :", rmse_cv(ridge, X_test, y_test).mean())
y_train_rdg = ridge.predict(X_train)
y_test_rdg = ridge.predict(X_test)
# Plot residuals
plt.scatter(y_train_rdg, y_train_rdg - y_train, c="blue", marker="s", label="Training data")
plt.scatter(y_test_rdg, y_test_rdg - y_test, c="lightgreen", marker="s", label="Validation data")
plt.title("Linear regression with Ridge regularization")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc="upper left")
plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
plt.show()
# Plot predictions
plt.scatter(y_train_rdg, y_train, c="blue", marker="s", label="Training data")
plt.scatter(y_test_rdg, y_test, c="lightgreen", marker="s", label="Validation data")
plt.title("Linear regression with Ridge regularization")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc="upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
plt.show()
# Plot important coefficients
coefs = pd.Series(ridge.coef_, index=X_train.columns)
print("Ridge picked " + str(sum(coefs != 0)) + " features and eliminated the other " + \
str(sum(coefs == 0)) + " features")
imp_coefs = pd.concat([coefs.sort_values().head(10),
coefs.sort_values().tail(10)])
imp_coefs.plot(kind="barh")
plt.title("Coefficients in the Ridge Model")
plt.show()
return ridge
def online(X_org, y_org, test_x, test_uid):
n_folds = 5
verbose = True
shuffle = False
X = X_org
y = y_org
X_submission = test_x
if shuffle:
idx = np.random.permutation(y.size)
X = X[idx]
y = y[idx]
skf = list(StratifiedKFold(y, n_folds))
clfs = [
RandomForestClassifier().set_params(**INITIAL_PARAMS.get("RFC:one", {})),
ExtraTreesClassifier().set_params(**INITIAL_PARAMS.get("ETC:one", {})),
GradientBoostingClassifier().set_params(**INITIAL_PARAMS.get("GBC:one", {})),
LogisticRegression().set_params(**INITIAL_PARAMS.get("LR:one", {})),
xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:two", {})),
xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:one", {})),
]
print "Creating train and test sets for blending."
dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))
for j, clf in enumerate(clfs):
print j, clf
dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
for i, (train, test) in enumerate(skf):
print "Fold", i
X_train = X[train]
y_train = y[train]
X_test = X[test]
y_test = y[test]
clf.fit(X_train, y_train)
y_submission = clf.predict_proba(X_test)[:,1]
dataset_blend_train[test, j] = y_submission
dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1]
dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
print "Blending."
# clf = LogisticRegression(C=2, penalty='l2', class_weight='balanced', n_jobs=-1)
clf = linear_model.RidgeCV(
alphas=np.linspace(0, 200), cv=LM_CV_NUM)
# clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=100)
clf.fit(dataset_blend_train, y)
# y_submission = clf.predict_proba(dataset_blend_test)[:,1]
print clf.coef_, clf.intercept_
y_submission = clf.predict(dataset_blend_test) # for RidgeCV
print "Linear stretch of predictions to [0,1]"
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
print "blend result"
save_submission(os.path.join(consts.SUBMISSION_PATH,
MODEL_NAME + '_' + strftime("%m_%d_%H_%M_%S", localtime()) + '.csv'),
test_uid, y_submission)