def lasso_regression_model(parameter_array):
alpha_value = parameter_array[0] #alpha value index is first index
return linear_model.Lasso(alpha=alpha_value, fit_intercept=True, normalize=True, precompute=False, copy_X=True,
max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')
#Returns the SVR Linear Kernel model
python类Lasso()的实例源码
def model_fit_and_test(TrainX,TrainY,TestX,TestY):
def bulid_model(model_name):
model = model_name()
return model
#for model_name in [LinearRegression, Ridge, Lasso, ElasticNet, KNeighborsRegressor, DecisionTreeRegressor, SVR,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]:
for model_name in [LinearRegression, ElasticNet]:
model = bulid_model(model_name)
model.fit(TrainX,TrainY)
print(model_name)
resid = model.predict(TestX) - TestY
#print resid
print("Residual sum of squares: %f"% np.mean(resid ** 2))
#print model.predict(TestX)
#print TestY
# Explained variance score: 1 is perfect prediction
plt.scatter(model.predict(TestX), resid);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
#plt.xlim([1, 50])
plt.show()
print('Variance score: %.2f' % model.score(TestX, TestY))
from statsmodels.stats.stattools import jarque_bera
_, pvalue, _, _ = jarque_bera(resid)
print ("Test Residuals Normal", pvalue)
from statsmodels import regression, stats
import statsmodels.api as sms
import statsmodels.stats.diagnostic as smd
# xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4)))
xs_with_constant = sms.add_constant(TestX)
_, pvalue1, _, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant)
print ("Test Heteroskedasticity", pvalue1)
ljung_box = smd.acorr_ljungbox(resid, lags=10)
#print "Lagrange Multiplier Statistics:", ljung_box[0]
print "Test Autocorrelation P-values:", ljung_box[1]
if any(ljung_box[1] < 0.05):
print "The residuals are autocorrelated."
else:
print "The residuals are not autocorrelated."
def main(dataset_size, test_proportion):
diabetes = load_diabetes()
X = diabetes.data[:dataset_size]
y = diabetes.target[:dataset_size]
fig, ax_list = plt.subplots(3, 1, figsize=(8, 6))
plot_errors_by_lambda(X, y, test_proportion=test_proportion, regression_class=Ridge, ax=ax_list[0])
plot_errors_by_lambda(X, y, test_proportion=test_proportion, regression_class=Lasso, ax=ax_list[1])
plot_errors_by_lambda(X, y, test_proportion=test_proportion, regression_class=LinearRegression, ax=ax_list[2])
plt.tight_layout()
plt.show()
def vote_with_lr(conf, forecasts, best_model_index, y_actual):
start = time.time()
best_forecast = forecasts[:, best_model_index]
forecasts = np.sort(np.delete(forecasts, best_model_index, axis=1), axis=1)
forecasts = np.where(forecasts <=0, 0.1, forecasts)
data_train = []
for i in range(forecasts.shape[0]):
f_row = forecasts[i,]
min_diff_to_best = np.min([cal_rmsle(best_forecast[i], f) for f in f_row])
comb = list(itertools.combinations(f_row,2))
avg_error = scipy.stats.hmean([cal_rmsle(x,y) for (x,y) in comb])
data_train.append([min_diff_to_best, avg_error, scipy.stats.hmean(f_row), np.median(f_row), np.std(f_row)])
X_all = np.column_stack([np.row_stack(data_train), best_forecast])
if conf.target_as_log:
y_actual = transfrom_to_log(y_actual)
#we use 10% full data to train the ensamble and 30% for evalaution
no_of_training_instances = int(round(len(y_actual)*0.25))
X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual)
y_actual_test = y_actual[no_of_training_instances:]
lr_model =linear_model.Lasso(alpha = 0.2)
lr_model.fit(X_train, y_train)
lr_forecast = lr_model.predict(X_test)
lr_forcast_revered = retransfrom_from_log(lr_forecast)
calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered)
print_time_took(start, "vote_with_lr")
return lr_forcast_revered
def get_models4ensamble(conf):
models = []
#models = [RFRModel(conf), DLModel(conf), LRModel(conf)]
#models = [LRModel(conf)]
# see http://scikit-learn.org/stable/modules/linear_model.html
#0 was too big to run with depth set to 1, and 1 was overfitting a bit
if conf.command == 1:
xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":3, "eta":0.1, "min_child_weight":5,
"subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0}
else:
xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":10, "eta":0.1, "min_child_weight":8,
"subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0}
#xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":10, "eta":0.1, "min_child_weight":8,
# "subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0}
models = [
#DLModel(conf),
#LRModel(conf, model=linear_model.BayesianRidge()),
#LRModel(conf, model=linear_model.LassoLars(alpha=.1)),
#LRModel(conf, model=linear_model.Lasso(alpha = 0.1)),
#LRModel(conf, model=Pipeline([('poly', PolynomialFeatures(degree=3)),
#LRModel(conf, model=linear_model.Ridge (alpha = .5))
# ('linear', LinearRegression(fit_intercept=False))])),
XGBoostModel(conf, xgb_params, use_cv=True),
LRModel(conf, model=linear_model.Lasso(alpha = 0.3)),
RFRModel(conf, RandomForestRegressor(oob_score=True, n_jobs=4)),
#LRModel(conf, model=linear_model.Lasso(alpha = 0.2)),
ETRModel(conf, model=ExtraTreesRegressor(n_jobs=4)),
#AdaBoostRModel(conf, model=AdaBoostRegressor(loss='square'))
]
return models
#return [XGBoostModel(conf, xgb_params, use_cv=True)]
advanced_supvervised_model_trainer.py 文件源码
项目:healthcareai-py
作者: HealthCatalyst
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def lasso_regression(self, scoring_metric='neg_mean_squared_error',
hyperparameter_grid=None,
randomized_search=True,
number_iteration_samples=2):
"""
A light wrapper for Sklearn's lasso regression that performs randomized search over an overridable default
hyperparameter grid.
Args:
scoring_metric (str): Any sklearn scoring metric appropriate for regression
hyperparameter_grid (dict): hyperparameters by name
randomized_search (bool): True for randomized search (default)
number_iteration_samples (int): Number of models to train during the randomized search for exploring the
hyperparameter space. More may lead to a better model, but will take longer.
Returns:
TrainedSupervisedModel:
"""
self.validate_regression('Lasso Regression')
if hyperparameter_grid is None:
hyperparameter_grid = {"fit_intercept": [True, False]}
number_iteration_samples = 2
algorithm = get_algorithm(Lasso,
scoring_metric,
hyperparameter_grid,
randomized_search,
number_iteration_samples=number_iteration_samples)
trained_supervised_model = self._create_trained_supervised_model(algorithm)
return trained_supervised_model
def build_ensemble(**kwargs):
"""Generate ensemble."""
ens = SuperLearner(**kwargs)
est = [ElasticNet(copy_X=False),
Lasso(copy_X=False)]
ens.add(est)
ens.add(KNeighborsRegressor())
return ens
def lasso():
"""Fit Lasso."""
print("Fitting LAS...", end=" ", flush=True)
time.sleep(SLEEP)
t0 = time.time()
ls = Lasso()
ls.fit(X, y)
print_time(t0, "Done", end="")
def elasticnet():
"""Fit Elastic Net."""
print("Fitting ELN...", end=" ", flush=True)
time.sleep(SLEEP)
t0 = time.time()
ls = Lasso()
ls.fit(X, y)
print_time(t0, "Done", end="")
def build_ensemble(kls, **kwargs):
"""Generate ensemble of class kls."""
ens = kls(**kwargs)
ens.add([SVR(), RandomForestRegressor(),
GradientBoostingRegressor(), Lasso(copy_X=False),
MLPRegressor(shuffle=False, alpha=0.001)])
ens.add_meta(Lasso(copy_X=False))
return ens
def spot_check(X, y):
if type == 'regression':
models = [
(LinearRegression(), 'Ordinary Least Squares'),
(Ridge(alpha=0.1), 'Ridge (alpha 0.1)'),
(Ridge(), 'Ridge (alpha 1.0)'),
(Lasso(alpha=0.1), 'Lasso (alpha 0.1)'),
(Lasso(), 'Lasso (alpha 1.0)'),
(ElasticNet(alpha=0.1), 'ElasticNet (alpha 0.1)'),
(ElasticNet(), 'ElasticNet (alpha 1.0)'),
(DecisionTreeRegressor(), 'Decision Tree'),
(KNeighborsRegressor(), 'K-Nearest Neighbors'),
# (RandomForestRegressor(), 'Random Forest Regressor'),
# (BaggingRegressor(), 'Bagging Regressor'),
# (GradientBoostingRegressor(), 'Gradient Bosted Regression'),
# (SVR(), 'Support Vector Regression')
]
splits = 5
scores = []
for model, model_name in models:
score = check_model(model, splits, X, y)
# get average score
scores.append(score)
model_names = map(lambda x: x[1], models)
for name, score in zip(model_names, scores):
print('%s: %f' % (name, score))
def get_classifier(self, X, Y):
""" ??Lasso??
:param X: ????
:param Y: ??????
:return: ??
"""
clf = Lasso()
clf.fit(X, Y)
return clf
def train_lasso_model(_train_x, train_y, _predict_x):
print_title("Lasso Regressor")
train_x, predict_x = \
standarize_feature(_train_x, _predict_x)
reg = linear_model.LassoCV(
precompute=True, cv=5, verbose=1, n_jobs=4)
reg.fit(train_x, train_y)
print("alphas: %s" % reg.alphas_)
print("mse path: %s" % np.mean(reg.mse_path_, axis=1))
itemindex = np.where(reg.alphas_ == reg.alpha_)
print("itemindex: %s" % itemindex)
_mse = np.mean(reg.mse_path_[itemindex[0], :])
print("Best alpha using bulit-in LassoCV: %f(mse: %f)" %
(reg.alpha_, _mse))
alpha = reg.alpha_
reg = linear_model.Lasso(alpha=alpha)
reg.fit(train_x, train_y)
n_nonzeros = (reg.coef_ != 0).sum()
print("Non-zeros coef: %d" % n_nonzeros)
predict_y = reg.predict(predict_x)
train_y_pred = reg.predict(train_x)
return {"y": predict_y, "train_y": train_y_pred, "coef": reg.coef_}
def lasso(train ,test , label, alpha = 0.00099, max_iteration = 50000):
lasso = Lasso(alpha = alpha , max_iter = max_iteration)
lasso.fit(train,label)
#prediction on training data
y_predicton = lasso.predict(train)
y_test = label
print("Lasso score on training set: ", rmse(y_test, y_predicton))
y_predicton = lasso.predict(test)
y_predicton = np.exp(y_predicton)
return y_predicton
def test_regressor_cv(self):
"""
Ensure only "CV" regressors are allowed
"""
for model in (SVR, Ridge, Lasso, LassoLars, ElasticNet):
with self.assertRaises(YellowbrickTypeError):
alphas = AlphaSelection(model())
for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV):
try:
alphas = AlphaSelection(model())
except YellowbrickTypeError:
self.fail("could not instantiate RegressorCV on alpha selection")
def run():
data = load_binary()
# Extract features
user_feat_matrix = process_level2(data) # X
del user_feat_matrix['X']['user_id']
X = user_feat_matrix['X'].values
X[np.isnan(X)] = 0
Y = user_feat_matrix['Y']
Y.fillna(0, inplace=True)
del user_feat_matrix['X_all']['user_id']
X_all = user_feat_matrix['X_all'].values
X_all[np.isnan(X_all)] = 0
cols = list(Y.columns.values)
symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted',
'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain',
'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin']
with open("result.txt", 'w') as f:
f.write("user_id,day_in_cycle,symptom,probability\n")
for symptom in symptoms:
print(symptom)
pipeline = Pipeline([
('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
#('standard_scale', StandardScaler()),
('estimator', Lasso()),
])
param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]}
model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4,
verbose=2)
model.fit(X, s_Y.values)
print("dumping...")
data_dir = 'data'
cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv'))
c_length = {k:v for k,v in zip(cycles0.user_id.values, cycles0.expected_cycle_length)}
dump(symptom, model, X_all, c_length, data['users'].user_id)
def estimate(self, a, y, initial_x=None):
"""
:param a: MxN matrix A in the y=Ax equation
:type a: numpy.ndarray
:param y: M vector y in the y=Ax equation
:type y: numpy.ndarray
:param initial_x: N vector of an initial solution
:type initial_x: numpy.ndarray
:return: best estimation of the N vector x in the y=Ax equation
:rtype: numpy.ndarray
:Example:
>>> import numpy as np
>>> import linvpy as lp
>>> a = np.matrix([[1, 2], [3, 4], [5, 6]])
>>> y = np.array([1, 2, 3])
>>> m = lp.MEstimator()
>>> m.estimate(a,y)
array([ -2.95552481e-16, 5.00000000e-01])
>>> m_ = lp.MEstimator(loss_function=lp.Bisquare, clipping=2.23, \
regularization=lp.Lasso(), lamb=3)
>>> initial_solution = np.array([1, 2])
>>> m_.estimate(a, y, initial_x=initial_solution)
array([ 0., 0.])
"""
return self.irls(a, y, initial_x)
def build_model(train_file, attr_file, model_out, algorithm='ridge'):
classifiers = ['ridge', 'linear', 'lasso', 'rf', 'en']
if algorithm not in classifiers:
raise NotImplementedError("only implemented algorithms: " + str(classifiers))
train_data = pd.read_pickle(train_file)
attrs = read_attrs(attr_file)
target_attr = attrs[0]
usable_attrs = attrs[1:]
if algorithm == 'ridge':
clf = Ridge()
elif algorithm == 'linear':
clf = LinearRegression()
elif algorithm == 'lasso':
clf = Lasso()
elif algorithm == 'en':
clf = ElasticNet()
else:
clf = RandomForestRegressor()
logger.debug("Modeling '%s'", target_attr)
logger.debug(" train set (%d): %s", len(train_data), train_file)
logger.debug(" Algorithm: %s", algorithm)
if hasattr(clf, 'coef_'):
logger.debug('Coefficients:')
for i,c in enumerate(clf.coef_):
logger.debug(' %-20s' % usable_attrs[i] + ':', '%20.4f' % c)
clf.fit(train_data[usable_attrs], train_data[target_attr])
pickle.dump(clf, open(model_out, 'wb'))
def gs_Lasso( xM, yV, alphas_log = (-1, 1, 9), n_splits=5, n_jobs = -1):
print(xM.shape, yV.shape)
clf = linear_model.Lasso()
#parmas = {'alpha': np.logspace(1, -1, 9)}
parmas = {'alpha': np.logspace( *alphas_log)}
kf5_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
#kf5 = kf5_c.split( xM)
gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = n_jobs)
gs.fit( xM, yV)
return gs
def gs_Lasso_norm( xM, yV, alphas_log = (-1, 1, 9)):
print(xM.shape, yV.shape)
clf = linear_model.Lasso( normalize = True)
#parmas = {'alpha': np.logspace(1, -1, 9)}
parmas = {'alpha': np.logspace( *alphas_log)}
kf5_c = model_selection.KFold( n_splits = 5, shuffle=True)
#kf5 = kf5_c.split( xM)
gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = -1)
gs.fit( xM, yV)
return gs