def test_regression():
# Check regression for various parameter settings.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
boston.target[:50],
random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0],
"max_features": [0.5, 1.0],
"bootstrap": [True, False],
"bootstrap_features": [True, False]})
for base_estimator in [None,
DummyRegressor(),
DecisionTreeRegressor(),
KNeighborsRegressor(),
SVR()]:
for params in grid:
BaggingRegressor(base_estimator=base_estimator,
random_state=rng,
**params).fit(X_train, y_train).predict(X_test)
python类SVR的实例源码
def predict_price(dates, prices, x):
dates = np.reshape(dates,(len(dates), 1)) # converting to matrix of n X 1
svr_rbf = SVR(kernel= 'rbf', C= 1e3, gamma= 0.1) # defining the support vector regression models
svr_lin = SVR(kernel= 'linear', C= 1e3)
svr_poly = SVR(kernel= 'poly', C= 1e3, degree= 2)
svr_rbf.fit(dates, prices) # fitting the data points in the models
svr_lin.fit(dates, prices)
svr_poly.fit(dates, prices)
plt.scatter(dates, prices, color= 'black', label= 'Data') # plotting the initial datapoints
plt.plot(dates, svr_rbf.predict(dates), color= 'red', label= 'RBF model') # plotting the line made by the RBF kernel
plt.plot(dates,svr_lin.predict(dates), color= 'green', label= 'Linear model') # plotting the line made by linear kernel
plt.plot(dates,svr_poly.predict(dates), color= 'blue', label= 'Polynomial model') # plotting the line made by polynomial kernel
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Support Vector Regression')
plt.legend()
plt.show()
return svr_rbf.predict(x)[0], svr_lin.predict(x)[0], svr_poly.predict(x)[0]
def svrtrainsk(x, y, cost=1.0, epsilon=0.1):
model = SVR(C=cost, epsilon=epsilon)
model.fit(x, y)
return model
def build_ensemble(**kwargs):
"""Generate ensemble."""
ens = SuperLearner(**kwargs)
prep = {'Standard Scaling': [StandardScaler()],
'Min Max Scaling': [MinMaxScaler()],
'No Preprocessing': []}
est = {'Standard Scaling':
[ElasticNet(), Lasso(), KNeighborsRegressor()],
'Min Max Scaling':
[SVR()],
'No Preprocessing':
[RandomForestRegressor(random_state=SEED),
GradientBoostingRegressor()]}
ens.add(est, prep)
ens.add(GradientBoostingRegressor(), meta=True)
return ens
def sample_pipelines(pca_kernels=None, svr_kernels=None):
"""
Pipelines that can't be fit in a reasonable amount of time on the whole
dataset
"""
# Model instances
model_steps = []
if pca_kernels is None:
pca_kernels = ['poly', 'rbf', 'sigmoid', 'cosine']
for pca_kernel in pca_kernels:
model_steps.append([
KernelPCA(n_components=2, kernel=pca_kernel),
LinearRegression(),
])
if svr_kernels is None:
svr_kernels = ['poly', 'rbf', 'sigmoid']
for svr_kernel in svr_kernels:
model_steps.append(SVR(kernel=svr_kernel, verbose=True, cache_size=1000))
# Pipelines
pipelines = []
for m in model_steps:
# Steps
common_steps = [
StandardScaler(),
]
model_steps = m if isinstance(m, list) else [m]
steps = common_steps + model_steps
pipelines.append(make_pipeline(*steps))
return pipelines
def model_cross_valid(X,Y):
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
def bulid_model(model_name):
model = model_name()
return model
scoring = 'neg_mean_squared_error'
# + random fest boost lstm gbdt
for model_name in [LinearRegression,ElasticNet]:
#for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]:
model = bulid_model(model_name)
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(model_name,results.mean())
def train_support_vector_regression():
# Picking model
return mp.ModelProperties(regression=True), svm.SVR()
# http://xgboost.readthedocs.io/en/latest/python/python_api.html
def test_support_vector_regressor(self):
for dtype in self.number_data_type.keys():
scikit_model = SVR(kernel='rbf')
data = self.scikit_data['data'].astype(dtype)
target = self.scikit_data['target'].astype(dtype)
scikit_model, spec = self._sklearn_setup(scikit_model, dtype, data, target)
test_data = data[0].reshape(1, -1)
coreml_model = create_model(spec)
try:
self.assertEqual(scikit_model.predict(test_data)[0],
coreml_model.predict({'data': test_data})['target'],
msg="{} != {} for Dtype: {}".format(
scikit_model.predict(test_data)[0],
coreml_model.predict({'data': test_data})['target'],
dtype
)
)
except RuntimeError:
print("{} not supported. ".format(dtype))
SupportVectorRegression.py 文件源码
项目:job-salary-prediction
作者: soton-data-mining
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def predict(self):
svr_rbf = SVM.SVR(kernel='rbf', C=1e3, gamma=0.1)
train_result = svr_rbf.fit(self.x_train, self.y_train).predict(self.x_train)
test_result = svr_rbf.fit(self.x_train, self.y_train).predict(self.x_test)
BaseModel.export_prediction(test_result, 'SVR_RBF_C1e3_Gamma01')
return (train_result, test_result)
def test_pred_error(self):
"""
Assert no errors occur during Prediction Error Plots integration
"""
model = SVR()
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.5, random_state=42)
model.fit(X_train, y_train)
visualizer = PredictionError(model)
visualizer.score(X_test, y_test)
visualizer.poof()
visualizer.ax.grid(False)
self.assert_images_similar(visualizer)
##########################################################################
## Residuals Plots test case
##########################################################################
def test_clusterer_enforcement(self):
"""
Assert that only clustering estimators can be passed to cluster viz
"""
nomodels = [
SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier
]
for nomodel in nomodels:
with self.assertRaises(YellowbrickTypeError):
visualizer = ClusteringScoreVisualizer(nomodel())
models = [
KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch
]
for model in models:
try:
visualizer = ClusteringScoreVisualizer(model())
except YellowbrickTypeError:
self.fail("could not pass clustering estimator to visualizer")
def __init__(self, num_features, training_window, training_interval):
"""
num_features: the length of the feature vector
training_window: the number of previous data points to train on
training_interval: the number of data points between training periods
"""
self.num_features = num_features
self.training_interval = training_interval
self.training_window = training_window
# Init sample matrix, a deque of feature vectors
self.samples = deque(maxlen=training_window)
self.targets = deque(maxlen=training_window)
#self.model = SVR(kernel='rbf', C=1000)
self.model = BayesianRidge()
self.severity = blr.Severity()
self.alpha = 1.0
self.parameters = 0 # Training parameters
self.train_count = 0
self.have_trained = False
self.pred_range = [0.0, np.inf] # upper and lower bounds for predictions
def cv_SVR( xM, yV, svr_params, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = svm.SVR( **svr_params)
kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
kf_n = kf5_ext_c.split( xM)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def svm_SVR_C( xM, yV, c_l, graph = True):
"""
SVR is performed iteratively with different C values
until all C in the list are used.
"""
r2_l, sd_l = [], []
for C in c_l:
print('sklearn.svm.SVR(C={})'.format( C))
clf = svm.SVR( C = C)
clf.fit( xM, yV.A1)
yV_pred = clf.predict(xM)
r2, sd = regress_show( yV, np.mat( yV_pred).T, graph = graph)
for X, x in [[r2_l, r2], [sd_l, sd]]:
X.append( x)
print('average r2, sd are', np.mean( r2_l), np.mean( sd_l))
if graph:
pdw = pd.DataFrame( { 'log10(C)': np.log10(c_l), 'r2': r2_l, 'sd': sd_l})
pdw.plot( x = 'log10(C)')
return r2_l, sd_l
def cv_SVR( xM, yV, svr_params, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = svm.SVR( **svr_params)
kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
kf_n = kf5_ext_c.split( xM)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
kutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def svm_SVR_C( xM, yV, c_l, graph = True):
"""
SVR is performed iteratively with different C values
until all C in the list are used.
"""
r2_l, sd_l = [], []
for C in c_l:
print('sklearn.svm.SVR(C={})'.format( C))
clf = svm.SVR( C = C)
clf.fit( xM, yV.A1)
yV_pred = clf.predict(xM)
r2, sd = regress_show( yV, np.mat( yV_pred).T, graph = graph)
for X, x in [[r2_l, r2], [sd_l, sd]]:
X.append( x)
print('average r2, sd are', np.mean( r2_l), np.mean( sd_l))
if graph:
pdw = pd.DataFrame( { 'log10(C)': np.log10(c_l), 'r2': r2_l, 'sd': sd_l})
pdw.plot( x = 'log10(C)')
return r2_l, sd_l
def cv_SVR(xM, yV, svr_params, n_folds=5, n_jobs=-1, grid_std=None, graph=True, shuffle=True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = svm.SVR(**svr_params)
kf_n_c = model_selection.KFold(n_splits=n_folds, shuffle=True)
kf_n = kf_n_c.split(xM)
yV_pred = model_selection.cross_val_predict(
clf, xM, yV.A1, cv=kf_n, n_jobs=n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show(yV, yV_pred, grid_std=grid_std)
return yV_pred
def test_SVR_rbf(*data):
'''
test SVR with RBF kernel and different gamma
:param data: train_data,test_data, train_target, test_target
:return: None
'''
X_train,X_test,y_train,y_test=data
gammas=range(1,20)
train_scores=[]
test_scores=[]
for gamma in gammas:
regr=svm.SVR(kernel='rbf',gamma=gamma)
regr.fit(X_train,y_train)
train_scores.append(regr.score(X_train,y_train))
test_scores.append(regr.score(X_test, y_test))
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(gammas,train_scores,label="Training score ",marker='+' )
ax.plot(gammas,test_scores,label= " Testing score ",marker='o' )
ax.set_title( "SVR_rbf")
ax.set_xlabel(r"$\gamma$")
ax.set_ylabel("score")
ax.set_ylim(-1,1)
ax.legend(loc="best",framealpha=0.5)
plt.show()
def regressorOp(x, y):
"""
This will optimize the parameters for the algo
"""
regr_rbf = svm.SVR(kernel="rbf")
C = [1000, 10, 1]
gamma = [0.005, 0.004, 0.003, 0.002, 0.001]
epsilon = [0.1, 0.01]
parameters = {"C":C, "gamma":gamma, "epsilon":epsilon}
gs = grid_search.GridSearchCV(regr_rbf, parameters, scoring="r2")
gs.fit(x, y)
print "Best Estimator:\n", gs.best_estimator_
print "Type: ", type(gs.best_estimator_)
return gs.best_estimator_
def __init__(self, model_type=DEFAULT_MODEL_TYPE):
"""
Set ups model and pipeline for learning and predicting.
:param model_type: only 'SVR' model is supported for now
"""
assert (model_type == 'SVR'), "Model '{}' is not supported. " \
"We support only SVR for now.".format(model_type)
self._model_type = model_type
self._model_params = BTCForecast.DEFAULT_SVR_MODEL_PARAMS
# set up SVR pipeline
self._scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
self._model = SVR(kernel=self._model_params['kernel'],
epsilon=self._model_params['epsilon'],
C=self._model_params['c'],
gamma=self._model_params['gamma'])
self._pipeline = make_pipeline(self._scaler, self._model)
self.has_learned = False
def test_ovr_single_label_predict_proba():
base_clf = MultinomialNB(alpha=1)
X, Y = iris.data, iris.target
X_train, Y_train = X[:80], Y[:80]
X_test = X[80:]
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
# decision function only estimator. Fails in current implementation.
decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
assert_raises(AttributeError, decision_only.predict_proba, X_test)
Y_pred = clf.predict(X_test)
Y_proba = clf.predict_proba(X_test)
assert_almost_equal(Y_proba.sum(axis=1), 1.0)
# predict assigns a label if the probability that the
# sample has the label is greater than 0.5.
pred = np.array([l.argmax() for l in Y_proba])
assert_false((pred - Y_pred).any())
def test_rfe_min_step():
n_features = 10
X, y = make_friedman1(n_samples=50, n_features=n_features, random_state=0)
n_samples, n_features = X.shape
estimator = SVR(kernel="linear")
# Test when floor(step * n_features) <= 0
selector = RFE(estimator, step=0.01)
sel = selector.fit(X, y)
assert_equal(sel.support_.sum(), n_features // 2)
# Test when step is between (0,1) and floor(step * n_features) > 0
selector = RFE(estimator, step=0.20)
sel = selector.fit(X, y)
assert_equal(sel.support_.sum(), n_features // 2)
# Test when step is an integer
selector = RFE(estimator, step=5)
sel = selector.fit(X, y)
assert_equal(sel.support_.sum(), n_features // 2)
def test_svr():
# Test Support Vector Regression
diabetes = datasets.load_diabetes()
for clf in (svm.NuSVR(kernel='linear', nu=.4, C=1.0),
svm.NuSVR(kernel='linear', nu=.4, C=10.),
svm.SVR(kernel='linear', C=10.),
svm.LinearSVR(C=10.),
svm.LinearSVR(C=10.),
):
clf.fit(diabetes.data, diabetes.target)
assert_greater(clf.score(diabetes.data, diabetes.target), 0.02)
# non-regression test; previously, BaseLibSVM would check that
# len(np.unique(y)) < 2, which must only be done for SVC
svm.SVR().fit(diabetes.data, np.ones(len(diabetes.data)))
svm.LinearSVR().fit(diabetes.data, np.ones(len(diabetes.data)))
def test_svr_predict():
# Test SVR's decision_function
# Sanity check, test that predict implemented in python
# returns the same as the one in libsvm
X = iris.data
y = iris.target
# linear kernel
reg = svm.SVR(kernel='linear', C=0.1).fit(X, y)
dec = np.dot(X, reg.coef_.T) + reg.intercept_
assert_array_almost_equal(dec.ravel(), reg.predict(X).ravel())
# rbf kernel
reg = svm.SVR(kernel='rbf', gamma=1).fit(X, y)
rbfs = rbf_kernel(X, reg.support_vectors_, gamma=reg.gamma)
dec = np.dot(rbfs, reg.dual_coef_.T) + reg.intercept_
assert_array_almost_equal(dec.ravel(), reg.predict(X).ravel())
def train(self):
""""""
start = time.time()
print('size before truncated outliers is %d ' % len(self.TrainData))
self.TrainData = self.TrainData[
(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)]
print('size after truncated outliers is %d ' % len(self.TrainData))
X = self.TrainData.drop(self._l_drop_cols, axis=1)
Y = self.TrainData['logerror']
self._l_train_columns = X.columns
X = X.values.astype(np.float32, copy=False)
svr = SVR(C = self._C, epsilon= self._epsilon, tol= 1e-3, kernel= 'linear',max_iter= 100, verbose= True)
self._model = svr.fit(X, Y)
end = time.time()
print('time consumed %d ' % ((end - start)))
self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__,
datetime.now().strftime('%Y%m%d-%H:%M:%S'))
# with open(self._f_eval_train_model, 'wb') as o_file:
# pickle.dump(self._model, o_file, -1)
# o_file.close()
self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]],
ignore_index=True) ## ignore_index will reset the index or index will be overlaped
return
def define_model(self):
#if self.modeltype == "AR" :
# return statsmodels.tsa.ar_model.AR(max_order=self.parameters['max_order'])
if self.modeltype == "RandomForest" :
return ensemble.RandomForestRegressor(n_estimators=self.parameters['n_estimators'])
#return ensemble.RandomForestClassifier(
# n_estimators=self.parameters['n_estimators'])
elif self.modeltype == "LinearRegression" :
return linear_model.LinearRegression()
elif self.modeltype == "Lasso" :
return linear_model.Lasso(
alpha=self.parameters['alpha'])
elif self.modeltype == "ElasticNet" :
return linear_model.ElasticNet(
alpha=self.parameters['alpha'],
l1_ratio=self.parameters['l1_ratio'])
elif self.modeltype == "SVR" :
return SVR(
C=self.parameters['C'],
epsilon=self.parameters['epsilon'],
kernel=self.parameters['kernel'])
#elif self.modeltype == 'StaticModel':
# return StaticModel (
# parameters=self.parameters
# )
#elif self.modeltype == 'AdvancedStaticModel':
# return AdvancedStaticModel (
# parameters=self.parameters
# )
# elif self.modeltype == 'SGDRegressor' :
# print(self.parameters)
# return linear_model.SGDRegressor(
# loss=self.parameters['loss'],
# penalty=self.parameters['penalty'],
# l1_ratio=self.parameters['l1_ratio'])
else:
raise ConfigError("Unsupported model {0}".format(self.modeltype))
def lasso_regression_model(parameter_array):
alpha_value = parameter_array[0] #alpha value index is first index
return linear_model.Lasso(alpha=alpha_value, fit_intercept=True, normalize=True, precompute=False, copy_X=True,
max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')
#Returns the SVR Linear Kernel model
def svr_linear_regression(parameter_array):
c_value = parameter_array[0]
# epsilon_value = parameter_array[1]
return svm.SVR(kernel='linear', degree=3, gamma='auto', coef0=0.0, tol=0.001, C=c_value, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1)
#Returns the mlp regression model
def SVM(train, test, tunings=None, smoteit=True, bin=True, regress=False):
"SVM "
if not isinstance(train, pd.core.frame.DataFrame):
train = csv2DF(train, as_mtx=False, toBin=bin)
if not isinstance(test, pd.core.frame.DataFrame):
test = csv2DF(test, as_mtx=False, toBin=True)
if smoteit:
train = SMOTE(train, resample=True)
# except: set_trace()
if not tunings:
if regress:
clf = SVR()
else:
clf = SVC()
else:
if regress:
clf = SVR()
else:
clf = SVC()
features = train.columns[:-1]
klass = train[train.columns[-1]]
# set_trace()
clf.fit(train[features], klass)
actual = test[test.columns[-1]].as_matrix()
try: preds = clf.predict(test[test.columns[:-1]])
except: set_trace()
return actual, preds
def setClf(self):
clf = SVR(C=100, epsilon=0.1, gamma = 0.0001,cache_size = 10240)
min_max_scaler = preprocessing.MinMaxScaler()
self.clf = Pipeline([('scaler', min_max_scaler), ('estimator', clf)])
return