def test_regression():
# Check regression for various parameter settings.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
boston.target[:50],
random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0],
"max_features": [0.5, 1.0],
"bootstrap": [True, False],
"bootstrap_features": [True, False]})
for base_estimator in [None,
DummyRegressor(),
DecisionTreeRegressor(),
KNeighborsRegressor(),
SVR()]:
for params in grid:
BaggingRegressor(base_estimator=base_estimator,
random_state=rng,
**params).fit(X_train, y_train).predict(X_test)
python类KNeighborsRegressor()的实例源码
def build_ensemble(**kwargs):
"""Generate ensemble."""
ens = SuperLearner(**kwargs)
prep = {'Standard Scaling': [StandardScaler()],
'Min Max Scaling': [MinMaxScaler()],
'No Preprocessing': []}
est = {'Standard Scaling':
[ElasticNet(), Lasso(), KNeighborsRegressor()],
'Min Max Scaling':
[SVR()],
'No Preprocessing':
[RandomForestRegressor(random_state=SEED),
GradientBoostingRegressor()]}
ens.add(est, prep)
ens.add(GradientBoostingRegressor(), meta=True)
return ens
def fit(self,X,y):
'''
??knn?????
:param X: ??????dataframe???????????????
:param y: ??????series??X???????????????????????
:return:
'''
X=pd.DataFrame(X.copy())
X=X.reset_index(drop=True)
y=pd.Series(y.copy())
y=y.reset_index(drop=True)
self.means=y.mean()
self.models={}
for col in X.columns.tolist():
if col in self.feature_cate:
self.models[col]=y.groupby(X[col]).mean().to_dict()
else:
knn=KNeighborsRegressor(n_neighbors=self.n_neighbors)
knn.fit(X[[col]],y)
self.models[col]=copy.deepcopy(knn)
return self
def model_cross_valid(X,Y):
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
def bulid_model(model_name):
model = model_name()
return model
scoring = 'neg_mean_squared_error'
# + random fest boost lstm gbdt
for model_name in [LinearRegression,ElasticNet]:
#for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]:
model = bulid_model(model_name)
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(model_name,results.mean())
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'weights': ['uniform', 'distance'],
'n_neighbors': range(2,100)
}
]
reg = GridSearchCV(neighbors.KNeighborsRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error')
reg.fit(self.X_train, self.y_train)
print "Best parameters set found on development set:\n"
print reg.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in reg.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print reg.scorer_
print "MSE for test data set:"
y_true, y_pred = self.y_test, reg.predict(self.X_test)
print mean_squared_error(y_pred, y_true)
def test_kneighbors_regressor(n_samples=40,
n_features=5,
n_test_pts=10,
n_neighbors=3,
random_state=0):
# Test k-neighbors regression
rng = np.random.RandomState(random_state)
X = 2 * rng.rand(n_samples, n_features) - 1
y = np.sqrt((X ** 2).sum(1))
y /= y.max()
y_target = y[:n_test_pts]
weight_func = _weight_func
for algorithm in ALGORITHMS:
for weights in ['uniform', 'distance', weight_func]:
knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors,
weights=weights,
algorithm=algorithm)
knn.fit(X, y)
epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1)
y_pred = knn.predict(X[:n_test_pts] + epsilon)
assert_true(np.all(abs(y_pred - y_target) < 0.3))
def test_KNeighborsRegressor_multioutput_uniform_weight():
# Test k-neighbors in multi-output regression with uniform weight
rng = check_random_state(0)
n_features = 5
n_samples = 40
n_output = 4
X = rng.rand(n_samples, n_features)
y = rng.rand(n_samples, n_output)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
for algorithm, weights in product(ALGORITHMS, [None, 'uniform']):
knn = neighbors.KNeighborsRegressor(weights=weights,
algorithm=algorithm)
knn.fit(X_train, y_train)
neigh_idx = knn.kneighbors(X_test, return_distance=False)
y_pred_idx = np.array([np.mean(y_train[idx], axis=0)
for idx in neigh_idx])
y_pred = knn.predict(X_test)
assert_equal(y_pred.shape, y_test.shape)
assert_equal(y_pred_idx.shape, y_test.shape)
assert_array_almost_equal(y_pred, y_pred_idx)
def test_kneighbors_regressor_sparse(n_samples=40,
n_features=5,
n_test_pts=10,
n_neighbors=5,
random_state=0):
# Test radius-based regression on sparse matrices
# Like the above, but with various types of sparse matrices
rng = np.random.RandomState(random_state)
X = 2 * rng.rand(n_samples, n_features) - 1
y = ((X ** 2).sum(axis=1) < .25).astype(np.int)
for sparsemat in SPARSE_TYPES:
knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors,
algorithm='auto')
knn.fit(sparsemat(X), y)
for sparsev in SPARSE_OR_DENSE:
X2 = sparsev(X)
assert_true(np.mean(knn.predict(X2).round() == y) > 0.95)
def test_neighbors_iris():
# Sanity checks on the iris dataset
# Puts three points of each label in the plane and performs a
# nearest neighbor query on points near the decision boundary.
for algorithm in ALGORITHMS:
clf = neighbors.KNeighborsClassifier(n_neighbors=1,
algorithm=algorithm)
clf.fit(iris.data, iris.target)
assert_array_equal(clf.predict(iris.data), iris.target)
clf.set_params(n_neighbors=9, algorithm=algorithm)
clf.fit(iris.data, iris.target)
assert_true(np.mean(clf.predict(iris.data) == iris.target) > 0.95)
rgs = neighbors.KNeighborsRegressor(n_neighbors=5, algorithm=algorithm)
rgs.fit(iris.data, iris.target)
assert_greater(np.mean(rgs.predict(iris.data).round() == iris.target),
0.95)
def learn(self, experiences, max_iter=20):
# experience is in (s, a, r, ns)
states = experiences[:, 0:self.domain.state_space_dims]
actions = experiences[:, self.domain.state_space_dims]
rewards = experiences[:, self.domain.state_space_dims+1]
next_states = experiences[:, self.domain.state_space_dims+2:]
X = self.representation.phi_sa("root", states, actions)
for i in range(0, max_iter):
#old_qs = np.reshape(self.representation.Q("root", states, actions), (-1, 1))
nqs = self.representation.Qs("root", next_states)
best_nqs = np.reshape(np.amax(nqs, axis=1), (-1, 1))
y = rewards+ self.domain.discount_factor * best_nqs
#resd = np.mean(np.abs(y - old_qs))
model = KNeighborsRegressor(n_neighbors=2, n_jobs=-1)
model.fit(X, y)
self.representation.models["root"] = model
#print "Residual is " + str(resd)
def model_fit_and_test(TrainX,TrainY,TestX,TestY):
def bulid_model(model_name):
model = model_name()
return model
#for model_name in [LinearRegression, Ridge, Lasso, ElasticNet, KNeighborsRegressor, DecisionTreeRegressor, SVR,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]:
for model_name in [LinearRegression, ElasticNet]:
model = bulid_model(model_name)
model.fit(TrainX,TrainY)
print(model_name)
resid = model.predict(TestX) - TestY
#print resid
print("Residual sum of squares: %f"% np.mean(resid ** 2))
#print model.predict(TestX)
#print TestY
# Explained variance score: 1 is perfect prediction
plt.scatter(model.predict(TestX), resid);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
#plt.xlim([1, 50])
plt.show()
print('Variance score: %.2f' % model.score(TestX, TestY))
from statsmodels.stats.stattools import jarque_bera
_, pvalue, _, _ = jarque_bera(resid)
print ("Test Residuals Normal", pvalue)
from statsmodels import regression, stats
import statsmodels.api as sms
import statsmodels.stats.diagnostic as smd
# xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4)))
xs_with_constant = sms.add_constant(TestX)
_, pvalue1, _, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant)
print ("Test Heteroskedasticity", pvalue1)
ljung_box = smd.acorr_ljungbox(resid, lags=10)
#print "Lagrange Multiplier Statistics:", ljung_box[0]
print "Test Autocorrelation P-values:", ljung_box[1]
if any(ljung_box[1] < 0.05):
print "The residuals are autocorrelated."
else:
print "The residuals are not autocorrelated."
def __init__(self, isTrain):
super(RegressionKNN, self).__init__(isTrain)
# data preprocessing
#self.dataPreprocessing()
# Create KNN regression object
# first parameter is the K neighbors
# 'uniform' assigns uniform weights to each neighbor
# 'distance' assigns weights proportional to the inverse of the distance from the query point
# default metric is euclidean distance
self.regr = neighbors.KNeighborsRegressor(86, weights='distance')
def __init__(self, conf):
"""smpKNN.__init__
init
"""
smpModel.__init__(self, conf)
self.fwd = KNeighborsRegressor(n_neighbors = self.n_neighbors)
self.X_ = []
self.y_ = []
self.bootstrap()
def calculate(X, y):
best_p, best_score = 0, -float('inf')
kf = KFold(len(y), n_folds=5, shuffle=True, random_state=42)
for p in numpy.linspace(1, 10, num=200):
knr = KNeighborsRegressor(n_neighbors=5, weights='distance', p=p)
score = max(cross_val_score(knr, X, y, cv=kf, scoring='mean_squared_error'))
if score > best_score:
best_score = score
best_p = p
return best_p, best_score
def knnPredictor(df):
dataTrainX, dataTrainY, dataTestX, dataTestY = sample(df)
corelationCoefficiantDictionary = {}
corelationCoefficiantArray = []
for k in range(1, 200, 1):
knnModel = KNeighborsRegressor(n_neighbors=k)
knnModel.fit(dataTrainX, dataTrainY)
knnpredicted = knnModel.predict(dataTestX)
corelationCoefficient = pearsonr(dataTestY, knnpredicted)
corelationCoefficiantDictionary[k] = corelationCoefficient[0]
corelationCoefficiantArray.append(corelationCoefficient[0])
# plotter.plot(corelationCoefficiantArray)
bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get)
knnModelBest = KNeighborsRegressor(n_neighbors=bestK)
knnModelBest.fit(dataTrainX, dataTrainY)
print("K = ")
print(bestK)
print("Corelation Coeff:")
print(corelationCoefficiantDictionary[bestK])
knnpredictedBest = knnModelBest.predict(dataTestX)
fig, ax = plotter.subplots()
corelationCoefficient = pearsonr(dataTestY, knnpredictedBest)
print(corelationCoefficient[0])
ax.set_ylabel('Predicted KNN Weekly')
ax.scatter(dataTestY, knnpredictedBest)
ax.set_xlabel('Measured')
plotter.show()
def predictKnn(data, priceToPredict):
corelationCoefficiantDictionary = {}
corelationCoefficiantArray = []
openingPriceTrain, openingPriceTest, closingPriceTrain, closingPriceTest = \
data["openingPriceTrain"], data["openingPriceTest"], data["closingPriceTrain"], data["closingPriceTest"]
for k in range( 1 , 100 , 1):
neigh = KNeighborsRegressor(n_neighbors=k)
#n = 7 best fits
neigh.fit(openingPriceTrain, closingPriceTrain)
closingPriceTestArray = np.reshape(closingPriceTest,-1)
knnpr = neigh.predict(openingPriceTest)
predictedArray = np.reshape(knnpr,-1)
corelationCoefficient = pearsonr(closingPriceTestArray,predictedArray)
corelationCoefficiantDictionary[k] = corelationCoefficient[0]
corelationCoefficiantArray.append(corelationCoefficient[0])
plotter.plot(corelationCoefficiantArray)
# plotter.show()
bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get)
neighBest = KNeighborsRegressor(n_neighbors=bestK)
neighBest.fit(openingPriceTrain, closingPriceTrain)
openingPriceToPredict = np.array([priceToPredict])
print("K = ")
print(bestK)
print(neighBest.predict(openingPriceToPredict))
def build_ensemble(**kwargs):
"""Generate ensemble."""
ens = SuperLearner(**kwargs)
est = [ElasticNet(copy_X=False),
Lasso(copy_X=False)]
ens.add(est)
ens.add(KNeighborsRegressor())
return ens
def knn():
"""Fit KNN."""
print("Fitting KNN...", end=" ", flush=True)
time.sleep(SLEEP)
t0 = time.time()
knn = KNeighborsRegressor()
knn.fit(X, y)
print_time(t0, "Done", end="")
def knn_regression(K, training_data, labels, test_data, weights='distance'):
knn = neighbors.KNeighborsRegressor(K, weights=weights)
output = knn.fit(training_data, labels).predict(test_data)
return output
def generate_model(self, regressor, qty_neighbors, algorithm, distance_type):
""" Regressor Model Generation"""
if regressor == "knn":
return KNeighborsRegressor(n_neighbors=qty_neighbors, algorithm=algorithm, p=distance_type)
elif regressor == "linear":
return LinearRegression(fit_intercept=True) # copy_X=True, n_jobs=1, normalize=False
elif regressor == "logistic":
return LogisticRegression(class_weight='balanced')
def spot_check(X, y):
if type == 'regression':
models = [
(LinearRegression(), 'Ordinary Least Squares'),
(Ridge(alpha=0.1), 'Ridge (alpha 0.1)'),
(Ridge(), 'Ridge (alpha 1.0)'),
(Lasso(alpha=0.1), 'Lasso (alpha 0.1)'),
(Lasso(), 'Lasso (alpha 1.0)'),
(ElasticNet(alpha=0.1), 'ElasticNet (alpha 0.1)'),
(ElasticNet(), 'ElasticNet (alpha 1.0)'),
(DecisionTreeRegressor(), 'Decision Tree'),
(KNeighborsRegressor(), 'K-Nearest Neighbors'),
# (RandomForestRegressor(), 'Random Forest Regressor'),
# (BaggingRegressor(), 'Bagging Regressor'),
# (GradientBoostingRegressor(), 'Gradient Bosted Regression'),
# (SVR(), 'Support Vector Regression')
]
splits = 5
scores = []
for model, model_name in models:
score = check_model(model, splits, X, y)
# get average score
scores.append(score)
model_names = map(lambda x: x[1], models)
for name, score in zip(model_names, scores):
print('%s: %f' % (name, score))
def get_classifier(self, X, Y):
""" ????????
:param X: ????
:param Y: ??????
:return: ??
"""
clf = KNeighborsRegressor(weights='uniform')
clf.fit(X, Y)
return clf
def __init__(self, S, A, n_neighbors=5, weights='uniform', algorithm='auto', metric='minkowski', memory_fit=100, memory_size=100, **kwargs):
#assert self.lr_mode == 'constant', 'KNNQ is only compatible with constant learning rates.'
self.S = S
self.A = A
self.states = deque([])
self.targets = deque([])
self.memory_fit = memory_fit
self.memory_size = memory_size
self.count = 0
self.neigh = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, metric=metric)
super(KNNQ, self).__init__(**kwargs)
self.update_mode = 'set'
def calculatepRCA(data, y ='',c='',p='',x=''):
'''
Returns the pRCA from data. pRCA is the probability that (RCA_{y+1} > 1) given the volume of exports (x_{cpy}),
and the 'baseline term' (\sum_c x_{cpy} \sum_p x_{cpy} / \sum_c \sum_p x_{cpy}).
It is computed using k-nearest neighbors, in the space of log exports and log baseline term.
Parameters
----------
data : pandas.DataFrame
Raw data. It has source,target,volume (trade, number of people etc.).
y,c,p,x : str (optional)
Labels of the columns in data used for source,target,volume
Returns
-------
RCA : pandas.DataFrame
Table with the RCAs, with the columns c,p,x,RCA
If shares is True it also includes:
s_c : Share of X_cp over X_c
s_p : Share of X_cp over X_p
'''
df = calculateRCA_by_year(data,y ='year',c='ccode',p='pcode',x='x',log_terms = True)
#Compute (RCA > 1) next year and merge it
df_ = df.copy()
df_['year'] = df_['year'] - 1
df_['RCA_y+1'] = (df_['log(RCA)'] > 0).astype(int)
df_ = df_[['year','ccode','pcode','RCA_y+1']]
df = df.merge(df_)
#Prepare dataset for knn and fit
M = df[['log(x)','T','RCA_y+1']].as_matrix()
X, y = M[:,:2], M[:, 2]
knn = neighbors.KNeighborsRegressor(n_neighbors = 200, weights = 'uniform').fit(X, y)
#To avoid memory error, compute predictions in split X. Predictions are output pRCA
pRCA = np.array([])
for x in np.array_split(X, 10):
pRCA = np.append(pRCA, knn.predict(x))
df['pRCA'] = pRCA
return df
def __init__(self, idim = 1, odim = 1):
self.fwd = KNeighborsRegressor(n_neighbors=5)
ActInfModel.__init__(self, idim, odim)
self.X_ = []
self.y_ = []
self.bootstrap()
def build_model(self):
return KNeighborsRegressor(**self.params)
def test_KNeighborsRegressor(*data):
'''
test the KNN regressor
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
regr=neighbors.KNeighborsRegressor()
regr.fit(X_train,y_train)
print("Training Score:{0}".format(regr.score(X_train,y_train)))
print("Testing Score:{0}".format(regr.score(X_test,y_test)))
def test_KNeighborsRegressor_k_w(*data):
'''
test the performance with different n_neighbors and weights
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
Ks=np.linspace(1,y_train.size,num=100,endpoint=False,dtype='int')
weights=['uniform','distance']
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
### graph
for weight in weights:
training_scores=[]
testing_scores=[]
for K in Ks:
regr=neighbors.KNeighborsRegressor(weights=weight,n_neighbors=K)
regr.fit(X_train,y_train)
testing_scores.append(regr.score(X_test,y_test))
training_scores.append(regr.score(X_train,y_train))
ax.plot(Ks,testing_scores,label="testing score:weight={0}".format(weight))
ax.plot(Ks,training_scores,label="training score:weight={0}".format(weight))
ax.legend(loc='best')
ax.set_xlabel("K")
ax.set_ylabel("score")
ax.set_ylim(0,1.05)
ax.set_title("KNeighborsRegressor")
plt.show()
def test_KNeighborsRegressor_k_p(*data):
'''
test the performance with different n_neighbors and p
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
Ks=np.linspace(1,y_train.size,endpoint=False,dtype='int')
Ps=[1,2,10]
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
### graph
for P in Ps:
training_scores=[]
testing_scores=[]
for K in Ks:
regr=neighbors.KNeighborsRegressor(p=P,n_neighbors=K)
regr.fit(X_train,y_train)
testing_scores.append(regr.score(X_test,y_test))
training_scores.append(regr.score(X_train,y_train))
ax.plot(Ks,testing_scores,label="testing score:p={0}".format(P))
ax.plot(Ks,training_scores,label="training score:p={0}".format(P))
ax.legend(loc='best')
ax.set_xlabel("K")
ax.set_ylabel("score")
ax.set_ylim(0,1.05)
ax.set_title("KNeighborsRegressor")
plt.show()
def knn(train_sample, validation_sample, features, seed):
log_base = np.e
knn_est = KNeighborsRegressor(n_neighbors=1, weights='distance', algorithm='auto', leaf_size=30,
p=1).fit(
train_sample[features], np.log1p(train_sample['volume']) / np.log(log_base))
knn_prob = np.power(log_base, knn_est.predict(validation_sample[features])) - 1
print_mape(validation_sample['volume'], knn_prob, 'KNN')
return knn_prob