def main():
df_train0 = pd.read_csv("train.csv")
df_train1 = pd.read_csv("train1.csv")
df_train2 = pd.read_csv("train2.csv")
df_train3 = pd.read_csv("train3.csv")
df_train_list = [df_train0]
df_train = pd.concat(df_train_list)
len_train = len(df_train)
df_test = pd.read_csv("test2.csv")
df_train = df_train.append(df_test)[df_train.columns.tolist()]
df_date = pd.read_csv("date.csv")
df_ts = pd.read_csv("ts_feature2_simple.csv")
print df_test.head()
df_train = df_train.merge(df_date, on="date", how="left")
df_train = df_train.merge(df_ts, on=["tollgate_id", "hour", "miniute", "direction"], how="left")
data = pd.DataFrame.reset_index(df_train)
data = data.drop("index", axis=1)
print data.head(1)
data = feature_transform_knn(key=1, data= data)
y = data.ix[:len_train - 1]["volume"]
x = data.ix[:len_train - 1, 8:]
x1 = data.ix[len_train:, 8:]
regressor_cubic = KNeighborsRegressor(n_neighbors=15,)
regressor_cubic.fit(x, y)
yhat = regressor_cubic.predict(x1)
df_test["volume"] = yhat
df_test = df_test[['tollgate_id', 'time_window', 'direction', 'volume']]
df_test.to_csv("result/result_knn_"+str(np.mean(yhat))+".csv", index=False)
print np.mean(yhat)
python类KNeighborsRegressor()的实例源码
def test_precomputed_cross_validation():
# Ensure array is split correctly
rng = np.random.RandomState(0)
X = rng.rand(20, 2)
D = pairwise_distances(X, metric='euclidean')
y = rng.randint(3, size=20)
for Est in (neighbors.KNeighborsClassifier,
neighbors.RadiusNeighborsClassifier,
neighbors.KNeighborsRegressor,
neighbors.RadiusNeighborsRegressor):
metric_score = cross_val_score(Est(), X, y)
precomp_score = cross_val_score(Est(metric='precomputed'), D, y)
assert_array_equal(metric_score, precomp_score)
def test_neighbors_regressors_zero_distance():
# Test radius-based regressor, when distance to a sample is zero.
X = np.array([[1.0, 1.0], [1.0, 1.0], [2.0, 2.0], [2.5, 2.5]])
y = np.array([1.0, 1.5, 2.0, 0.0])
radius = 0.2
z = np.array([[1.1, 1.1], [2.0, 2.0]])
rnn_correct_labels = np.array([1.25, 2.0])
knn_correct_unif = np.array([1.25, 1.0])
knn_correct_dist = np.array([1.25, 2.0])
for algorithm in ALGORITHMS:
# we don't test for weights=_weight_func since user will be expected
# to handle zero distances themselves in the function.
for weights in ['uniform', 'distance']:
rnn = neighbors.RadiusNeighborsRegressor(radius=radius,
weights=weights,
algorithm=algorithm)
rnn.fit(X, y)
assert_array_almost_equal(rnn_correct_labels, rnn.predict(z))
for weights, corr_labels in zip(['uniform', 'distance'],
[knn_correct_unif, knn_correct_dist]):
knn = neighbors.KNeighborsRegressor(n_neighbors=2,
weights=weights,
algorithm=algorithm)
knn.fit(X, y)
assert_array_almost_equal(corr_labels, knn.predict(z))
def test_predict_sparse_ball_kd_tree():
rng = np.random.RandomState(0)
X = rng.rand(5, 5)
y = rng.randint(0, 2, 5)
nbrs1 = neighbors.KNeighborsClassifier(1, algorithm='kd_tree')
nbrs2 = neighbors.KNeighborsRegressor(1, algorithm='ball_tree')
for model in [nbrs1, nbrs2]:
model.fit(X, y)
assert_raises(ValueError, model.predict, csr_matrix(X))
def get_model_list():
model_list, name_list = [], []
# model_list.append(linear_model.LinearRegression())
# name_list.append('LR')
# model_list.append(gaussian_process.GaussianProcessRegressor(alpha=1e-10))
# name_list.append('GaussianProcess')
# model_list.append(KNeighborsRegressor(weights = 'uniform',n_neighbors=28))
# name_list.append('KNN_unif')
#
# model_list.append(KNeighborsRegressor(weights = 'distance',n_neighbors=28))
# name_list.append('KNN_dist')
#
# model_list.append(SVR(kernel = 'poly', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
# name_list.append('SVR_poly')
# #
model_list.append(SVR(kernel = 'rbf', C = 0.3, gamma = 'auto'))
name_list.append('SVR_rbf')
# #
# model_list.append(DecisionTreeRegressor())
# name_list.append('DT')
#
# model_list.append(RandomForestRegressor(n_estimators=150, max_depth=None,min_samples_split=2, random_state=0))
# name_list.append('RF')
#
# model_list.append(ExtraTreesRegressor(n_estimators=150, max_depth=None, max_features='auto', min_samples_split=2, random_state=0))
# name_list.append('ET')
return model_list,name_list
#MAPE
def get_model_list():
model_list, name_list = [], []
# model_list.append(linear_model.LinearRegression())
# name_list.append('LR')
# model_list.append(gaussian_process.GaussianProcessRegressor(alpha=1e-10))
# name_list.append('GaussianProcess')
# model_list.append(KNeighborsRegressor(weights = 'uniform',n_neighbors=28))
# name_list.append('KNN_unif')
#
# model_list.append(KNeighborsRegressor(weights = 'distance',n_neighbors=28))
# name_list.append('KNN_dist')
#
# model_list.append(SVR(kernel = 'poly', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
# name_list.append('SVR_poly')
# #
model_list.append(SVR(kernel = 'rbf', C = 0.3, gamma = 'auto'))
name_list.append('SVR_rbf')
# #
# model_list.append(DecisionTreeRegressor())
# name_list.append('DT')
#
# model_list.append(RandomForestRegressor(n_estimators=150, max_depth=None,min_samples_split=2, random_state=0))
# name_list.append('RF')
#
# model_list.append(ExtraTreesRegressor(n_estimators=150, max_depth=None, max_features='auto', min_samples_split=2, random_state=0))
# name_list.append('ET')
return model_list,name_list
#????
def get_classifier(self, X, Y):
""" ????????
:param X: ????
:param Y: ??????
:return: ??
"""
clf = KNeighborsRegressor(weights='uniform')
clf.fit(X, Y)
return clf
def fit(self, X, y=None, **kwargs):
X = check_array(X, dtype=np.float64, force_all_finite=False)
X_nan = np.isnan(X)
most_by_nan = X_nan.sum(axis=0).argsort()[::-1]
imputed = self.initial_imputer.fit_transform(X)
new_imputed = imputed.copy()
self.statistics_ = np.ma.getdata(X)
self.gamma_ = []
if self.f_model == "RandomForest":
self.estimators_ = [RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=i, **kwargs) for i in range(X.shape[1])]
elif self.f_model == "KNN":
self.estimators_ = [KNeighborsRegressor(n_neighbors=min(5, sum(~X_nan[:, i])), **kwargs) for i in range(X.shape[1])]
elif self.f_model == "PCA":
self.estimators_ = [PCA(n_components=int(np.sqrt(min(X.shape))), whiten=True, **kwargs)]
for iter in range(self.max_iter):
if len(self.estimators_) > 1:
for i in most_by_nan:
X_s = np.delete(new_imputed, i, 1)
y_nan = X_nan[:, i]
X_train = X_s[~y_nan]
y_train = new_imputed[~y_nan, i]
X_unk = X_s[y_nan]
estimator_ = self.estimators_[i]
estimator_.fit(X_train, y_train)
if len(X_unk) > 0:
new_imputed[y_nan, i] = estimator_.predict(X_unk)
else:
estimator_ = self.estimators_[0]
estimator_.fit(new_imputed)
new_imputed[X_nan] = estimator_.inverse_transform(estimator_.transform(new_imputed))[X_nan]
gamma = ((new_imputed-imputed)**2/(1e-6+new_imputed.var(axis=0))).sum()/(1e-6+X_nan.sum())
self.gamma_.append(gamma)
if np.abs(np.diff(self.gamma_[-2:])) < self.tol:
break
return self
def train_validate(self, df, validation_range, update_progress):
""" Train and validate regressor on df samples with indices listed in validation_range. """
training_summary = pd.DataFrame()
first_sample, samples, labels = prepare_samples(df, self.indicators_samples)
# progress bar parameters
total_steps = len(self.model_params['sample_presentation']) * \
len(self.model_params['exp_weight']) * len(self.model_params['k'])
completed_steps = 0
# loop over model parameters
for sample_presentation in self.model_params['sample_presentation']:
presented_samples, presented_labels, normalizer = set_presentation(samples, labels, sample_presentation, self.indicators_samples['Daily'])
for exp_weight in self.model_params['exp_weight']:
weighted_samples = apply_exp_weights(presented_samples, exp_weight)
for k in self.model_params['k']:
model, total_train_time, total_test_time = [[0 for i in range (len(h))] for j in range(3)]
error_list, relative_error_list, hit_list = [[[] for i in range (len(h))] for j in range(3)]
params = (sample_presentation, exp_weight, k)
# model training and validation core
for h_index in range(len(h)):
for index in validation_range:
i = index-first_sample
x_train, x_validate = weighted_samples[:i-h[h_index]+1,:], weighted_samples[i,:] #need to stop training h steps before test
y_train, y_validate = presented_labels[h_index][:i-h[h_index]+1], presented_labels[h_index][i]
#train
t1 = time.time()
model[h_index] = KNeighborsRegressor(n_neighbors=k) # train a separate model for each horizon
model[h_index].fit(x_train, y_train)
t2 = time.time()
train_time = (t2-t1)
#test
y_predict = model[h_index].predict(x_validate.reshape(1,-1))
test_time = (time.time()-t2)
#apend new results
y_validate_absolute = remove_presentation(y_validate,normalizer[i], sample_presentation)
y_predict_absolute = remove_presentation(y_predict ,normalizer[i], sample_presentation)
error_list[h_index] += [y_validate_absolute - y_predict_absolute]
relative_error_list[h_index] += [(y_validate_absolute - y_predict_absolute)/y_validate_absolute]
hit_list[h_index] += [(y_validate-x_validate[-1])*(y_predict-x_validate[-1]) > 0]
total_train_time[h_index] += train_time
total_test_time[h_index] += test_time
if i == len(presented_labels[h_index])-1:
#very last training point, include last training oppurtunity
x_train = weighted_samples[:i+1,:]
y_train = presented_labels[h_index][:i+1]
model[h_index].fit(x_train, y_train)
break
completed_steps += 1
update_progress(100.0 * completed_steps/total_steps)
#save last trained model, and add to training summary
training_summary = training_summary.append(summarize(self, model, error_list, relative_error_list, hit_list,
params, total_train_time, total_test_time))
return training_summary, make_presentable(training_summary, self.summary_name)
RegressionUniformBlending.py 文件源码
项目:AirTicketPredicting
作者: junlulocky
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def __init__(self, isTrain):
super(RegressionUniformBlending, self).__init__(isTrain)
# data preprocessing
#self.dataPreprocessing()
self.net1 = NeuralNet(
layers=[ # three layers: one hidden layer
('input', layers.InputLayer),
('hidden', layers.DenseLayer),
#('hidden2', layers.DenseLayer),
#('hidden3', layers.DenseLayer),
('output', layers.DenseLayer),
],
# layer parameters:
input_shape=(None, 13), # input dimension is 13
hidden_num_units=6, # number of units in hidden layer
#hidden2_num_units=8, # number of units in hidden layer
#hidden3_num_units=4, # number of units in hidden layer
output_nonlinearity=None, # output layer uses sigmoid function
output_num_units=1, # output dimension is 1
# obejctive function
objective_loss_function = lasagne.objectives.squared_error,
# optimization method:
update=lasagne.updates.nesterov_momentum,
update_learning_rate=0.002,
update_momentum=0.4,
# use 25% as validation
train_split=TrainSplit(eval_size=0.2),
regression=True, # flag to indicate we're dealing with regression problem
max_epochs=100, # we want to train this many epochs
verbose=0,
)
# Create linear regression object
self.linRegr = linear_model.LinearRegression()
# Create KNN regression object
self.knn = neighbors.KNeighborsRegressor(86, weights='distance')
# Create Decision Tree regression object
self.decisionTree = DecisionTreeRegressor(max_depth=7, max_features=None)
# Create AdaBoost regression object
decisionReg = DecisionTreeRegressor(max_depth=10)
rng = np.random.RandomState(1)
self.adaReg = AdaBoostRegressor(decisionReg,
n_estimators=400,
random_state=rng)
# Create linear regression object
self.model = RandomForestRegressor(max_features='sqrt', n_estimators=32, max_depth=39)
def __init__(self, S, A, maxlen=1000, mode=None, embedding_dim=1, **kwargs):
super(TableQ2, self).__init__(**kwargs)
self.S = S
self.A = A
if mode == None:
if type(S) == type(A) == gym.spaces.Discrete:
self.mode = 'array'
elif type(A) == gym.spaces.Discrete:
self.mode = 'dictionary'
else:
pass
self.mode = mode
self.maxlen = maxlen
self.embedding_dim = embedding_dim
if self.mode == 'array':
s_dim = get_space_dim(S)
a_dim = get_space_dim(A)
self.table = np.zeros((s_dim, a_dim))
self.maxlen = s_dim
elif self.mode == 'dictionary':
self.table = {0: np.zeros(self.A.n)}
elif self.mode == 'tables':
self.k = 4
self.neigh = KNeighborsRegressor(n_neighbors=self.k)
self.states = np.zeros((self.maxlen,self.embedding_dim))
self.values = np.zeros((self.maxlen, self.A.n))
self.recency= np.zeros((self.maxlen,))
self.i = 0
elif self.mode == 'action_tables':
#self.states = []
#self.recency= []
self.k = 4
self.action_tables = [ [[],[], KNeighborsRegressor(n_neighbors=self.k), []]
for _ in xrange(self.A.n)]
"""
for at in self.action_tables:
states, values, neigh, recency = at
for _ in xrange(self.k):
if self.embedding_dim > 1:
states.append(np.ones(self.embedding_dim))
else:
states.append(1)
values.append(0)
recency.append(0)
#print states, values
#neigh.fit(np.array(states), np.array(values))
s = self._list_to_sklearn(states)
v = self._list_to_sklearn(values)
#print s, v
neigh.fit(s, v)
"""
else:
raise NotImplementedError, 'Sorry, TableQ only supports three modes.'
def plot(k=1,xyzFile='xyz_synth_surf.txt',write=False):
with open(xyzFile) as f:
xyz=np.float64([row.split() for row in f.readlines()])
#~ plt.figure()
#~ plt.scatter(xyz[:, 0], xyz[:, 1], c=xyz[:,2])
#~ plt.plot(xyz[:3, 0], xyz[:3, 1], c='k', marker='s',ms=10)
#~ plt.plot(xyz[:50, 0], xyz[:50, 1], xyz[:50,2], c='k', marker='s',ms=3)
fig=plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xyz[:, 0], xyz[:, 1], xyz[:,2], c=xyz[:,2], marker='o',linewidths=0)
ax.plot(xyz[:50, 0], xyz[:50, 1], xyz[:50,2], c='k', marker='s',ms=3)
#~ ax.scatter(xyz[:50, 0], xyz[:50, 1], xyz[:50,2], c='k', marker='s',linewidths=0,cmap=plt.cm.bone)
xmin=np.min(xyz[:,0])
xmax=np.max(xyz[:,0])
step=(xmax-xmin)/100.
x_=np.arange(np.min(xyz[:,0]),np.max(xyz[:,0]),step)
y_=np.arange(np.min(xyz[:,0]),np.max(xyz[:,0]),step)
xx,yy=np.meshgrid(x_,y_)
xy=np.append(xx.ravel()[:,np.newaxis],yy.ravel()[:,np.newaxis],1)
knn = neighbors.KNeighborsRegressor(k, weights='distance',p=1)
z_= knn.fit(xyz[:,:2],xyz[:,2]).predict(xy)
fig=plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(xx, yy, z_.reshape(np.shape(xx)),rstride=1, cstride=1, cmap=plt.cm.spectral,
linewidth=0, antialiased=False)
ax.plot(xyz[:50, 0], xyz[:50, 1], xyz[:50,2], c='k', marker='s',ms=3)
if write:
with open('knn_'+ xyzFile,'w') as f:
for xi,yi,zi in zip(xx.ravel(),yy.ravel(),z_):
f.write('%f %f %f\n' % (xi,yi,zi))
#################################################################
#calculate a distance matrix based on variation of information
def main():
df_test = pd.read_csv("test2.csv")
df_train0 = pd.read_csv("train.csv")
df_train_list = [df_train0,]
random.shuffle(df_train_list)
df_train = pd.concat(df_train_list)
df_ts = pd.read_csv("ts_feature2_simple.csv")
df_date = pd.read_csv("date.csv")
df_train = df_train.merge(df_date, on="date", how="left")
df_train = df_train.merge(df_ts, on=["tollgate_id", "hour", "miniute", "direction"], how="left")
df_test = df_test.merge(df_date, on="date", how="left")
df_test = df_test.merge(df_ts, on=["tollgate_id", "hour", "miniute", "direction"], how="left")
df_train_grouped = df_train.groupby(["tollgate_id", "direction"])
df_test_grouped = df_test.groupby(["tollgate_id", "direction"])
result = []
oob = []
for key, train_data in df_train_grouped:
test_data = df_test_grouped.get_group(key)
len_train = len(train_data)
train_data = train_data.append(test_data)[train_data.columns.tolist()]
train_data = feature_transform_knn(key, train_data)
regressor_cubic = KNeighborsRegressor(n_neighbors=8, algorithm="auto")
train_data = pd.DataFrame.reset_index(train_data)
train_data = train_data.drop("index", axis=1)
y = train_data.ix[:len_train - 1, :]["volume"]
x = train_data.ix[:len_train - 1, 8:]
print x.head()
x1 = train_data.ix[len_train:, 8:]
regressor_cubic.fit(x, y)
yhat = regressor_cubic.predict(x1)
test_data["volume"] = yhat
result.append(test_data[['tollgate_id', 'time_window', 'direction', 'volume']])
df_result = pd.concat(result, axis=0)
print np.mean(df_result["volume"])
df_result.to_csv("result/result_split_knn"+str(np.mean(df_result["volume"]))+".csv", index=False)
print np.mean(oob)
def test_precomputed(random_state=42):
"""Tests unsupervised NearestNeighbors with a distance matrix."""
# Note: smaller samples may result in spurious test success
rng = np.random.RandomState(random_state)
X = rng.random_sample((10, 4))
Y = rng.random_sample((3, 4))
DXX = metrics.pairwise_distances(X, metric='euclidean')
DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
for method in ['kneighbors']:
# TODO: also test radius_neighbors, but requires different assertion
# As a feature matrix (n_samples by n_features)
nbrs_X = neighbors.NearestNeighbors(n_neighbors=3)
nbrs_X.fit(X)
dist_X, ind_X = getattr(nbrs_X, method)(Y)
# As a dense distance matrix (n_samples by n_samples)
nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute',
metric='precomputed')
nbrs_D.fit(DXX)
dist_D, ind_D = getattr(nbrs_D, method)(DYX)
assert_array_almost_equal(dist_X, dist_D)
assert_array_almost_equal(ind_X, ind_D)
# Check auto works too
nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto',
metric='precomputed')
nbrs_D.fit(DXX)
dist_D, ind_D = getattr(nbrs_D, method)(DYX)
assert_array_almost_equal(dist_X, dist_D)
assert_array_almost_equal(ind_X, ind_D)
# Check X=None in prediction
dist_X, ind_X = getattr(nbrs_X, method)(None)
dist_D, ind_D = getattr(nbrs_D, method)(None)
assert_array_almost_equal(dist_X, dist_D)
assert_array_almost_equal(ind_X, ind_D)
# Must raise a ValueError if the matrix is not of correct shape
assert_raises(ValueError, getattr(nbrs_D, method), X)
target = np.arange(X.shape[0])
for Est in (neighbors.KNeighborsClassifier,
neighbors.RadiusNeighborsClassifier,
neighbors.KNeighborsRegressor,
neighbors.RadiusNeighborsRegressor):
print(Est)
est = Est(metric='euclidean')
est.radius = est.n_neighbors = 1
pred_X = est.fit(X, target).predict(Y)
est.metric = 'precomputed'
pred_D = est.fit(DXX, target).predict(DYX)
assert_array_almost_equal(pred_X, pred_D)
def test_neighbors_badargs():
# Test bad argument values: these should all raise ValueErrors
assert_raises(ValueError,
neighbors.NearestNeighbors,
algorithm='blah')
X = rng.random_sample((10, 2))
Xsparse = csr_matrix(X)
y = np.ones(10)
for cls in (neighbors.KNeighborsClassifier,
neighbors.RadiusNeighborsClassifier,
neighbors.KNeighborsRegressor,
neighbors.RadiusNeighborsRegressor):
assert_raises(ValueError,
cls,
weights='blah')
assert_raises(ValueError,
cls, p=-1)
assert_raises(ValueError,
cls, algorithm='blah')
nbrs = cls(algorithm='ball_tree', metric='haversine')
assert_raises(ValueError,
nbrs.predict,
X)
assert_raises(ValueError,
ignore_warnings(nbrs.fit),
Xsparse, y)
nbrs = cls()
assert_raises(ValueError,
nbrs.fit,
np.ones((0, 2)), np.ones(0))
assert_raises(ValueError,
nbrs.fit,
X[:, :, None], y)
nbrs.fit(X, y)
assert_raises(ValueError,
nbrs.predict,
[[]])
if (isinstance(cls, neighbors.KNeighborsClassifier) or
isinstance(cls, neighbors.KNeighborsRegressor)):
nbrs = cls(n_neighbors=-1)
assert_raises(ValueError, nbrs.fit, X, y)
nbrs = neighbors.NearestNeighbors().fit(X)
assert_raises(ValueError, nbrs.kneighbors_graph, X, mode='blah')
assert_raises(ValueError, nbrs.radius_neighbors_graph, X, mode='blah')
def get_model_list(task_name):
model_list, name_list = [], []
model_list.append(linear_model.LinearRegression())
name_list.append('LR')
#
model_list.append(linear_model.SGDRegressor())
name_list.append('LR_SGD')
model_list.append(linear_model.Lasso(alpha = 1.0))
name_list.append('Lasso')
model_list.append(linear_model.Ridge (alpha = 1.0))
name_list.append('Ridge')
model_list.append(linear_model.LassoLars(alpha=.1))
name_list.append('LassoLars')
model_list.append(linear_model.BayesianRidge())
name_list.append('BayesianRidge')
model_list.append(KernelRidge(alpha=1.0))
name_list.append('KernelRidge')
model_list.append(gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1))
name_list.append('GaussianProcess')
model_list.append(KNeighborsRegressor(weights = 'uniform',n_neighbors=3))
name_list.append('KNN_unif')
model_list.append(KNeighborsRegressor(weights = 'distance',n_neighbors=3))
name_list.append('KNN_dist')
model_list.append(SVR(kernel = 'linear', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
name_list.append('SVM_linear')
model_list.append(SVR(kernel = 'poly', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
name_list.append('SVM_poly')
model_list.append(SVR(kernel = 'rbf', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
name_list.append('SVM_rbf')
model_list.append(DecisionTreeRegressor())
name_list.append('DT')
model_list.append(RandomForestRegressor(n_estimators=100, max_depth=None,min_samples_split=2, random_state=0))
name_list.append('RF')
model_list.append(ExtraTreesRegressor(n_estimators=100, max_depth=None, max_features='auto', min_samples_split=2, random_state=0))
name_list.append('ET')
return model_list, name_list