def rfr_feature_select():
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score, ShuffleSplit
boston = load_boston()
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]
rf = RandomForestRegressor(n_estimators=20, max_depth=4)
scores = []
for i in range(X.shape[1]):
score = cross_val_score(rf, X[:, i:i + 1],
Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3))
scores.append((round(np.mean(score), 3), names[i]))
print sorted(scores, reverse=True)
python类load_boston()的实例源码
def test_boston(self):
from sklearn.datasets import load_boston
scikit_data = load_boston()
scikit_model = StandardScaler().fit(scikit_data.data)
spec = converter.convert(scikit_model, scikit_data.feature_names, 'out').get_spec()
input_data = [dict(zip(scikit_data.feature_names, row))
for row in scikit_data.data]
output_data = [{"out" : row} for row in scikit_model.transform(scikit_data.data)]
metrics = evaluate_transformer(spec, input_data, output_data)
assert metrics["num_errors"] == 0
test_random_forest_classifier_numeric.py 文件源码
项目:coremltools
作者: apple
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def setUpClass(self):
from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeClassifier
# Load data and train model
import numpy as np
scikit_data = load_boston()
self.X = scikit_data.data.astype('f').astype('d') ## scikit-learn downcasts data
t = scikit_data.target
num_classes = 3
target = np.digitize(t, np.histogram(t, bins = num_classes - 1)[1]) - 1
# Save the data and the model
self.scikit_data = scikit_data
self.target = target
self.feature_names = scikit_data.feature_names
self.output_name = 'target'
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
import numpy as np
scikit_data = load_boston()
scikit_model = RandomForestClassifier(random_state = 1)
t = scikit_data.target
target = np.digitize(t, np.histogram(t)[1]) - 1
scikit_model.fit(scikit_data.data, target)
# Save the data and the model
self.scikit_data = scikit_data
self.target = target
self.scikit_model = scikit_model
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
if not HAS_SKLEARN:
return
if not HAS_LIBSVM:
return
scikit_data = load_boston()
prob = svmutil.svm_problem(scikit_data['target'], scikit_data['data'].tolist())
param = svmutil.svm_parameter()
param.svm_type = svmutil.NU_SVR
param.kernel_type = svmutil.LINEAR
param.eps = 1
self.libsvm_model = svmutil.svm_train(prob, param)
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
scikit_data = load_boston()
scikit_model = Imputer(strategy='most_frequent', axis=0)
scikit_data['data'][1,8] = np.NaN
input_data = scikit_data['data'][:,8].reshape(-1, 1)
scikit_model.fit(input_data, scikit_data['target'])
# Save the data and the model
self.scikit_data = scikit_data
self.scikit_model = scikit_model
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
if not(HAS_SKLEARN):
return
scikit_data = load_boston()
feature_names = scikit_data.feature_names
scikit_model = LinearRegression()
scikit_model.fit(scikit_data['data'], scikit_data['target'])
# Save the data and the model
self.scikit_data = scikit_data
self.scikit_model = scikit_model
def test_boston_OHE_plus_trees(self):
data = load_boston()
pl = Pipeline([
("OHE", OneHotEncoder(categorical_features = [8], sparse=False)),
("Trees",GradientBoostingRegressor(random_state = 1))])
pl.fit(data.data, data.target)
# Convert the model
spec = convert(pl, data.feature_names, 'target')
# Get predictions
df = pd.DataFrame(data.data, columns=data.feature_names)
df['prediction'] = pl.predict(data.data)
# Evaluate it
result = evaluate_regressor(spec, df, 'target', verbose = False)
assert result["max_error"] < 0.0001
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
if not HAS_XGBOOST:
return
if not HAS_SKLEARN:
return
scikit_data = load_boston()
dtrain = xgboost.DMatrix(scikit_data.data, label = scikit_data.target,
feature_names = scikit_data.feature_names)
xgb_model = xgboost.train({}, dtrain, 1)
# Save the data and the model
self.scikit_data = scikit_data
self.xgb_model = xgb_model
self.feature_names = self.scikit_data.feature_names
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
if not HAS_SKLEARN:
return
if not HAS_LIBSVM:
return
scikit_data = load_boston()
prob = svmutil.svm_problem(scikit_data['target'], scikit_data['data'].tolist())
param = svmutil.svm_parameter()
param.svm_type = svmutil.EPSILON_SVR
param.kernel_type = svmutil.LINEAR
param.eps = 1
self.libsvm_model = svmutil.svm_train(prob, param)
def test_boston_OHE(self):
data = load_boston()
for categorical_features in [ [3], [8], [3, 8], [8,3] ]:
model = OneHotEncoder(categorical_features = categorical_features, sparse=False)
model.fit(data.data, data.target)
# Convert the model
spec = sklearn.convert(model, data.feature_names, 'out').get_spec()
input_data = [dict(zip(data.feature_names, row)) for row in data.data]
output_data = [{"out" : row} for row in model.transform(data.data)]
result = evaluate_transformer(spec, input_data, output_data)
assert result["num_errors"] == 0
# This test still isn't working
def test_boston_OHE_pipeline(self):
data = load_boston()
for categorical_features in [ [3], [8], [3, 8], [8,3] ]:
# Put it in a pipeline so that we can test whether the output dimension
# handling is correct.
model = Pipeline([("OHE", OneHotEncoder(categorical_features = categorical_features)),
("Normalizer", Normalizer())])
model.fit(data.data.copy(), data.target)
# Convert the model
spec = sklearn.convert(model, data.feature_names, 'out').get_spec()
input_data = [dict(zip(data.feature_names, row)) for row in data.data]
output_data = [{"out" : row} for row in model.transform(data.data.copy())]
result = evaluate_transformer(spec, input_data, output_data)
assert result["num_errors"] == 0
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
if not(HAS_SKLEARN):
return
scikit_data = load_boston()
feature_names = scikit_data.feature_names
scikit_model = LinearRegression()
scikit_model.fit(scikit_data['data'], scikit_data['target'])
scikit_spec = converter.convert(scikit_model, feature_names, 'target').get_spec()
# Save the data and the model
self.scikit_data = scikit_data
self.scikit_model = scikit_model
self.scikit_spec = scikit_spec
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
if not HAS_SKLEARN:
return
if not HAS_LIBSVM:
return
scikit_data = load_boston()
prob = svmutil.svm_problem(scikit_data['target'] > scikit_data['target'].mean(),
scikit_data['data'].tolist())
param = svmutil.svm_parameter()
param.svm_type = svmutil.C_SVC
param.kernel_type = svmutil.LINEAR
param.eps = 1
libsvm_model = svmutil.svm_train(prob, param)
libsvm_spec = libsvm_converter.convert(libsvm_model, scikit_data.feature_names, 'target').get_spec()
# Save the data and the model
self.scikit_data = scikit_data
self.libsvm_spec = libsvm_spec
test_boosted_trees_classifier_numeric.py 文件源码
项目:coremltools
作者: apple
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def setUpClass(self):
from sklearn.datasets import load_boston
# Load data and train model
import numpy as np
scikit_data = load_boston()
num_classes = 3
self.X = scikit_data.data.astype('f').astype('d') ## scikit-learn downcasts data
t = scikit_data.target
target = np.digitize(t, np.histogram(t, bins = num_classes - 1)[1]) - 1
# Save the data and the model
self.scikit_data = scikit_data
self.target = target
self.feature_names = scikit_data.feature_names
self.output_name = 'target'
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
scikit_data = load_boston()
scikit_model = DecisionTreeClassifier(random_state = 1)
t = scikit_data.target
target = np.digitize(t, np.histogram(t)[1]) - 1
scikit_model.fit(scikit_data.data, target)
# Save the data and the model
self.scikit_data = scikit_data
self.target = target
self.scikit_model = scikit_model
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
import numpy as np
scikit_data = load_boston()
scikit_model = GradientBoostingClassifier(random_state = 1)
t = scikit_data.target
target = np.digitize(t, np.histogram(t)[1]) - 1
scikit_model.fit(scikit_data.data, target)
self.target = target
# Save the data and the model
self.scikit_data = scikit_data
self.scikit_model = scikit_model
test_decision_tree_classifier_numeric.py 文件源码
项目:coremltools
作者: apple
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def setUpClass(self):
from sklearn.datasets import load_boston
import numpy as np
# Load data and train model
scikit_data = load_boston()
num_classes = 3
self.X = scikit_data.data.astype('f').astype('d') ## scikit-learn downcasts data
t = scikit_data.target
target = np.digitize(t, np.histogram(t, bins = num_classes - 1)[1]) - 1
# Save the data and the model
self.scikit_data = scikit_data
self.target = target
self.feature_names = scikit_data.feature_names
self.output_name = 'target'
def setUp(self):
self.X, self.y = load_boston(return_X_y=True)
self.regressor_settings = [
'sklearn_random_forest_regressor',
'sklearn_extra_trees_regressor',
'sklearn_bagging_regressor',
'sklearn_GP_regressor',
'sklearn_ridge_regressor',
'sklearn_lasso_regressor',
'sklearn_kernel_ridge_regressor',
'sklearn_knn_regressor',
'sklearn_svr_regressor',
'sklearn_decision_tree_regressor',
'sklearn_linear_regression',
'sklearn_adaboost_regressor',
'xgboost_regressor',
]
def test_onehot():
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=333)
train = pd.DataFrame(X_train)
test = pd.DataFrame(X_test)
t_train, t_test = onehot_features(train.copy(deep=True), test.copy(deep=True), [8, 1, 12], full=False,
dummy_na=True)
assert t_train.shape[1] == t_test.shape[1]
assert t_train.shape[1] == 441
t_train, t_test = onehot_features(train.copy(deep=True), test.copy(deep=True), [8, 1, 12], full=True,
dummy_na=False)
assert t_train.shape[1] == t_test.shape[1]
assert t_train.shape[1] == 500
def test_few_fit_shapes():
"""test_few.py: fit and predict return correct shapes """
np.random.seed(202)
# load example data
boston = load_boston()
d = pd.DataFrame(data=boston.data)
print("feature shape:",boston.data.shape)
learner = FEW(generations=1, population_size=5,
mutation_rate=0.2, crossover_rate=0.8,
ml = LassoLarsCV(), min_depth = 1, max_depth = 3,
sel = 'epsilon_lexicase', tourn_size = 2,
random_state=0, verbosity=0,
disable_update_check=False, fit_choice = 'mse')
score = learner.fit(boston.data[:300], boston.target[:300])
print("learner:",learner._best_estimator)
yhat_test = learner.predict(boston.data[300:])
test_score = learner.score(boston.data[300:],boston.target[300:])
print("train score:",score,"test score:",test_score,
"test r2:",r2_score(boston.target[300:],yhat_test))
assert yhat_test.shape == boston.target[300:].shape
def test_few_with_parents_weight():
"""test_few.py: few performs without error with parent pressure for selection"""
np.random.seed(1006987)
boston = load_boston()
d = np.column_stack((boston.data,boston.target))
np.random.shuffle(d)
features = d[:,0:-1]
target = d[:,-1]
print("feature shape:",boston.data.shape)
learner = FEW(generations=1, population_size=5,
mutation_rate=1, crossover_rate=1,
ml = LassoLarsCV(), min_depth = 1, max_depth = 3,
sel = 'tournament', fit_choice = 'r2',tourn_size = 2, random_state=0, verbosity=0,
disable_update_check=False, weight_parents=True)
learner.fit(features[:300], target[:300])
few_score = learner.score(features[:300], target[:300])
test_score = learner.score(features[300:],target[300:])
print("few score:",few_score)
print("few test score:",test_score)
plot_model_complexity_influence.py 文件源码
项目:Parallel-SGD
作者: angadgill
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def generate_data(case, sparse=False):
"""Generate regression/classification data."""
bunch = None
if case == 'regression':
bunch = datasets.load_boston()
elif case == 'classification':
bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
X, y = shuffle(bunch.data, bunch.target)
offset = int(X.shape[0] * 0.8)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]
if sparse:
X_train = csr_matrix(X_train)
X_test = csr_matrix(X_test)
else:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
y_train = np.array(y_train)
data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train,
'y_test': y_test}
return data
def test_score_sample_weight():
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets
rng = np.random.RandomState(0)
# test both ClassifierMixin and RegressorMixin
estimators = [DecisionTreeClassifier(max_depth=2),
DecisionTreeRegressor(max_depth=2)]
sets = [datasets.load_iris(),
datasets.load_boston()]
for est, ds in zip(estimators, sets):
est.fit(ds.data, ds.target)
# generate random sample weights
sample_weight = rng.randint(1, 10, size=len(ds.target))
# check that the score with and without sample weights are different
assert_not_equal(est.score(ds.data, ds.target),
est.score(ds.data, ds.target,
sample_weight=sample_weight),
msg="Unweighted and weighted scores "
"are unexpectedly equal")
def test_warm_start_convergence_with_regularizer_decrement():
boston = load_boston()
X, y = boston.data, boston.target
# Train a model to converge on a lightly regularized problem
final_alpha = 1e-5
low_reg_model = ElasticNet(alpha=final_alpha).fit(X, y)
# Fitting a new model on a more regularized version of the same problem.
# Fitting with high regularization is easier it should converge faster
# in general.
high_reg_model = ElasticNet(alpha=final_alpha * 10).fit(X, y)
assert_greater(low_reg_model.n_iter_, high_reg_model.n_iter_)
# Fit the solution to the original, less regularized version of the
# problem but from the solution of the highly regularized variant of
# the problem as a better starting point. This should also converge
# faster than the original model that starts from zero.
warm_low_reg_model = deepcopy(high_reg_model)
warm_low_reg_model.set_params(warm_start=True, alpha=final_alpha)
warm_low_reg_model.fit(X, y)
assert_greater(low_reg_model.n_iter_, warm_low_reg_model.n_iter_)
def make_rg_dataset_and_field_manager(self):
boston = datasets.load_boston()
dataset = DataSet(boston.data, boston.target, boston.feature_names, "price")
feature_fields = []
for i, name in enumerate(dataset.feature_names):
f = Field(name, "NUMBER", value_mean=np.mean(dataset.data[:, i]), value_std=np.std(dataset.data[:, i]))
feature_fields.append(f)
target = Field("price", "NUMBER", value_mean=np.mean(dataset.target), value_std=np.std(dataset.target))
field_manager = FieldManager(-1, feature_fields, target)
return dataset, field_manager
def get_boston_regression_dataset():
boston = load_boston()
df_boston = pd.DataFrame(boston.data)
df_boston.columns = boston.feature_names
df_boston['MEDV'] = boston['target']
df_boston_train, df_boston_test = train_test_split(df_boston, test_size=0.33, random_state=42)
return df_boston_train, df_boston_test
def get_boston_dataset():
boston = load_boston()
df_boston = pd.DataFrame(boston.data)
df_boston.columns = boston.feature_names
df_boston['MEDV'] = boston['target']
df_boston_train, df_boston_test = train_test_split(df_boston, test_size=0.2, random_state=42)
return df_boston_train, df_boston_test
def load_boston_df(include_tgt=True, tgt_name="target", shuffle=False):
"""Loads the boston housing dataset into a dataframe with the
target set as the "target" feature or whatever name
is specified in ``tgt_name``.
Parameters
----------
include_tgt : bool, optional (default=True)
Whether to include the target
tgt_name : str, optional (default="target")
The name of the target feature
shuffle : bool, optional (default=False)
Whether to shuffle the rows
Returns
-------
X : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features)
The loaded dataset
"""
bo = load_boston()
X = pd.DataFrame.from_records(data=bo.data, columns=bo.feature_names)
if include_tgt:
X[tgt_name] = bo.target
return X if not shuffle else shuffle_dataframe(X)
def lession_5():
# db = datasets.load_boston()
# print db.data.shape
# data_X=db.data
# data_y=db.target
# model = LinearRegression()
# model.fit(data_X,data_y)
# print model.predict(data_X[:8])
# print data_y[:8]
X,y = datasets.make_regression(n_samples=100,n_features=1,n_targets=1,noise=10)
plt.scatter(X,y)
plt.show()