def main():
iris = datasets.load_iris()
x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5)
clrTree = tree.DecisionTreeClassifier()
clrTree = clrTree.fit(x_train, y_train)
outTree = clrTree.predict(x_test)
clrKN = KNeighborsClassifier()
clrKN = clrKN.fit(x_train, y_train)
outKN = clrKN.predict(x_test)
# Prediction accuracy
print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, outTree)*100)+"%")
print("Accuracy for KNeighbors Classifier: " + str(accuracy_score(y_test, outKN)*100)+"%")
python类load_iris()的实例源码
def main():
iris = datasets.load_iris()
x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5)
clr = NewClassifier()
clr.fit(x_train, y_train)
prediction = clr.predict(x_test)
# Prediction accuracy
print("Accuracy: " + str(accuracy_score(y_test, prediction) * 100) + "%")
# Run main
def test_pipeline_transform():
# Test whether pipeline works with a transformer at the end.
# Also test pipeline.transform and pipeline.inverse_transform
iris = load_iris()
X = iris.data
pca = PCA(n_components=2, svd_solver='full')
pipeline = Pipeline([('pca', pca)])
# test transform and fit_transform:
X_trans = pipeline.fit(X).transform(X)
X_trans2 = pipeline.fit_transform(X)
X_trans3 = pca.fit_transform(X)
assert_array_almost_equal(X_trans, X_trans2)
assert_array_almost_equal(X_trans, X_trans3)
X_back = pipeline.inverse_transform(X_trans)
X_back2 = pca.inverse_transform(X_trans)
assert_array_almost_equal(X_back, X_back2)
def main():
iris = load_iris()
test_idx = [0, 50, 100]
# training Data
train_target = np.delete(iris.target, test_idx)
train_data = np.delete(iris.data, test_idx, axis=0)
# testing data
test_target = iris.target[test_idx]
test_data = iris.data[test_idx]
# Train Classifier
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_data, train_target)
print(clf.predict(test_data))
# Run main
def test_bagged_imputer_classification():
iris = load_iris()
# make DF, add species col
X = pd.DataFrame.from_records(data=iris.data, columns=iris.feature_names)
X['species'] = iris.target
# shuffle...
X = shuffle_dataframe(X)
# set random indices to be null.. 15% should be good
rands = np.random.rand(X.shape[0])
mask = rands > 0.85
X['species'].iloc[mask] = np.nan
# define imputer, assert no missing
imputer = BaggedCategoricalImputer(cols=['species'])
y = imputer.fit_transform(X)
assert y['species'].isnull().sum() == 0, 'expected no null...'
# now test with a different estimator
imputer = BaggedCategoricalImputer(cols=['species'], base_estimator=RandomForestClassifier())
y = imputer.fit_transform(X)
assert y['species'].isnull().sum() == 0, 'expected no null...'
def test_make_grid_search():
X, y = load_iris(return_X_y=True)
lr = LogisticRegression()
svc = set_grid(SVC(), kernel=['poly'], degree=[2, 3])
gs1 = make_grid_search(lr, cv=5) # empty grid
gs2 = make_grid_search(svc, cv=5)
gs3 = make_grid_search([lr, svc], cv=5)
for gs, n_results in [(gs1, 1), (gs2, 2), (gs3, 3)]:
gs.fit(X, y)
assert gs.cv == 5
assert len(gs.cv_results_['params']) == n_results
svc_mask = gs3.cv_results_['param_root'] == svc
assert svc_mask.sum() == 2
assert gs3.cv_results_['param_root__degree'][svc_mask].tolist() == [2, 3]
assert gs3.cv_results_['param_root'][~svc_mask].tolist() == [lr]
decision_tree_classifier.py 文件源码
项目:ML-From-Scratch
作者: eriklindernoren
项目源码
文件源码
阅读 40
收藏 0
点赞 0
评论 0
def main():
print ("-- Classification Tree --")
data = datasets.load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
clf = ClassificationTree()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)
Plot().plot_in_2d(X_test, y_pred,
title="Decision Tree",
accuracy=accuracy,
legend_labels=data.target_names)
linear_discriminant_analysis.py 文件源码
项目:ML-From-Scratch
作者: eriklindernoren
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def main():
# Load the dataset
data = datasets.load_iris()
X = data.data
y = data.target
# Three -> two classes
X = X[y != 2]
y = y[y != 2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
# Fit and predict using LDA
lda = LDA()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)
Plot().plot_in_2d(X_test, y_pred, title="LDA", accuracy=accuracy)
def main():
# Load dataset
data = datasets.load_iris()
X = normalize(data.data[data.target != 0])
y = data.target[data.target != 0]
y[y == 1] = 0
y[y == 2] = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1)
clf = LogisticRegression(gradient_descent=True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)
# Reduce dimension to two using PCA and plot the results
Plot().plot_in_2d(X_test, y_pred, title="Logistic Regression", accuracy=accuracy)
gradient_boosting_classifier.py 文件源码
项目:ML-From-Scratch
作者: eriklindernoren
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def main():
print ("-- Gradient Boosting Classification --")
data = datasets.load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)
Plot().plot_in_2d(X_test, y_pred,
title="Gradient Boosting",
accuracy=accuracy,
legend_labels=data.target_names)
def main():
print ("-- XGBoost --")
data = datasets.load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=2)
clf = XGBoost()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)
Plot().plot_in_2d(X_test, y_pred,
title="XGBoost",
accuracy=accuracy,
legend_labels=data.target_names)
def test():
iris = load_iris()
#print iris
#print iris['target'].shape
gbdt=GradientBoostingRegressor(n_estimators=1000, max_depth=4)
gbdt.fit(iris.data[:120],iris.target[:120])
#Save GBDT Model
joblib.dump(gbdt, 'GBDT.model')
predict = gbdt.predict(iris.data[:120])
total_err = 0
for i in range(len(predict)):
print predict[i],iris.target[i]
err = predict[i] - iris.target[i]
total_err += err * err
print 'Training Error: %f' % (total_err / len(predict))
pred = gbdt.predict(iris.data[120:])
error = 0
for i in range(len(pred)):
print pred[i],iris.target[i+120]
err = pred[i] - iris.target[i+120]
error += err * err
print 'Test Error: %f' % (error / len(pred))
def get_iris(rng=42, tst_size=0.3):
iris = datasets.load_iris()
X = iris.data
y = iris.target
X = iris_normalisation(X)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=tst_size,
random_state=rng)
trg_train = np.zeros((3, len(y_train)), dtype='uint8')
for e in range(trg_train.shape[1]):
v = y_train[e]
trg_train[v, e] = 1
trg_test = np.zeros((3, len(y_test)), dtype='uint8')
for e in range(trg_test.shape[1]):
v = y_test[e]
trg_test[v, e] = 1
trn = Instance(X_train.T, trg_train)
tst = Instance(X_test.T, trg_test)
return trn, tst
def test_cross_val_predict():
# Make sure it works in cross_val_predict for multiclass.
X, y = load_iris(return_X_y=True)
y = LabelBinarizer().fit_transform(y)
X = StandardScaler().fit_transform(X)
mlp = MLPClassifier(n_epochs=10,
solver_kwargs={'learning_rate': 0.05},
random_state=4567).fit(X, y)
cv = KFold(n_splits=4, random_state=457, shuffle=True)
y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba')
auc = roc_auc_score(y, y_oos, average=None)
assert np.all(auc >= 0.96)
def test_RFECV():
'''
test the method of RFECV
:return: None
'''
iris=load_iris()
X=iris.data
y=iris.target
estimator=LinearSVC()
selector=RFECV(estimator=estimator,cv=3)
selector.fit(X,y)
print("N_features %s"%selector.n_features_)
print("Support is %s"%selector.support_)
print("Ranking %s"%selector.ranking_)
print("Grid Scores %s"%selector.grid_scores_)
def test_few_classification():
"""test_few.py: tests default classification settings"""
np.random.seed(42)
X, y = load_iris(return_X_y=True)
train,test = train_test_split(np.arange(X.shape[0]), train_size=0.75,
test_size=0.25)
few = FEW(classification=True,population_size='1x',generations=10)
few.fit(X[train],y[train])
print('train score:', few.score(X[train],y[train]))
print('test score:', few.score(X[test],y[test]))
# test boolean output
few = FEW(classification=True,otype='b',population_size='2x',
seed_with_ml=False,generations=10)
np.random.seed(42)
few.fit(X[train],y[train])
print('train score:', few.score(X[train],y[train]))
print('test score:', few.score(X[test],y[test]))
few.print_model()
def test_base():
# Check BaseEnsemble methods.
ensemble = BaggingClassifier(base_estimator=Perceptron(), n_estimators=3)
iris = load_iris()
ensemble.fit(iris.data, iris.target)
ensemble.estimators_ = [] # empty the list and create estimators manually
ensemble._make_estimator()
ensemble._make_estimator()
ensemble._make_estimator()
ensemble._make_estimator(append=False)
assert_equal(3, len(ensemble))
assert_equal(3, len(ensemble.estimators_))
assert_true(isinstance(ensemble[0], Perceptron))
def test_cross_val_score_mask():
# test that cross_val_score works with boolean masks
svm = SVC(kernel="linear")
iris = load_iris()
X, y = iris.data, iris.target
kfold = KFold(5)
scores_indices = cross_val_score(svm, X, y, cv=kfold)
kfold = KFold(5)
cv_masks = []
for train, test in kfold.split(X, y):
mask_train = np.zeros(len(y), dtype=np.bool)
mask_test = np.zeros(len(y), dtype=np.bool)
mask_train[train] = 1
mask_test[test] = 1
cv_masks.append((train, test))
scores_masks = cross_val_score(svm, X, y, cv=cv_masks)
assert_array_equal(scores_indices, scores_masks)
def test_cross_val_score_precomputed():
# test for svm with precomputed kernel
svm = SVC(kernel="precomputed")
iris = load_iris()
X, y = iris.data, iris.target
linear_kernel = np.dot(X, X.T)
score_precomputed = cross_val_score(svm, linear_kernel, y)
svm = SVC(kernel="linear")
score_linear = cross_val_score(svm, X, y)
assert_array_equal(score_precomputed, score_linear)
# Error raised for non-square X
svm = SVC(kernel="precomputed")
assert_raises(ValueError, cross_val_score, svm, X, y)
# test error is raised when the precomputed kernel is not array-like
# or sparse
assert_raises(ValueError, cross_val_score, svm,
linear_kernel.tolist(), y)
def test_cross_val_score_with_score_func_classification():
iris = load_iris()
clf = SVC(kernel='linear')
# Default score (should be the accuracy score)
scores = cross_val_score(clf, iris.data, iris.target, cv=5)
assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)
# Correct classification score (aka. zero / one score) - should be the
# same as the default estimator score
zo_scores = cross_val_score(clf, iris.data, iris.target,
scoring="accuracy", cv=5)
assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
# F1 score (class are balanced so f1_score should be equal to zero/one
# score
f1_scores = cross_val_score(clf, iris.data, iris.target,
scoring="f1_weighted", cv=5)
assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
def test_cross_val_score_mask():
# test that cross_val_score works with boolean masks
svm = SVC(kernel="linear")
iris = load_iris()
X, y = iris.data, iris.target
cv_indices = cval.KFold(len(y), 5)
scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices)
cv_indices = cval.KFold(len(y), 5)
cv_masks = []
for train, test in cv_indices:
mask_train = np.zeros(len(y), dtype=np.bool)
mask_test = np.zeros(len(y), dtype=np.bool)
mask_train[train] = 1
mask_test[test] = 1
cv_masks.append((train, test))
scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks)
assert_array_equal(scores_indices, scores_masks)
def test_cross_val_score_precomputed():
# test for svm with precomputed kernel
svm = SVC(kernel="precomputed")
iris = load_iris()
X, y = iris.data, iris.target
linear_kernel = np.dot(X, X.T)
score_precomputed = cval.cross_val_score(svm, linear_kernel, y)
svm = SVC(kernel="linear")
score_linear = cval.cross_val_score(svm, X, y)
assert_array_equal(score_precomputed, score_linear)
# Error raised for non-square X
svm = SVC(kernel="precomputed")
assert_raises(ValueError, cval.cross_val_score, svm, X, y)
# test error is raised when the precomputed kernel is not array-like
# or sparse
assert_raises(ValueError, cval.cross_val_score, svm,
linear_kernel.tolist(), y)
def test_cross_val_score_with_score_func_classification():
iris = load_iris()
clf = SVC(kernel='linear')
# Default score (should be the accuracy score)
scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)
# Correct classification score (aka. zero / one score) - should be the
# same as the default estimator score
zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
scoring="accuracy", cv=5)
assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
# F1 score (class are balanced so f1_score should be equal to zero/one
# score
f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
scoring="f1_weighted", cv=5)
assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
def test_safe_split_with_precomputed_kernel():
clf = SVC()
clfp = SVC(kernel="precomputed")
iris = load_iris()
X, y = iris.data, iris.target
K = np.dot(X, X.T)
cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0)
tr, te = list(cv)[0]
X_tr, y_tr = cval._safe_split(clf, X, y, tr)
K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr)
assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))
X_te, y_te = cval._safe_split(clf, X, y, te, tr)
K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr)
assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
def test_score_sample_weight():
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets
rng = np.random.RandomState(0)
# test both ClassifierMixin and RegressorMixin
estimators = [DecisionTreeClassifier(max_depth=2),
DecisionTreeRegressor(max_depth=2)]
sets = [datasets.load_iris(),
datasets.load_boston()]
for est, ds in zip(estimators, sets):
est.fit(ds.data, ds.target)
# generate random sample weights
sample_weight = rng.randint(1, 10, size=len(ds.target))
# check that the score with and without sample weights are different
assert_not_equal(est.score(ds.data, ds.target),
est.score(ds.data, ds.target,
sample_weight=sample_weight),
msg="Unweighted and weighted scores "
"are unexpectedly equal")
def test_fit_predict_on_pipeline():
# test that the fit_predict method is implemented on a pipeline
# test that the fit_predict on pipeline yields same results as applying
# transform and clustering steps separately
iris = load_iris()
scaler = StandardScaler()
km = KMeans(random_state=0)
# first compute the transform and clustering step separately
scaled = scaler.fit_transform(iris.data)
separate_pred = km.fit_predict(scaled)
# use a pipeline to do the transform and clustering in one step
pipe = Pipeline([('scaler', scaler), ('Kmeans', km)])
pipeline_pred = pipe.fit_predict(iris.data)
assert_array_almost_equal(pipeline_pred, separate_pred)
def test_discretenb_provide_prior_with_partial_fit():
# Test whether discrete NB classes use provided prior
# when using partial_fit
iris = load_iris()
iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split(
iris.data, iris.target, test_size=0.4, random_state=415)
for cls in [BernoulliNB, MultinomialNB]:
for prior in [None, [0.3, 0.3, 0.4]]:
clf_full = cls(class_prior=prior)
clf_full.fit(iris.data, iris.target)
clf_partial = cls(class_prior=prior)
clf_partial.partial_fit(iris_data1, iris_target1,
classes=[0, 1, 2])
clf_partial.partial_fit(iris_data2, iris_target2)
assert_array_almost_equal(clf_full.class_log_prior_,
clf_partial.class_log_prior_)
def test_randomized_logistic():
# Check randomized sparse logistic regression
iris = load_iris()
X = iris.data[:, [0, 2]]
y = iris.target
X = X[y != 2]
y = y[y != 2]
F, _ = f_classif(X, y)
scaling = 0.3
clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
scaling=scaling, n_resampling=50,
tol=1e-3)
X_orig = X.copy()
feature_scores = clf.fit(X, y).scores_
assert_array_equal(X, X_orig) # fit does not modify X
assert_array_equal(np.argsort(F), np.argsort(feature_scores))
clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5],
random_state=42, scaling=scaling,
n_resampling=50, tol=1e-3)
feature_scores = clf.fit(X, y).scores_
assert_array_equal(np.argsort(F), np.argsort(feature_scores))
def test_correct_labelsize():
# Assert 1 < n_labels < n_samples
dataset = datasets.load_iris()
X = dataset.data
# n_labels = n_samples
y = np.arange(X.shape[0])
assert_raises_regexp(ValueError,
'Number of labels is %d\. Valid values are 2 '
'to n_samples - 1 \(inclusive\)' % len(np.unique(y)),
silhouette_score, X, y)
# n_labels = 1
y = np.zeros(X.shape[0])
assert_raises_regexp(ValueError,
'Number of labels is %d\. Valid values are 2 '
'to n_samples - 1 \(inclusive\)' % len(np.unique(y)),
silhouette_score, X, y)
def check_non_transformer_estimators_n_iter(name, estimator,
multi_output=False):
# Check if all iterative solvers, run for more than one iteration
iris = load_iris()
X, y_ = iris.data, iris.target
if multi_output:
y_ = np.reshape(y_, (-1, 1))
set_random_state(estimator, 0)
if name == 'AffinityPropagation':
estimator.fit(X)
else:
estimator.fit(X, y_)
# HuberRegressor depends on scipy.optimize.fmin_l_bfgs_b
# which doesn't return a n_iter for old versions of SciPy.
if not (name == 'HuberRegressor' and estimator.n_iter_ is None):
assert_greater(estimator.n_iter_, 0)