def make_classification_example(axis, random_state):
X, y = make_blobs(n_samples=100, n_features=2, centers=2, cluster_std=2.7, random_state=random_state)
axis.scatter(X[y == 0, 0], X[y == 0, 1], color="red", s=10, label="Disease")
axis.scatter(X[y == 1, 0], X[y == 1, 1], color="blue", s=10, label="Healthy")
clf = LinearSVC().fit(X, y)
# get the separating hyperplane
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 7)
yy = a * xx - (clf.intercept_[0]) / w[1]
# plot the line, the points, and the nearest vectors to the plane
axis.plot(xx, yy, 'k-', color="black", label="Model")
ax1.tick_params(labelbottom='off', labelleft='off')
ax1.set_xlabel("Gene 1")
ax1.set_ylabel("Gene 2")
ax1.legend()
python类make_blobs()的实例源码
figure.classification.vs.regression.py 文件源码
项目:microbiome-summer-school-2017
作者: aldro61
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def generate_data(N, seed=10):
""" This generates some test data that we can use to test our pairwise-
distance functions.
Required arguments:
N -- The number of datapoints in the test data.
Optional arguments:
seed -- The seed for NumPy's random module.
"""
# Generate some data:
np.random.seed(seed)
n_samples1 = N * 3 // 4 # same as floor(3/4 * N)
n_samples2 = N - n_samples1
# Blob set 1
centers1 = [[0., 0.],
[1., 0.],
[0.5, np.sqrt(0.75)]]
cluster_std1 = [0.3] * len(centers1)
data, _ = make_blobs(n_samples=n_samples1,
centers=centers1,
cluster_std=cluster_std1)
# Make sure Blob 1 checks out
# Blob set 2
centers2 = [[0.5, np.sqrt(0.75)]]
cluster_std2 = [0.3] * len(centers2)
extra, _ = make_blobs(n_samples=n_samples2,
centers=centers2,
cluster_std=cluster_std2)
return np.concatenate((data, extra), axis=0)
gaussian_mixture_model.py 文件源码
项目:ML-From-Scratch
作者: eriklindernoren
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def main():
# Load the dataset
X, y = datasets.make_blobs()
# Cluster the data
clf = GaussianMixtureModel(k=3)
y_pred = clf.predict(X)
p = Plot()
p.plot_in_2d(X, y_pred, title="GMM Clustering")
p.plot_in_2d(X, y, title="Actual Clustering")
partitioning_around_medoids.py 文件源码
项目:ML-From-Scratch
作者: eriklindernoren
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def main():
# Load the dataset
X, y = datasets.make_blobs()
# Cluster the data using K-Medoids
clf = PAM(k=3)
y_pred = clf.predict(X)
# Project the data onto the 2 primary principal components
p = Plot()
p.plot_in_2d(X, y_pred, title="PAM Clustering")
p.plot_in_2d(X, y, title="Actual Clustering")
def main():
# Load the dataset
X, y = datasets.make_blobs()
# Cluster the data using K-Means
clf = KMeans(k=3)
y_pred = clf.predict(X)
# Project the data onto the 2 primary principal components
p = Plot()
p.plot_in_2d(X, y_pred, title="K-Means Clustering")
p.plot_in_2d(X, y, title="Actual Clustering")
def data_labels():
return make_blobs(random_state=2)
def test_n_samples_leaves_roots():
# Sanity check for the number of samples in leaves and roots
X, y = make_blobs(n_samples=10)
brc = Birch()
brc.fit(X)
n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
n_samples_leaves = sum([sc.n_samples_ for leaf in brc._get_leaves()
for sc in leaf.subclusters_])
assert_equal(n_samples_leaves, X.shape[0])
assert_equal(n_samples_root, X.shape[0])
def test_n_clusters():
# Test that n_clusters param works properly
X, y = make_blobs(n_samples=100, centers=10)
brc1 = Birch(n_clusters=10)
brc1.fit(X)
assert_greater(len(brc1.subcluster_centers_), 10)
assert_equal(len(np.unique(brc1.labels_)), 10)
# Test that n_clusters = Agglomerative Clustering gives
# the same results.
gc = AgglomerativeClustering(n_clusters=10)
brc2 = Birch(n_clusters=gc)
brc2.fit(X)
assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
assert_array_equal(brc1.labels_, brc2.labels_)
# Test that the wrong global clustering step raises an Error.
clf = ElasticNet()
brc3 = Birch(n_clusters=clf)
assert_raises(ValueError, brc3.fit, X)
# Test that a small number of clusters raises a warning.
brc4 = Birch(threshold=10000.)
assert_warns(UserWarning, brc4.fit, X)
def test_sparse_X():
# Test that sparse and dense data give same results
X, y = make_blobs(n_samples=100, centers=10)
brc = Birch(n_clusters=10)
brc.fit(X)
csr = sparse.csr_matrix(X)
brc_sparse = Birch(n_clusters=10)
brc_sparse.fit(csr)
assert_array_equal(brc.labels_, brc_sparse.labels_)
assert_array_almost_equal(brc.subcluster_centers_,
brc_sparse.subcluster_centers_)
def test_branching_factor():
# Test that nodes have at max branching_factor number of subclusters
X, y = make_blobs()
branching_factor = 9
# Purposefully set a low threshold to maximize the subclusters.
brc = Birch(n_clusters=None, branching_factor=branching_factor,
threshold=0.01)
brc.fit(X)
check_branching_factor(brc.root_, branching_factor)
brc = Birch(n_clusters=3, branching_factor=branching_factor,
threshold=0.01)
brc.fit(X)
check_branching_factor(brc.root_, branching_factor)
# Raises error when branching_factor is set to one.
brc = Birch(n_clusters=None, branching_factor=1, threshold=0.01)
assert_raises(ValueError, brc.fit, X)
def test_threshold():
# Test that the leaf subclusters have a threshold lesser than radius
X, y = make_blobs(n_samples=80, centers=4)
brc = Birch(threshold=0.5, n_clusters=None)
brc.fit(X)
check_threshold(brc, 0.5)
brc = Birch(threshold=5.0, n_clusters=None)
brc.fit(X)
check_threshold(brc, 5.)
def test_birch_example_reproducibility(example_id):
# check reproducibility of the Birch example
rng = np.random.RandomState(42)
X, y = make_blobs(n_samples=1000, n_features=10, random_state=rng)
cluster_model = Birch(threshold=0.9, branching_factor=20,
compute_sample_indices=True)
cluster_model.fit(X)
#assert len(cluster_model.root_.subclusters_[1].child_.subclusters_) == 3
htree, n_subclusters = birch_hierarchy_wrapper(cluster_model)
assert htree.tree_size == n_subclusters
# same random seed as in the birch hierarchy example
assert htree.tree_size == 78
sc = htree.flatten()[example_id]
if example_id == 34:
# this is true in both cases, but example_id fails on circle ci
assert sc.current_depth == 1
assert len(sc.children) == 3
assert_array_equal([sc['cluster_id'] for sc in htree.flatten()],
np.arange(htree.tree_size))
def load_dataset(dataset, n_samples, random_state=1, n_features=3):
# wrapper function to load one of the 3d datasets
if dataset == 's_curve':
return make_s_curve(n_samples, random_state=random_state)
elif dataset == 'swiss_roll':
return make_swiss_roll(n_samples, random_state=random_state)
elif dataset == 'broken_swiss_roll':
return make_broken_swiss_roll(n_samples, random_state=random_state)
elif dataset == 'sphere':
return make_sphere(n_samples, random_state=random_state)
elif dataset == '3_circles':
return make_3_circles(n_samples, random_state=random_state)
elif dataset == 'peaks':
return make_peaks(n_samples, random_state=random_state)
elif dataset == 'blobs':
return make_blobs(n_samples, n_features=n_features, centers=3, random_state=random_state)
else:
print("unknown dataset")
def dataset_generator():
"""
generate multi-class dataset
:return: data X and its labels
"""
plt.title("Three blobs", fontsize='small')
X, y = make_blobs(n_features=2, centers=3)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
plt.show()
#np.save('X_multi.npy', X)
#np.save('y_multi.npy', y)
return X, y
def gaussian_blobs(n_samples=200, return_centers=False):
random_state = 0
centers = [(-10, -10), (-10, 0), (0, -10)]
centers.extend([(10, 10), (10, 0), (0, 10)])
centers = np.array(centers)
X, gt = sk_datasets.make_blobs(n_samples=n_samples, centers=centers,
n_features=2, shuffle=False,
random_state=random_state)
if return_centers:
return X, gt, centers
else:
return X, gt
def make_clusters(skew=True, *arg, **kwargs):
X, y = datasets.make_blobs(*arg, **kwargs)
if skew:
nrow = X.shape[1]
for i in np.unique(y):
X[y == i] = X[y == i].dot(np.random.random((nrow, nrow)) - 0.5)
return X, y
def kmeans_example(plot=False):
X, y = make_blobs(centers=4, n_samples=500, n_features=2,
shuffle=True, random_state=42)
clusters = len(np.unique(y))
k = KMeans(K=clusters, max_iters=150, init='++')
k.fit(X)
k.predict()
if plot:
k.plot()
def test_integrated_kmeans_elbow(self):
"""
Test no exceptions for kmeans k-elbow visualizer on blobs dataset
See #182: cannot use occupancy dataset because of memory usage
"""
# Generate a blobs data set
X,y = make_blobs(
n_samples=1000, n_features=12, centers=6, shuffle=True
)
try:
visualizer = KElbowVisualizer(KMeans(), k=4)
visualizer.fit(X)
visualizer.poof()
except Exception as e:
self.fail("error during k-elbow: {}".format(e))
def test_integrated_mini_batch_kmeans_elbow(self):
"""
Test no exceptions for mini-batch kmeans k-elbow visualizer
See #182: cannot use occupancy dataset because of memory usage
"""
# Generate a blobs data set
X,y = make_blobs(
n_samples=1000, n_features=12, centers=6, shuffle=True
)
try:
visualizer = KElbowVisualizer(MiniBatchKMeans(), k=4)
visualizer.fit(X)
visualizer.poof()
except Exception as e:
self.fail("error during k-elbow: {}".format(e))
def test_integrated_kmeans_silhouette(self):
"""
Test no exceptions for kmeans silhouette visualizer on blobs dataset
See #182: cannot use occupancy dataset because of memory usage
"""
# Generate a blobs data set
X, y = make_blobs(
n_samples=1000, n_features=12, centers=8, shuffle=True,
)
try:
visualizer = SilhouetteVisualizer(KMeans())
visualizer.fit(X)
visualizer.poof()
except Exception as e:
self.fail("error during silhouette: {}".format(e))
def test_integrated_mini_batch_kmeans_silhouette(self):
"""
Test no exceptions for mini-batch kmeans silhouette visualizer
See #182: cannot use occupancy dataset because of memory usage
"""
# Generate a blobs data set
X, y = make_blobs(
n_samples=1000, n_features=12, centers=8, shuffle=True,
)
try:
visualizer = SilhouetteVisualizer(MiniBatchKMeans())
visualizer.fit(X)
visualizer.poof()
except Exception as e:
self.fail("error during silhouette: {}".format(e))
def generate_data(n_samples, n_features):
"""Generate random blob-ish data with noisy features.
This returns an array of input data with shape `(n_samples, n_features)`
and an array of `n_samples` target labels.
Only one feature contains discriminative information, the other features
contain only noise.
"""
X, y = make_blobs(n_samples=n_samples, n_features=1, centers=[[-2], [2]])
# add non-discriminative features
if n_features > 1:
X = np.hstack([X, np.random.randn(n_samples, n_features - 1)])
return X, y
bench_plot_approximate_neighbors.py 文件源码
项目:Parallel-SGD
作者: angadgill
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def make_data(n_samples, n_features, n_queries, random_state=0):
"""Create index and query data."""
print('Generating random blob-ish data')
X, _ = make_blobs(n_samples=n_samples + n_queries,
n_features=n_features, centers=100,
shuffle=True, random_state=random_state)
# Keep the last samples as held out query vectors: note since we used
# shuffle=True we have ensured that index and query vectors are
# samples from the same distribution (a mixture of 100 gaussians in this
# case)
return X[:n_samples], X[n_samples:]
def test_kde_pipeline_gridsearch():
# test that kde plays nice in pipelines and grid-searches
X, _ = make_blobs(cluster_std=.1, random_state=1,
centers=[[0, 1], [1, 0], [0, 0]])
pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False),
KernelDensity(kernel="gaussian"))
params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
search = GridSearchCV(pipe1, param_grid=params, cv=5)
search.fit(X)
assert_equal(search.best_params_['kerneldensity__bandwidth'], .1)
def test_grid_search_no_score():
# Test grid-search on classifier that has no score function.
clf = LinearSVC(random_state=0)
X, y = make_blobs(random_state=0, centers=2)
Cs = [.1, 1, 10]
clf_no_score = LinearSVCNoScore(random_state=0)
grid_search = GridSearchCV(clf, {'C': Cs}, scoring='accuracy')
grid_search.fit(X, y)
grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs},
scoring='accuracy')
# smoketest grid search
grid_search_no_score.fit(X, y)
# check that best params are equal
assert_equal(grid_search_no_score.best_params_, grid_search.best_params_)
# check that we can call score and that it gives the correct result
assert_equal(grid_search.score(X, y), grid_search_no_score.score(X, y))
# giving no scoring function raises an error
grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs})
assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit,
[[1]])
def test_grid_search_iid():
# test the iid parameter
# noise-free simple 2d-data
X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0,
cluster_std=0.1, shuffle=False, n_samples=80)
# split dataset into two folds that are not iid
# first one contains data of all 4 blobs, second only from two.
mask = np.ones(X.shape[0], dtype=np.bool)
mask[np.where(y == 1)[0][::2]] = 0
mask[np.where(y == 2)[0][::2]] = 0
# this leads to perfect classification on one fold and a score of 1/3 on
# the other
svm = SVC(kernel='linear')
# create "cv" for splits
cv = [[mask, ~mask], [~mask, mask]]
# once with iid=True (default)
grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv)
grid_search.fit(X, y)
first = grid_search.grid_scores_[0]
assert_equal(first.parameters['C'], 1)
assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.])
# for first split, 1/4 of dataset is in test, for second 3/4.
# take weighted average
assert_almost_equal(first.mean_validation_score,
1 * 1. / 4. + 1. / 3. * 3. / 4.)
# once with iid=False
grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv,
iid=False)
grid_search.fit(X, y)
first = grid_search.grid_scores_[0]
assert_equal(first.parameters['C'], 1)
# scores are the same as above
assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.])
# averaged score is just mean of scores
assert_almost_equal(first.mean_validation_score,
np.mean(first.cv_validation_scores))
def test_gridsearch_no_predict():
# test grid-search with an estimator without predict.
# slight duplication of a test from KDE
def custom_scoring(estimator, X):
return 42 if estimator.bandwidth == .1 else 0
X, _ = make_blobs(cluster_std=.1, random_state=1,
centers=[[0, 1], [1, 0], [0, 0]])
search = GridSearchCV(KernelDensity(),
param_grid=dict(bandwidth=[.01, .1, 1]),
scoring=custom_scoring)
search.fit(X)
assert_equal(search.best_params_['bandwidth'], .1)
assert_equal(search.best_score_, 42)
def test_grid_search_score_consistency():
# test that correct scores are used
clf = LinearSVC(random_state=0)
X, y = make_blobs(random_state=0, centers=2)
Cs = [.1, 1, 10]
for score in ['f1', 'roc_auc']:
grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score)
grid_search.fit(X, y)
cv = StratifiedKFold(n_folds=3)
for C, scores in zip(Cs, grid_search.grid_scores_):
clf.set_params(C=C)
scores = scores[2] # get the separate runs from grid scores
i = 0
for train, test in cv.split(X, y):
clf.fit(X[train], y[train])
if score == "f1":
correct_score = f1_score(y[test], clf.predict(X[test]))
elif score == "roc_auc":
dec = clf.decision_function(X[test])
correct_score = roc_auc_score(y[test], dec)
assert_almost_equal(correct_score, scores[i])
i += 1
def test_grid_search_no_score():
# Test grid-search on classifier that has no score function.
clf = LinearSVC(random_state=0)
X, y = make_blobs(random_state=0, centers=2)
Cs = [.1, 1, 10]
clf_no_score = LinearSVCNoScore(random_state=0)
grid_search = GridSearchCV(clf, {'C': Cs}, scoring='accuracy')
grid_search.fit(X, y)
grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs},
scoring='accuracy')
# smoketest grid search
grid_search_no_score.fit(X, y)
# check that best params are equal
assert_equal(grid_search_no_score.best_params_, grid_search.best_params_)
# check that we can call score and that it gives the correct result
assert_equal(grid_search.score(X, y), grid_search_no_score.score(X, y))
# giving no scoring function raises an error
grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs})
assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit,
[[1]])
def test_grid_search_iid():
# test the iid parameter
# noise-free simple 2d-data
X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0,
cluster_std=0.1, shuffle=False, n_samples=80)
# split dataset into two folds that are not iid
# first one contains data of all 4 blobs, second only from two.
mask = np.ones(X.shape[0], dtype=np.bool)
mask[np.where(y == 1)[0][::2]] = 0
mask[np.where(y == 2)[0][::2]] = 0
# this leads to perfect classification on one fold and a score of 1/3 on
# the other
svm = SVC(kernel='linear')
# create "cv" for splits
cv = [[mask, ~mask], [~mask, mask]]
# once with iid=True (default)
grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv)
grid_search.fit(X, y)
first = grid_search.grid_scores_[0]
assert_equal(first.parameters['C'], 1)
assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.])
# for first split, 1/4 of dataset is in test, for second 3/4.
# take weighted average
assert_almost_equal(first.mean_validation_score,
1 * 1. / 4. + 1. / 3. * 3. / 4.)
# once with iid=False
grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv,
iid=False)
grid_search.fit(X, y)
first = grid_search.grid_scores_[0]
assert_equal(first.parameters['C'], 1)
# scores are the same as above
assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.])
# averaged score is just mean of scores
assert_almost_equal(first.mean_validation_score,
np.mean(first.cv_validation_scores))