def _build_graph(self):
"""Compute the graph Laplacian."""
# Graph sparsification
if self.sparsify == 'epsilonNN':
self.A_ = radius_neighbors_graph(self.X_, self.radius, include_self=False)
else:
Q = kneighbors_graph(
self.X_,
self.n_neighbors,
include_self = False
).astype(np.bool)
if self.sparsify == 'kNN':
self.A_ = (Q + Q.T).astype(np.float64)
elif self.sparsify == 'MkNN':
self.A_ = (Q.multiply(Q.T)).astype(np.float64)
# Edge re-weighting
if self.reweight == 'rbf':
W = rbf_kernel(self.X_, gamma=self.t)
self.A_ = self.A_.multiply(W)
return sp.csgraph.laplacian(self.A_, normed=self.normed)
python类kneighbors_graph()的实例源码
def cluster_agglomerative(X_train, model_args=None, gridsearch=True, connectivity_graph=True, connectivity_graph_neighbors=10):
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
print('AgglomerativeClustering')
if connectivity_graph:
print('Creating k-neighbors graph for connectivity restraint')
connectivity = kneighbors_graph(X_train, n_neighbors=connectivity_graph_neighbors)
model_args['connectivity'] = connectivity
if gridsearch is True:
## TODO:
# add hyperparamter searching. No scoring method available for this model,
# so we can't easily use gridsearching.
raise NotImplementedError('No hyperparameter optimization available yet for this model. Set gridsearch to False')
# prune(param_grid, model_args)
else:
if 'n_clusters' not in model_args:
raise KeyError('Need to define n_clusters for AgglomerativeClustering')
param_grid = None
return ModelWrapper(AgglomerativeClustering, X=X_train, model_args=model_args, param_grid=param_grid, unsupervised=True)
def test_kneighbors_graph_sparse(seed=36):
# Test kneighbors_graph to build the k-Nearest Neighbor graph
# for sparse input.
rng = np.random.RandomState(seed)
X = rng.randn(10, 10)
Xcsr = csr_matrix(X)
for n_neighbors in [1, 2, 3]:
for mode in ["connectivity", "distance"]:
assert_array_almost_equal(
neighbors.kneighbors_graph(X,
n_neighbors,
mode=mode).toarray(),
neighbors.kneighbors_graph(Xcsr,
n_neighbors,
mode=mode).toarray())
def test_isomap_simple_grid():
# Isomap should preserve distances when all neighbors are used
N_per_side = 5
Npts = N_per_side ** 2
n_neighbors = Npts - 1
# grid of equidistant points in 2D, n_components = n_dim
X = np.array(list(product(range(N_per_side), repeat=2)))
# distances from each point to all others
G = neighbors.kneighbors_graph(X, n_neighbors,
mode='distance').toarray()
for eigen_solver in eigen_solvers:
for path_method in path_methods:
clf = manifold.Isomap(n_neighbors=n_neighbors, n_components=2,
eigen_solver=eigen_solver,
path_method=path_method)
clf.fit(X)
G_iso = neighbors.kneighbors_graph(clf.embedding_,
n_neighbors,
mode='distance').toarray()
assert_array_almost_equal(G, G_iso)
def ward_hc(self, n_clusters, n_neighbors=10):
"""
Perform Ward hierarchical clustering
Parameters
----------
n_clusters : int
number of clusters
lsi_components : int
apply LSA before the clustering algorithm
n_neighbors : int
N nearest neighbors used for computing the connectivity matrix
"""
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
pars = {'n_neighbors': n_neighbors, 'is_hierarchical': True,
"metric": self.metric}
if 'lsi' not in self.pipeline:
raise ValueError("you must use lsi with birch clustering "
"for scaling reasons.")
# This is really not efficient as
# it's done a second time in _cluster_func
X = self.pipeline.data
connectivity = kneighbors_graph(X, n_neighbors=n_neighbors,
include_self=False)
km = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward',
connectivity=connectivity)
return self._cluster_func(n_clusters, km, pars)
def test_kneighbors_graph():
# Test kneighbors_graph to build the k-Nearest Neighbor graph.
X = np.array([[0, 1], [1.01, 1.], [2, 0]])
# n_neighbors = 1
A = neighbors.kneighbors_graph(X, 1, mode='connectivity',
include_self=True)
assert_array_equal(A.toarray(), np.eye(A.shape[0]))
A = neighbors.kneighbors_graph(X, 1, mode='distance')
assert_array_almost_equal(
A.toarray(),
[[0.00, 1.01, 0.],
[1.01, 0., 0.],
[0.00, 1.40716026, 0.]])
# n_neighbors = 2
A = neighbors.kneighbors_graph(X, 2, mode='connectivity',
include_self=True)
assert_array_equal(
A.toarray(),
[[1., 1., 0.],
[1., 1., 0.],
[0., 1., 1.]])
A = neighbors.kneighbors_graph(X, 2, mode='distance')
assert_array_almost_equal(
A.toarray(),
[[0., 1.01, 2.23606798],
[1.01, 0., 1.40716026],
[2.23606798, 1.40716026, 0.]])
# n_neighbors = 3
A = neighbors.kneighbors_graph(X, 3, mode='connectivity',
include_self=True)
assert_array_almost_equal(
A.toarray(),
[[1, 1, 1], [1, 1, 1], [1, 1, 1]])
def test_non_euclidean_kneighbors():
rng = np.random.RandomState(0)
X = rng.rand(5, 5)
# Find a reasonable radius.
dist_array = pairwise_distances(X).flatten()
np.sort(dist_array)
radius = dist_array[15]
# Test kneighbors_graph
for metric in ['manhattan', 'chebyshev']:
nbrs_graph = neighbors.kneighbors_graph(
X, 3, metric=metric, mode='connectivity',
include_self=True).toarray()
nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X)
assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray())
# Test radiusneighbors_graph
for metric in ['manhattan', 'chebyshev']:
nbrs_graph = neighbors.radius_neighbors_graph(
X, radius, metric=metric, mode='connectivity',
include_self=True).toarray()
nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X)
assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).A)
# Raise error when wrong parameters are supplied,
X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan')
X_nbrs.fit(X)
assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3,
metric='euclidean')
X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan')
X_nbrs.fit(X)
assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs,
radius, metric='euclidean')
def test_k_and_radius_neighbors_train_is_not_query():
# Test kneighbors et.al when query is not training data
for algorithm in ALGORITHMS:
nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
X = [[0], [1]]
nn.fit(X)
test_data = [[2], [1]]
# Test neighbors.
dist, ind = nn.kneighbors(test_data)
assert_array_equal(dist, [[1], [0]])
assert_array_equal(ind, [[1], [1]])
dist, ind = nn.radius_neighbors([[2], [1]], radius=1.5)
check_object_arrays(dist, [[1], [1, 0]])
check_object_arrays(ind, [[1], [0, 1]])
# Test the graph variants.
assert_array_equal(
nn.kneighbors_graph(test_data).A, [[0., 1.], [0., 1.]])
assert_array_equal(
nn.kneighbors_graph([[2], [1]], mode='distance').A,
np.array([[0., 1.], [0., 0.]]))
rng = nn.radius_neighbors_graph([[2], [1]], radius=1.5)
assert_array_equal(rng.A, [[0, 1], [1, 1]])
def test_include_self_neighbors_graph():
# Test include_self parameter in neighbors_graph
X = [[2, 3], [4, 5]]
kng = neighbors.kneighbors_graph(X, 1, include_self=True).A
kng_not_self = neighbors.kneighbors_graph(X, 1, include_self=False).A
assert_array_equal(kng, [[1., 0.], [0., 1.]])
assert_array_equal(kng_not_self, [[0., 1.], [1., 0.]])
rng = neighbors.radius_neighbors_graph(X, 5.0, include_self=True).A
rng_not_self = neighbors.radius_neighbors_graph(
X, 5.0, include_self=False).A
assert_array_equal(rng, [[1., 1.], [1., 1.]])
assert_array_equal(rng_not_self, [[0., 1.], [1., 0.]])
def test_same_knn_parallel():
X, y = datasets.make_classification(n_samples=30, n_features=5,
n_redundant=0, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y)
def check_same_knn_parallel(algorithm):
clf = neighbors.KNeighborsClassifier(n_neighbors=3,
algorithm=algorithm)
clf.fit(X_train, y_train)
y = clf.predict(X_test)
dist, ind = clf.kneighbors(X_test)
graph = clf.kneighbors_graph(X_test, mode='distance').toarray()
clf.set_params(n_jobs=3)
clf.fit(X_train, y_train)
y_parallel = clf.predict(X_test)
dist_parallel, ind_parallel = clf.kneighbors(X_test)
graph_parallel = \
clf.kneighbors_graph(X_test, mode='distance').toarray()
assert_array_equal(y, y_parallel)
assert_array_almost_equal(dist, dist_parallel)
assert_array_equal(ind, ind_parallel)
assert_array_almost_equal(graph, graph_parallel)
for algorithm in ALGORITHMS:
yield check_same_knn_parallel, algorithm
def test_isomap_reconstruction_error():
# Same setup as in test_isomap_simple_grid, with an added dimension
N_per_side = 5
Npts = N_per_side ** 2
n_neighbors = Npts - 1
# grid of equidistant points in 2D, n_components = n_dim
X = np.array(list(product(range(N_per_side), repeat=2)))
# add noise in a third dimension
rng = np.random.RandomState(0)
noise = 0.1 * rng.randn(Npts, 1)
X = np.concatenate((X, noise), 1)
# compute input kernel
G = neighbors.kneighbors_graph(X, n_neighbors,
mode='distance').toarray()
centerer = preprocessing.KernelCenterer()
K = centerer.fit_transform(-0.5 * G ** 2)
for eigen_solver in eigen_solvers:
for path_method in path_methods:
clf = manifold.Isomap(n_neighbors=n_neighbors, n_components=2,
eigen_solver=eigen_solver,
path_method=path_method)
clf.fit(X)
# compute output kernel
G_iso = neighbors.kneighbors_graph(clf.embedding_,
n_neighbors,
mode='distance').toarray()
K_iso = centerer.fit_transform(-0.5 * G_iso ** 2)
# make sure error agrees
reconstruction_error = np.linalg.norm(K - K_iso) / Npts
assert_almost_equal(reconstruction_error,
clf.reconstruction_error())
def makeWard(X, k=2):
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=10)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
return cluster.AgglomerativeClustering(n_clusters=k,
linkage='ward', connectivity=connectivity)
def makeAvgLinkage(X=None, k=2):
connectivity = kneighbors_graph(X, n_neighbors=10)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
return cluster.AgglomerativeClustering(linkage="average",
affinity="cityblock", n_clusters=k,
connectivity=connectivity)
def makeMaxLinkage(X=None, k=2):
connectivity = kneighbors_graph(X, n_neighbors=10)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
return cluster.AgglomerativeClustering(linkage="complete",
affinity="cityblock", n_clusters=k,
connectivity=connectivity)
def fit(self, X, y=None):
"""Creates an affinity matrix for X using the selected affinity,
then applies spectral clustering to this affinity matrix.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features)
OR, if affinity==`precomputed`, a precomputed affinity
matrix of shape (n_samples, n_samples)
"""
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
dtype=np.float64)
if X.shape[0] == X.shape[1] and self.affinity != "precomputed":
warnings.warn("The spectral clustering API has changed. ``fit``"
"now constructs an affinity matrix from data. To use"
" a custom affinity matrix, "
"set ``affinity=precomputed``.")
if self.affinity == 'nearest_neighbors':
connectivity = kneighbors_graph(X, n_neighbors=self.n_neighbors, include_self=True)
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
elif self.affinity == 'precomputed':
self.affinity_matrix_ = X
else:
params = self.kernel_params
if params is None:
params = {}
if not callable(self.affinity):
params['gamma'] = self.gamma
params['degree'] = self.degree
params['coef0'] = self.coef0
self.affinity_matrix_ = pairwise_kernels(X, metric=self.affinity,
filter_params=True,
**params)
random_state = check_random_state(self.random_state)
self.labels_ = spectral_clustering(self.affinity_matrix_,
n_clusters=self.n_clusters,
eigen_solver=self.eigen_solver,
random_state=random_state,
n_init=self.n_init,
eigen_tol=self.eigen_tol,
assign_labels=self.assign_labels,
norm_laplacian=self.norm_laplacian)
return self
def test_neighbors_badargs():
# Test bad argument values: these should all raise ValueErrors
assert_raises(ValueError,
neighbors.NearestNeighbors,
algorithm='blah')
X = rng.random_sample((10, 2))
Xsparse = csr_matrix(X)
y = np.ones(10)
for cls in (neighbors.KNeighborsClassifier,
neighbors.RadiusNeighborsClassifier,
neighbors.KNeighborsRegressor,
neighbors.RadiusNeighborsRegressor):
assert_raises(ValueError,
cls,
weights='blah')
assert_raises(ValueError,
cls, p=-1)
assert_raises(ValueError,
cls, algorithm='blah')
nbrs = cls(algorithm='ball_tree', metric='haversine')
assert_raises(ValueError,
nbrs.predict,
X)
assert_raises(ValueError,
ignore_warnings(nbrs.fit),
Xsparse, y)
nbrs = cls()
assert_raises(ValueError,
nbrs.fit,
np.ones((0, 2)), np.ones(0))
assert_raises(ValueError,
nbrs.fit,
X[:, :, None], y)
nbrs.fit(X, y)
assert_raises(ValueError,
nbrs.predict,
[[]])
if (isinstance(cls, neighbors.KNeighborsClassifier) or
isinstance(cls, neighbors.KNeighborsRegressor)):
nbrs = cls(n_neighbors=-1)
assert_raises(ValueError, nbrs.fit, X, y)
nbrs = neighbors.NearestNeighbors().fit(X)
assert_raises(ValueError, nbrs.kneighbors_graph, X, mode='blah')
assert_raises(ValueError, nbrs.radius_neighbors_graph, X, mode='blah')
def test_k_and_radius_neighbors_duplicates():
# Test behavior of kneighbors when duplicates are present in query
for algorithm in ALGORITHMS:
nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
nn.fit([[0], [1]])
# Do not do anything special to duplicates.
kng = nn.kneighbors_graph([[0], [1]], mode='distance')
assert_array_equal(
kng.A,
np.array([[0., 0.], [0., 0.]]))
assert_array_equal(kng.data, [0., 0.])
assert_array_equal(kng.indices, [0, 1])
dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5)
check_object_arrays(dist, [[0, 1], [1, 0]])
check_object_arrays(ind, [[0, 1], [0, 1]])
rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5)
assert_array_equal(rng.A, np.ones((2, 2)))
rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5,
mode='distance')
assert_array_equal(rng.A, [[0, 1], [1, 0]])
assert_array_equal(rng.indices, [0, 1, 0, 1])
assert_array_equal(rng.data, [0, 1, 1, 0])
# Mask the first duplicates when n_duplicates > n_neighbors.
X = np.ones((3, 1))
nn = neighbors.NearestNeighbors(n_neighbors=1)
nn.fit(X)
dist, ind = nn.kneighbors()
assert_array_equal(dist, np.zeros((3, 1)))
assert_array_equal(ind, [[1], [0], [1]])
# Test that zeros are explicitly marked in kneighbors_graph.
kng = nn.kneighbors_graph(mode='distance')
assert_array_equal(
kng.A, np.zeros((3, 3)))
assert_array_equal(kng.data, np.zeros(3))
assert_array_equal(kng.indices, [1., 0., 1.])
assert_array_equal(
nn.kneighbors_graph().A,
np.array([[0., 1., 0.], [1., 0., 0.], [0., 1., 0.]]))