def test_euclidean_distances():
X = da.random.uniform(size=(100, 4), chunks=50)
Y = da.random.uniform(size=(100, 4), chunks=50)
a = dm.euclidean_distances(X, Y)
b = sm.euclidean_distances(X, Y)
assert_eq(a, b)
x_norm_squared = (X ** 2).sum(axis=1).compute()[:, np.newaxis]
a = dm.euclidean_distances(X, Y, X_norm_squared=x_norm_squared)
b = sm.euclidean_distances(X, Y, X_norm_squared=x_norm_squared)
assert_eq(a, b)
y_norm_squared = (Y ** 2).sum(axis=1).compute()[np.newaxis, :]
a = dm.euclidean_distances(X, Y, Y_norm_squared=y_norm_squared)
b = sm.euclidean_distances(X, Y, Y_norm_squared=y_norm_squared)
assert_eq(a, b)
python类euclidean_distances()的实例源码
def predict(self, X):
""" A reference implementation of a prediction for a classifier.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.
Returns
-------
y : array of int of shape = [n_samples]
The label for each sample is the label of the closest sample
seen udring fit.
"""
# Check is fit had been called
check_is_fitted(self, ['X_', 'y_'])
# Input validation
X = check_array(X)
closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
return self.y_[closest]
def get_wmd_distance(d1, d2, min_vocab=7, verbose=False):
vocabulary = [w for w in set(d1.lower().split() + d2.lower().split()) if w in model.vocab and w not in stop_words.ENGLISH_STOP_WORDS]
if len(vocabulary) < min_vocab:
return 1
vect = CountVectorizer(vocabulary=vocabulary).fit([d1, d2])
W_ = np.array([model[w] for w in vect.get_feature_names() if w in model])
D_ = euclidean_distances(W_)
D_ = D_.astype(np.double)
D_ /= D_.max() # just for comparison purposes
v_1, v_2 = vect.transform([d1, d2])
v_1 = v_1.toarray().ravel()
v_2 = v_2.toarray().ravel()
# pyemd needs double precision input
v_1 = v_1.astype(np.double)
v_2 = v_2.astype(np.double)
v_1 /= v_1.sum()
v_2 /= v_2.sum()
if verbose:
print vocabulary
print v_1, v_2
return emd(v_1, v_2, D_)
# d1 = "Government speaks to the media in Illinois"
# d2 = "The state addresses the press in Chicago"
# print get_wmd_distance(d1, d2)
def test_euclidean_distances_same():
X = da.random.uniform(size=(100, 4), chunks=50)
a = dm.euclidean_distances(X, X)
b = sm.euclidean_distances(X, X)
assert_eq(a, b, atol=1e-4)
x_norm_squared = (X ** 2).sum(axis=1).compute()[:, np.newaxis]
assert_eq(X, X, Y_norm_squared=x_norm_squared, atol=1e-4)
def fit(self, X, **kwargs):
"""Apply affinity propagation clustering.
Create affinity matrix from negative euclidean distances if required.
Parameters
----------
X: array-like or sparse matrix,
shape (n_samples, n_features) or (n_samples, n_samples)
Data matrix or, if affinity is ``precomputed``, matrix of
similarities / affinities.
"""
if not issparse(X):
return super(AffinityPropagation, self).fit(X, **kwargs)
# Since X is sparse, this converts it in a coo_matrix if required
X = check_array(X, accept_sparse='coo')
if self.affinity == "precomputed":
self.affinity_matrix_ = X
elif self.affinity == "euclidean":
self.affinity_matrix_ = coo_matrix(
-euclidean_distances(X, squared=True))
else:
raise ValueError("Affinity must be 'precomputed' or "
"'euclidean'. Got %s instead"
% str(self.affinity))
self.cluster_centers_indices_, self.labels_, self.n_iter_ = \
sparse_ap(
self.affinity_matrix_, self.preference, max_iter=self.max_iter,
convergence_iter=self.convergence_iter, damping=self.damping,
copy=self.copy, verbose=self.verbose, return_n_iter=True,
convergence_percentage=self.convergence_percentage)
if self.affinity != "precomputed":
self.cluster_centers_ = X.data[self.cluster_centers_indices_].copy()
return self
def _wmd(self, i, row, X_train):
"""Compute the WMD between training sample i and given test row.
Assumes that `row` and train samples are sparse BOW vectors summing to 1.
"""
union_idx = np.union1d(X_train[i].indices, row.indices) - 1
W_minimal = self.W_embed[union_idx]
W_dist = euclidean_distances(W_minimal)
bow_i = X_train[i, union_idx].A.ravel()
bow_j = row[:, union_idx].A.ravel()
return emd(bow_i, bow_j, W_dist)
def get_twodim_reps(reps, seed, distance=euclidean_distances):
reps = reps.astype(np.float64)
similarities = distance(reps)
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=seed)
return mds.fit(similarities).embedding_
def test_random_projection_embedding_quality():
data, _ = make_sparse_random_data(8, 5000, 15000)
eps = 0.2
original_distances = euclidean_distances(data, squared=True)
original_distances = original_distances.ravel()
non_identical = original_distances != 0.0
# remove 0 distances to avoid division by 0
original_distances = original_distances[non_identical]
for RandomProjection in all_RandomProjection:
rp = RandomProjection(n_components='auto', eps=eps, random_state=0)
projected = rp.fit_transform(data)
projected_distances = euclidean_distances(projected, squared=True)
projected_distances = projected_distances.ravel()
# remove 0 distances to avoid division by 0
projected_distances = projected_distances[non_identical]
distances_ratio = projected_distances / original_distances
# check that the automatically tuned values for the density respect the
# contract for eps: pairwise distances are preserved according to the
# Johnson-Lindenstrauss lemma
assert_less(distances_ratio.max(), 1 + eps)
assert_less(1 - eps, distances_ratio.min())
def test_affinity_propagation():
# Affinity Propagation algorithm
# Compute similarities
S = -euclidean_distances(X, squared=True)
preference = np.median(S) * 10
# Compute Affinity Propagation
cluster_centers_indices, labels = affinity_propagation(
S, preference=preference)
n_clusters_ = len(cluster_centers_indices)
assert_equal(n_clusters, n_clusters_)
af = AffinityPropagation(preference=preference, affinity="precomputed")
labels_precomputed = af.fit(S).labels_
af = AffinityPropagation(preference=preference, verbose=True)
labels = af.fit(X).labels_
assert_array_equal(labels, labels_precomputed)
cluster_centers_indices = af.cluster_centers_indices_
n_clusters_ = len(cluster_centers_indices)
assert_equal(np.unique(labels).size, n_clusters_)
assert_equal(n_clusters, n_clusters_)
# Test also with no copy
_, labels_no_copy = affinity_propagation(S, preference=preference,
copy=False)
assert_array_equal(labels, labels_no_copy)
# Test input validation
assert_raises(ValueError, affinity_propagation, S[:, :-1])
assert_raises(ValueError, affinity_propagation, S, damping=0)
af = AffinityPropagation(affinity="unknown")
assert_raises(ValueError, af.fit, X)