def predict(self, X):
"""Predict ranking values for new data.
Parameters
----------
X : array, shape (n_test, n_features)
Test data
Returns
-------
y : array, shape (n_test,)
Ranking values
"""
n_features = X.shape[1]
if self.n_features != n_features:
raise ValueError("Expected %d dimensions, got %d"
% (self.n_features, n_features))
K = euclidean_distances(self.X, X, squared=True)
K /= self.denom
np.exp(K, K)
return np.sum(self.alpha[:, np.newaxis] * (K[:-1] - K[1:]), axis=0)
python类euclidean_distances()的实例源码
def transform(self, X):
"""
Transform X into subcluster centroids dimension.
Each dimension represents the distance from the sample point to each
cluster centroid.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Input data.
Returns
-------
X_trans : {array-like, sparse matrix}, shape (n_samples, n_clusters)
Transformed data.
"""
check_is_fitted(self, 'subcluster_centers_')
return euclidean_distances(X, self.subcluster_centers_)
def _select_target_neighbors(self):
"""Find the target neighbors of each sample, that stay fixed during training.
Returns
-------
array_like
An array of neighbors indices for each sample with shape (n_samples, n_neighbors).
"""
self.logger.info('Finding target neighbors...')
target_neighbors = np.empty((self.X_.shape[0], self.n_neighbors_), dtype=int)
for class_ in self.classes_:
class_ind, = np.where(np.equal(self.y_, class_))
dist = euclidean_distances(self.X_[class_ind], squared=True)
np.fill_diagonal(dist, np.inf)
neigh_ind = np.argpartition(dist, self.n_neighbors_ - 1, axis=1)
neigh_ind = neigh_ind[:, :self.n_neighbors_]
# argpartition doesn't guarantee sorted order, so we sort again but only the k neighbors
row_ind = np.arange(len(class_ind))[:, None]
neigh_ind = neigh_ind[row_ind, np.argsort(dist[row_ind, neigh_ind])]
target_neighbors[class_ind] = class_ind[neigh_ind]
return target_neighbors
def eta_L2(self):
# Note that V should be positive
return self.V*np.sum(euclidean_distances(self.eta,squared=True))
def closest_label(X, labels, vec, dist='cosine', ooc_only=False, top=10):
if dist == 'euclidean':
sim = euclidean_distances(X, vec.reshape(1, -1))
elif dist == 'cosine':
sim = cosine_similarity(X, vec.reshape(1, -1))
else:
raise NotImplementedError('dist must be euclidean or cosine')
# get the top five indices
indices = sim.argsort(axis=0)[-top:][::-1]
words = []
for i in indices:
words.append(labels[i[0]])
return " ".join(words)
def compare_distances(self, train_img, cluster):
# sometimes the sift algorithm matches random points on screen so therefore
# it is necessary to determine the euclidean distances between these points
distances = euclidean_distances([self.kmeans.cluster_centers_[0]], cluster)
height, width = train_img.shape
new_cluster = []
# If all the points are greater than np.sqrt((width / 2) ** 2 + (height / 2) ** 2)
# Which then we can assume that they are not correct
# this will only work on images that fit the same dimensions against the query image
for index, distance in enumerate(distances[0]):
if distance <= np.sqrt((width / 2) ** 2 + (height / 2) ** 2):
new_cluster.append(cluster[index])
return new_cluster
def fit(self, X):
"""Fit ranking SVM.
Parameters
----------
X : array, shape (n_samples, n_features)
Training data, sorted, highest rank first
"""
self.n_samples, self.n_features = X.shape
self.n_alpha = self.n_samples - 1
self.X = X
if self.n_samples < 2:
raise ValueError("Expected at least 2 training samples, got %d"
% self.n_samples)
random_state = check_random_state(self.random_state)
n_iter = self.n_iter
if n_iter < 0:
n_iter = int(50000 * np.sqrt(self.n_features))
K = euclidean_distances(self.X, squared=True)
# Average distance between training data
sigma = np.sqrt(K).sum() / ((self.n_samples - 1) * self.n_samples)
sigma *= self.c_sigma
self.denom = -np.maximum(2.0 * sigma ** 2, MACHINE_EPSILON)
K /= self.denom
np.exp(K, K)
# Constraint violation cost
Ci = np.linspace(self.n_alpha, 1, self.n_alpha) ** self.c_pow
Ci *= 10 ** self.c_base
# Optimize alpha parameters
self.alpha = optimize(Ci, K, 1.0, n_iter, random_state)
return self
def generateClustering(self, assignment_proba, centroids, drop_annotated_instances = False,
cluster_labels = None):
self.clusters = [Cluster() for x in range(self.num_clusters)]
if cluster_labels is not None:
for x in range(self.num_clusters):
self.clusters[x].label = cluster_labels[x]
ids = self.instances.getIds()
for i in range(len(ids)):
instance_id = ids[i]
annotated = self.instances.isAnnotated(instance_id)
c = self.assigned_clusters[i]
proba = None
if assignment_proba is not None:
proba = assignment_proba[i, :]
label = self.instances.getLabel(instance_id)
family = self.instances.getFamily(instance_id)
if centroids is not None:
# Reshape to avoid warning from euclidean_distances
# Does not take 1D array as input
centroid = centroids[c].reshape(1, -1)
features = self.instances.getInstance(instance_id).reshape(1,-1)
distance = euclidean_distances(centroid, features)[0][0]
else:
distance = None
self.clusters[c].addInstance(instance_id, distance, label, family, annotated)
unknown_cluster_id = 0
for c in range(self.num_clusters):
unknown_cluster_id = self.clusters[c].finalComputation(unknown_cluster_id)
def substract_picks(self, path):
oldpicks = self._picks.copy()
with open(path, 'r') as f:
regions = yaml.load(f)
self._picks = regions['Centers']
diameter = regions['Diameter']
x_cord = np.array([_[0] for _ in self._picks])
y_cord = np.array([_[1] for _ in self._picks])
x_cord_old = np.array([_[0] for _ in oldpicks])
y_cord_old = np.array([_[1] for _ in oldpicks])
distances = np.sum((euclidean_distances(oldpicks, self._picks)<diameter/2)*1,axis=1)>=1
filtered_list = [i for (i, v) in zip(oldpicks, distances) if not v]
x_cord_new = np.array([_[0] for _ in filtered_list])
y_cord_new = np.array([_[1] for _ in filtered_list])
output = False
if output:
fig1 = plt.figure()
plt.title('Old picks and new picks')
plt.scatter(x_cord,-y_cord, c='r', label='Newpicks')
plt.scatter(x_cord_old,-y_cord_old, c='b', label='Oldpicks')
plt.scatter(x_cord_new,-y_cord_new, c='g', label='Picks to keep')
fig1.show()
self._picks = filtered_list
self.update_pick_info_short()
self.window.tools_settings_dialog.pick_diameter.setValue(regions['Diameter'])
self.update_scene(picks_only=True)
def compute(self):
"""
Compute distance matrix.
Returns
-------
D: array, shape = [m, n]
Distance matrix.
"""
return euclidean_distances(self.X, self.Y, squared=True)
def compute(self):
"""
Compute distance matrix.
Returns
-------
D: array, shape = [m, n]
Distance matrix.
"""
return euclidean_distances(self.X, self.Y, squared=True)
def first_periodic_kernel(X, Y=None, gamma=None, period=None):
# TODO: Add mathematical form of the kernel in the docstring
"""Compute the first periodic kernel between *X* and *Y*.
Parameters
----------
X : array of shape (n_samples_X, n_features)
Y : array of shape (n_samples_Y, n_features)
gamma : float, default None
If None, default to 1.0 / n_samples_X
period : float, default None
If None, default to 2 * pi.
This parameter should not be default as
wrong estimation lead to poor learning score.
Returns
-------
kernel_matrix : array of shape (n_samples_X, n_samples_Y)
"""
X, Y = check_pairwise_arrays(X, Y)
if gamma is None:
gamma = 0.8
if period is None:
period = 2. * pi
a = -log(gamma) / period
b = 2 * pi / period
c = sqrt(pi / a) * (exp(- b ** 2 / (4 * a)) + 1)
K = euclidean_distances(X, Y, squared=True)
# TODO: Optimize to avoid temporary?
return exp(-a * K) * (1 + cos(b * sqrt(K))) / c
def fit(self, X, y):
eucl = euclidean_distances(X)
k = self.k
while True:
simi_m = 1 / (1 + eucl)
to_remove = simi_m.shape[0] - (k + 1)
for vec in simi_m:
vec[vec.argsort()[:to_remove]] = 0
g = Graph.Weighted_Adjacency(simi_m.tolist(), mode=ADJ_UNDIRECTED, loops=False)
if g.is_connected():
break
k += 1
self.k = k
comm = g.community_multilevel()
self.y_comm = np.array(comm.membership)
self.y = y
self.X = X
self.mapping = {}
for c in list(set(comm.membership)):
com_clas = self.y[self.y_comm==c]
self.mapping[c] = Counter(com_clas).most_common(1)[0][0]
def predict(self, X):
y_pred = []
for x in X:
dists = euclidean_distances([x], self.X)[0]
simi_m = 1 / (1 + dists)
nearest_com = self.y_comm[simi_m.argsort()[-self.k:]]
y_pred.append(self.mapping[Counter(nearest_com).most_common(1)[0][0]])
return np.array(y_pred)
def dist_matrices(X1, X2, criterion='euclidean'):
X1loc = np.array(X1)
X2loc = np.array(X2)
if len(X1loc.shape) == 1:
if len(X2loc.shape) == 1:
if X1loc.shape[0] == X2loc.shape[0]:
# As row vectors
X1loc = X1loc.reshape(1, -1)
X2loc = X2loc.reshape(1, -1)
else:
# As column vectors
X1loc = X1loc.reshape(-1, 1)
X2loc = X2loc.reshape(-1, 1)
else:
if X1loc.shape[0] == X2loc.shape[1]:
# Row vector VS. Many rows
X1loc = X1loc.reshape(1, -1)
elif X2loc.shape[1] == 1:
# Column vector VS. Column vector
X1loc = X1loc.reshape(-1, 1)
elif X1loc.shape[0] == X2loc.shape[0]:
# Row vector VS. transposed columns
X1loc = X1loc.reshape(1, -1)
X2loc = X2loc.transpose()
else:
raise ValueError('Invalid dimensions of X1 and X2')
elif len(X2loc.shape) == 1:
if X2loc.shape[0] == X1loc.shape[1]:
# Many rows VS. row vector
X2loc = X2loc.reshape(1, -1)
else:
raise ValueError('Invalid dimensions of X1 and X2')
if criterion == 'euclidean':
return skdists.euclidean_distances(X1loc, X2loc)
elif criterion == 'hamming':
raise NotImplementedError('Hamming distance between rows of matrices has not been implemented yet.')
else:
raise ValueError('Invalid distance criterion')
def process(self, rows_slice):
tmp = self.array[rows_slice, ...]
result = - euclidean_distances(tmp, self.array, squared = True)
with Worker.hdf5_lock:
with tables.open_file(self.hdf5_file, 'r+') as fileh:
hdf5_array = fileh.get_node(self.path)
hdf5_array[rows_slice, ...] = result
del tmp
def test_euclidean_distances():
# Check the pairwise Euclidean distances computation
X = [[0]]
Y = [[1], [2]]
D = euclidean_distances(X, Y)
assert_array_almost_equal(D, [[1., 2.]])
X = csr_matrix(X)
Y = csr_matrix(Y)
D = euclidean_distances(X, Y)
assert_array_almost_equal(D, [[1., 2.]])
rng = np.random.RandomState(0)
X = rng.random_sample((10, 4))
Y = rng.random_sample((20, 4))
X_norm_sq = (X ** 2).sum(axis=1).reshape(1, -1)
Y_norm_sq = (Y ** 2).sum(axis=1).reshape(1, -1)
# check that we still get the right answers with {X,Y}_norm_squared
D1 = euclidean_distances(X, Y)
D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq,
Y_norm_squared=Y_norm_sq)
assert_array_almost_equal(D2, D1)
assert_array_almost_equal(D3, D1)
assert_array_almost_equal(D4, D1)
# check we get the wrong answer with wrong {X,Y}_norm_squared
X_norm_sq *= 0.5
Y_norm_sq *= 0.5
wrong_D = euclidean_distances(X, Y,
X_norm_squared=np.zeros_like(X_norm_sq),
Y_norm_squared=np.zeros_like(Y_norm_sq))
assert_greater(np.max(np.abs(wrong_D - D1)), .01)
# Paired distances
def kmeans(encoder_val_clean, y, nClusters, y_pred_prev=None, weight_initilization='k-means++', seed=42, n_init=40,
max_iter=300):
# weight_initilization = { 'kmeans-pca', 'kmean++', 'random', None }
if weight_initilization == 'kmeans-pca':
start_time = timeit.default_timer()
pca = PCA(n_components=nClusters).fit(encoder_val_clean)
kmeans_model = KMeans(init=pca.components_, n_clusters=nClusters, n_init=1, max_iter=300, random_state=seed)
y_pred = kmeans_model.fit_predict(encoder_val_clean)
centroids = kmeans_model.cluster_centers_.T
centroids = centroids / np.sqrt(np.diag(np.matmul(centroids.T, centroids)))
end_time = timeit.default_timer()
elif weight_initilization == 'k-means++':
start_time = timeit.default_timer()
kmeans_model = KMeans(init='k-means++', n_clusters=nClusters, n_init=n_init, max_iter=max_iter, n_jobs=15,
random_state=seed)
y_pred = kmeans_model.fit_predict(encoder_val_clean)
D = 1.0 / euclidean_distances(encoder_val_clean, kmeans_model.cluster_centers_, squared=True)
D **= 2.0 / (2 - 1)
D /= np.sum(D, axis=1)[:, np.newaxis]
centroids = kmeans_model.cluster_centers_.T
centroids = centroids / np.sqrt(np.diag(np.matmul(centroids.T, centroids)))
end_time = timeit.default_timer()
print('k-means: \t nmi =', normalized_mutual_info_score(y, y_pred), '\t arc =', adjusted_rand_score(y, y_pred),
'\t acc = {:.4f} '.format(bestMap(y, y_pred)),
'K-means objective = {:.1f} '.format(kmeans_model.inertia_), '\t runtime =', end_time - start_time)
if y_pred_prev is not None:
print('Different Assignments: ', sum(y_pred == y_pred_prev), '\tbestMap: ', bestMap(y_pred, y_pred_prev),
'\tdatapoints-bestMap*datapoints: ',
encoder_val_clean.shape[0] - bestMap(y_pred, y_pred_prev) * encoder_val_clean.shape[0])
return centroids, kmeans_model.inertia_, y_pred
def _split_node(node, threshold, branching_factor):
"""The node has to be split if there is no place for a new subcluster
in the node.
1. Two empty nodes and two empty subclusters are initialized.
2. The pair of distant subclusters are found.
3. The properties of the empty subclusters and nodes are updated
according to the nearest distance between the subclusters to the
pair of distant subclusters.
4. The two nodes are set as children to the two subclusters.
"""
new_subcluster1 = _CFSubcluster()
new_subcluster2 = _CFSubcluster()
new_node1 = _CFNode(
threshold, branching_factor, is_leaf=node.is_leaf,
n_features=node.n_features)
new_node2 = _CFNode(
threshold, branching_factor, is_leaf=node.is_leaf,
n_features=node.n_features)
new_subcluster1.child_ = new_node1
new_subcluster2.child_ = new_node2
if node.is_leaf:
if node.prev_leaf_ is not None:
node.prev_leaf_.next_leaf_ = new_node1
new_node1.prev_leaf_ = node.prev_leaf_
new_node1.next_leaf_ = new_node2
new_node2.prev_leaf_ = new_node1
new_node2.next_leaf_ = node.next_leaf_
if node.next_leaf_ is not None:
node.next_leaf_.prev_leaf_ = new_node2
dist = euclidean_distances(
node.centroids_, Y_norm_squared=node.squared_norm_, squared=True)
n_clusters = dist.shape[0]
farthest_idx = np.unravel_index(
dist.argmax(), (n_clusters, n_clusters))
node1_dist, node2_dist = dist[[farthest_idx]]
node1_closer = node1_dist < node2_dist
for idx, subcluster in enumerate(node.subclusters_):
if node1_closer[idx]:
new_node1.append_subcluster(subcluster)
new_subcluster1.update(subcluster)
else:
new_node2.append_subcluster(subcluster)
new_subcluster2.update(subcluster)
return new_subcluster1, new_subcluster2
def _find_impostors_batch(x1, x2, t1, t2, return_dist=False, batch_size=500):
"""Find impostor pairs in chunks to avoid large memory usage
Parameters
----------
x1 : array_like
An array of transformed data samples with shape (n_samples, n_features).
x2 : array_like
An array of transformed data samples with shape (m_samples, n_features) where m_samples < n_samples.
t1 : array_like
An array of distances to the margins with shape (n_samples,).
t2 : array_like
An array of distances to the margins with shape (m_samples,).
batch_size : int (Default value = 500)
The size of each chunk of x1 to compute distances to.
return_dist : bool (Default value = False)
Whether to return the distances to the impostors.
Returns
-------
tuple: (array_like, array_like, [array_like])
imp1 : array_like
An array of sample indices with shape (n_impostors,).
imp2 : array_like
An array of sample indices that violate a margin with shape (n_impostors,).
dist : array_like, optional
An array of pairwise distances of (imp1, imp2) with shape (n_impostors,).
"""
n, m = len(t1), len(t2)
imp1, imp2, dist = [], [], []
for chunk in gen_batches(n, batch_size):
dist_out_in = euclidean_distances(x1[chunk], x2, squared=True)
i1, j1 = np.where(dist_out_in < t1[chunk, None])
i2, j2 = np.where(dist_out_in < t2[None, :])
if len(i1):
imp1.extend(i1 + chunk.start)
imp2.extend(j1)
if return_dist:
dist.extend(dist_out_in[i1, j1])
if len(i2):
imp1.extend(i2 + chunk.start)
imp2.extend(j2)
if return_dist:
dist.extend(dist_out_in[i2, j2])
if return_dist:
return imp1, imp2, dist
else:
return imp1, imp2
def reorderClusters(B, X, GDM, returnOrderIndices = False):
Bloc = np.array(B)
Xloc = ds.listofarrays2arrayofarrays(X)
Bloc = Bloc[:, np.any(Bloc, axis=0)] # Only keep non-empty clusters
B_ordered = np.zeros(Bloc.shape, dtype=bool)
K = Bloc.shape[1] # Number of clusters
L = Xloc.shape[0] # Number of datasets
if K == 0:
return Bloc
# Find Cmeans and distances between clusters
Cmeans = np.array([None] * L, dtype=object)
D = np.zeros([K, K, L]) # KxKxL
for l in range(L):
Cmeans[l] = np.zeros([K, Xloc[l].shape[1]], dtype=float) # (K) x (X[l] samples)
for k in range(K):
Cmeans[l][k] = np.mean(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
D[:, :, l] = skdists.euclidean_distances(Cmeans[l]) # KxK
D = np.median(D, axis=2) # KxK
# Set first cluster as first, then find closest by closest
B_ordered[:, 0] = Bloc[:, 0]
I = np.zeros(K, dtype=int)
I[0] = 0
clustersDone = np.zeros(K, dtype=bool)
clustersDone[0] = True
for k in range(1,K):
relevantD = D[I[k-1], ~clustersDone]
clustersLeft = np.nonzero(~clustersDone)[0]
nextCluster = np.argmin(relevantD)
nextCluster = clustersLeft[nextCluster]
B_ordered[:, k] = Bloc[:, nextCluster]
I[k] = nextCluster
clustersDone[nextCluster] = True
if returnOrderIndices:
return (B_ordered, I)
else:
return B_ordered
def set_preference(data, chunk_size):
"""Return the median of the distribution of pairwise L2 Euclidean distances
between samples (the rows of 'data') as the default preference parameter
for Affinity Propagation clustering.
Parameters
----------
data : array of shape (N_samples, N_features)
The data-set submitted for Affinity Propagation clustering.
chunk_size : int
The size of random subsamples from the data-set whose similarity
matrix is computed. The resulting median of the distribution of
pairwise distances between the data-points selected as part of a
given subsample is stored into a list of medians.
Returns
-------
preference : float
The preference parameter for Affinity Propagation clustering is computed
as the median of the list of median pairwise distances between the data-points
selected as part of each of 15 rounds of random subsampling.
"""
N_samples, N_features = data.shape
rng = np.arange(0, N_samples, dtype = int)
medians = []
for i in xrange(15):
selected_samples = np.random.choice(N_samples, size = chunk_size, replace = False)
samples = data[selected_samples, :]
S = - euclidean_distances(samples, data, squared = True)
n = chunk_size * N_samples - (chunk_size * (chunk_size + 1) / 2)
rows = np.zeros(0, dtype = int)
for i in xrange(chunk_size):
rows = np.append(rows, np.full(N_samples - i, i, dtype = int))
cols = np.zeros(0, dtype = int)
for i in xrange(chunk_size):
cols = np.append(cols, np.delete(rng, selected_samples[:i+1]))
triu_indices = tuple((rows, cols))
preference = np.median(S, overwrite_input = True)
medians.append(preference)
del S
if i % 4 == 3:
gc.collect()
preference = np.median(medians)
return preference