def _spatial_sort(glyph):
from scipy.spatial.distance import cdist
from numpy import argsort
from numpy import argmin
curr = argmin(glyph[:,0])
visited = set([curr])
order = [curr]
dd = cdist(glyph, glyph)
while len(visited)<len(glyph):
row = dd[curr,:]
for i in argsort(row):
if row[i]<=0.0 or i==curr or i in visited:
continue
order.append(i)
visited.add(i)
break
glyph[:,:] = glyph[order,:]
python类cdist()的实例源码
def _center_mahalanobis(self, data):
"""
Finds a point that is in the center of the data using Mahalanobis distance.
Parameters
----------
data: input data as numpy array
Returns
-------
mean: numpy array
"""
distances = cdist(data, data, metric='mahalanobis', VI=self._inv_covar_matrices)
sum_distances = np.sum(distances, axis=0)
center_idx = np.argmin(sum_distances)
return data[center_idx]
def log_likelihood(self, data):
nks = np.bincount(self.labels_, minlength=self.n_clusters) # number of points in each cluster
n, d = data.shape
log_likelihood = 0
covar_matrices = self.covariances(self.labels_, cluster_centers=self.cluster_centers_, data=data)
covar_matrix_det_v = np.linalg.det(covar_matrices)
self._inv_covar_matrices = self._matrix_inverses(covar_matrices)
for k, nk in enumerate(nks):
if self.verbose == 1:
print('log_likelihood: covar_matrix_det = {}'.format(covar_matrix_det_v[k]))
term_1 = nk * (np.log(float(nk)/n) - 0.5 * d * np.log(2*np.pi) - 0.5 * np.log(abs(covar_matrix_det_v[k])))
cdist_result = cdist(data[self.labels_ == k], np.array([self.cluster_centers_[k]]), metric='mahalanobis', VI=self._inv_covar_matrices[k])
cdist_no_nan = cdist_result[~np.isnan(cdist_result)] # to deal with nans returned by cdist
term_2 = -0.5 * (np.sum(cdist_no_nan))
k_sum = term_1 + term_2
log_likelihood += k_sum
if np.isnan(log_likelihood) or log_likelihood == float('inf'):
raise Exception('ll is nan or inf')
return log_likelihood
def _assign_posterior(self):
"""assign posterior to prior based on Hungarian algorithm
Returns
-------
TFA
Returns the instance itself.
"""
prior_centers = self.get_centers(self.local_prior)
posterior_centers = self.get_centers(self.local_posterior_)
posterior_widths = self.get_widths(self.local_posterior_)
# linear assignment on centers
cost = distance.cdist(prior_centers, posterior_centers, 'euclidean')
_, col_ind = linear_sum_assignment(cost)
# reorder centers/widths based on cost assignment
self.set_centers(self.local_posterior_, posterior_centers[col_ind])
self.set_widths(self.local_posterior_, posterior_widths[col_ind])
return self
leaflet-dask-delayed.py 文件源码
项目:MDAnalysis-with-Dask
作者: Becksteinlab
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def Leaflet_finder(traj, i, j, ii, jj, len_chunks, cutoff):
g1 = np.load(os.path.abspath(os.path.normpath(os.path.join(os.getcwd(),traj))), mmap_mode='r')[i:i+len_chunks]
g2 = np.load(os.path.abspath(os.path.normpath(os.path.join(os.getcwd(),traj))), mmap_mode='r')[j:j+len_chunks]
block = np.zeros((len(g1),len(g2)),dtype=float)
block[:,:] = cdist(g1, g2) <= cutoff
adj_list = np.where(block[:,:] == True)
adj_list = np.vstack(adj_list)
adj_list[0] = adj_list[0]+ii*len_chunks
adj_list[1] = adj_list[1]+jj*len_chunks
if adj_list.shape[1] == 0:
adj_list=np.zeros((2,1))
graph = nx.Graph()
edges = [(adj_list[0,k],adj_list[1,k]) for k in range(0,adj_list.shape[1])]
graph.add_edges_from(edges)
leaflet = sorted(nx.connected_components(graph), key=len, reverse=True)
l_connected = [] # Keep only connected components
for lng in range(len(leaflet)):
l_connected.append(leaflet[lng])
return list(l_connected)
def Leaflet_finder(block, traj, cutoff, len_atom, len_chunks, block_id=None):
id_0 = block_id[0]
id_1 = block_id[1]
block[:,:] = cdist(np.load(traj, mmap_mode='r')[id_0*len_chunks:(id_0+1)*len_chunks], np.load(traj, mmap_mode='r')[id_1*len_chunks:(id_1+1)*len_chunks]) <= cutoff
adj_list = np.where(block[:,:] == True)
adj_list = np.vstack(adj_list)
adj_list[0] = adj_list[0]+id_0*len_chunks
adj_list[1] = adj_list[1]+id_1*len_chunks
if adj_list.shape[1] == 0:
adj_list=np.zeros((2,1))
graph = nx.Graph()
edges = [(adj_list[0,k],adj_list[1,k]) for k in range(0,adj_list.shape[1])]
graph.add_edges_from(edges)
l = np.array({i: item for i, item in enumerate(sorted(nx.connected_components(graph)))}, dtype=np.object).reshape(1,1)
return l
def Leaflet_finder(block, traj, cutoff, len_atom, len_chunks, block_id=None):
id_0 = block_id[0]
id_1 = block_id[1]
block[:,:] = cdist(np.load(traj, mmap_mode='r')[id_0*len_chunks:(id_0+1)*len_chunks], np.load(traj, mmap_mode='r')[id_1*len_chunks:(id_1+1)*len_chunks]) <= cutoff
adj_list = np.where(block[:,:] == True)
adj_list = np.vstack(adj_list)
adj_list[0] = adj_list[0]+id_0*len_chunks
adj_list[1] = adj_list[1]+id_1*len_chunks
if adj_list.shape[1] == 0:
adj_list=np.zeros((2,1))
graph = nx.Graph()
edges = [(adj_list[0,k],adj_list[1,k]) for k in range(0,adj_list.shape[1])]
graph.add_edges_from(edges)
l = np.array({i: item for i, item in enumerate(sorted(nx.connected_components(graph)))}, dtype=np.object).reshape(1,1)
return l
def compute_document_similarity(X):
'''
From a matrix of unit distances, computes the cosine similarity
then changes to the angular distance (for a proper metric).
'''
S = cdist(X, X, metric='cosine')
S -= 1
S *= -1
S[S > 1] = 1.0
S[S < 0] = 0.0
# Set nan values to zero
S[np.isnan(S)] = 0
# Convert to angular distance (a proper metric)
S = 1 - (np.arccos(S) / np.pi)
assert(not np.isnan(S).any())
assert(not np.isinf(S).any())
return S
def _compute_dispersion_matrix(X, labels):
n = len(np.unique(labels))
dist = np.zeros((n, n))
ITR = list(itertools.combinations_with_replacement(range(n), 2))
for i, j in tqdm(ITR):
if i == j:
d = pdist(X[labels == i], metric='cosine')
else:
d = cdist(X[labels == i], X[labels == j], metric='cosine')
# Only take upper diagonal (+diagonal elements)
d = d[np.triu_indices(n=d.shape[0], m=d.shape[1], k=0)]
dist[i, j] = dist[j, i] = d.mean()
return dist
def coherence(vecs):
coh=0.0
counter=0
if len(vecs) > 1:
matrix=np.array(vecs)
#print matrix
dist_m=distance.cdist(matrix,matrix,'cosine')
#print dist_m
for i in range(0,len(vecs)-1):
for j in range(i+1,len(vecs)):
cosine=1-dist_m[i][j]
#print cosine
coh+=cosine
counter+=1
coh = float(coh)/float(counter)
else:
coh=1
#print coh
return coh
###########################################
# Compute profile
###########################################
def update_atoms(S, b, H, a, Z, W, labels, weight=[1.,1.], max_iter=50):
M = len(Z)
M_h = [cdist(H[labels[m]].T, S.T, metric='euclidean') for m in range(M)]
M_z = [cdist(Z[m].T, S.T, metric='euclidean') for m in range(M)]
T_h = [algo3(a[labels[m]], b[m], M_h[m], max_iter=max_iter)[0] for m in range(M)]
T_z = [algo3(W[m], b[m], M_z[m], max_iter=max_iter)[0] for m in range(M)]
k = S.shape[1]
for l in range(k):
z_part = weight[0]*np.sum([(z*t[:,l]).sum(axis=1) for (z,t) in zip(Z, T_z)], axis=0)
z_weight = weight[0]*np.sum([t[:,l].sum() for t in T_z])
h_part = weight[1]*np.sum([(h*th[:,l]).sum(axis=1) for (h,th) in zip([H[labels[m]] for m in range(M)], T_h)], axis=0)
h_weight = weight[1]*np.sum([th[:,l].sum() for th in T_h])
S[:,l] = (z_part + h_part)/(z_weight + h_weight)
#################### Learning functions for algorithms
## No constraint algorithm
## Initialization based on k-means
def compute_clients_dist(self, client_data):
client_categorical_feats = [client_data.get(specified_key) for specified_key in CATEGORICAL_FEATURES]
client_continuous_feats = [client_data.get(specified_key) for specified_key in CONTINUOUS_FEATURES]
# Compute the distances between the user and the cached continuous
# and categorical features.
cont_features = distance.cdist(self.continuous_features,
np.array([client_continuous_feats]),
'canberra')
# The lambda trick is needed to prevent |cdist| from force-casting the
# string features to double.
cat_features = distance.cdist(self.categorical_features,
np.array([client_categorical_feats]),
lambda x, y: distance.hamming(x, y))
# Take the product of similarities to attain a univariate similarity score.
# Addition of 0.001 to the continuous features avoids a zero value from the
# categorical variables, allowing categorical features precedence.
return (cont_features + 0.001) * cat_features
def pairdist(ann1, ann2, which):
"""
Compute the pairwise euclidean distance between
the contour boundary points, and return the
minimum, maximum, or average value.
which: str
One of 'min', 'max', or 'avg'.
"""
dists = cdist(ann1.contours_to_matrix(0),
ann2.contours_to_matrix(0))
if which == 'min':
return dists.min()
elif which == 'max':
return dists.max()
elif which == 'avg':
return dists.mean()
else:
raise ValueError('invalid `which` value.')
def __init__(self, classData, k=1, distMetric='euclidean', *args, **kwargs):
Classifier.__init__(self, util.colmat(classData[0]).shape[1],
len(classData))
self.k = k
minObs = min([len(cls) for cls in classData])
if self.k > minObs:
raise Exception('k=%d exceeds the number of examples in smallest training class %d.' %
(k, minObs))
if callable(distMetric):
self.distFunc = lambda x1, x2: distMetric(x1, x2, *args, **kwargs)
else:
self.distFunc = lambda x1, x2: spdist.cdist(x1, x2, metric=distMetric)
self.trainData = classData
def modified_hausdorf_distance(itemA, itemB):
# Modified Hausdorff Distance
#
# Input
# itemA : [n x 2] coordinates of black pixels
# itemB : [m x 2] coordinates of black pixels
#
# M.-P. Dubuisson, A. K. Jain (1994). A modified hausdorff distance for object matching.
# International Conference on Pattern Recognition, pp. 566-568.
#
D = cdist(itemA, itemB)
mindist_A = D.min(axis=1)
mindist_B = D.min(axis=0)
mean_A = np.mean(mindist_A)
mean_B = np.mean(mindist_B)
return max(mean_A, mean_B)
def l2norm_(X, Xstar):
"""
Wrapper function to compute the L2 norm
Parameters
----------
X: np.ndarray, shape=((n, nfeatures))
Instances.
Xstar: np.ndarray, shape=((m, nfeatures))
Instances
Returns
-------
np.ndarray
Pairwise euclidian distance between row pairs of `X` and `Xstar`.
"""
return cdist(X, Xstar)
def kronDelta(X, Xstar):
"""
Computes Kronecker delta for rows in X and Xstar.
Parameters
----------
X: np.ndarray, shape=((n, nfeatures))
Instances.
Xstar: np.ndarray, shape((m, nfeatures))
Instances.
Returns
-------
np.ndarray
Kronecker delta between row pairs of `X` and `Xstar`.
"""
return cdist(X, Xstar) < np.finfo(np.float32).eps
def get_knn_score_for(tree, k=5):
tree = tree_copy_with_start(tree)
tree_encoding = encoder.get_encoding([None, tree]) # This makes sure that token-based things fail
tree_str_rep = str(tree)
distances = cdist(np.atleast_2d(tree_encoding), encodings, 'cosine')
knns = np.argsort(distances)[0]
num_non_identical_nns = 0
sum_equiv_nns = 0
current_i = 0
while num_non_identical_nns < k and current_i < len(knns) and eq_class_counts[
tree.symbol] - 1 > num_non_identical_nns:
expression_idx = knns[current_i]
current_i += 1
if eq_class_idx_to_names[expression_data[expression_idx]['eq_class']] == tree.symbol and str(
expression_data[expression_idx]['tree']) == tree_str_rep:
continue # This is an identical expression, move on
num_non_identical_nns += 1
if eq_class_idx_to_names[expression_data[expression_idx]['eq_class']] == tree.symbol:
sum_equiv_nns += 1
return "(%s-nn-stat: %s)" % (k, sum_equiv_nns / k)
def kernel(self, X, Y=None):
GenericTests.check_type(X,'X',np.ndarray,2)
# if X=Y, use more efficient pdist call which exploits symmetry
if Y is None:
dists = squareform(pdist(X, 'euclidean'))
else:
GenericTests.check_type(Y,'Y',np.ndarray,2)
assert(shape(X)[1]==shape(Y)[1])
dists = cdist(X, Y, 'euclidean')
if self.nu==0.5:
#for nu=1/2, Matern class corresponds to Ornstein-Uhlenbeck Process
K = (self.sigma**2.) * exp( -dists / self.width )
elif self.nu==1.5:
K = (self.sigma**2.) * (1+ sqrt(3.)*dists / self.width) * exp( -sqrt(3.)*dists / self.width )
elif self.nu==2.5:
K = (self.sigma**2.) * (1+ sqrt(5.)*dists / self.width + 5.0*(dists**2.) / (3.0*self.width**2.) ) * exp( -sqrt(5.)*dists / self.width )
else:
raise NotImplementedError()
return K
def calc_arom_facing_norms(arom_a_coords, arom_b_coords):
"""Given two aromatic rings get the normal vectors that face the other ring"""
centroids = [calc_centroid(arom_coords) for arom_coords in [arom_a_coords, arom_b_coords]]
arom_norms = calc_arom_norms(arom_a_coords, arom_b_coords)
face_norms = []
for i, arom_norm in enumerate(arom_norms):
# get the index of the other arom
j = 1 if i ==0 else 0
norm = calc_facing_vector(arom_norm + centroids[i], centroids[j])
# norm_up = arom_norm
# norm_down = -1 * arom_norm
# # get the norm so that it points to the other ring
# d_up = euclidean(norm_up + centroids[i], centroids[j])
# d_down = cdist(norm_down + centroids[i], centroids[j])
# norm = norm_up if d_up < d_down else norm_down
face_norms.append(norm)
return face_norms
def calc_arom_facing_norms(arom_a_coords, arom_b_coords):
"""Given two aromatic rings get the normal vectors that face the other ring"""
centroids = calc_centroids(arom_a_coords, arom_b_coords)
arom_norms = calc_arom_norms(arom_a_coords, arom_b_coords)
face_norms = []
for i, arom_norm in enumerate(arom_norms):
# get the index of the other arom
j = 1 if i ==0 else 0
norm_up = arom_norm
norm_down = -1 * arom_norm
# get the norm so that it points to the other ring
d_up = cdist([norm_up + centroids[i]], [centroids[j]])
d_down = cdist([norm_down + centroids[i]], [centroids[j]])
norm = norm_up if d_up < d_down else norm_down
face_norms.append(norm)
return face_norms
def min_l1_dist(m1, m2):
assert len(m1) == len(m2)
# pairwise l1 distances
dist1 = cdist(m1, m2, 'minkowski', 1)
dist2 = cdist(m1, -m2, 'minkowski', 1)
neg = zip(*np.where(dist1 > dist2))
dist = np.minimum(dist1, dist2)
m = Munkres()
matching = m.compute(dist.copy())
total = 0.0
for row, column in matching:
value = dist[row][column]
total += value
return total, matching, neg
def _nneighbor(files):
import glob
import h5py as _h5py
import numpy as np
from scipy.spatial import distance
paths = glob.glob(files)
if paths:
from . import io, postprocess
from h5py import File
for path in paths:
print('Loading {} ...'.format(path))
with _h5py.File(path, 'r') as locs_file:
locs = locs_file['clusters'][...]
clusters = np.rec.array(locs, dtype=locs.dtype)
points = np.array(clusters[['com_x','com_y']].tolist())
alldist = distance.cdist(points,points)
alldist[alldist==0]=float('inf')
minvals = np.amin(alldist,axis=0)
base, ext = os.path.splitext(path)
out_path = base + '_minval.txt'
#np.savetxt(base + '_minval.txt', minvals, header='dx\tdy', newline='\r\n')
np.savetxt(out_path, minvals, newline='\r\n')
print('Saved filest o: {}'.format(out_path))
def _assign(self, X, update_class_attributes=True):
if self.metric == "euclidean":
dists = cdist(X.reshape((X.shape[0], -1)), self.cluster_centers_.reshape((self.n_clusters, -1)),
metric="euclidean")
elif self.metric == "dtw":
dists = cdist_dtw(X, self.cluster_centers_)
elif self.metric == "softdtw":
dists = cdist_soft_dtw(X, self.cluster_centers_, gamma=self.gamma_sdtw)
else:
raise ValueError("Incorrect metric: %s (should be one of 'dtw', 'softdtw', 'euclidean')" % self.metric)
matched_labels = dists.argmin(axis=1)
if update_class_attributes:
self.labels_ = matched_labels
_check_no_empty_cluster(self.labels_, self.n_clusters)
if self.dtw_inertia and self.metric != "dtw":
inertia_dists = cdist_dtw(X, self.cluster_centers_)
else:
inertia_dists = dists
self.inertia_ = _compute_inertia(inertia_dists, self.labels_, self._squared_inertia)
return matched_labels
def _distance_matrix(self, X1, X2):
"""
This private method returns the following matrix:
:math:`\\boldsymbol{D_{ij}} = \\varphi(\\| \\boldsymbol{x_i} -
\\boldsymbol{y_j} \\|)`
:param numpy.ndarray X1: the vector x in the formula above.
:param numpy.ndarray X2: the vector y in the formula above.
:return: matrix: the matrix D.
:rtype: numpy.ndarray
"""
matrix = cdist(
X1, X2, lambda x, y: self.basis(x-y, self.parameters.radius)
)
return matrix
def estimate_ls(self, X):
Ntrain = X.shape[0]
if Ntrain < 10000:
X1 = np.copy(X)
else:
randind = np.random.permutation(Ntrain)
X1 = X[randind[0:(5*self.no_pseudos[0])], :]
# d2 = compute_distance_matrix(X1)
D = X1.shape[1]
N = X1.shape[0]
triu_ind = np.triu_indices(N)
ls = np.zeros((D, ))
for i in range(D):
X1i = np.reshape(X1[:, i], (N, 1))
d2i = cdist(X1i, X1i, 'euclidean')
# d2i = d2[:, :, i]
d2imed = np.median(d2i[triu_ind])
# d2imed = 0.01
# print d2imed,
ls[i] = np.log(d2imed + 1e-16)
return ls
def estimate_ls_temp(self, X):
Ntrain = X.shape[0]
if Ntrain < 10000:
X1 = np.copy(X)
else:
randind = np.random.permutation(Ntrain)
X1 = X[randind[0:(5*self.no_pseudos[0])], :]
dist = cdist(X1, X1, 'euclidean')
# diff = X1[:, None, :] - X1[None, :, :]
# dist = np.sum(abs(diff), axis=2)
D = X1.shape[1]
N = X1.shape[0]
triu_ind = np.triu_indices(N)
ls = np.zeros((D, ))
d2imed = np.median(dist[triu_ind])
for i in range(D):
ls[i] = np.log(d2imed + 1e-16)
return ls
def avg_within_ss(X, k):
"""
Compute the average within-cluster sum of squares. The code here can be
found "almost" anywhere online
Params:
--------
X: numpy array with observations and features to be clustered
k: number of clusters
Returns:
--------
avgwithinss: average within-cluster sum of squares
"""
model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50,
n_init=3, max_no_improvement=10, verbose=0)
model.fit(X)
centroids = model.cluster_centers_
dist_c = cdist(X, centroids, 'euclidean')
dist = np.min(dist_c, axis=1)
avgwithinss = sum(dist**2)/X.shape[0]
return avgwithinss
def EvalSHD(data,model):
dataNum,dataDim=data.shape
distToAnchors=cdist(model["anchorData"], data,'euclidean')
sigmaRBF=model["sigmaRBF"]
phi=npy.exp(-npy.square(distToAnchors)/sigmaRBF/sigmaRBF/2)
matP=model["matProj"]
projData=npy.dot(matP.T,phi)
code=npy.ones(projData.shape,dtype=npy.int32)
code[projData<0]=-1
if 0==code.shape[0]%8:
codeByteNum=code.shape[0]/8
else:
codeByteNum=1+code.shape[0]/8
compactCode=npy.zeros((code.shape[1],codeByteNum),dtype=npy.uint8)
for kk in range(code.shape[0]):
idxByte=kk/8
idxBit=kk%8
compactCode[code[kk,:]==1,idxByte]+=(1<<idxBit)
return compactCode
def recommendNext(self, user_id, current_loc):
# indices should start from zero
indices = user_id - 1
# U x Z
Theta = self.Theta[indices]
# Z x I
Phi = self.Phi
beta = self.beta
# U x I
dists = np.exp(-0.5 * beta * np.square(cdist(current_loc, self.loc_info)))
# Equation (8)
# Z x U x I
P_izu = np.expand_dims(np.exp(Phi), axis = 1) * np.expand_dims(dists, axis = 0)
P_izu = P_izu / np.expand_dims(np.sum(P_izu, axis = 2), axis = 2)
# Equation (9)
# U x I
P_iu = np.expand_dims(Theta.T, axis = 2) * P_izu
P_iu = np.sum(P_iu, axis = 0)
return P_iu