def gradient(self, x):
d = self._distances
if d is not None and np.ndim(d) == 1: d = squareform(d)
return np.sum([self.params[k] * self.features[k].gradient(x,d)
for k in range(self.K)],0)
python类squareform()的实例源码
def buildGraph(data, epsilon=1., metric='euclidean', p=2):
D = squareform(pdist(data, metric=metric, p=p))
D[D >= epsilon] = 0.
G = nx.Graph(D)
edges = list(map(set, G.edges()))
weights = [G.get_edge_data(u, v)['weight'] for u, v in G.edges()]
return G.nodes(), edges, weights
def cluster(target_sequence_ids, fasta_filename, method='average'):
""" Form distance-based hierachical clustering of sequences.
Looks up each entry in target_sequence_ids in the file
specified by fasta_filename to obtain an associated DNA
sequence.
In principle, we could just work with the Hamming distance, but
the sequences may be of different lengths (mostly small
differences.) So we need a more sophisticated approach: we use
pairwise global alignment, scoring 0 for a match, -1 for mismatch,
and -1.5 for opening or extending a gap. We then take the distance
to be -1.0*(score).
UPGMA clustering is used when method='average', the default.
Returns the distance matrix and the linkage matrix returned
by the clustering routine.
"""
# globalms arguments: seq1, seq2, match, mismatch, open, extend
distance = lambda seq1, seq2: -1.0*(
pairwise2.align.globalms(seq1,seq2,0,-1,-1.5,-1.5, score_only=True)
)
sequences = fasta_to_dict(fasta_filename)
N = len(target_sequence_ids)
distances = np.zeros((N,N))
# fill in the upper triangle
for i,seqid1 in enumerate(target_sequence_ids):
seq1 = sequences[seqid1]
for j_offset, seqid2 in enumerate(target_sequence_ids[i+1:]):
j = j_offset + i + 1
seq2 = sequences[seqid2]
distances[i][j] = distance(seq1, seq2)
# convert to the form expected by the scipy clustering routines
y = squareform(distances,checks=False)
return distances, hierarchy.linkage(y,method)
transformation_tests_func.py 文件源码
项目:3D_Dense_Transformer_Networks
作者: JohnYC1995
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def makeT(self,cp):
# cp: [(k*k*k) x 3] control points
# T: [((k*k*k)+4) x ((k*k*k)+4)]
K = cp.shape[0]
T = np.zeros((K+4, K+4))
T[:K, 0] = 1; T[:K, 1:4] = cp; T[K, 4:] = 1; T[K+1:, 4:] = cp.T
R = squareform(pdist(cp, metric='euclidean'))
R = R * R;R[R == 0] = 1 # a trick to make R ln(R) 0
R = R * np.log(R)
np.fill_diagonal(R, 0)
T[:K, 4:] = R
return T
def n1_fraction_borderline(data):
def get_n1_for_round(sparse_matrix, y):
Tcsr = minimum_spanning_tree(sparse_matrix)
borders = set()
a = Tcsr.nonzero()[0]
b = Tcsr.nonzero()[1]
for i in range(len(a)):
if (y[a[i]] != y[b[i]]):
borders.add(a[i])
borders.add(b[i])
n1 = len(borders)
return n1
features = data.columns[:-1, ]
dist = pdist(data[features], 'euclidean')
df_dist = pd.DataFrame(squareform(dist))
sparse_matrix = csr_matrix(df_dist.values)
labels = data.columns[-1]
y = data[labels]
n1 = 0
rounds = 10
for round in range(rounds):
n1 = n1 + get_n1_for_round(sparse_matrix, y)
n = len(data)
n1 = (1.0 * n1) / (rounds * n)
return n1
def n2_ratio_intra_extra_class_nearest_neighbor_distance(data):
features = data.columns[:-1,]
labels = data.columns[-1]
dist = pdist(data[features], 'euclidean')
df_dist = pd.DataFrame(squareform(dist))
max_size = df_dist.copy( )
max_size.iloc[:, :] = False
classes = data.iloc[ :, -1].unique()
n = data.shape[0]
n2 = 0
cl = 'bla'
intra_min = 0
inter_min = 0
for i in range(data.shape[0]):
ci = data.iloc[i, -1]
if ci != cl:
cl = ci
intra_idx = data[data[labels] == ci].index.values.tolist()
inter_idx = data[data[labels] != ci].index.values
intra_idx.remove(i)
intra_min = intra_min + df_dist.iloc[intra_idx, i].min()
inter_min = inter_min + df_dist.iloc[inter_idx, i].min()
intra_idx.append(i)
# tratar caso de inter_min == 0
if inter_min == 0:
inter_min = 1
n2 = (1.0 * intra_min) / (1.0 * inter_min)
return n2
def start_clustering(self):
functions.log('Calculate {0} distances...'.format(int(len(self.orfs) * (len(self.orfs) + 1) / 2)))
self.distances = self.create_distance_matrix()
functions.log('Start clustering...')
self.linkage_matrix = scipy.cluster.hierarchy.linkage(ssd.squareform(self.distances), method='complete')
functions.log('Clustering done.')
def merge_candidates_scan(candidates, seriesuid, distance=5.):
distances = pdist(candidates, metric='euclidean')
adjacency_matrix = squareform(distances)
# Determine nodes within distance, replace by 1 (=adjacency matrix)
adjacency_matrix = np.where(adjacency_matrix<=distance,1,0)
# Determine all connected components in the graph
n, labels = connected_components(adjacency_matrix)
new_candidates = np.zeros((n,3))
# Take the mean for these connected components
for cluster_i in range(n):
points = candidates[np.where(labels==cluster_i)]
center = np.mean(points,axis=0)
new_candidates[cluster_i,:] = center
x = new_candidates[:,0]
y = new_candidates[:,1]
z = new_candidates[:,2]
labels = [seriesuid]*len(x)
class_name = [0]*len(x)
data= zip(labels,x,y,z,class_name)
new_candidates = pd.DataFrame(data,columns=CANDIDATES_COLUMNS)
return new_candidates
def precompute_kernels(self, q) :
"""
Returns a tuple of kernel, kernel', kernel'' matrices at position q.
"""
x = q.reshape((self.npoints, self.dimension))
dists = squareform(pdist(x, 'sqeuclidean'))
K = exp(- dists / (2* self.kernel_scale ** 2))
return ( K,
- K / (2* self.kernel_scale ** 2),
K / (4* self.kernel_scale ** 4))
def dq_Kqp_a(self,q,p,a, kernels) :
"""
Useful for the adjoint integration scheme.
d_q (K_q p) . a = ...
"""
h = 1e-8
Q0phA = q + h*a
Q0mhA = q - h*a
update_emp = ( Landmarks.K(self, Q0phA, p, Landmarks.precompute_kernels(self, Q0phA))
- Landmarks.K(self, Q0mhA, p, Landmarks.precompute_kernels(self, Q0mhA))) / (2*h)
return update_emp
"""x = q.reshape((self.npoints, self.dimension))
p = p.reshape((self.npoints, self.dimension))
a = a.reshape((self.npoints, self.dimension))
dists = squareform(pdist(x, 'sqeuclidean')) # dists_ij = |x_i-x_j|^2
# We have :
# [K_q p]_nd = sum_j { k(|x_n - x_j|^2) * p_j^d }
#
# So that :
# grad_nd = a_nd * sum_j { 2 * (x_n^d - x_j^d) * k'(|x_n - x_j|^2) * p_j^d }
grad = zeros((self.npoints, self.dimension))
for d in range(self.dimension) :
diffs = atleast_2d(x[:,d]).T - x[:,d] # diffs_ij = x_i^d - x_j^d
# K_ij = 2 * (x_i^d - x_j^d) * k'(|x_i - x_j|^2) * p_j^d
K = 2 * dists * kernels[1] * p[:,d]
# grad_nd = a_nd * sum_j { 2 * (x_n^d - x_j^d) * k'(|x_n - x_j|^2) * p_j^d }
grad[:,d] = a[:,d] * sum( K , 1 )
return grad.reshape((self.npoints * self.dimension,))"""
ppdb_utils.py 文件源码
项目:Learning-sentence-representation-with-guidance-of-human-attention
作者: wangshaonan
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def getPairsFast(d, type):
X = []
T = []
pairs = []
for i in range(len(d)):
(p1,p2) = d[i]
X.append(p1.representation)
X.append(p2.representation)
T.append(p1)
T.append(p2)
arr = pdist(X,'cosine')
arr = squareform(arr)
for i in range(len(arr)):
arr[i,i]=1
if i % 2 == 0:
arr[i,i+1] = 1
else:
arr[i,i-1] = 1
arr = np.argmin(arr,axis=1)
for i in range(len(d)):
(t1,t2) = d[i]
p1 = None
p2 = None
if type == "MAX":
p1 = T[arr[2*i]]
p2 = T[arr[2*i+1]]
if type == "RAND":
p1 = getPairRand(d,i)
p2 = getPairRand(d,i)
if type == "MIX":
p1 = getPairMixScore(d,i,T[arr[2*i]])
p2 = getPairMixScore(d,i,T[arr[2*i+1]])
pairs.append((p1,p2))
return pairs
def cao_juan_2009(topic_term_dists, num_topics):
cos_pdists = squareform(pdist(topic_term_dists, metric='cosine'))
return np.sum(cos_pdists) / (num_topics*(num_topics - 1)/2)
def deveaud_2014(topic_term_dists, num_topics):
jsd_pdists = squareform(pdist(topic_term_dists, metric=jensen_shannon))
return np.sum(jsd_pdists) / (num_topics*(num_topics - 1))
def compute_distmat(self, dataframe):
"""
Computes the pairwise euclidean distances between every atom.
Design choice: passed in a DataFrame to enable easier testing on
dummy data.
"""
self.eucl_dists = pdist(dataframe[['x', 'y', 'z']],
metric='euclidean')
self.eucl_dists = pd.DataFrame(squareform(self.eucl_dists))
self.eucl_dists.index = dataframe.index
self.eucl_dists.columns = dataframe.index
return self.eucl_dists
def get_representation_distance_ratio(encoder: AbstractEncoder, data_filename: str, print_stats: bool = False):
"""Compute the ratio of the avg distance of points within an equivalence class vs the avg distance between all points"""
data = import_data(data_filename)
encodings = []
equivalence_sets = []
for name, code in data.items():
idx = len(encodings)
enc = encoder.get_encoding(code['original'])
assert not np.isnan(np.sum(enc))
encodings.append(enc)
for noisy_sample in code['noise']:
enc = encoder.get_encoding(noisy_sample)
assert not np.isnan(np.sum(enc))
encodings.append(enc)
equivalence_sets.append(set(range(idx, len(encodings))))
encodings = np.array(encodings)
all_distances = squareform(pdist(encodings, 'cosine')) # TODO: avoid square form somehow
assert not np.any(np.isnan(all_distances))
# Average the lower triangle of all_distances
avg_distance_between_all_points = np.sum(np.tril(all_distances, k=-1)) / (len(encodings) * (len(encodings) - 1) / 2)
sum_distance_within_eq_class = 0.
num_pairs = 0
for equiv_class_idxs in equivalence_sets:
num_elements_in_class = len(equiv_class_idxs)
if num_elements_in_class < 2:
continue
elems_in_eq_class = np.fromiter(equiv_class_idxs, dtype=np.int32)
sum_distance_within_eq_class += np.sum(np.tril(all_distances[elems_in_eq_class][:, elems_in_eq_class], k=-1))
num_pairs += num_elements_in_class * (num_elements_in_class - 1) / 2
avg_distance_within_eq_class = sum_distance_within_eq_class / num_pairs
if print_stats:
print(
"Within Avg Dist: %s All Avg Dist: %s " % (avg_distance_within_eq_class, avg_distance_between_all_points))
return avg_distance_between_all_points / avg_distance_within_eq_class
def sort_points_to_line(vertices, start = 0):
"""Sorts points to a line by sequentiall connectoing nearest points
Arguments:
vertices (nx2 array): vertices of the line
start (int): start index
Returns:
nx2 array: sorted points
"""
d = squareform(pdist(vertices));
i = start;
n = vertices.shape[0];
uidx = np.ones(n, dtype = bool);
uidx[i] = False;
sidx = [i];
while np.sum(uidx) > 0:
i = np.argmin(d[i][uidx]);
i = np.where(uidx)[0][i];
sidx.append(i);
uidx[i] = False;
return vertices[sidx];
corrneighbours.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def predict(otrain):
binary = (otrain > 0)
norm = NormalizePositive(axis=1)
train = norm.fit_transform(otrain)
dists = distance.pdist(binary, 'correlation')
dists = distance.squareform(dists)
neighbors = dists.argsort(axis=1)
filled = train.copy()
for u in range(filled.shape[0]):
# n_u are the neighbors of user
n_u = neighbors[u, 1:]
for m in range(filled.shape[1]):
# This code could be faster using numpy indexing trickery as the
# cost of readibility (this is left as an exercise to the reader):
revs = [train[neigh, m]
for neigh in n_u
if binary[neigh, m]]
if len(revs):
n = len(revs)
n //= 2
n += 1
revs = revs[:n]
filled[u,m] = np.mean(revs)
return norm.inverse_transform(filled)
def kernel(self, X, Y=None):
GenericTests.check_type(X,'X',np.ndarray,2)
# if X=Y, use more efficient pdist call which exploits symmetry
normX=reshape(np.linalg.norm(X,axis=1),(len(X),1))
if Y is None:
dists = squareform(pdist(X, 'euclidean'))
normY=normX.T
else:
GenericTests.check_type(Y,'Y',np.ndarray,2)
assert(shape(X)[1]==shape(Y)[1])
normY=reshape(np.linalg.norm(Y,axis=1),(1,len(Y)))
dists = cdist(X, Y, 'euclidean')
K=0.5*(normX**self.alpha+normY**self.alpha-dists**self.alpha)
return K
def kernel(self, X, Y=None):
"""
Computes the hypercube kerpy k(x,y)=tanh(gamma)^d(x,y), where d is the
Hamming distance between x and y
X - 2d numpy.bool8 array, samples on right left side
Y - 2d numpy.bool8 array, samples on left hand side.
Can be None in which case its replaced by X
"""
if not type(X) is numpy.ndarray:
raise TypeError("X must be numpy array")
if not len(X.shape) == 2:
raise ValueError("X must be 2D numpy array")
if not X.dtype == numpy.bool8:
raise ValueError("X must be boolean numpy array")
if not Y is None:
if not type(Y) is numpy.ndarray:
raise TypeError("Y must be None or numpy array")
if not len(Y.shape) == 2:
raise ValueError("Y must be None or 2D numpy array")
if not Y.dtype == numpy.bool8:
raise ValueError("Y must be boolean numpy array")
if not X.shape[1] == Y.shape[1]:
raise ValueError("X and Y must have same dimension if Y is not None")
# un-normalise normalised hamming distance in both cases
if Y is None:
K = tanh(self.gamma) ** squareform(pdist(X, 'hamming') * X.shape[1])
else:
K = tanh(self.gamma) ** (cdist(X, Y, 'hamming') * X.shape[1])
return K
def kernel(self, X, Y=None):
"""
Computes the standard Gaussian kernel k(x,y)=exp(-0.5* ||x-y||**2 / sigma**2)
X - 2d numpy.ndarray, first set of samples:
number of rows: number of samples
number of columns: dimensionality
Y - 2d numpy.ndarray, second set of samples, can be None in which case its replaced by X
"""
if self.is_sparse:
X = X.todense()
Y = Y.todense()
GenericTests.check_type(X, 'X',np.ndarray)
assert(len(shape(X))==2)
# if X=Y, use more efficient pdist call which exploits symmetry
if Y is None:
sq_dists = squareform(pdist(X, 'sqeuclidean'))
else:
GenericTests.check_type(Y, 'Y',np.ndarray)
assert(len(shape(Y))==2)
assert(shape(X)[1]==shape(Y)[1])
sq_dists = cdist(X, Y, 'sqeuclidean')
K = exp(-0.5 * (sq_dists) / self.width ** 2)
return K