def plot_bokeh(df,sublist,filename):
lenlist=[0]
df_sub = df[df['cuisine']==sublist[0]]
lenlist.append(df_sub.shape[0])
for cuisine in sublist[1:]:
temp = df[df['cuisine']==cuisine]
df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True)
lenlist.append(df_sub.shape[0])
df_X = df_sub.drop(['cuisine','recipeName'],axis=1)
print df_X.shape, lenlist
dist = squareform(pdist(df_X, metric='cosine'))
tsne = TSNE(metric='precomputed').fit_transform(dist)
#cannot use seaborn palette for bokeh
palette =['red','green','blue','yellow']
colors =[]
for i in range(len(sublist)):
for j in range(lenlist[i+1]-lenlist[i]):
colors.append(palette[i])
#plot with boken
output_file(filename)
source = ColumnDataSource(
data=dict(x=tsne[:,0],y=tsne[:,1],
cuisine = df_sub['cuisine'],
recipe = df_sub['recipeName']))
hover = HoverTool(tooltips=[
("cuisine", "@cuisine"),
("recipe", "@recipe")])
p = figure(plot_width=1000, plot_height=1000, tools=[hover],
title="flavor clustering")
p.circle('x', 'y', size=10, source=source,fill_color=colors)
show(p)
python类pdist()的实例源码
def buildGraph(data, epsilon=1., metric='euclidean', p=2):
D = squareform(pdist(data, metric=metric, p=p))
D[D >= epsilon] = 0.
G = nx.Graph(D)
edges = list(map(set, G.edges()))
weights = [G.get_edge_data(u, v)['weight'] for u, v in G.edges()]
return G.nodes(), edges, weights
def is_satisfied(self, gcell):
scale = np.array([[2, 0, 0],
[0, 2, 0],
[0, 0, 2]])
super_gcell = gcell.supercell(scale)
target_cart = super_gcell.get_cartesian(ele=self.target_ele)
# target_cart is a np array of target element's
# cartesian coordinates
mindist = np.min(pdist(target_cart))
is_ok = mindist > self.target_dist
# import pdb
# pdb.set_trace()
return is_ok
def cluster(df, metric="euclidean", method="single", row=True, column=True):
row_linkmat, col_linkmat = None, None
if row:
distmat = dist.pdist(df, metric)
row_linkmat = hier.linkage(distmat, method)
df = df.iloc[hier.leaves_list(row_linkmat), :]
if column:
df = df.T
distmat = dist.pdist(df, metric)
col_linkmat = hier.linkage(distmat, method)
df = df.iloc[hier.leaves_list(col_linkmat), :].T
return df, row_linkmat, col_linkmat
transformation_tests_func.py 文件源码
项目:3D_Dense_Transformer_Networks
作者: JohnYC1995
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def makeT(self,cp):
# cp: [(k*k*k) x 3] control points
# T: [((k*k*k)+4) x ((k*k*k)+4)]
K = cp.shape[0]
T = np.zeros((K+4, K+4))
T[:K, 0] = 1; T[:K, 1:4] = cp; T[K, 4:] = 1; T[K+1:, 4:] = cp.T
R = squareform(pdist(cp, metric='euclidean'))
R = R * R;R[R == 0] = 1 # a trick to make R ln(R) 0
R = R * np.log(R)
np.fill_diagonal(R, 0)
T[:K, 4:] = R
return T
def coherence(U,m):
Phi = random_phi(m,U.shape[0])
PU = Phi.dot(U)
d = distance.pdist(PU.T,'cosine')
return abs(1-d)
def compare_distances(A,B,random_samples=[],s=200,pvalues=False):
if len(random_samples) == 0:
random_samples = np.zeros(A.shape[1],dtype=np.bool)
random_samples[:min(s,A.shape[1])] = True
np.random.shuffle(random_samples)
dist_x = distance.pdist(A[:,random_samples].T,'euclidean')
dist_y = distance.pdist(B[:,random_samples].T,'euclidean')
pear = pearsonr(dist_x,dist_y)
spear = spearmanr(dist_x,dist_y)
if pvalues:
return pear,spear
else:
return pear[0],spear[0]
def n1_fraction_borderline(data):
def get_n1_for_round(sparse_matrix, y):
Tcsr = minimum_spanning_tree(sparse_matrix)
borders = set()
a = Tcsr.nonzero()[0]
b = Tcsr.nonzero()[1]
for i in range(len(a)):
if (y[a[i]] != y[b[i]]):
borders.add(a[i])
borders.add(b[i])
n1 = len(borders)
return n1
features = data.columns[:-1, ]
dist = pdist(data[features], 'euclidean')
df_dist = pd.DataFrame(squareform(dist))
sparse_matrix = csr_matrix(df_dist.values)
labels = data.columns[-1]
y = data[labels]
n1 = 0
rounds = 10
for round in range(rounds):
n1 = n1 + get_n1_for_round(sparse_matrix, y)
n = len(data)
n1 = (1.0 * n1) / (rounds * n)
return n1
def n2_ratio_intra_extra_class_nearest_neighbor_distance(data):
features = data.columns[:-1,]
labels = data.columns[-1]
dist = pdist(data[features], 'euclidean')
df_dist = pd.DataFrame(squareform(dist))
max_size = df_dist.copy( )
max_size.iloc[:, :] = False
classes = data.iloc[ :, -1].unique()
n = data.shape[0]
n2 = 0
cl = 'bla'
intra_min = 0
inter_min = 0
for i in range(data.shape[0]):
ci = data.iloc[i, -1]
if ci != cl:
cl = ci
intra_idx = data[data[labels] == ci].index.values.tolist()
inter_idx = data[data[labels] != ci].index.values
intra_idx.remove(i)
intra_min = intra_min + df_dist.iloc[intra_idx, i].min()
inter_min = inter_min + df_dist.iloc[inter_idx, i].min()
intra_idx.append(i)
# tratar caso de inter_min == 0
if inter_min == 0:
inter_min = 1
n2 = (1.0 * intra_min) / (1.0 * inter_min)
return n2
def is_behavior_learning_done(self):
"""Check if the optimization is finished.
Returns
-------
finished : bool
Is the learning of a behavior finished?
"""
if self.it <= self.n_samples_per_update:
return False
if not np.all(np.isfinite(self.fitness)):
return True
# Check for invalid values
if not (np.all(np.isfinite(self.invsqrtC)) and
np.all(np.isfinite(self.cov)) and
np.all(np.isfinite(self.mean)) and
np.isfinite(self.var)):
self.logger.info("Stopping: infs or nans" % self.var)
return True
if (self.min_variance is not None and
np.max(np.diag(self.cov)) * self.var <= self.min_variance):
self.logger.info("Stopping: %g < min_variance" % self.var)
return True
max_dist = np.max(pdist(self.fitness[:, np.newaxis]))
if max_dist < self.min_fitness_dist:
self.logger.info("Stopping: %g < min_fitness_dist" % max_dist)
return True
cov_diag = np.diag(self.cov)
if (self.max_condition is not None and
np.max(cov_diag) > self.max_condition * np.min(cov_diag)):
self.logger.info("Stopping: %g / %g > max_condition"
% (np.max(self.cov), np.min(self.cov)))
return True
return False
def __call__(self):
if len(self.words) == 0 or len(self.vectors) == 0:
return []
distance_matrix = scidist.pdist(np.array(self.vectors),self.metric)
linkage_matrix = hier.linkage(distance_matrix,self.linkage)
dendrogram = self._linkage_matrix_to_dendrogram(linkage_matrix,self.words,self.vectors)
clusterings = self._create_clusterings(dendrogram)
return [[(node.label,node.vector) for node in _get_cluster_nodes(cluster)] for cluster in self._find_optimal_clustering(clusterings)]
def calculate_fitness(feature_vectors):
pairwise_euclidean_distances = distance.pdist(feature_vectors, 'euclidean')
fitness = pairwise_euclidean_distances.mean() + \
pairwise_euclidean_distances.min()
return fitness
def merge_candidates_scan(candidates, seriesuid, distance=5.):
distances = pdist(candidates, metric='euclidean')
adjacency_matrix = squareform(distances)
# Determine nodes within distance, replace by 1 (=adjacency matrix)
adjacency_matrix = np.where(adjacency_matrix<=distance,1,0)
# Determine all connected components in the graph
n, labels = connected_components(adjacency_matrix)
new_candidates = np.zeros((n,3))
# Take the mean for these connected components
for cluster_i in range(n):
points = candidates[np.where(labels==cluster_i)]
center = np.mean(points,axis=0)
new_candidates[cluster_i,:] = center
x = new_candidates[:,0]
y = new_candidates[:,1]
z = new_candidates[:,2]
labels = [seriesuid]*len(x)
class_name = [0]*len(x)
data= zip(labels,x,y,z,class_name)
new_candidates = pd.DataFrame(data,columns=CANDIDATES_COLUMNS)
return new_candidates
def precompute_kernels(self, q) :
"""
Returns a tuple of kernel, kernel', kernel'' matrices at position q.
"""
x = q.reshape((self.npoints, self.dimension))
dists = squareform(pdist(x, 'sqeuclidean'))
K = exp(- dists / (2* self.kernel_scale ** 2))
return ( K,
- K / (2* self.kernel_scale ** 2),
K / (4* self.kernel_scale ** 4))
def dq_Kqp_a(self,q,p,a, kernels) :
"""
Useful for the adjoint integration scheme.
d_q (K_q p) . a = ...
"""
h = 1e-8
Q0phA = q + h*a
Q0mhA = q - h*a
update_emp = ( Landmarks.K(self, Q0phA, p, Landmarks.precompute_kernels(self, Q0phA))
- Landmarks.K(self, Q0mhA, p, Landmarks.precompute_kernels(self, Q0mhA))) / (2*h)
return update_emp
"""x = q.reshape((self.npoints, self.dimension))
p = p.reshape((self.npoints, self.dimension))
a = a.reshape((self.npoints, self.dimension))
dists = squareform(pdist(x, 'sqeuclidean')) # dists_ij = |x_i-x_j|^2
# We have :
# [K_q p]_nd = sum_j { k(|x_n - x_j|^2) * p_j^d }
#
# So that :
# grad_nd = a_nd * sum_j { 2 * (x_n^d - x_j^d) * k'(|x_n - x_j|^2) * p_j^d }
grad = zeros((self.npoints, self.dimension))
for d in range(self.dimension) :
diffs = atleast_2d(x[:,d]).T - x[:,d] # diffs_ij = x_i^d - x_j^d
# K_ij = 2 * (x_i^d - x_j^d) * k'(|x_i - x_j|^2) * p_j^d
K = 2 * dists * kernels[1] * p[:,d]
# grad_nd = a_nd * sum_j { 2 * (x_n^d - x_j^d) * k'(|x_n - x_j|^2) * p_j^d }
grad[:,d] = a[:,d] * sum( K , 1 )
return grad.reshape((self.npoints * self.dimension,))"""
ppdb_utils.py 文件源码
项目:Learning-sentence-representation-with-guidance-of-human-attention
作者: wangshaonan
项目源码
文件源码
阅读 38
收藏 0
点赞 0
评论 0
def getPairsFast(d, type):
X = []
T = []
pairs = []
for i in range(len(d)):
(p1,p2) = d[i]
X.append(p1.representation)
X.append(p2.representation)
T.append(p1)
T.append(p2)
arr = pdist(X,'cosine')
arr = squareform(arr)
for i in range(len(arr)):
arr[i,i]=1
if i % 2 == 0:
arr[i,i+1] = 1
else:
arr[i,i-1] = 1
arr = np.argmin(arr,axis=1)
for i in range(len(d)):
(t1,t2) = d[i]
p1 = None
p2 = None
if type == "MAX":
p1 = T[arr[2*i]]
p2 = T[arr[2*i+1]]
if type == "RAND":
p1 = getPairRand(d,i)
p2 = getPairRand(d,i)
if type == "MIX":
p1 = getPairMixScore(d,i,T[arr[2*i]])
p2 = getPairMixScore(d,i,T[arr[2*i+1]])
pairs.append((p1,p2))
return pairs
def cao_juan_2009(topic_term_dists, num_topics):
cos_pdists = squareform(pdist(topic_term_dists, metric='cosine'))
return np.sum(cos_pdists) / (num_topics*(num_topics - 1)/2)
def deveaud_2014(topic_term_dists, num_topics):
jsd_pdists = squareform(pdist(topic_term_dists, metric=jensen_shannon))
return np.sum(jsd_pdists) / (num_topics*(num_topics - 1))
def check_embed_match(X_embed1, X_embed2):
"""
Check whether the two embeddings are almost the same by computing their normalized euclidean distances
in the embedding space and checking the correlation.
Inputs:
- X_embed1, X_embed2: two Nxd matrices with coordinates in the embedding space
Returns:
- r: Pearson correlation coefficient between the normalized distances of the points
"""
D_emb1 = pdist(X_embed1, 'euclidean')
D_emb2 = pdist(X_embed2, 'euclidean')
D_emb1 /= D_emb1.max()
D_emb2 /= D_emb2.max()
return np.corrcoef(D_emb1, D_emb2)[0, 1]
def median_heuristic(y):
""" Estimate RBF bandwith using median heuristic.
Parameters
----------
y : (number of samples, dimension)-ndarray
One row of y corresponds to one sample.
Returns
-------
bandwidth : float
Estimated RBF bandwith.
"""
num_of_samples = y.shape[0] # number of samples
# if y contains more samples, then it is subsampled to this cardinality
num_of_samples_used = 100
# subsample y (if necessary; select '100' random y columns):
if num_of_samples > num_of_samples_used:
idx = choice(num_of_samples, num_of_samples_used, replace=False)
y = y[idx] # broadcasting
dist_vector = pdist(y) # pairwise Euclidean distances
bandwith = median(dist_vector) / sqrt(2)
return bandwith