def svgd_kernel(self, h = -1):
sq_dist = pdist(self.theta)
pairwise_dists = squareform(sq_dist)**2
if h < 0: # if h < 0, using median trick
h = np.median(pairwise_dists)
h = np.sqrt(0.5 * h / np.log(self.theta.shape[0]+1))
# compute the rbf kernel
Kxy = np.exp( -pairwise_dists / h**2 / 2)
dxkxy = -np.matmul(Kxy, self.theta)
sumkxy = np.sum(Kxy, axis=1)
for i in range(self.theta.shape[1]):
dxkxy[:, i] = dxkxy[:,i] + np.multiply(self.theta[:,i],sumkxy)
dxkxy = dxkxy / (h**2)
return (Kxy, dxkxy)
python类squareform()的实例源码
def svgd_kernel(self, theta, h = -1):
sq_dist = pdist(theta)
pairwise_dists = squareform(sq_dist)**2
if h < 0: # if h < 0, using median trick
h = np.median(pairwise_dists)
h = np.sqrt(0.5 * h / np.log(theta.shape[0]+1))
# compute the rbf kernel
Kxy = np.exp( -pairwise_dists / h**2 / 2)
dxkxy = -np.matmul(Kxy, theta)
sumkxy = np.sum(Kxy, axis=1)
for i in range(theta.shape[1]):
dxkxy[:, i] = dxkxy[:,i] + np.multiply(theta[:,i],sumkxy)
dxkxy = dxkxy / (h**2)
return (Kxy, dxkxy)
def tsne_cluster_cuisine(df,sublist):
lenlist=[0]
df_sub = df[df['cuisine']==sublist[0]]
lenlist.append(df_sub.shape[0])
for cuisine in sublist[1:]:
temp = df[df['cuisine']==cuisine]
df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True)
lenlist.append(df_sub.shape[0])
df_X = df_sub.drop(['cuisine','recipeName'],axis=1)
print df_X.shape, lenlist
dist = squareform(pdist(df_X, metric='cosine'))
tsne = TSNE(metric='precomputed').fit_transform(dist)
palette = sns.color_palette("hls", len(sublist))
plt.figure(figsize=(10,10))
for i,cuisine in enumerate(sublist):
plt.scatter(tsne[lenlist[i]:lenlist[i+1],0],\
tsne[lenlist[i]:lenlist[i+1],1],c=palette[i],label=sublist[i])
plt.legend()
#interactive plot with boken; set up for four categories, with color palette; pass in df for either ingredient or flavor
def hessian(self, x, d=None):
"""
Computes Hessian matrix
"""
d = calc_distances(x) if d is None else d
if d.ndim == 1: d = squareform(d)
H = np.zeros((3*len(x), 3*len(x)))
n = self.n
for i in range(len(x)):
for j in range(len(x)):
if j == i: continue
dx = x[i]-x[j]
r = d[i,j]
h = n / r**(0.5*n+2) * ((n+2) * np.multiply.outer(dx,dx) - np.eye(3) * r)
H[3*i:3*(i+1), 3*j:3*(j+1)] = -h
H[3*i:3*(i+1), 3*i:3*(i+1)] += h
return H
def construct_data_synthetic_Laplacian(D, lifetime, noise_var, N_train, N_test):
# pick datapoint locations uniformly at random
N = N_train + N_test
X = np.random.rand(N, D)
# construct kernel matrix
K = scipy.exp(- lifetime * squareform(pdist(X, 'cityblock')))
# sample the function at picked locations x
y = np.linalg.cholesky(K).dot(np.random.randn(N)) + np.sqrt(noise_var) * np.random.randn(N)
# pick training indices sequentially
indices_train = range(0, N_train)
indices_test = range(N_train, N)
# split the data into train and test
X_train = X[indices_train]
X_test = X[indices_test ]
y_train = y[indices_train]
y_test = y[indices_test ]
return X_train, y_train, X_test, y_test
# SAMPLING
def compute_similarity_matrix(self, parent=None):
clusters = list(self._models)
n_clusters = len(clusters)
X = np.vstack([self[cluster][0] for cluster in clusters])
nX = l2_normalize(X)
similarities = -squareform(pdist(nX, metric=self.distance))
matrix = ValueSortedDict()
for i, j in itertools.combinations(range(n_clusters), 2):
matrix[clusters[i], clusters[j]] = similarities[i, j]
matrix[clusters[j], clusters[i]] = similarities[j, i]
return matrix
def compute_dcov_dcorr_statistics(y, alpha):
""" Compute the statistics to distance covariance/correlation.
Parameters
----------
y : (number of samples, dimension)-ndarray
One row of y corresponds to one sample.
alpha : float
0 < alpha < 2
Returns
-------
c : (number of samples, dimension)-ndarray
Computed statistics.
"""
d = squareform(pdist(y))**alpha
ck = mean(d, axis=0)
c = d - ck - ck[:, newaxis] + mean(ck)
return c
def formClusters(dists, link, distance):
"""Form clusters based on hierarchical clustering of input distance matrix
with linkage type and cutoff distance
:param dists: numpy matrix of distances
:param link: linkage type for hierarchical clustering
:param distance: distance at which to cut into clusters
:return: list of cluster assignments
"""
# Make distance matrix square
dists = squareform(dists)
# Compute linkage
links = linkage(dists, link)
# import matplotlib.pyplot as plt
# from scipy.cluster import hierarchy
# plt.figure(figsize=(15,5))
# p = hierarchy.dendrogram(links)
# Break into clusters based on cutoff
clusters = fcluster(links, distance, criterion='distance')
return clusters
def plot_hamming_dist(s,W,brec):
masks = s[:,0,:].T>0
x_hat = np.zeros(masks.shape)
for ii in range(masks.shape[1]):
Weff = W*masks[:,ii]
x_hat[:,ii] = np.linalg.inv(np.eye(100)-Weff).dot(brec)
fig = plt.figure()
plt.pcolormesh(squareform(pdist(np.sign(x_hat[:,:]).T,metric='hamming'))) #,vmax=.3)
plt.colorbar()
plt.ylim([0,x_hat.shape[1]])
plt.xlim([0,x_hat.shape[1]])
plt.axes().set_aspect('equal')
plt.title('Hamming Distance Between Putative FPs')
plt.ylabel('Time')
plt.xlabel('Time')
return fig
def kernel(self, X, Y=None):
GenericTests.check_type(X,'X',np.ndarray,2)
# if X=Y, use more efficient pdist call which exploits symmetry
if Y is None:
dists = squareform(pdist(X, 'euclidean'))
else:
GenericTests.check_type(Y,'Y',np.ndarray,2)
assert(shape(X)[1]==shape(Y)[1])
dists = cdist(X, Y, 'euclidean')
if self.nu==0.5:
#for nu=1/2, Matern class corresponds to Ornstein-Uhlenbeck Process
K = (self.sigma**2.) * exp( -dists / self.width )
elif self.nu==1.5:
K = (self.sigma**2.) * (1+ sqrt(3.)*dists / self.width) * exp( -sqrt(3.)*dists / self.width )
elif self.nu==2.5:
K = (self.sigma**2.) * (1+ sqrt(5.)*dists / self.width + 5.0*(dists**2.) / (3.0*self.width**2.) ) * exp( -sqrt(5.)*dists / self.width )
else:
raise NotImplementedError()
return K
dataset.py 文件源码
项目:neural-combinatorial-optimization-rl-tensorflow
作者: MichelDeudon
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def k_nearest_neighbor(self, sequence):
# Calculate dist_matrix
dist_array = pdist(sequence)
dist_matrix = squareform(dist_array)
# Construct tour
new_sequence = [sequence[0]]
current_city = 0
visited_cities = [0]
for i in range(1,len(sequence)):
j = np.random.randint(0,min(len(sequence)-i,self.kNN))
next_city = [index for index in dist_matrix[current_city].argsort() if index not in visited_cities][j]
visited_cities.append(next_city)
new_sequence.append(sequence[next_city])
current_city = next_city
return np.asarray(new_sequence)
# Generate random TSP-TW instance
def create_hc(G):
"""Creates hierarchical cluster of graph G from distance matrix"""
path_length=nx.all_pairs_shortest_path_length(G)
distances=numpy.zeros((len(G),len(G)))
for u,p in path_length.items():
for v,d in p.items():
distances[u][v]=d
# Create hierarchical cluster
Y=distance.squareform(distances)
Z=hierarchy.complete(Y) # Creates HC using farthest point linkage
# This partition selection is arbitrary, for illustrive purposes
membership=list(hierarchy.fcluster(Z,t=1.15))
# Create collection of lists for blockmodel
partition=defaultdict(list)
for n,p in zip(list(range(len(G))),membership):
partition[p].append(n)
return list(partition.values())
def create_3D_distance_matrix(vox_ijk, epi_fname):
"""Compute distance between voxels in the volume.
Parameters
----------
vox_ijk : n x 3 array
Indices of voxels included in the ROI.
epi_fname : file path
Path to image defining the volume space.
Returns
-------
dmat : array
Dense square distance matrix.
"""
aff = nib.load(epi_fname).affine
vox_ras = nib.affines.apply_affine(aff, vox_ijk)
dmat = squareform(pdist(vox_ras))
return dmat
def PQTrain(data, lenSubVec,numSubCenter):
(dataSize, dataDim)=data.shape
if 0!=dataDim%lenSubVec:
print "Cannot partition the feature space with the given segment number"
return
numSubVec=dataDim/lenSubVec
centers=npy.zeros((numSubVec*numSubCenter,lenSubVec),dtype=npy.float32)
distOfCenters=npy.zeros((numSubCenter,numSubCenter,numSubVec),dtype=npy.float32)
objKmeans=KMeans(numSubCenter,'k-means++',3,100,0.001)
for ii in range(numSubVec):
print("PQ training. Processing "+str(ii)+"-th sub-vector")
objKmeans.fit(data[:,ii*lenSubVec:(ii+1)*lenSubVec])
centers[ii*numSubCenter:(ii+1)*numSubCenter,:]= objKmeans.cluster_centers_
distOfCenters[:,:,ii]=squareform(pdist(objKmeans.cluster_centers_,metric="euclidean"))
model={"centers":centers,"distOfCenters":distOfCenters}
return model
def _compute_centers(self, X, sparse, rs):
"""Generate centers, then compute tau, dF and dN vals"""
super(GRBFRandomLayer, self)._compute_centers(X, sparse, rs)
centers = self.components_['centers']
sorted_distances = np.sort(squareform(pdist(centers)))
self.dF_vals = sorted_distances[:, -1]
self.dN_vals = sorted_distances[:, 1]/100.0
#self.dN_vals = 0.0002 * np.ones(self.dF_vals.shape)
tauNum = np.log(np.log(self.grbf_lambda) /
np.log(1.0 - self.grbf_lambda))
tauDenom = np.log(self.dF_vals/self.dN_vals)
self.tau_vals = tauNum/tauDenom
self._extra_args['taus'] = self.tau_vals
# get radii according to ref [1]
def kernel_matrix(svm_model, original_X):
if (svm_model.svm_kernel == 'polynomial_kernel' or svm_model.svm_kernel == 'soft_polynomial_kernel'):
K = (svm_model.zeta + svm_model.gamma * np.dot(original_X, original_X.T)) ** svm_model.Q
elif (svm_model.svm_kernel == 'gaussian_kernel' or svm_model.svm_kernel == 'soft_gaussian_kernel'):
pairwise_dists = squareform(pdist(original_X, 'euclidean'))
K = np.exp(-svm_model.gamma * (pairwise_dists ** 2))
'''
K = np.zeros((svm_model.data_num, svm_model.data_num))
for i in range(svm_model.data_num):
for j in range(svm_model.data_num):
if (svm_model.svm_kernel == 'polynomial_kernel' or svm_model.svm_kernel == 'soft_polynomial_kernel'):
K[i, j] = Kernel.polynomial_kernel(svm_model, original_X[i], original_X[j])
elif (svm_model.svm_kernel == 'gaussian_kernel' or svm_model.svm_kernel == 'soft_gaussian_kernel'):
K[i, j] = Kernel.gaussian_kernel(svm_model, original_X[i], original_X[j])
'''
return K
def comp_distance(X, metric='euclidean'):
"""Compute distance matrix for data array X
Parameters
----------
X : np.ndarray
Data array (rows store samples, columns store variables).
metric : string
For example 'euclidean', 'sqeuclidean', see sp.spatial.distance.pdist.
Returns
-------
D : np.ndarray
Distance matrix.
"""
from scipy.spatial import distance
return distance.squareform(distance.pdist(X, metric=metric))
def plot_clusters_igraph(responsibilities, color_groups):
from scipy.spatial.distance import pdist, correlation, squareform
from igraph import Graph, plot
data = responsibilities[:, :2]
Y = pdist(data, hellinger_distance)
print(Y[:30], file=stderr)
# return
g = Graph()
n = data.shape[0]
g.add_vertices(n)
colors = ["grey"]*n
palette = list(colors_dict.values())
for j, group in enumerate(color_groups):
c = palette[j]
for i in group:
colors[i] = c
l = g.layout_mds(dist=squareform(Y))
plot(g, layout=l, vertex_color=colors, bbox=(1024, 1024), vertex_size=5)
# c&p from stackexchange
def test_dbscan_similarity():
# Tests the DBSCAN algorithm with a similarity array.
# Parameters chosen specifically for this task.
eps = 0.15
min_samples = 10
# Compute similarities
D = distance.squareform(distance.pdist(X))
D /= np.max(D)
# Compute DBSCAN
core_samples, labels = dbscan(D, metric="precomputed", eps=eps,
min_samples=min_samples)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
assert_equal(n_clusters_1, n_clusters)
db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
labels = db.fit(D).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert_equal(n_clusters_2, n_clusters)
def group_by_proximity(self, k=10):
if len(self.points) == 0:
return {}
X = numpy.array([[p.lat, p.lon] for p in self.points])
distance_matrix = distance.squareform(distance.pdist(X))
db = KMeans(n_clusters=k).fit(distance_matrix)
# re-attach ids
grouped_points = {}
for i, k in enumerate(db.labels_):
logger.debug('idx, label [%s, %s]', i, k)
if k not in grouped_points:
grouped_points[k] = []
point = self.points[i]
grouped_points[k].append({'id': point.uid, 'lat': point.lat, 'lon': point.lon})
logger.info('Finished grouping into %d groups.', len(grouped_points))
return grouped_points
def cluster_sequences(sequences, minsize=5):
"""
Cluster the given sequences into groups of similar sequences.
Return a triple that contains a pandas.DataFrame with the edit distances,
the linkage result, and a list that maps sequence ids to their cluster id.
If an entry is zero in that list, it means that the sequence is not part of
a cluster.
"""
matrix = distances(sequences)
linkage = hierarchy.linkage(distance.squareform(matrix), method='average')
# Linkage columns are:
# 0, 1: merged clusters, 2: distance, 3: number of nodes in cluster
inner = inner_nodes(hierarchy.to_tree(linkage))
prev = linkage[:, 2].max() # highest distance
clusters = [0] * len(sequences)
cl = 1
for n in inner:
if n.dist > 0 and prev / n.dist < 0.8 \
and n.left.count >= minsize and n.right.count >= minsize:
for id in collect_ids(n.left):
# Do not overwrite previously assigned ids
if clusters[id] == 0:
clusters[id] = cl
cl += 1
prev = n.dist
# At the end of the above loop, we have not processed the rightmost
# subtree. In our experiments, it never contains true novel sequences,
# so we omit it.
return pd.DataFrame(matrix), linkage, clusters
def compute_db_index(matrix, kmeans):
'''
Compute Davies-Bouldin index, a measure of clustering quality.
Faster and possibly more reliable than silhouette score.
'''
(n, m) = matrix.shape
k = kmeans.n_clusters
centers = kmeans.cluster_centers_
labels = kmeans.labels_
centroid_dists = sp_dist.squareform(sp_dist.pdist(centers))
# Avoid divide-by-zero
centroid_dists[np.abs(centroid_dists) < MIN_CENTROID_DIST] = MIN_CENTROID_DIST
wss = np.zeros(k)
counts = np.zeros(k)
for i in xrange(n):
label = labels[i]
# note: this is 2x faster than scipy sqeuclidean
sqdist = np.square(matrix[i,:] - centers[label,:]).sum()
wss[label] += sqdist
counts[label] += 1
# Handle empty clusters
counts[counts == 0] = 1
scatter = np.sqrt(wss / counts)
mixitude = (scatter + scatter[:, np.newaxis]) / centroid_dists
np.fill_diagonal(mixitude, 0.0)
worst_case_mixitude = np.max(mixitude, axis=1)
db_score = worst_case_mixitude.sum() / k
return db_score
def run_orange3(Z, D):
import Orange.clustering.hierarchical as orange_hier
tree = orange_hier.tree_from_linkage(Z)
start_time = time.time()
orange_hier.optimal_leaf_ordering(tree, squareform(D))
end_time = time.time()
return end_time - start_time, None
def kernel_matrix(self, X):
# check for stupid mistake
assert X.shape[0] > X.shape[1]
sq_dists = squareform(pdist(X, 'sqeuclidean'))
K = np.exp(-sq_dists/ self.scaling)
return K
def k_multiple(self, X):
"""
Efficient computation of kernel matrix without loops
Effectively does the same as calling self.k on all pairs of the input
"""
assert(X.ndim == 1)
sq_dists = squareform(pdist(X.reshape(len(X), 1), 'sqeuclidean'))
K = np.exp(-(sq_dists) / self.scaling)
return K
def k_multiple_dim(self, X):
# check for stupid mistake
assert X.shape[0] > X.shape[1]
sq_dists = squareform(pdist(X, 'sqeuclidean'))
K = np.exp(-(sq_dists) / self.scaling)
return K
def k_multiple(self, X):
"""
Efficient computation of kernel matrix without loops
Effectively does the same as calling self.k on all pairs of the input
"""
assert(X.ndim == 1)
sq_dists = squareform(pdist(X.reshape(len(X), 1), 'sqeuclidean'))
K = np.exp(-(sq_dists) / self.scaling)
return K
def k_multiple_dim(self, X):
# check for stupid mistake
assert X.shape[0] > X.shape[1]
sq_dists = squareform(pdist(X, 'sqeuclidean'))
K = np.exp(-(sq_dists) / self.scaling)
return K
def plot_bokeh(df,sublist,filename):
lenlist=[0]
df_sub = df[df['cuisine']==sublist[0]]
lenlist.append(df_sub.shape[0])
for cuisine in sublist[1:]:
temp = df[df['cuisine']==cuisine]
df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True)
lenlist.append(df_sub.shape[0])
df_X = df_sub.drop(['cuisine','recipeName'],axis=1)
print df_X.shape, lenlist
dist = squareform(pdist(df_X, metric='cosine'))
tsne = TSNE(metric='precomputed').fit_transform(dist)
#cannot use seaborn palette for bokeh
palette =['red','green','blue','yellow']
colors =[]
for i in range(len(sublist)):
for j in range(lenlist[i+1]-lenlist[i]):
colors.append(palette[i])
#plot with boken
output_file(filename)
source = ColumnDataSource(
data=dict(x=tsne[:,0],y=tsne[:,1],
cuisine = df_sub['cuisine'],
recipe = df_sub['recipeName']))
hover = HoverTool(tooltips=[
("cuisine", "@cuisine"),
("recipe", "@recipe")])
p = figure(plot_width=1000, plot_height=1000, tools=[hover],
title="flavor clustering")
p.circle('x', 'y', size=10, source=source,fill_color=colors)
show(p)
def gradient(self, x, D=None):
"""
Return gradient
"""
dx = np.array([np.subtract.outer(x[:,d],x[:,d]) for d in range(x.shape[1])])
D = calc_distances(x) if D is None else D
D = D**(-0.5*self.n-1)
D = squareform(D)
g = - self.n * dx * D
return g.sum(-1).T