def computeValDataDistanceMatrix(self):
# TODO: use self.computeDescriptors(self.valdataDB) ?
batchSize = self.cfgParams.batch_size
nSamp = self.n_val_batches * batchSize
descr = numpy.zeros((nSamp,self.descrNet.cfgParams.outputDim[1]))
for i in range(self.n_val_batches):
# if self.macroBatchSize > 0:
# self.setMacroBatchData(self.valdataDB,numpy.floor(i / self.macroBatchSize).astype(numpy.int))
# miniBatchIdx = numpy.mod(i,self.macroBatchSize)
# else:
# miniBatchIdx = i
miniBatchIdx = self.dataManager.makeMinibatchAvailable(self.valdataDB,i)
d = self.tfComputeDescr(miniBatchIdx)
descr[i*batchSize:(i+1)*batchSize] = d
dst = scipy.spatial.distance.pdist(descr,'euclidean')
dst = scipy.spatial.distance.squareform(dst)
return dst
python类distance()的实例源码
def computeDistanceMatrix(self,test_set):
batch_size = self.cfgParams.batch_size
nSamp = test_set.numSamples
descrLen = self.descrNet.cfgParams.outputDim[1]
descr = numpy.zeros((nSamp,descrLen))
n_test_batches = nSamp / batch_size
for i in range(n_test_batches):
# if self.macroBatchSize > 0:
# self.setMacroBatchData(test_set,numpy.floor(i / self.macroBatchSize).astype(numpy.int))
# miniBatchIdx = numpy.mod(i,self.macroBatchSize)
# else:
# miniBatchIdx = i
miniBatchIdx = self.dataManager.makeMinibatchAvailable(test_set,i)
d = self.tfComputeDescr(miniBatchIdx)
descr[i*batch_size:(i+1)*batch_size] = d
print("distances done")
dst = scipy.spatial.distance.pdist(descr,'euclidean')
dst = scipy.spatial.distance.squareform(dst)
return dst
def fit(self, feat):
# Compute affinity matrix using RBF kernel on pair-wise distances
affinity = scipy.spatial.distance.pdist(np.array([f for id, f in feat]))
sigma = -2 * np.var(affinity)
affinity = np.exp(scipy.spatial.distance.squareform(affinity) / sigma)
# Recursive clustering
self.tree = { 'depth' : 0, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : None, 'items' : feat, 'affinity' : affinity }
queue = []
heapq.heappush(queue, (-1 * len(self.tree['items']), np.random.rand(), self.tree))
while (self.tree['leafs'] < self.max_clusters) and (len(queue) > 0):
if len(queue[0][2]['items']) <= self.min_cluster_size:
break
left, right, ncut_value = self.split(heapq.heappop(queue)[2])
if ncut_value > self.T:
break
if (left is not None) and (right is not None):
heapq.heappush(queue, (-1 * len(left['items']), np.random.rand(), left))
heapq.heappush(queue, (-1 * len(right['items']), np.random.rand(), right))
def get_score_funcs():
"""Helper to get the score functions"""
from scipy import stats
from scipy.spatial import distance
score_funcs = Bunch()
xy_arg_dist_funcs = [(n, f) for n, f in vars(distance).items()
if isfunction(f) and not n.startswith('_')]
xy_arg_stats_funcs = [(n, f) for n, f in vars(stats).items()
if isfunction(f) and not n.startswith('_')]
score_funcs.update(dict((n, _make_xy_sfunc(f))
for n, f in xy_arg_dist_funcs
if _get_args(f) == ['u', 'v']))
score_funcs.update(dict((n, _make_xy_sfunc(f, ndim_output=True))
for n, f in xy_arg_stats_funcs
if _get_args(f) == ['x', 'y']))
return score_funcs
def thrEstimation(self):
x = 0.00
dx = 0.05
countsList = []
x_list = []
while x < 1:
FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance')
counter=collections.Counter(FlatC)
Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
countsList.append(counter[Best])
x+= dx
x_list.append(x)
dy = np.diff(countsList)
for a, b in zip (x_list, dy):
if b == max(dy):
return a
def distancematrix(data, ydata=None):
if ydata is None:
distances = scipy.spatial.distance.pdist(data, 'euclidean')
else:
distances = scipy.spatial.distance.cdist(data, ydata, 'euclidean')
return distances.astype(numpy.float32)
def batch_pdist(data_slice):
# Each data_slice has tuples consisting of two points that we need to
# find the great circle distance between and their weight:
partial_sum = 0
for X, Y, weights in data_slice:
dist = np.array([])
zipped = zip(X, Y)
for x, y in zipped:
dist = np.append(dist, great_circle(x, y).km)
partial_sum += np.sum(weights * dist )
return partial_sum
# return 10
def mean_pairwise_distance(X, weights=None, n_jobs=None, axis=0):
"""Function that returns the sum and mean of the pairwise distances of an 2D
array X.
Required arguments:
X -- 2D array of points.
Optional arguments:
weights -- 1D array of counts or weights per point in X (default: 1s).
n_jobs -- Number of cores to use for calculation (default: all).
axis -- The axis of X corresponding to data elements (default: 0).
"""
N = X.shape[axis]
if weights is None:
weights = np.ones((N,))
if n_jobs is None:
n_jobs = min(mp.cpu_count(),N)
# Get the pairs and their weights to calculate the distances without
# needing the whole of X, split it into roughly equal sub-arrays per cpu:
pairs_split = np.array_split([(X[i:], X[:N - i], weights[i:] * weights[:N - i])
for i in xrange(1, N)],
n_jobs, axis=axis)
# Create a pool for each cpu to send the batch_dist function to each split.
# Then, close the pool and wait for jobs to complete before continuing:
pool = mp.Pool(processes=n_jobs)
queue_sum = sum(pool.map(batch_pdist, pairs_split, chunksize=N // n_jobs))
pool.close()
pool.join()
N = weights.sum()
# Compute the number of combinations, add to the number of unique pairs
# and use that as the denominator to calculate the mean pairwise distance:
mean = queue_sum / (N * (N - 1.0) / 2.0)
# If you do not want to include distance from an item to itself use:
# mean = queue_sum / (((N - 1)**2 + (N + 1)) / 2.0)
return queue_sum, mean
def distance(a, b):
""" Slow version of ``add`` to simulate work """
return np.sum(np.sqrt(np.sum((a - b)**2, axis=1)))
# Parallel:
def checkFiltersDist(descrNet):
wvals = descrNet.layer0.W.get_value()
wvals = wvals.reshape((wvals.shape[0],numpy.prod(wvals.shape[1:])))
dst = scipy.spatial.distance.pdist(wvals,'cosine')
dst = scipy.spatial.distance.squareform(dst)
showDistanceMatrix(dst)
def split(self, node):
# Perform normalized cut
try:
ind = SpectralClustering(2, affinity = 'precomputed', assign_labels = 'discretize').fit_predict(node['affinity'])
except KeyboardInterrupt:
raise
except:
return None, None, 0
# Create left and right node
mask1, mask2 = (ind == 0), (ind == 1)
if not (np.any(mask1) and np.any(mask2)):
return None, None, 0
left = { 'depth' : node['depth'] + 1, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : node, 'items' : [f for i, f in enumerate(node['items']) if ind[i] == 0], 'affinity' : node['affinity'][np.ix_(mask1, mask1)] }
right = { 'depth' : node['depth'] + 1, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : node, 'items' : [f for i, f in enumerate(node['items']) if ind[i] == 1], 'affinity' : node['affinity'][np.ix_(mask2, mask2)] }
# Force the node with the lower minimum distance to the query to be the left node
if ind[0] == 1: # items are already sorted when passed to fit(), so we just need to look at the first item instead of re-computing all distances
left, right = right, left
# Modify parent
node['children'] = [left, right]
# Modify parent chain
parent = node
while parent is not None:
parent['height'] += 1
parent['size'] += 2
parent['leafs'] += 1
parent = parent['parent']
return left, right, self.ncut_value(node['affinity'], ind)
def cosine_similarity(repr1, repr2):
"""Calculates cosine similarity (https://en.wikipedia.org/wiki/Cosine_similarity)."""
if repr1 is None or repr2 is None:
return 0
assert not (np.isnan(repr2).any() or np.isinf(repr2).any())
assert not (np.isnan(repr1).any() or np.isinf(repr1).any())
sim = 1 - scipy.spatial.distance.cosine(repr1, repr2)
if np.isnan(sim):
# the similarity is nan if no term in the document is in the vocabulary
return 0
return sim
def euclidean_distance(repr1, repr2):
"""Calculates Euclidean distance (https://en.wikipedia.org/wiki/Euclidean_distance)."""
sim = np.sqrt(np.sum([np.power(p-q, 2) for (p, q) in zip(repr1, repr2)]))
return sim
def variational_distance(repr1, repr2):
"""Also known as L1 or Manhattan distance (https://en.wikipedia.org/wiki/Taxicab_geometry)."""
sim = np.sum([np.abs(p-q) for (p, q) in zip(repr1, repr2)])
return sim
def bhattacharyya_distance(repr1, repr2):
"""Calculates Bhattacharyya distance (https://en.wikipedia.org/wiki/Bhattacharyya_distance)."""
sim = - np.log(np.sum([np.sqrt(p*q) for (p, q) in zip(repr1, repr2)]))
assert not np.isnan(sim), 'Error: Similarity is nan.'
if np.isinf(sim):
# the similarity is -inf if no term in the review is in the vocabulary
return 0
return sim
def createLabels(self):
self.labelList= []
with open(self.ccFile) as f:
for line in f:
if line.strip() == 'Labels':
break
for line in f:
if line.strip() == 'Correlation coefficients':
break
goodLine = line.split()
self.labelList.append("%s"%(goodLine[2].strip('\n')))
return self.labelList
#changed, now the distance is defined directly by ccCalc
def checkMultiplicity(self, thr):
FlatC = hierarchy.fcluster(self.Tree, thr, criterion='distance')
counter=collections.Counter(FlatC)
Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
print('You are clustering with a threshold of %s'%(thr))
print('The biggest cluster contains %s datasets from a total of %s'%(counter[Best], len(self.labelList)))
def completenessEstimation(self):
x = 0.00
dx = 0.05
while x > 1:
FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance')
counter=collections.Counter(FlatC)
Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
def minimalForCompleteness(self):
print("Running estimator for minimal threshold for completeness")
labels=self.createLabels()
x = 0.00
dx = 0.05
countsList = {}
x_list = []
while x < 1:
Arrays= {}
FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance')
counter=collections.Counter(FlatC)
Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
toProcess=[Best]
y=0
for cluster, filename in zip(FlatC,labels):
if cluster in toProcess:
hklFile = any_reflection_file(filename)
b= hklFile.as_miller_arrays()
for column in b:
if column.is_xray_intensity_array():
Arrays[y]=column
break
y+=1
try:
Arr = Arrays[0]
except:
countsList.append(0)
for label in range(1, y):
try:
Arr = Arr.concatenate(Arrays[label])
except:
pass
countsList[x]=(Arr.completeness())
x+= dx
# return minimal for max
L = []
for key in countsList:
if countsList[key]>0.98:
L.append(key)
L.sort()
return L[0]
def createDendrogram(self):
X = hierarchy.dendrogram(Tree, color_threshold=self.threshold)
#self.textOutput.append('Plotted Dendrogram. Colored at a %s threshold for distance'%(threshold))
self.TreeCanvas.draw()
def mineHardNegativeTrainingPairsWithinMiniBatches(self):
dnParams = self.descrNet.cfgParams
batch_size = self.cfgParams.batch_size
pairIdx = self.tvPairIdx
#pairLabels = self.tvPairLabels
y = self.tvY
margin = self.pair_neg_margin
diff = self.descrNet.output[pairIdx[:,0]] - self.descrNet.output[pairIdx[:,1]]
dst = T.sum(diff**2,axis=1) / dnParams.outputDim[1] # divide by number of outputs, such that the max distance is 1
pairLabels = T.eq(y[pairIdx[:,0]],y[pairIdx[:,1]]) # same class / different class ?
pair_cost = pairLabels*dst + (1-pairLabels)*T.sqr(T.maximum(0,margin - T.sqrt(dst)))
# indices for all pairs of vectors in the minibatch
pidx1,pidx2 = numpy.triu_indices(batch_size, 1) #numpy.mask_indices(batch_size, numpy.triu, 1)
pidx1 = pidx1.reshape((len(pidx1),1))
pidx2 = pidx2.reshape((len(pidx2),1))
comb_pairIdx = numpy.concatenate((pidx1,pidx2),axis=1).astype(numpy.int32)
dm = self.dataManager
if isinstance(self.tvX,list):
givens = { tv: data[self.tvIndex * batch_size:(self.tvIndex + 1) * batch_size] for (tv,data) in zip(self.tvX,dm.tvsData_x) }
else:
givens = { self.tvX : dm.tvsData_x[self.tvIndex * batch_size:(self.tvIndex + 1) * batch_size] }
givens[self.y] = dm.tvsData_y[self.tvIndex * batch_size:(self.tvIndex + 1) * batch_size]
givens[pairIdx] = comb_pairIdx
tf = theano.function(inputs=[self.tvIndex],
outputs=[pair_cost],
givens=givens)
# for every sample get the index of the other sample with which together it forms the most expensive (highest cost) pair
nSamp = self.n_train_batches*batch_size
idx = numpy.zeros(nSamp,dtype=numpy.int32)
labels = numpy.zeros(nSamp,dtype=numpy.int32)
for i in range(self.n_train_batches):
# if self.macroBatchSize > 0:
# self.setMacroBatchData(self.traindataDB,numpy.floor(i / self.macroBatchSize).astype(numpy.int))
# miniBatchIdx = numpy.mod(i,self.macroBatchSize)
# else:
# miniBatchIdx = i
miniBatchIdx = self.dataManager.makeMinibatchAvailable(self.traindataDB,i)
c = tf(miniBatchIdx)
c = scipy.spatial.distance.squareform(c[0])
# find the max for each
offset = i*batch_size
maxIdx = numpy.argmax(c,axis=0) + offset
idx[i*batch_size:(i+1)*batch_size] = maxIdx
labels[i*batch_size:(i+1)*batch_size] = self.traindataDB.y[maxIdx] == self.traindataDB.y[i*batch_size:(i+1)*batch_size]
#print(c)
idx = numpy.concatenate((numpy.arange(nSamp,dtype=numpy.int32).reshape(nSamp,1),idx.reshape(nSamp,1)),axis=1)
return idx,labels