def all_distances(self, l1, metric='cosine'):
"""Return distance matrix with distances to all words."""
l1_vecs = self.word_vectors_matrix(l1)
l1_labels = [self.label(e) for e in l1]
sims = self.pairwise_distances(l1_vecs, self.vectors, metric=metric)
return pd.DataFrame(sims, l1_labels, self.words)
python类pairwise_distances()的实例源码
def pair_distance(self, w1, w2, metric='cosine'):
"""Calculate distance between two words."""
distance = self.pairwise_distances(
self.get_vector(w1),
self.get_vector(w2), metric=metric)
return distance[0, 0]
def matrix_distances(self, l1, l2=None, metric='cosine'):
"""Return distance matrix with distances between pairs of words."""
l1_vecs = self.word_vectors_matrix(l1)
l1_labels = [self.label(e) for e in l1]
if l2 is None:
sims = self.pairwise_distances(l1_vecs, metric=metric)
l2 = l1
else:
l2_vecs = self.word_vectors_matrix(l2)
l2_labels = [self.label(e) for e in l2]
sims = self.pairwise_distances(l1_vecs, l2_vecs, metric=metric)
return pd.DataFrame(sims, l1_labels, l2_labels)
def computeProbabilities(X, perplexity=30.0, tolerance=1e-5):
#Perform an initial dimensionality reduction
pca = PCA(n_components=50)
X = pca.fit_transform(X)
numSamples = X.shape[0]
P = np.zeros((numSamples, numSamples))
D = pairwise_distances(X, squared=True)
for i in range(numSamples):
indices = np.concatenate((np.arange(i), np.arange(i + 1, numSamples)))
distancesFromI = D[i, indices]
sigma = binarySearch(computePerplexity, distancesFromI, tolerance, perplexity)
precision = 1.0 / sigma
#Compute a "row" of matrix P: the probabilities wrt point I
PwrtI = np.exp(- distancesFromI * precision)
PwrtI /= sum(PwrtI)
#Insert an element corresponding to I wrt I
PwrtI = np.concatenate((PwrtI[0:i], [0.0], PwrtI[i:numSamples]))
#Insert the row
P[i, :] = PwrtI
return P
def main(args):
PF, PL, GF, GL = _get_test_data(args.result_dir)
D = pairwise_distances(GF, PF, metric=args.method, n_jobs=-2)
gallery_labels_set = np.unique(GL)
for label in PL:
if label not in gallery_labels_set:
print 'Probe-id is out of Gallery-id sets.'
Times = 100
k = 20
res = np.zeros(k)
gallery_labels_map = [[] for i in xrange(gallery_labels_set.size)]
for i, g in enumerate(GL):
gallery_labels_map[g].append(i)
for __ in xrange(Times):
# Randomly select one gallery sample per label selected
newD = np.zeros((gallery_labels_set.size, PL.size))
for i, g in enumerate(gallery_labels_set):
j = np.random.choice(gallery_labels_map[g])
newD[i, :] = D[j, :]
# Compute CMC
res += _cmc_core(newD, gallery_labels_set, PL, k)
res /= Times
for topk in [1, 5, 10, 20]:
print "{:8}{:8.1%}".format('top-' + str(topk), res[topk - 1])
def getDist(feat1, feat2, metric):
pair_num = len(feat1)
import sklearn.metrics.pairwise as pw
mt = pw.pairwise_distances(feat1, feat2, metric=metric)
distance = np.empty((pair_num,))
for i in xrange(pair_num):
distance[i] = mt[i,i]
return distance
# Extract feature via network.
graphssl.py 文件源码
项目:graph-based-semi-supervised-learning
作者: deerishi
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def constructCovarianceMatrix(self):
#this function constructs the covariance matrix for the dataset and then does a label propagation over it
self.covarianceMatrix=np.cov(self.trainVectorsPCA.T) #as numpy treats them as column vetcors
self.inverseCovarianceMatrix=np.linalg.inv(self.covarianceMatrix)
#compute the cholesky decomposition and then transform the data into the new space
self.L_cov=np.linalg.cholesky(self.covarianceMatrix)
self.allDataCov=np.dot(self.allDataPCA,self.L_cov.T)
self.pwdis=pairwise_distances(self.allDataCov)
self.D=np.zeros(self.pwdis.shape)
projectedDigits=TSNE(random_state=randomState).fit_transform(self.allDataCov)
plt.figure()
plt.scatter(projectedDigits[:,0],projectedDigits[:,1],c=self.labels)
plt.title('Data projected by Covariance Matrix in Mahalanobis metric')
plt.savefig(pp,format='pdf')
plt.close()
ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
accs=[]
for k in ks:
for i in range(0,self.pwdis.shape[0]):
l1=self.pwdis[i].tolist()
#print 'l1 is ',l1,'\n\n'
allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
#now set the all the weights except for k+1 to 0
self.pwdis[i,allnearestNeighbours[k:]]=0
self.D[i,i]=sum(self.pwdis[i]+0.01)
print 'accuracy by using Covariance Matrix for Mahalanobis Distance for k= ',k,'\n'
accs.append(self.labelPropogation())
plt.figure()
plt.plot(ks,accs)
plt.title('Plot of accuracy vs k using Covariance Matrix in Mahalanobis metric')
plt.savefig(pp,format='pdf')
graphssl.py 文件源码
项目:graph-based-semi-supervised-learning
作者: deerishi
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def constructEucleadianGaussianKernel(self):
self.pwdis=pairwise_distances(self.allDataPCA)
maccs=[]
ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
for k in ks:
sigmas=[1,1.5,2,2.5,3,3.5]
accs=[]
for sigma in sigmas:
self.pwdis=-1*self.pwdis/(2*sigma*sigma)
self.pwdis=np.exp(self.pwdis)
self.D=np.zeros(self.pwdis.shape)
for i in range(0,self.pwdis.shape[0]):
l1=self.pwdis[i].tolist()
#print 'l1 is ',l1,'\n\n'
allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
#now set the all the weights except for k+1 to 0
self.pwdis[i,allnearestNeighbours[k:]]=0
self.D[i,i]=sum(self.pwdis[i])
#here we make no trnasformation on the dataset, as this is simply the
print 'accuracy for constructEucleadianGaussianKernel with k=',k,' and sigma =',sigma,' is \n'
accs.append(self.labelPropogation())
maccs.append(np.mean(accs))
plt.figure()
plt.plot(ks,maccs)
plt.title('Accuarcy vs k for Eucledian Gaussian Kernel')
plt.savefig(pp,format='pdf')
plt.close()
graphssl.py 文件源码
项目:graph-based-semi-supervised-learning
作者: deerishi
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def constructEucleadianGaussianKernelNoPca(self):
self.pwdis=pairwise_distances(self.allVectors)
maccs=[]
ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
for k in ks:
sigmas=[1,1.5,2,2.5,3,3.5]
accs=[]
for sigma in sigmas:
self.pwdis=-1*self.pwdis/(2*sigma*sigma)
self.pwdis=np.exp(self.pwdis)
self.D=np.zeros(self.pwdis.shape)
for i in range(0,self.pwdis.shape[0]):
l1=self.pwdis[i].tolist()
#print 'l1 is ',l1,'\n\n'
allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
#now set the all the weights except for k+1 to 0
self.pwdis[i,allnearestNeighbours[k:]]=0
self.D[i,i]=sum(self.pwdis[i])
#here we make no trnasformation on the dataset, as this is simply the
print 'accuracy for constructEucleadianGaussianKernel with k=',k,' and sigma =',sigma,' is \n'
accs.append(self.labelPropogation())
maccs.append(np.mean(accs))
plt.figure()
plt.plot(ks,maccs)
plt.title('Accuarcy vs k for Eucledian Gaussian Kernel')
plt.savefig(pp,format='pdf')
plt.close()
graphssl.py 文件源码
项目:graph-based-semi-supervised-learning
作者: deerishi
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def constructSimilartyMatrixCosine(self):
#This is a simpole k nearest neighbour approach based on the cosine distance
#for this takefrom modshogun import RealFeatures, MulticlassLabels
#then find the k nearest neighbours for each node
#now we have all the pairwise cosine distances between all the sentences
#now we need to do a knnNeighbour search
#now we can construct the diagonal weight marix , which has the sum of all the weights
ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
accs=[]
for k in ks:
self.pwdis=pairwise_distances(self.allVectors,metric='cosine')
self.D=np.zeros(self.pwdis.shape)
for i in range(0,self.pwdis.shape[0]):
l1=self.pwdis[i].tolist()
#print 'l1 is ',l1,'\n\n'
allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
#now set the all the weights except for k+1 to 0
self.pwdis[i,allnearestNeighbours[k:]]=0
self.D[i,i]=sum(self.pwdis[i])
print 'accuracy on non pca data using cosine and k= ',k,' is ','\n'
accs.append(self.labelPropogation())
plt.figure()
plt.plot(ks,accs)
plt.title('Plot of accuracy vs k using cosine non PCA data')
plt.savefig(pp,format='pdf')
plt.close()
graphssl.py 文件源码
项目:graph-based-semi-supervised-learning
作者: deerishi
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def constructSimilartyMatrixCosinePCA(self):
#This is a simpole k nearest neighbour approach based on the cosine distance
#for this takefrom modshogun import RealFeatures, MulticlassLabels
#then find the k nearest neighbours for each node
ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
accs=[]
for k in ks:
self.pwdis=pairwise_distances(self.allDataPCA,metric='cosine')
#now we have all the pairwise cosine distances between all the sentences
#now we need to do a knnNeighbour search
#now we can construct the diagonal weight marix , which has the sum of all the weights
self.D=np.zeros(self.pwdis.shape)
for i in range(0,self.pwdis.shape[0]):
l1=self.pwdis[i].tolist()
#print 'l1 is ',l1,'\n\n'
allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
#now set the all the weights except for k+1 to 0
self.pwdis[i,allnearestNeighbours[k:]]=0
self.D[i,i]=sum(self.pwdis[i])
print 'Now computing accuracy for cosine metric on PCA data'
accs.append(self.labelPropogation())
plt.figure()
plt.plot(ks,accs)
plt.title('Plot of accuracy vs k using cosine PCA data')
plt.savefig(pp,format='pdf')
plt.close()
#now we have the weight matrix graph based on the cosine distance
#print 'self.D is ',self.D
def calc_cosine_dist(text_a ,text_b):
return pairwise_distances(text_a, text_b, metric='cosine')[0][0]
def calc_cosine_dist(text_a ,text_b):
return pairwise_distances(text_a, text_b, metric='cosine')[0][0]
generate_sklearn_tfidf_sim.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def calc_cosine_dist(text_a ,text_b):
return pairwise_distances(text_a, text_b, metric='cosine')[0][0]
generate_selftrained_glove_sim_dist_diff.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def calc_cosine_dist(text_a ,text_b, metric = 'euclidean'):
return pairwise_distances([text_a], [text_b], metric = metric)[0][0]
def predict_proba(self, X):
"""
Returns a matrix for each of the samples to belong to each of the classes.
The matrix has shape = [n_samples, n_classes] where n_samples is the
size of the first dimension of the input matrix X and n_classes is the number of
classes as determined from the parameter 'y' obtained during training.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Prediction vector, where n_samples in the number of samples and
n_features is the number of features.
"""
probabilities = np.zeros((X.shape[0], self.y.shape[1]), dtype=np.float64)
distances = (pairwise_distances(X, self.centroids_, metric=self.metric))
# in order to get probability like values, we ensure that the closer
# the distance is to zero, the closer the probability is to 1
if(self.metric == 'cosine'):
distances = 1 - distances
else:
# in the case of euclidean distance metric we need to normalize by the largest distance
# to get a value between 0 and 1
distances = 1 - (distances / distances.max())
# map back onto a matrix containing all labels
probabilities[:,self._mem_original_mapping] = distances
return probabilities
def assign_to_closest(X, centers, metric='euclidean'):
return np.argmin(pairwise_distances(X, centers, metric=metric), axis=1)
def sq_cdist(A,B): return pairwise_distances(A,B, 'sqeuclidean')
# Sets of inputs
def sq_cdist(A,B): return pairwise_distances(A,B, 'sqeuclidean')
# Sets of input defining sizes
def sort(self, word):
'''
Use an input word to sort words using cosine distance in ascending order
'''
assert word in self.dictionary
i = self.dictionary[word]
vec = self.final_embeddings[i].reshape(1, -1)
# Calculate pairwise cosine distance and flatten to 1-d
pdist = pairwise_distances(self.final_embeddings, vec, metric='cosine').ravel()
return [self.reverse_dictionary[i] for i in pdist.argsort()]