def correlations(A,B,pc_n=100):
p = (1 - distance.correlation(A.flatten(),B.flatten()))
spear = spearmanr(A.flatten(),B.flatten())
dist_genes = np.zeros(A.shape[0])
for i in range(A.shape[0]):
dist_genes[i] = 1 - distance.correlation(A[i],B[i])
pg = (np.average(dist_genes[np.isfinite(dist_genes)]))
dist_sample = np.zeros(A.shape[1])
for i in range(A.shape[1]):
dist_sample[i] = 1 - distance.correlation(A[:,i],B[:,i])
ps = (np.average(dist_sample[np.isfinite(dist_sample)]))
pc_dist = []
if pc_n > 0:
u0,s0,vt0 = np.linalg.svd(A)
u,s,vt = np.linalg.svd(B)
for i in range(pc_n):
pc_dist.append(abs(1 - distance.cosine(u0[:,i],u[:,i])))
pc_dist = np.array(pc_dist)
return p,spear[0],pg,ps,pc_dist
python类cosine()的实例源码
def evaluate1Word(wv, reference):
"""Evaluate wv against reference, return (rho, count) where rwo is
Spearman's rho and count is the number of reference word pairs
that could be evaluated against.
"""
count=0
gold, predicted = [], []
for words, sim in sorted(reference, key=lambda ws: ws[1]):
if " " not in words[0] and " " not in words[1]:
#print words[0],words[1]
try:
v1, v2 = wv[words[0]], wv[words[1]]
except KeyError:
count+=1
continue
#print words
gold.append((words, sim))
predicted.append((words, cosine(v1, v2)))
simlist = lambda ws: [s for w,s in ws]
rho, p = spearmanr(simlist(gold), simlist(predicted))
print "Word not found in WordVector",count
return (rho, len(gold))
unsupervised_labels.py 文件源码
项目:NETL-Automatic-Topic-Labelling-
作者: sb1992
项目源码
文件源码
阅读 93
收藏 0
点赞 0
评论 0
def get_best_label(label_list,num):
topic_ls = get_topic_lg(topic_list[num])
val_dict = {}
for item in label_list:
trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] #Extracting letter trigram for label
label_cnt = Counter(trigrams)
total = sum(label_cnt.values(), 0.0)
for key in label_cnt:
label_cnt[key] /= total
tot_keys = list(set(topic_ls.keys() + label_cnt.keys()))
listtopic = []
listlabel = []
for elem in tot_keys:
if elem in topic_ls:
listtopic.append(topic_ls[elem])
else:
listtopic.append(0.0)
if elem in label_cnt:
listlabel.append(label_cnt[elem])
else:
listlabel.append(0.0)
val = 1 - cosine(np.array(listtopic),np.array(listlabel)) # Cosine Similarity
val_dict[item] = val
list_sorted=sorted(val_dict.items(), key=lambda x:x[1], reverse = True) # Sorting the labels by rank
return [i[0] for i in list_sorted[:int(args.num_unsup_labels)]]
def texts_tfidf(ids, important_texts, citations_texts) :
'''
Generates tf-idf vectors for each text then calculates cosine similarity between the vectors.
'''
tfidf = TfidfVectorizer(strip_accents='ascii',
stop_words='english',
ngram_range=(1,2),
min_df=2)
freqs1 = tfidf.fit_transform(important_texts)
terms1 = tfidf.get_feature_names()
freqs2 = tfidf.fit_transform(citations_texts)
terms2 = tfidf.get_feature_names()
return terms1, terms2, freqs1, freqs2
def texts_similarity(terms1, terms2, freqs1, freqs2) :
# Merge all terms
terms = list(set(terms1 + terms2))
npapers = freqs1.shape[0]
sims = np.empty(npapers, np.float)
for i in xrange(npapers) :
# If one of the vectors is nil, skip it
if (freqs1[i].sum()==0.0) or (freqs2[i].sum()==0.0) :
continue
# Changes representation to a {term: freq} map
fmap1 = to_dict(terms1, freqs1.getrow(i).toarray()[0])
fmap2 = to_dict(terms2, freqs2.getrow(i).toarray()[0])
vec1, vec2 = to_same_dimension(terms, fmap1, fmap2)
sims[i] = 1.0-cosine(vec1, vec2)
return sims
def random_similarity(terms1, terms2, freqs1, freqs2) :
# Merge all terms
terms = list(set(terms1 + terms2))
npapers = freqs1.shape[0]
sims = np.empty(npapers, np.float)
for i in xrange(npapers) :
a = random.randint(0,npapers-1) #@UndefinedVariable
b = random.randint(0,npapers-1) #@UndefinedVariable
# If one of the vectors is nil, skip it
if (freqs1[a].sum()==0.0) or (freqs2[b].sum()==0.0) :
continue
# Changes representation to a {term: freq} map
fmap1 = to_dict(terms1, freqs1[a].toarray()[0])
fmap2 = to_dict(terms2, freqs2[b].toarray()[0])
vec1, vec2 = to_same_dimension(terms, fmap1, fmap2)
sims[i] = 1.0-cosine(vec1, vec2)
return sims
def sanity_check(test_emb, train_emb, num_test):
'''
Sanity check on the cosine similarity calculations
Finds the closest vector in the space by brute force
'''
correct_list = []
for i in xrange(num_test):
smallest_norm = np.infty
index = 0
for j in xrange(len(train_emb)):
norm = np.linalg.norm(emb - test_emb[i])
if norm < smallest_norm:
smallest_norm = norm
index = j
correct_list.append(index)
# Pad the list to make it the same length as test_emb
for i in xrange(len(test_emb) - num_test):
correct_list.append(-1)
return correct_list
def token_similarity(self, words ,rwords):
words = set(words)
rwords = set(rwords)
word_vec = np.zeros(self.word_dim)
rword_vec = np.zeros(self.word_dim)
word_count = 0
rword_count = 0
for word in words:
if self.word_vec.has_key(word) and word not in self.stopwords:
word_vec += self.word_vec[word]
word_count += 1
for word in rwords:
if self.word_vec.has_key(word):
rword_vec += self.word_vec[word]
rword_count += 1
if word_count > 0:
word_vec = word_vec / word_count
if rword_count > 0:
rword_vec = rword_vec / rword_count
if word_count>0 and rword_count>0:
return cosine(word_vec, rword_vec)
else:
return 1
def nearest_words(self, word, top=20, display=False):
"""
Find the nearest words to the word
according to the cosine similarity.
"""
W = self.W / np.linalg.norm(self.W, axis=0)
if (type(word)==str):
vec = self.word_vector(word, W)
else:
vec = word / np.linalg.norm(word)
cosines = (vec.T).dot(W)
args = np.argsort(cosines)[::-1]
nws = []
for i in xrange(1, top+1):
nws.append(self.inv_vocab[args[i]])
if (display):
print self.inv_vocab[args[i]], round(cosines[args[i]],3)
return nws
def argmax_fun(W, indices, argmax_type='levi'):
"""
cosine: b* = argmax cosine(b*, b - a + a*)
levi: b* = argmax cos(b*,a*)cos(b*,b)/(cos(b*,a)+eps)
"""
if (argmax_type == 'levi'):
W = W / np.linalg.norm(W, axis=0)
words3 = W[:, indices]
cosines = ((words3.T).dot(W) + 1) / 2
obj = (cosines[1] * cosines[2]) / (cosines[0] + 1e-3)
pred_idx = np.argmax(obj)
elif (argmax_type == 'cosine'):
words3_vec = W[:, indices].sum(axis=1) - 2*W[:, indices[0]]
W = W / np.linalg.norm(W, axis=0)
words3_vec = words3_vec / np.linalg.norm(words3_vec)
cosines = (words3_vec.T).dot(W)
pred_idx = np.argmax(cosines)
return pred_idx
def synonyms_by_synset(self, synset_name, topn=3):
ssid = self.id_table[synset_name]
doc = self.doc_matrix[ssid]
found_indices = set([ssid])
synonyms = []
for _ in range(topn):
min_index = 0
min_val = 10
for i in range(self.doc_matrix.shape[0]):
cos_dist = cosine(self.doc_matrix[i], doc)
if i not in found_indices and cos_dist < min_val:
min_index = i
min_val = cos_dist
found_indices.add(min_index)
synonyms.append((self.definitions[min_index], min_val))
return synonyms
def get_sils_matrix(method, scores, wordlist):
''' See get_sims_matrix for definitions, which are the same here. The
difference is that the resulting matrix contains distances instead of
similarities.
:return: 2-dimensional np.ndarray of size len(wordlist) x len(wordlist)
'''
if method =='direct':
sims = get_sims_matrix(method, scores, wordlist)
sims = preprocessing.normalize(np.matrix(sims), norm='l2')
sils = 1-sims
elif method == 'dict_cosine': # cosine dist of word-PPDB2.0Score matrix
sils = np.array([[dict_cosine_dist(scores.get(i,{}),scores.get(j,{})) for j in wordlist] for i in wordlist])
elif method == 'dict_JS': # JS divergence of word-PPDB2.0Score matrix
sils = np.array([[dict_js_divergence(scores.get(i,{}),scores.get(j,{}))[0] for j in wordlist] for i in wordlist])
elif method == 'vec_cosine':
d = scores.values()[0].shape[0]
sils = np.array([[cosine(scores.get(i,np.zeros(d)), scores.get(j,np.zeros(d))) for j in wordlist] for i in wordlist])
else:
sys.stderr.write('Unknown sil method: %s' % method)
return None
sils = np.nan_to_num(sils)
return sils
def get_sentiment_sim(context_seqs, gen_seqs):
'''return the cosine similarity between the sentiment scores of each context and corresponding generated sequence;
the sentiment scores are given in spacy'''
gen_seqs = check_seqs_format(gen_seqs)
emotion_types = ['AFRAID', 'AMUSED', 'ANGRY', 'ANNOYED', 'DONT_CARE', 'HAPPY', 'INSPIRED', 'SAD']
gen_sentiment_sim_scores = []
for context_seq, gen_seqs_ in zip(context_seqs, gen_seqs):
context_sentiment = lexicon_methods.emotional_valence(encoder(context_seq))
context_sentiment = numpy.array([context_sentiment[emotion_type] for emotion_type in emotion_types]) + 1e-8 #add tiny number to avoid NaN when all scores are 0
sentiment_sim_scores = []
for gen_seq in gen_seqs_:
gen_sentiment = lexicon_methods.emotional_valence(encoder(gen_seq))
gen_sentiment = numpy.array([gen_sentiment[emotion_type] for emotion_type in emotion_types]) + 1e-8 #add tiny number to avoid NaN when all scores are 0
sentiment_sim = 1 - cosine(context_sentiment, gen_sentiment)
sentiment_sim_scores.append(sentiment_sim)
gen_sentiment_sim_scores.append(sentiment_sim_scores)
gen_sentiment_sim_scores = numpy.array(gen_sentiment_sim_scores)
return {'sentiment_sim_scores': gen_sentiment_sim_scores, 'mean_sentiment_sim_scores': numpy.mean(gen_sentiment_sim_scores)}
def test_cosine_similarity():
# Test the cosine_similarity.
rng = np.random.RandomState(0)
X = rng.random_sample((5, 4))
Y = rng.random_sample((3, 4))
Xcsr = csr_matrix(X)
Ycsr = csr_matrix(Y)
for X_, Y_ in ((X, None), (X, Y),
(Xcsr, None), (Xcsr, Ycsr)):
# Test that the cosine is kernel is equal to a linear kernel when data
# has been previously normalized by L2-norm.
K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
X_ = normalize(X_)
if Y_ is not None:
Y_ = normalize(Y_)
K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
assert_array_almost_equal(K1, K2)
def find_similar_words(wordvecs):
""" Use loaded word embeddings to find out the most similar words in the
embedded vector space.
"""
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
pairwise_sim_mat = 1 - pairwise_distances(wordvecs.W[1:],
metric='cosine',
# metric='euclidean',
)
id2word = {}
for key, value in wordvecs.word_idx_map.iteritems():
assert(value not in id2word)
id2word[value] = key
while True:
word = raw_input("Enter a word ('STOP' to quit): ")
if word == 'STOP': break
try:
w_id = wordvecs.word_idx_map[word]
except KeyError:
print '%s not in the vocabulary.' % word
sim_w_id = pairwise_sim_mat[w_id-1].argsort()[-10:][::-1]
for i in sim_w_id:
print id2word[i+1],
print ''
def generate_answer(self, msg_text, chat_id):
minimum_index=[1-(10**(-5)),-1] # min value / minimum index
if chat_id in self.vectorizer:
t=self.vectorizer[chat_id].transform([msg_text]).toarray()[0]
else:
reply=""
return
for i,t2 in enumerate(self.mat[chat_id].toarray()):
w=cosine(t,t2)
if abs(w)<=minimum_index[0]:
if minimum_index[0] == abs(w): # equal weight, lets take the longer message
if len(self.speech[chat_id][0][i]) > len(self.speech[chat_id][0][minimum_index[1]]):
minimum_index[1] = i
else: #not equal, take the lower weight
minimum_index[0] = w
minimum_index[1] = i
if minimum_index[1]==-1 or minimum_index[0]>0.85: # no message found or score too bad
return ""
from_sent_id = self.speech[chat_id][1][minimum_index[1]]
for i in range(1,5):
try:
if from_sent_id != self.speech[chat_id][1][minimum_index[1]+i]:
return self.speech[chat_id][0][minimum_index[1]+i]
except IndexError:
return ""
return ""
def most_similar(self, word, num_similar=5):
idx = self._w2idx[word]
y = list(range(self._matrix.shape[0]))
y.pop(idx)
most_similar = [(1,0)] * num_similar
for i in y:
dist = 0
dist = cosine(self._matrix[idx], self._matrix[i])
if dist < most_similar[-1][0]:
most_similar.pop()
most_similar.append((dist,i))
most_similar = sorted(most_similar)
most_similar = [(distance, self._idx2w[i]) for (distance, i) in most_similar]
return most_similar
def all_col_dist(m):
D = m.shape[1]
d = np.zeros((D,D))
for i in xrange(D):
div = m[:,i]
for j in xrange(D):
djv = m[:,j]
d[j][i] = cosine(div,djv)
return d
def choose_best_action(self, list_of_words):
min_distance = 3
best_matching_action = None
tf_idf_shelve = shelve.open(self.tf_idf_shelve_file_name)
current_sentence_centroid = self.compute_list_of_words_centroid(list_of_words)
for action,centroid in tf_idf_shelve[CENTROID].iteritems():
distance = cosine(centroid,current_sentence_centroid)
print action,distance
if distance <= min_distance:
min_distance = distance
best_matching_action = action
tf_idf_shelve.close()
return current_sentence_centroid, best_matching_action, min_distance
def choose_best_action(self, list_of_words):
min_distance = 3
best_matching_action = None
tf_idf_shelve = shelve.open(self.tf_idf_shelve_file_name)
current_sentence_centroid = self.compute_list_of_words_centroid(list_of_words)
for action,centroid in tf_idf_shelve[CENTROID].iteritems():
distance = cosine(centroid,current_sentence_centroid)
print action,distance
if distance <= min_distance:
min_distance = distance
best_matching_action = action
tf_idf_shelve.close()
return current_sentence_centroid, best_matching_action, min_distance
def calAvgSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2):
assert(len(senseVec1)==len(senseVec2))
avgCos = []
for t in xrange(len(senseVec1)):
thisCos = []
p1 = (senseScore1[t])
p2 = (senseScore2[t])
for i in xrange(len(senseVec1[t])):
for j in xrange(len(senseVec2[t])):
thisCos.append((1-cosine(senseVec1[t][i],senseVec2[t][j]))*p1[i]*p2[j])
avgCos.append(np.sum(thisCos))
return spearmanr(test_score, avgCos)[0]
def calMaxSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2):
assert(len(senseVec1)==len(senseVec2))
avgCos = []
for t in xrange(len(senseVec1)):
i = np.argmax(senseScore1[t])
j = np.argmax(senseScore2[t])
thisCos = (1-cosine(senseVec1[t][i],senseVec2[t][j]))
avgCos.append(thisCos)
return spearmanr(test_score, avgCos)[0]
sentence-similarity.py 文件源码
项目:visually-grounded-speech
作者: gchrupala
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def cosine_similarity(a, b):
# returns cosine smilarity between a and b
return 1.0-cosine(a, b)
sentence-similarity.py 文件源码
项目:visually-grounded-speech
作者: gchrupala
项目源码
文件源码
阅读 40
收藏 0
点赞 0
评论 0
def cosine_similarities(a, b, transform):
"""
returns list of cosine similarities between lists of vectors
a and b. The z_score transformation is applied if transform == True
"""
a = numpy.stack(a)
b = numpy.stack(b)
#transform if requested
if transform:
print "transforming"
# z_score is written to apply same scale to a and b
a, b = z_score(a, b)
print "calculating cosine dists"
cos = [cosine_similarity(a[i], b[i]) for i in range(len(a))]
return cos
def delta(u, v):
""" cosine ° sigmoid
>>> delta([0.2], [0.3])
0.5
>>> delta([0.3], [0.2])
0.5
>>> delta([0.1,0.9], [-0.9,0.1]) == delta([-0.9,0.1], [0.1,0.9])
True
"""
# TODO scale with a and c
return expit(cosine(u, v))
def reduncy(sen_vec, doc_vec):
return 1 - cosine(sen_vec, (doc_vec - sen_vec))
def relavence(sen_vec, doc_vec):
return 1 - cosine(sen_vec, doc_vec)
def compute_distance(query_channel, channel, mean_vec, distance_type = 'eucos'):
""" Compute the specified distance type between chanels of mean vector and query image.
In caffe library, FC8 layer consists of 10 channels. Here, we compute distance
of distance of each channel (from query image) with respective channel of
Mean Activation Vector. In the paper, we considered a hybrid distance eucos which
combines euclidean and cosine distance for bouding open space. Alternatively,
other distances such as euclidean or cosine can also be used.
Input:
--------
query_channel: Particular FC8 channel of query image
channel: channel number under consideration
mean_vec: mean activation vector
Output:
--------
query_distance : Distance between respective channels
"""
if distance_type == 'eucos':
query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200. + spd.cosine(mean_vec[channel, :], query_channel)
elif distance_type == 'euclidean':
query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200.
elif distance_type == 'cosine':
query_distance = spd.cosine(mean_vec[channel, :], query_channel)
else:
print "distance type not known: enter either of eucos, euclidean or cosine"
return query_distance
def cmp_vectors(v1, v2):
# c = cosine(normed(v1), normed(v2))
# c = cosine(v1, v2)
c = v1 @ v2
return c
def process_options(args):
options = argparser().parse_args(args)
if options.max_rank is not None and options.max_rank < 1:
raise ValueError('max-rank must be >= 1')
if options.threshold is not None and options.threshold < 0.0:
raise ValueError('threshold must be >= 0')
if options.tolerance is not None and options.tolerance < 0.0:
raise ValueError('tolerance must be >= 0')
if options.approximate and not options.threshold:
raise ValueError('approximate only makes sense with a threshold')
if options.approximate and options.metric != 'cosine':
raise NotImplementedError('approximate only supported for cosine')
wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)
if options.normalize:
logging.info('normalize vectors to unit length')
wv.normalize()
words, vectors = wv.words(), wv.vectors()
if options.whiten:
# whitening should be implemented in wvlib to support together with
# approximate similarity
if options.approximate:
raise NotImplemenedError
logging.info('normalize features to unit variance')
vectors = whiten(vectors)
return words, vectors, wv, options