def make_dist(vectors, options):
if options.metric != 'cosine':
return vectors, metrics[options.metric]
else:
# normalize once only
vectors = [v/numpy.linalg.norm(v) for v in vectors]
return vectors, lambda u, v: 1 - numpy.dot(u, v)
python类cosine()的实例源码
def cosine(v1, v2):
return numpy.dot(v1/numpy.linalg.norm(v1), v2/numpy.linalg.norm(v2))
train_svm_model.py 文件源码
项目:NETL-Automatic-Topic-Labelling-
作者: sb1992
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def get_lt_ranks(lab_list,num):
topic_ls = get_topic_lt(topic_list[num])
val_dict = {}
val_list =[]
final_list=[]
for item in lab_list:
trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] #Letter trigram for candidate label.
label_cnt = Counter(trigrams)
total = sum(label_cnt.values(), 0.0)
for key in label_cnt:
label_cnt[key] /= total
tot_keys = list(set(topic_ls.keys() + label_cnt.keys()))
listtopic = []
listlabel = []
for elem in tot_keys:
if elem in topic_ls:
listtopic.append(topic_ls[elem])
else:
listtopic.append(0.0)
if elem in label_cnt:
listlabel.append(label_cnt[elem])
else:
listlabel.append(0.0)
val = 1 - cosine(np.array(listtopic),np.array(listlabel)) # Cosine Similarity
val_list.append((item,val))
rank_val = [i[1] for i in val_list]
arr = np.array(rank_val)
order = arr.argsort()
ranks = order.argsort()
for i,elem in enumerate(val_list):
final_list.append((elem[0],ranks[i],int(num)))
return final_list
# Generates letter trigram feature
supervised_labels.py 文件源码
项目:NETL-Automatic-Topic-Labelling-
作者: sb1992
项目源码
文件源码
阅读 54
收藏 0
点赞 0
评论 0
def get_lt_ranks(lab_list,num):
topic_ls = get_topic_lt(topic_list[num]) # Will get letter trigram for topic terms.
val_dict = {}
val_list =[]
final_list=[]
for item in lab_list:
trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] # get the trigrams for label candidate.
label_cnt = Counter(trigrams)
total = sum(label_cnt.values(), 0.0)
for key in label_cnt:
label_cnt[key] /= total
tot_keys = list(set(topic_ls.keys() + label_cnt.keys()))
listtopic = []
listlabel = []
for elem in tot_keys:
if elem in topic_ls:
listtopic.append(topic_ls[elem])
else:
listtopic.append(0.0)
if elem in label_cnt:
listlabel.append(label_cnt[elem])
else:
listlabel.append(0.0)
val = 1 - cosine(np.array(listtopic),np.array(listlabel)) # Cosine similarity.
val_list.append((item,val))
rank_val = [i[1] for i in val_list]
arr = np.array(rank_val)
order = arr.argsort()
ranks = order.argsort()
for i,elem in enumerate(val_list):
final_list.append((elem[0],ranks[i],int(num)))
return final_list
# This calls the above method to get letter trigram feature.
def find_nearest(skip_words, vec, id_to_word, df, num_results=1, method='cosine'):
if method == 'cosine':
minim = [] # min, index
for i, v in enumerate(df):
# skip the base word, its usually the closest
if id_to_word[i] in skip_words:
continue
dist = cosine(vec, v)
minim.append((dist, i, v))
minim = sorted(minim, key=lambda v: v[0])
# return list of (word, cosine distance, vector) tuples
return [(id_to_word[minim[i][1]], minim[i][0], minim[i][2]) for i in range(num_results)]
else:
raise Exception('{} is not an excepted method parameter'.format(method))
def turn(gs, word_to_id, id_to_word, df, soft_score):
gs['turn_number'] += 1
names = list(gs['players'].keys())
current_player = names[(gs['turn_number'] % len(names) - 1)]
while True:
expr = input('{}, please enter a word expression:\n> '.format(current_player))
try:
vec, skip_words = eval_expression(expr, word_to_id, word_to_id, df)
except Exception as err:
print(err)
continue
break
answers = {}
for name in gs['players']:
while True:
word = input('{}, please enter your answer: '.format(name))
if word in word_to_id:
answers[name] = df[word_to_id[word]]
break
else:
print('{} is not in the dataset, please another word.'.format(word))
answer_word, answer_dist, answer_vec = find_nearest(skip_words, vec, id_to_word, df)[0]
# transform answers from vectors to distances
for k, v in answers.items():
answers[k] = cosine(v, answer_vec)
winner = min(answers, key=answers.get)
if not soft_score:
gs['players'][winner] += 1
else:
for name in answers:
gs['players'][name] += round(answers[name], 2)
print('Computer says {} = {}'.format(expr, colored(answer_word, 'cyan')))
print('{} wins this round.'.format(colored(winner, 'green')))
print_standings(gs)
def find_nearest(words, vec, id_to_word, df, num_results, method='cosine'):
if method == 'cosine':
minim = [] # min, index
for i, v in enumerate(df):
# skip the base word, its usually the closest
if id_to_word[i] in words:
continue
dist = cosine(vec, v)
minim.append((dist, i))
minim = sorted(minim, key=lambda v: v[0])
# return list of (word, cosine distance) tuples
return [(id_to_word[minim[i][1]], minim[i][0]) for i in range(num_results)]
else:
raise Exception('{} is not an excepted method parameter'.format(method))
def find_similar_words_by_vector(self, vector: np.ndarray, n: int = 10):
vocabulary = self._vocabulary
similar_ids = sorted(range(0, vocabulary.size),
key=lambda id: cosine(self._vectors[id], vector))[:n]
return [vocabulary.to_word(id) for id in similar_ids]
def computeDistance(X, Y, method):
if method == 'cosine':
dist = spdistance.cosine(X,Y)
if dist < 0:
print ('WARNING: distance between X {} and Y {} = {} < 0, method: '
'{}'.format(X, Y, dist, method))
return dist
def runNN(descriptors, labels, parallel, nprocs):
"""
compute nearest neighbor from specific descriptors, given labels
"""
distance_method = { "cosine": 'cosine' }
ret_matrix = None
for name, method in distance_method.iteritems():
dist_matrix = computeDistances(descriptors, method,
parallel, nprocs)
computeStats(name, dist_matrix, labels, parallel)
ret_matrix = dist_matrix
return ret_matrix
def compare_tweet_with_storage(tweet, storage=None, bow=False):
if storage is None:
if not os.path.isfile(os.path.join(config.data_folder, config.model_file)):
raise('Model was not found!')
else:
storage = pickle.load(open(os.path.join(config.data_folder, config.model_file), 'rb'))
print(tweet)
transformed_tweet = transform_tweet(tweet, bow)
print([x[0] for x in transformed_tweet], [np.sum(y) for y in (x[2] for x in transformed_tweet)])
scores = {}
for i, (entity, entity_type, vector_array) in enumerate(transformed_tweet):
temp_score = 0.0
for j, (tweetid, item) in enumerate(storage[storage['Entity'] == entity].iterrows()):
if bow:
clusterids = np.unique([vector_array.keys() + item['Vector array'].keys()])
vector1 = np.zeros([len(clusterids)])
vector2 = np.zeros([len(clusterids)])
for k, cid in enumerate(clusterids):
vector1[k] = vector_array.get(cid, 0)
vector2[k] = item['Vector array'].get(cid, 0)
temp_score = np.max([1.0 * np.sum(np.logical_and(vector1, vector2)) / np.min([np.sum(vector1), np.sum(vector2)]), temp_score])
else:
if SPLIT:
result = [1 - cosine(vector_array[x], item['Vector array'][x]) for x in range(3)]
isnan = np.isnan(result)
res = 0.0
for v in range(3):
if not isnan[v]:
res+=result[v]
res = 1.0 * res/(np.sum(isnan==False)+10**(-10))
temp_score = np.max([res, temp_score])
# print(entity, entity_type)
else:
temp_score = np.max([1 - cosine(vector_array, item['Vector array']), temp_score])
print(1 - cosine(vector_array, item['Vector array']), entity, tweet, str(tweetid))
scores.update({entity: temp_score})
return combine_scores(scores)
generate_pretrained_glove_sim_dist_diff_idf.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def calc_glove_sim(row,embedder,idf_dict):
'''
Calc glove similarities and diff of centers of query\title
'''
a2 = [x for x in remove_punctuation(row['question1']).lower().split() if x in embedder]
b2 = [x for x in remove_punctuation(row['question2']).lower().split() if x in embedder]
# if len(a2)>0 and len(b2)>0:
# glove_sim = embedder.n_similarity(a2, b2)
# else:
# return((-1, -1, np.zeros(300)))
vectorA = np.zeros(300)
for w in a2:
if w in idf_dict:
coef = idf_dict[w]
else:
coef = idf_dict['default_idf']
vectorA += coef*embedder[w]
vectorA /= len(a2)
vectorB = np.zeros(300)
for w in b2:
if w in idf_dict:
coef = idf_dict[w]
else:
coef = idf_dict['default_idf']
vectorB += coef*embedder[w]
vectorB /= len(b2)
vector_diff = (vectorA - vectorB)
glove_sim = cosine(vectorA,vectorB)
glove_vdiff_dist = np.sqrt(np.sum(vector_diff**2))
return (glove_sim,glove_vdiff_dist, vector_diff)
check_similar_sentence.py 文件源码
项目:sentence_similarity
作者: MorinoseiMorizo
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def cosine_similarity(a, b):
return dis.cosine(a, b)
eval.py 文件源码
项目:Multi-view-neural-acoustic-words-embeddings
作者: opheadacheh
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def acous_text_eval(m, sess, data, lengths, text_data, text_lengths, matches, config):
embeddings = []
now = 0
while now < len(data):
embedding = sess.run(m.final_state, {m.input_x1: data[now: now + config.eval_batch_size],
m.input_x1_lengths: lengths[now: now + config.eval_batch_size]})
embeddings.append(embedding)
now += config.eval_batch_size
X = np.vstack(embeddings)
text_embeddings = []
now = 0
while now < len(data):
text_embedding = sess.run(m.word_state, {m.input_c1: text_data[now: now + config.eval_batch_size],
m.input_c1_lengths: text_lengths[now: now + config.eval_batch_size]})
text_embeddings.append(text_embedding)
now += config.eval_batch_size
Y = np.vstack(text_embeddings)
distances = []
for i in range(len(data)):
for j in range(i+1, len(data)):
distances.append(cosine(X[i], Y[j]))
distances = np.asarray(distances)
ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
print "Average precision:", ap
print "Precision-recall breakeven:", prb
return ap
def sem_clust(self, w2p, simsdict):
''' Baseline SEMCLUST method (dynamic thresholding), based on:
Marianna Apidianaki, Emilia Verzeni, and Diana McCarthy. Semantic
Clustering of Pivot Paraphrases. In LREC 2014.
Builds a graph where nodes are words, and edges connect words that
have a connection in <w2p>. Weights edges by the values given in
<simsdict>.
:param w2p: word -> {paraphrase: score} dictionary, used to decide which nodes to connect with edges
:param simsdict: word -> {paraphrase: score} OR word -> vector, used for edge weights
:return:
'''
self.reset_sense_clustering()
wordlist = self.pp_dict.keys()
oov = [w for w in wordlist if w not in w2p or w not in simsdict]
if len(oov) > 0:
sys.stderr.write('WARNING: Paraphrases %s are OOV. '
'Removing from ppset.\n' % str(oov))
wordlist = list(set(wordlist) - set(oov))
if len(wordlist) == 1:
self.add_sense_cluster([wordlist[0]])
return
# Using cosine similarity of word-paraphrase vectors:
if type(simsdict.values()[0]) != dict:
similarities = np.array([[1-cosine(simsdict[i], simsdict[j])
for j in wordlist] for i in wordlist])
else:
similarities = np.array([[(1-dict_cosine_dist(simsdict[i], simsdict[j]))
for j in wordlist] for i in wordlist])
gr = sem_clust.toGraph(similarities, wordlist, self.target_word, w2p)
for c in nx.connected_components(gr):
self.add_sense_cluster(c)
def dict_cosine_dist(u,v):
features = list(set(u.keys()) | set(v.keys()))
features.sort()
uvec = np.array([u[f] if f in u else 0.0 for f in features])
vvec = np.array([v[f] if f in v else 0.0 for f in features])
return cosine(uvec,vvec)
def get_similarity(self, w1, w2):
if w1 not in self.wv or w2 not in self.wv: return -0.5
sim = 1.0 - cos_dist(self.wv[w1], self.wv[w2])
return sim
def predict(self, seq1, seq2, pred_method='multiply', unigram_probs=None):
'''right now this function only handles getting prob for one sequence pair'''
if self.flat_input:
if self.embedded_input:
seq1 = seq1[None]
else:
seq1 = get_vector_batch([seq1], vector_length=self.lexicon_size+1)
else:
seq1 = get_seq_batch([seq1], max_length=self.n_timesteps)
probs = self.model.predict_on_batch(seq1)[0]
if self.flat_output:
if unigram_probs is not None:
probs = probs / unigram_probs ** 0.66
probs[numpy.isinf(probs)] = 0.0 #replace inf
#import pdb;pdb.set_trace()
seq2 = get_vector_batch([seq2], vector_length=self.lexicon_size+1)
#prob = 1 - cosine(seq2, probs)
probs = probs[seq2[0].astype('bool')]
else:
seq2 = get_seq_batch([seq2], padding='post', max_length=self.n_timesteps)
probs = probs[numpy.arange(self.n_timesteps), seq2]
probs = probs[seq2 > 0]
if pred_method == 'multiply':
prob = numpy.sum(numpy.log(probs))
#prob = numpy.multiply(probs)
if pred_method == 'mean':
#prob = numpy.sum(numpy.log(probs))
prob = numpy.mean(numpy.log(probs))
elif pred_method == 'last':
prob = numpy.log(probs[-1])
elif pred_method == 'max':
prob = numpy.log(numpy.max(probs))
return prob
def predict(self, seq1, seq2):
seq1 = seq1 + 1e-8
seq2 = seq2 + 1e-8 #smooth to avoid NaN
score = 1 - cosine(seq1, seq2)
return score
def get_word2vec_sim(context_seq, gen_seq):
'''return the word2vec cosine similarity between the context and each generated sequence
(where the word2vec representation for a sequence is just the average of its word vectors)'''
word_pairs = get_word_pairs(context_seq, gen_seq)
if word_pairs:
pair_scores = [similarity.word2vec(encoder(word1),encoder(word2)) for word1,word2 in word_pairs]
else: #no word pairs between context and generated sequences (e.g. generated sequence might be punctuation only)
pair_scores = [0]
# assert(len(word_pairs) == len(pair_scores))
word2vec_sim = numpy.mean(pair_scores)
return word2vec_sim