def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
python类cosine_similarity()的实例源码
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def cosine(x1,x2):
#find common ratings
#new_x1, new_x2 = common(x1,x2)
#compute the cosine similarity between two vectors
sum = x1.dot(x2)
denom = sqrt(x1.dot(x1)*x2.dot(x2))
try:
return float(sum)/denom
except ZeroDivisionError:
return 0
#return cosine_similarity(x1,x2)[0][0]
def vec_cos_sim(token_input, operation_input):
operation_string = None
ref_vector_string = None
cond_value_string = None
for opr_sign in ['==', '>=', '<=', '!=', '<>', '<', '>', '=']:
if opr_sign in operation_input:
ref_vector_string = operation_input.split(opr_sign)[0]
operation_string = opr_sign
cond_value_string = operation_input.split(opr_sign)[1]
break
if ref_vector_string and cond_value_string and operation_string:
try:
cond_value = float(cond_value_string)
ref_vector = change_string_to_vector(ref_vector_string)
token_vector = change_string_to_vector(token_input)
if len(ref_vector) != len(token_vector):
print ('len of vectors does not match')
return False
if operation_string == "=" or operation_string == "==":
return cosine_similarity(token_vector, ref_vector) == cond_value
elif operation_string == "<":
return cosine_similarity(token_vector, ref_vector) < cond_value
elif operation_string == ">":
return cosine_similarity(token_vector, ref_vector) > cond_value
elif operation_string == ">=":
return cosine_similarity(token_vector, ref_vector) >= cond_value
elif operation_string == "<=":
return cosine_similarity(token_vector, ref_vector) <= cond_value
elif operation_string == "!=" or operation_string == "<>":
return cosine_similarity(token_vector, ref_vector) != cond_value
else:
return False
except ValueError:
# TODO raise tokenregex error
return False
else:
# TODO raise tokenregex error
print ('Problem with the operation input')
def closest_docs_by_index(corpus_vectors, query_vectors, n_docs):
docs = []
sim = pw.cosine_similarity(corpus_vectors, query_vectors)
order = np.argsort(sim, axis=0)[::-1]
for i in range(len(query_vectors)):
docs.append(order[:, i][0:n_docs])
return np.array(docs)
def closest_label(X, labels, vec, dist='cosine', ooc_only=False, top=10):
if dist == 'euclidean':
sim = euclidean_distances(X, vec.reshape(1, -1))
elif dist == 'cosine':
sim = cosine_similarity(X, vec.reshape(1, -1))
else:
raise NotImplementedError('dist must be euclidean or cosine')
# get the top five indices
indices = sim.argsort(axis=0)[-top:][::-1]
words = []
for i in indices:
words.append(labels[i[0]])
return " ".join(words)
def find_nearest_word(self,represent, topk:int=10,stopwords:list=[]):
"""
????(???????????)???????
:param stopwords: ?????????????
:param represent:
:param topk:
:return:
"""
array1=np.empty(200)
if isinstance(represent,str) and represent in self:
array1=self[represent]
stopwords.append(represent)
elif isinstance(represent,np.ndarray) :
array1=represent
else:
raise NotImplementedError
result_cos=cosine_similarity(np.reshape(array1,(1,array1.shape[-1])),self._matrix)
result_cos=np.reshape(result_cos,result_cos.shape[-1])
result_sort=result_cos.argsort()[-1*topk:][::-1]
# [[self.idx2word[idx],result_cos[idx]] for idx in result_sort]
# found={}
# for item in result_sort:
# found[self.idx2word[item]]=result[item]
# sortlist=sorted(found.items(), key=lambda d: d[1],reverse=True)
#print(found)
return [[self.idx2word[idx],result_cos[idx]] for idx in result_sort if self.idx2word[idx] not in stopwords and sum([ 1 if stop.startswith(self.idx2word[idx]) else 0 for stop in stopwords])==0 ] #[item for item in sortlist if sum([len(item[0].replace(stop,''))>=2 for stop in stopwords]) ==0]
def simCalcMatrix(docs):
tfidf_vectorizer = TfidfVectorizer(min_df=0, stop_words=None)
tfidf_matrix_train = tfidf_vectorizer.fit_transform(docs) #finds the tfidf score with normalization
cosineSimilarities=cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)
return cosineSimilarities
def generateSimMatrix(phraseList):
#print 'Num elements', len(phraseList), phraseList
all_elements=[]
#for elementlist in phraseList:
for element in phraseList:
if len(element.strip())==0:
all_elements.append(' ')
else:
all_elements.append(element.strip())
tfidf_vectorizer = TfidfVectorizer(min_df=0, stop_words=None)
tfidf_matrix_train = tfidf_vectorizer.fit_transform(all_elements) #finds the tfidf score with normalization
cosineSimilarities=cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)
return cosineSimilarities
def calu_cosin_num(word_q,word_a,model):
#print 'word_q : ' + str(word_q) + '\n'
#print 'word_a : ' + str(word_a) + '\n'
try:
q_vector = model[word_q.decode('utf-8')]
a_vector = model[word_a.decode('utf-8')]
except KeyError:
return 0
cosine_similarity_num = cosine_similarity(np.array(q_vector).reshape(1,-1),np.array(a_vector).reshape(1,-1))
return float(cosine_similarity_num)
def calu_cosin_num(word_q,word_a,model):
#print 'word_q : ' + str(word_q) + '\n'
#print 'word_a : ' + str(word_a) + '\n'
try:
q_vector = model[word_q.decode('utf-8')]
a_vector = model[word_a.decode('utf-8')]
except KeyError:
return 0
cosine_similarity_num = cosine_similarity(np.array(q_vector).reshape(1,-1),np.array(a_vector).reshape(1,-1))
return float(cosine_similarity_num)
def cosine_affinity(X):
epsilon = 1e-8
S = cosine_similarity(X)
S[S > 1] = 1.0 # Rounding problems
S += 1 + epsilon
# Sanity checks
assert(not (S < 0).any())
assert(not np.isnan(S).any())
assert(not np.isinf(S).any())
return S
def decision_function(self, graphs):
vecs = self.vectorizer.transform(graphs)
return cosine_similarity(self.reference_vec, vecs)
server_query_images.py 文件源码
项目:rekognition-image-search-engine
作者: awslabs
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def search():
qry = request.args.get('query', '')
test = np.zeros((tfidf[0].shape))
keywords = []
for word in qry.split(' '):
# validate word
if len(word) <2 or word in stop_words:
continue
try:
idx = features.index(word)
test[0][idx] = 1
except ValueError, e:
pass
cosine_similarities = cosine_similarity(test, tfidf).flatten()
related_docs_indices = cosine_similarities.argsort()[:-100:-1] # TOP 100 results
MAX = 100
data = []
related_docs_indices = related_docs_indices[:MAX]
tag_map = {} # All tags and their counts
for img in indices[related_docs_indices]:
file_path = "/Users/smallya/workspace/Rekognition-personal-searchengine/" + img
labels = d_index[img]
word = qry.split(' ')[0]
data.append(file_path)
print related_docs_indices
return json.dumps(data)
def score_topics(source_id, topics_desc_dict):
token_dict = {}
indices = {}
res_dict = {}
index = 0
for tid, text in topics_desc_dict.iteritems():
lowers = text.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
no_punctuation = lowers.translate(remove_punctuation_map)
token_dict[tid] = no_punctuation
for tok in token_dict.keys():
indices.update({tok: index})
index += 1
main_index = indices[source_id]
# this can take some time
tf_idf = TfidfVectorizer(tokenizer=text_proc.tokenize, stop_words='english')
tfidf_matrix = tf_idf.fit_transform(token_dict.values())
res = cosine_similarity(tfidf_matrix[main_index], tfidf_matrix)
for tok, ind in indices.iteritems():
if tok == main_index:
continue;
res_dict.update({tok: res[0][ind]})
return res_dict
def score_outlinks(main_text, title_list):
main_title = "current_selected_topic"
token_dict = {}
len_titles = {}
indices = {}
res_dict = {}
index = 0
for title in title_list:
lowers = title.lower().replace("_", " ").replace("-", " ")
len_titles.update({title: len(lowers.split(" "))})
token_dict[title] = lowers
len_titles[main_title] = 1
token_dict[main_title] = main_text
for tok in token_dict.keys():
indices.update({tok: index})
index += 1
main_index = indices[main_title]
tf_idf = TfidfVectorizer(tokenizer=text_proc.tokenize, stop_words='english')
tfidf_matrix = tf_idf.fit_transform(token_dict.values())
res = cosine_similarity(tfidf_matrix[main_index], tfidf_matrix)
for tok, ind in indices.iteritems():
if tok == main_title:
continue;
res_dict.update({tok: (res[0][ind] * 100 / len_titles[tok]) })
return res_dict
def predict(self, X):
"""Predict the class labels for the provided data
Parameters
----------
X : np.ndarray, shape = [n_samples]
Returns
-------
y : np.array of shape [n_samples]
Class labels for each data sample.
"""
if not self.fitted:
raise NotFittedError("Estimator not fitted, call `fit` before exploiting the model.")
n_samples = len(X)
frequencies = np.zeros((n_samples, self.n_all_words_))
for i in range(n_samples):
words_unique, words_counts = np.unique(X[i], return_counts=True)
for j, word in enumerate(self.all_words_):
if word in words_unique:
frequencies[i, j] = words_counts[np.where(words_unique == word)[0]]
self.frequencies_ = frequencies
y_pred = cosine_similarity(frequencies, self.tf_idf_array_).argmax(axis=1)
return y_pred
def knowsim_experiment(scope, scope_name, type_list, count, newLabels, tau=1, kNeighbors=10, label_num = 5):
split_path = 'data/local/split/' + scope_name + '/'
with open('data/local/' + scope_name + '.dmp') as f:
hin = pk.load(f)
repeats = 50
tf_param = {'word': True, 'entity': False, 'we_weight': 0.1}
X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param)
n = X_word.shape[0]
knowsim = sparse.lil_matrix((n, n))
for t in type_list:
tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param, t)
# make similarity graph
cosX = cosine_similarity(X_typed)
graph = sparse.lil_matrix((n, n))
for i in range(n):
for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]:
if j == i:
continue
graph[i, j] = cosX[i, j] # np.exp(- (1 - cosX[i, j]) / 0.03) #
graph[j, i] = cosX[i, j] # np.exp(- (1 - cosX[i, j]) / 0.03) #
# calculate laplacian scores
row_sum = graph.sum(axis=1)
laplacian_score = generate_laplacian_score(row_sum, X_word, kNeighbors)
# add meta-path-based similarity to the knowsim
knowsim = knowsim + np.exp(-tau * laplacian_score) * graph
knowsim = knowsim.tocsr()
print 'running lp'
lp_param = {'alpha':0.98, 'normalization_factor':5}
ssl = SSLClassifier(knowsim, newLabels, scope, lp_param, repeatTimes=50, trainNumbers=label_num, classCount=count)
ssl.repeatedFixedExperimentwithNewIds(pathPrefix=split_path + 'lb' + str(label_num).zfill(3) + '_', newIds=newIds)
return ssl.get_mean()
def generate_laplacian_score(X_ent, X_word, kNeighbors):
# Generate cosine similarity graph
n = X_ent.shape[0]
m = X_ent.shape[1]
cosX = cosine_similarity(X_word)
graph = np.zeros((n, n))
t = cosX.sum().sum() / n/n
for i in range(n):
for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]:
if j == i:
continue
# diff = (X_word[i, :] - X_word[j, :]).toarray().flatten()
# dist = np.exp(np.dot(diff, diff) / t)
graph[i, j] = cosX[i, j] #np.exp(- (1 - cosX[i, j]) / 0.03) #
graph[j, i] = cosX[i, j] #np.exp(- (1 - cosX[i, j]) / 0.03) #
D = sparse.diags([graph.sum(axis=0)], [0])
L = D - graph
laplacian_score = np.zeros(m)
for i in range(m):
f_tilde = X_ent[:, i] - (float(X_ent[:, i].transpose() * D * np.ones((n, 1))) / D.sum().sum()) * np.ones(
(n, 1))
score = float(f_tilde.transpose() * L * f_tilde) / float(f_tilde.transpose() * D * f_tilde + 1e-10)
laplacian_score[i] = score
return (laplacian_score)