python类cosine_similarity()的实例源码

eval.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
eval.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
eval-all.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
qmath.py 文件源码 项目:RecQ 作者: Coder-Yu 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def cosine(x1,x2):
    #find common ratings
    #new_x1, new_x2 = common(x1,x2)
    #compute the cosine similarity between two vectors
    sum = x1.dot(x2)
    denom = sqrt(x1.dot(x1)*x2.dot(x2))
    try:
        return float(sum)/denom
    except ZeroDivisionError:
        return 0

    #return cosine_similarity(x1,x2)[0][0]
vector_opr.py 文件源码 项目:tokenquery 作者: ramtinms 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def vec_cos_sim(token_input, operation_input):
    operation_string = None
    ref_vector_string = None
    cond_value_string = None
    for opr_sign in ['==', '>=', '<=', '!=', '<>', '<', '>', '=']:
        if opr_sign in operation_input:
            ref_vector_string = operation_input.split(opr_sign)[0]
            operation_string = opr_sign
            cond_value_string = operation_input.split(opr_sign)[1]
            break

    if ref_vector_string and cond_value_string and operation_string:
        try:
            cond_value = float(cond_value_string)
            ref_vector = change_string_to_vector(ref_vector_string)
            token_vector = change_string_to_vector(token_input)
            if len(ref_vector) != len(token_vector):
                print ('len of vectors does not match')
                return False
            if operation_string == "=" or operation_string == "==":
                return cosine_similarity(token_vector, ref_vector) == cond_value
            elif operation_string == "<":
                return cosine_similarity(token_vector, ref_vector) < cond_value
            elif operation_string == ">":
                return cosine_similarity(token_vector, ref_vector) > cond_value
            elif operation_string == ">=":
                return cosine_similarity(token_vector, ref_vector) >= cond_value
            elif operation_string == "<=":
                return cosine_similarity(token_vector, ref_vector) <= cond_value
            elif operation_string == "!=" or operation_string == "<>":
                return cosine_similarity(token_vector, ref_vector) != cond_value
            else:
                return False
        except ValueError:
            # TODO raise tokenregex error
            return False

    else:
        # TODO raise tokenregex error
        print ('Problem with the operation input')
evaluate.py 文件源码 项目:adversarial-document-model 作者: AYLIEN 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def closest_docs_by_index(corpus_vectors, query_vectors, n_docs):
    docs = []
    sim = pw.cosine_similarity(corpus_vectors, query_vectors)
    order = np.argsort(sim, axis=0)[::-1]
    for i in range(len(query_vectors)):
        docs.append(order[:, i][0:n_docs])
    return np.array(docs)
kmeans.py 文件源码 项目:ref-extract 作者: brandonrobertz 项目源码 文件源码 阅读 40 收藏 0 点赞 0 评论 0
def closest_label(X, labels, vec, dist='cosine', ooc_only=False, top=10):
    if dist == 'euclidean':
        sim = euclidean_distances(X, vec.reshape(1, -1))
    elif dist == 'cosine':
        sim = cosine_similarity(X, vec.reshape(1, -1))
    else:
        raise NotImplementedError('dist must be euclidean or cosine')
    # get the top five indices
    indices = sim.argsort(axis=0)[-top:][::-1]
    words = []
    for i in indices:
        words.append(labels[i[0]])
    return " ".join(words)
word2vec.py 文件源码 项目:bot2017Fin 作者: AllanYiin 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def find_nearest_word(self,represent,  topk:int=10,stopwords:list=[]):
        """
        ????(???????????)???????
        :param stopwords: ?????????????
        :param represent:
        :param topk:
        :return:
        """
        array1=np.empty(200)
        if isinstance(represent,str) and represent in self:
            array1=self[represent]
            stopwords.append(represent)
        elif isinstance(represent,np.ndarray) :
            array1=represent
        else:
            raise NotImplementedError
        result_cos=cosine_similarity(np.reshape(array1,(1,array1.shape[-1])),self._matrix)
        result_cos=np.reshape(result_cos,result_cos.shape[-1])
        result_sort=result_cos.argsort()[-1*topk:][::-1]
        # [[self.idx2word[idx],result_cos[idx]] for idx in result_sort]
        # found={}
        # for item in  result_sort:
        #   found[self.idx2word[item]]=result[item]
        # sortlist=sorted(found.items(), key=lambda d: d[1],reverse=True)
        #print(found)
        return [[self.idx2word[idx],result_cos[idx]] for idx in result_sort if self.idx2word[idx] not in stopwords and sum([ 1 if stop.startswith(self.idx2word[idx]) else 0 for stop in  stopwords])==0 ] #[item for item in sortlist if sum([len(item[0].replace(stop,''))>=2 for stop in stopwords]) ==0]
WGGraph.py 文件源码 项目:AbTextSumm 作者: StevenLOL 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def simCalcMatrix(docs):
    tfidf_vectorizer = TfidfVectorizer(min_df=0, stop_words=None)
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(docs)  #finds the tfidf score with normalization
    cosineSimilarities=cosine_similarity(tfidf_matrix_train, tfidf_matrix_train) 
    return cosineSimilarities
WGGraph.py 文件源码 项目:AbTextSumm 作者: StevenLOL 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def generateSimMatrix(phraseList):
    #print 'Num elements', len(phraseList), phraseList
    all_elements=[]
    #for elementlist in phraseList:
    for element in phraseList:
        if len(element.strip())==0:
            all_elements.append(' ')
        else:
            all_elements.append(element.strip())
    tfidf_vectorizer = TfidfVectorizer(min_df=0, stop_words=None)
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(all_elements)  #finds the tfidf score with normalization
    cosineSimilarities=cosine_similarity(tfidf_matrix_train, tfidf_matrix_train) 
    return cosineSimilarities
get_near_word.py 文件源码 项目:CCIR 作者: xiaogang00 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def calu_cosin_num(word_q,word_a,model):
    #print 'word_q : ' + str(word_q) + '\n'
    #print 'word_a : ' + str(word_a) + '\n'
    try:
        q_vector = model[word_q.decode('utf-8')]
        a_vector = model[word_a.decode('utf-8')]
    except KeyError:
        return 0
    cosine_similarity_num = cosine_similarity(np.array(q_vector).reshape(1,-1),np.array(a_vector).reshape(1,-1))
    return float(cosine_similarity_num)
get_near_word.py 文件源码 项目:CCIR 作者: xiaogang00 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def calu_cosin_num(word_q,word_a,model):
    #print 'word_q : ' + str(word_q) + '\n'
    #print 'word_a : ' + str(word_a) + '\n'
    try:
        q_vector = model[word_q.decode('utf-8')]
        a_vector = model[word_a.decode('utf-8')]
    except KeyError:
        return 0
    cosine_similarity_num = cosine_similarity(np.array(q_vector).reshape(1,-1),np.array(a_vector).reshape(1,-1))
    return float(cosine_similarity_num)
metacluster.py 文件源码 项目:word2vec_pipeline 作者: NIHOPA 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def cosine_affinity(X):
    epsilon = 1e-8
    S = cosine_similarity(X)
    S[S > 1] = 1.0  # Rounding problems
    S += 1 + epsilon

    # Sanity checks
    assert(not (S < 0).any())
    assert(not np.isnan(S).any())
    assert(not np.isinf(S).any())

    return S
estimators.py 文件源码 项目:GraphLearn 作者: smautner 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def decision_function(self, graphs):
        vecs = self.vectorizer.transform(graphs)
        return cosine_similarity(self.reference_vec, vecs)
server_query_images.py 文件源码 项目:rekognition-image-search-engine 作者: awslabs 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def search():
    qry = request.args.get('query', '')
    test = np.zeros((tfidf[0].shape))

    keywords = []

    for word in qry.split(' '):
        # validate word
        if len(word) <2 or word in stop_words:
            continue 
        try:
            idx = features.index(word)
            test[0][idx] = 1
        except ValueError, e:
            pass

    cosine_similarities = cosine_similarity(test, tfidf).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-100:-1] # TOP 100 results

    MAX = 100 
    data = []
    related_docs_indices = related_docs_indices[:MAX]
    tag_map = {} # All tags and their counts

    for img in indices[related_docs_indices]:
        file_path = "/Users/smallya/workspace/Rekognition-personal-searchengine/" + img
        labels = d_index[img]
        word = qry.split(' ')[0]
        data.append(file_path)

    print related_docs_indices
    return json.dumps(data)
score_process.py 文件源码 项目:Diggly-Back-End 作者: WikiDiggly 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def score_topics(source_id, topics_desc_dict):
    token_dict = {}
    indices = {}
    res_dict = {}
    index = 0

    for tid, text in topics_desc_dict.iteritems():
        lowers = text.lower()
        remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
        no_punctuation = lowers.translate(remove_punctuation_map)
        token_dict[tid] = no_punctuation

    for tok in token_dict.keys():
        indices.update({tok: index})
        index += 1

    main_index = indices[source_id]

    # this can take some time
    tf_idf = TfidfVectorizer(tokenizer=text_proc.tokenize, stop_words='english')
    tfidf_matrix = tf_idf.fit_transform(token_dict.values())
    res = cosine_similarity(tfidf_matrix[main_index], tfidf_matrix)

    for tok, ind in indices.iteritems():
        if tok == main_index:
            continue;
        res_dict.update({tok: res[0][ind]})

    return res_dict
score_process.py 文件源码 项目:Diggly-Back-End 作者: WikiDiggly 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def score_outlinks(main_text, title_list):
    main_title = "current_selected_topic"
    token_dict = {}
    len_titles = {}
    indices = {}
    res_dict = {}
    index = 0

    for title in title_list:
        lowers = title.lower().replace("_", " ").replace("-", " ")
        len_titles.update({title: len(lowers.split(" "))})
        token_dict[title] = lowers

    len_titles[main_title] = 1
    token_dict[main_title] = main_text

    for tok in token_dict.keys():
        indices.update({tok: index})
        index += 1

    main_index = indices[main_title]

    tf_idf = TfidfVectorizer(tokenizer=text_proc.tokenize, stop_words='english')
    tfidf_matrix = tf_idf.fit_transform(token_dict.values())
    res = cosine_similarity(tfidf_matrix[main_index], tfidf_matrix)

    for tok, ind in indices.iteritems():
        if tok == main_title:
            continue;
        res_dict.update({tok: (res[0][ind] * 100 / len_titles[tok]) })

    return res_dict
classification.py 文件源码 项目:pyts 作者: johannfaouzi 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def predict(self, X):
        """Predict the class labels for the provided data

        Parameters
        ----------
        X : np.ndarray, shape = [n_samples]

        Returns
        -------
        y : np.array of shape [n_samples]
            Class labels for each data sample.
        """

        if not self.fitted:
            raise NotFittedError("Estimator not fitted, call `fit` before exploiting the model.")

        n_samples = len(X)
        frequencies = np.zeros((n_samples, self.n_all_words_))
        for i in range(n_samples):
            words_unique, words_counts = np.unique(X[i], return_counts=True)
            for j, word in enumerate(self.all_words_):
                if word in words_unique:
                    frequencies[i, j] = words_counts[np.where(words_unique == word)[0]]

        self.frequencies_ = frequencies

        y_pred = cosine_similarity(frequencies, self.tf_idf_array_).argmax(axis=1)

        return y_pred
main.py 文件源码 项目:semihin 作者: HKUST-KnowComp 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def knowsim_experiment(scope, scope_name, type_list, count, newLabels, tau=1, kNeighbors=10, label_num = 5):
    split_path = 'data/local/split/' + scope_name + '/'
    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)

    repeats = 50
    tf_param = {'word': True, 'entity': False, 'we_weight': 0.1}
    X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param)
    n = X_word.shape[0]

    knowsim = sparse.lil_matrix((n, n))
    for t in type_list:
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param, t)

        # make similarity graph
        cosX = cosine_similarity(X_typed)
        graph = sparse.lil_matrix((n, n))
        for i in range(n):
            for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]:
                if j == i:
                    continue
                graph[i, j] = cosX[i, j]  # np.exp(- (1 - cosX[i, j]) / 0.03) #
                graph[j, i] = cosX[i, j]  # np.exp(- (1 - cosX[i, j]) / 0.03) #

        # calculate laplacian scores
        row_sum = graph.sum(axis=1)
        laplacian_score = generate_laplacian_score(row_sum, X_word, kNeighbors)

        # add meta-path-based similarity to the knowsim
        knowsim = knowsim + np.exp(-tau * laplacian_score) * graph

    knowsim = knowsim.tocsr()
    print 'running lp'
    lp_param = {'alpha':0.98, 'normalization_factor':5}

    ssl = SSLClassifier(knowsim, newLabels, scope, lp_param, repeatTimes=50, trainNumbers=label_num, classCount=count)
    ssl.repeatedFixedExperimentwithNewIds(pathPrefix=split_path + 'lb' + str(label_num).zfill(3) + '_', newIds=newIds)
    return ssl.get_mean()
feature_grid_search.py 文件源码 项目:semihin 作者: HKUST-KnowComp 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def generate_laplacian_score(X_ent, X_word, kNeighbors):
    # Generate cosine similarity graph
    n = X_ent.shape[0]
    m = X_ent.shape[1]
    cosX = cosine_similarity(X_word)
    graph = np.zeros((n, n))
    t = cosX.sum().sum() / n/n
    for i in range(n):
        for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]:
            if j == i:
                continue
            # diff = (X_word[i, :] - X_word[j, :]).toarray().flatten()

            # dist = np.exp(np.dot(diff, diff) / t)
            graph[i, j] = cosX[i, j] #np.exp(- (1 - cosX[i, j]) / 0.03) #
            graph[j, i] = cosX[i, j] #np.exp(- (1 - cosX[i, j]) / 0.03) #

    D = sparse.diags([graph.sum(axis=0)], [0])
    L = D - graph

    laplacian_score = np.zeros(m)
    for i in range(m):
        f_tilde = X_ent[:, i] - (float(X_ent[:, i].transpose() * D * np.ones((n, 1))) / D.sum().sum()) * np.ones(
            (n, 1))
        score = float(f_tilde.transpose() * L * f_tilde) / float(f_tilde.transpose() * D * f_tilde + 1e-10)
        laplacian_score[i] = score


    return (laplacian_score)


问题


面经


文章

微信
公众号

扫码关注公众号