python类cosine()的实例源码

pairdist.py 文件源码 项目:BioNLP-2016 作者: cambridgeltl 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def make_dist(vectors, options):
    if options.metric != 'cosine':
        return vectors, metrics[options.metric]
    else:
        # normalize once only
        vectors = [v/numpy.linalg.norm(v) for v in vectors]
        return vectors, lambda u, v: 1 - numpy.dot(u, v)
evalrank.py 文件源码 项目:BioNLP-2016 作者: cambridgeltl 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def cosine(v1, v2):
    return numpy.dot(v1/numpy.linalg.norm(v1), v2/numpy.linalg.norm(v2))
train_svm_model.py 文件源码 项目:NETL-Automatic-Topic-Labelling- 作者: sb1992 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_lt_ranks(lab_list,num):
    topic_ls = get_topic_lt(topic_list[num])
    val_dict = {}
    val_list =[]
    final_list=[]
    for item in lab_list:
        trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] #Letter trigram for candidate label.
        label_cnt = Counter(trigrams)
        total = sum(label_cnt.values(), 0.0)
        for key in label_cnt:
            label_cnt[key] /= total
        tot_keys = list(set(topic_ls.keys() + label_cnt.keys()))
        listtopic = []
        listlabel = []
        for elem in tot_keys:
            if elem in topic_ls:
                listtopic.append(topic_ls[elem])
            else:
                listtopic.append(0.0)
            if elem in label_cnt:
                listlabel.append(label_cnt[elem])
            else:
                listlabel.append(0.0)
        val = 1 - cosine(np.array(listtopic),np.array(listlabel)) # Cosine Similarity
        val_list.append((item,val))
    rank_val = [i[1] for i in val_list]
    arr = np.array(rank_val)
    order = arr.argsort()
    ranks = order.argsort()
    for i,elem in enumerate(val_list):
        final_list.append((elem[0],ranks[i],int(num)))

    return final_list

# Generates letter trigram feature
supervised_labels.py 文件源码 项目:NETL-Automatic-Topic-Labelling- 作者: sb1992 项目源码 文件源码 阅读 54 收藏 0 点赞 0 评论 0
def get_lt_ranks(lab_list,num):
    topic_ls = get_topic_lt(topic_list[num]) # Will get letter trigram for topic terms.
    val_dict = {}
    val_list =[]
    final_list=[]
    for item in lab_list:
        trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] # get the trigrams for label candidate.
        label_cnt = Counter(trigrams)
        total = sum(label_cnt.values(), 0.0)
        for key in label_cnt:
            label_cnt[key] /= total
        tot_keys = list(set(topic_ls.keys() + label_cnt.keys()))
        listtopic = []
        listlabel = []
        for elem in tot_keys:
            if elem in topic_ls:
                listtopic.append(topic_ls[elem])
            else:
                listtopic.append(0.0)
            if elem in label_cnt:
                listlabel.append(label_cnt[elem])
            else:
                listlabel.append(0.0)
        val = 1 - cosine(np.array(listtopic),np.array(listlabel)) # Cosine similarity.
        val_list.append((item,val))
    rank_val = [i[1] for i in val_list]
    arr = np.array(rank_val)
    order = arr.argsort()
    ranks = order.argsort()
    for i,elem in enumerate(val_list):
        final_list.append((elem[0],ranks[i],int(num)))

    return final_list

# This calls the above method to get letter trigram feature.
word_game.py 文件源码 项目:GloVe-experiments 作者: brannondorsey 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def find_nearest(skip_words, vec, id_to_word, df, num_results=1, method='cosine'):

    if method == 'cosine':
        minim = [] # min, index
        for i, v in enumerate(df):
            # skip the base word, its usually the closest
            if id_to_word[i] in skip_words:
                continue
            dist = cosine(vec, v)
            minim.append((dist, i, v))
        minim = sorted(minim, key=lambda v: v[0])
        # return list of (word, cosine distance, vector) tuples
        return [(id_to_word[minim[i][1]], minim[i][0], minim[i][2]) for i in range(num_results)]
    else:
        raise Exception('{} is not an excepted method parameter'.format(method))
word_game.py 文件源码 项目:GloVe-experiments 作者: brannondorsey 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def turn(gs, word_to_id, id_to_word, df, soft_score):

    gs['turn_number'] += 1
    names = list(gs['players'].keys())
    current_player = names[(gs['turn_number'] % len(names) - 1)]
    while True:
        expr = input('{}, please enter a word expression:\n> '.format(current_player))
        try:
            vec, skip_words = eval_expression(expr, word_to_id, word_to_id, df)
        except Exception as err:
            print(err)
            continue
        break

    answers = {}
    for name in gs['players']:
        while True:
            word = input('{}, please enter your answer: '.format(name))
            if word in word_to_id:
                answers[name] = df[word_to_id[word]]
                break
            else:
                print('{} is not in the dataset, please another word.'.format(word))

    answer_word, answer_dist, answer_vec = find_nearest(skip_words, vec, id_to_word, df)[0]
    # transform answers from vectors to distances
    for k, v in answers.items():
        answers[k] = cosine(v, answer_vec)

    winner = min(answers, key=answers.get)

    if not soft_score:
        gs['players'][winner] += 1
    else:
        for name in answers:
            gs['players'][name] += round(answers[name], 2)

    print('Computer says {} = {}'.format(expr, colored(answer_word, 'cyan')))
    print('{} wins this round.'.format(colored(winner, 'green')))
    print_standings(gs)
word_arithmetic.py 文件源码 项目:GloVe-experiments 作者: brannondorsey 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def find_nearest(words, vec, id_to_word, df, num_results, method='cosine'):

    if method == 'cosine':
        minim = [] # min, index
        for i, v in enumerate(df):
            # skip the base word, its usually the closest
            if id_to_word[i] in words:
                continue
            dist = cosine(vec, v)
            minim.append((dist, i))
        minim = sorted(minim, key=lambda v: v[0])
        # return list of (word, cosine distance) tuples
        return [(id_to_word[minim[i][1]], minim[i][0]) for i in range(num_results)]
    else:
        raise Exception('{} is not an excepted method parameter'.format(method))
adhoc.py 文件源码 项目:workspace 作者: nojima 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def find_similar_words_by_vector(self, vector: np.ndarray, n: int = 10):
        vocabulary = self._vocabulary
        similar_ids = sorted(range(0, vocabulary.size),
                             key=lambda id: cosine(self._vectors[id], vector))[:n]
        return [vocabulary.to_word(id) for id in similar_ids]
evaluate.py 文件源码 项目:wi_wacv14 作者: VChristlein 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def computeDistance(X, Y, method):
    if method == 'cosine':
        dist = spdistance.cosine(X,Y)

    if dist < 0:
        print ('WARNING: distance between X {} and Y {} = {} < 0, method: '
                         '{}'.format(X, Y, dist, method))

    return dist
evaluate.py 文件源码 项目:wi_wacv14 作者: VChristlein 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def runNN(descriptors, labels, parallel, nprocs):
    """
    compute nearest neighbor from specific descriptors, given labels
    """

    distance_method = { "cosine": 'cosine' }
    ret_matrix = None
    for name, method in distance_method.iteritems():
        dist_matrix = computeDistances(descriptors, method, 
                                           parallel, nprocs)

        computeStats(name, dist_matrix, labels, parallel)
        ret_matrix = dist_matrix

    return ret_matrix
utils.py 文件源码 项目:aihackathon 作者: nicoheidtke 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def compare_tweet_with_storage(tweet, storage=None, bow=False):
    if storage is None:
        if not os.path.isfile(os.path.join(config.data_folder, config.model_file)):
            raise('Model was not found!')
        else:
            storage = pickle.load(open(os.path.join(config.data_folder, config.model_file), 'rb'))
    print(tweet)
    transformed_tweet = transform_tweet(tweet, bow)
    print([x[0] for x in transformed_tweet], [np.sum(y) for y in (x[2] for x in transformed_tweet)])
    scores = {}
    for i, (entity, entity_type, vector_array) in enumerate(transformed_tweet):
        temp_score = 0.0
        for j, (tweetid, item) in enumerate(storage[storage['Entity'] == entity].iterrows()):
            if bow:
                clusterids = np.unique([vector_array.keys() + item['Vector array'].keys()])
                vector1 = np.zeros([len(clusterids)])
                vector2 = np.zeros([len(clusterids)])
                for k, cid in enumerate(clusterids):
                    vector1[k] = vector_array.get(cid, 0)
                    vector2[k] = item['Vector array'].get(cid, 0)
                temp_score = np.max([1.0 * np.sum(np.logical_and(vector1, vector2)) / np.min([np.sum(vector1), np.sum(vector2)]), temp_score])
            else:
                if SPLIT:
                    result = [1 - cosine(vector_array[x], item['Vector array'][x]) for x in range(3)]
                    isnan = np.isnan(result)
                    res = 0.0
                    for v in range(3):
                        if not isnan[v]:
                            res+=result[v]
                    res = 1.0 * res/(np.sum(isnan==False)+10**(-10))
                    temp_score = np.max([res, temp_score])
                    # print(entity, entity_type)
                else:
                    temp_score = np.max([1 - cosine(vector_array, item['Vector array']), temp_score])
                    print(1 - cosine(vector_array, item['Vector array']), entity, tweet, str(tweetid))
        scores.update({entity: temp_score})
    return combine_scores(scores)
generate_pretrained_glove_sim_dist_diff_idf.py 文件源码 项目:kaggle-quora-solution-8th 作者: qqgeogor 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def calc_glove_sim(row,embedder,idf_dict):
    '''
    Calc glove similarities and diff of centers of query\title
    '''
    a2 = [x for x in remove_punctuation(row['question1']).lower().split() if x in embedder]
    b2 = [x for x in remove_punctuation(row['question2']).lower().split() if x in embedder]

    # if len(a2)>0 and len(b2)>0:
    #     glove_sim = embedder.n_similarity(a2, b2)
    # else:
    #     return((-1, -1, np.zeros(300)))

    vectorA = np.zeros(300)
    for w in a2:
        if w in idf_dict:
            coef = idf_dict[w]
        else:
            coef = idf_dict['default_idf']
        vectorA += coef*embedder[w]
    vectorA /= len(a2)

    vectorB = np.zeros(300)
    for w in b2:
        if w in idf_dict:
            coef = idf_dict[w]
        else:
            coef = idf_dict['default_idf']
        vectorB += coef*embedder[w]
    vectorB /= len(b2)

    vector_diff = (vectorA - vectorB)
    glove_sim = cosine(vectorA,vectorB)
    glove_vdiff_dist = np.sqrt(np.sum(vector_diff**2))
    return (glove_sim,glove_vdiff_dist, vector_diff)
check_similar_sentence.py 文件源码 项目:sentence_similarity 作者: MorinoseiMorizo 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def cosine_similarity(a, b):
    return dis.cosine(a, b)
eval.py 文件源码 项目:Multi-view-neural-acoustic-words-embeddings 作者: opheadacheh 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def acous_text_eval(m, sess, data, lengths, text_data, text_lengths, matches, config):
    embeddings = []
    now = 0
    while now < len(data):
        embedding = sess.run(m.final_state, {m.input_x1: data[now: now + config.eval_batch_size],
                                             m.input_x1_lengths: lengths[now: now + config.eval_batch_size]})
        embeddings.append(embedding)
        now += config.eval_batch_size
    X = np.vstack(embeddings)
    text_embeddings = []
    now = 0
    while now < len(data):
        text_embedding = sess.run(m.word_state, {m.input_c1: text_data[now: now + config.eval_batch_size],
                                                 m.input_c1_lengths: text_lengths[now: now + config.eval_batch_size]})
        text_embeddings.append(text_embedding)
        now += config.eval_batch_size
    Y = np.vstack(text_embeddings)
    distances = []
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            distances.append(cosine(X[i], Y[j]))
    distances = np.asarray(distances)
    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    print "Average precision:", ap
    print "Precision-recall breakeven:", prb
    return ap
paraphrase.py 文件源码 项目:cluster_paraphrases 作者: acocos 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def sem_clust(self, w2p, simsdict):
        ''' Baseline SEMCLUST method (dynamic thresholding), based on:

        Marianna Apidianaki, Emilia Verzeni, and Diana McCarthy. Semantic
        Clustering of Pivot Paraphrases. In LREC 2014.

        Builds a graph where nodes are words, and edges connect words that
        have a connection in <w2p>. Weights edges by the values given in
        <simsdict>.
        :param w2p: word -> {paraphrase: score} dictionary, used to decide which nodes to connect with edges
        :param simsdict: word -> {paraphrase: score} OR word -> vector, used for edge weights
        :return:
        '''
        self.reset_sense_clustering()
        wordlist = self.pp_dict.keys()

        oov = [w for w in wordlist if w not in w2p or w not in simsdict]
        if len(oov) > 0:
            sys.stderr.write('WARNING: Paraphrases %s are OOV. '
                             'Removing from ppset.\n' % str(oov))
            wordlist = list(set(wordlist) - set(oov))

        if len(wordlist) == 1:
            self.add_sense_cluster([wordlist[0]])
            return

        # Using cosine similarity of word-paraphrase vectors:
        if type(simsdict.values()[0]) != dict:
            similarities = np.array([[1-cosine(simsdict[i], simsdict[j])
                                      for j in wordlist] for i in wordlist])
        else:
            similarities = np.array([[(1-dict_cosine_dist(simsdict[i], simsdict[j]))
                                      for j in wordlist] for i in wordlist])

        gr = sem_clust.toGraph(similarities, wordlist, self.target_word, w2p)

        for c in nx.connected_components(gr):
            self.add_sense_cluster(c)
paraphrase.py 文件源码 项目:cluster_paraphrases 作者: acocos 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def dict_cosine_dist(u,v):
    features = list(set(u.keys()) | set(v.keys()))
    features.sort()
    uvec = np.array([u[f] if f in u else 0.0 for f in features])
    vvec = np.array([v[f] if f in v else 0.0 for f in features])
    return cosine(uvec,vvec)
model.py 文件源码 项目:MorphForest 作者: j-luo93 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def get_similarity(self, w1, w2):
        if w1 not in self.wv or w2 not in self.wv: return -0.5
        sim = 1.0 - cos_dist(self.wv[w1], self.wv[w2])
        return sim
classifier.py 文件源码 项目:narrative-prediction 作者: roemmele 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def predict(self, seq1, seq2, pred_method='multiply', unigram_probs=None):

        '''right now this function only handles getting prob for one sequence pair'''
        if self.flat_input:
            if self.embedded_input:
                seq1 = seq1[None]
            else:
                seq1 = get_vector_batch([seq1], vector_length=self.lexicon_size+1)
        else:
            seq1 = get_seq_batch([seq1], max_length=self.n_timesteps)

        probs = self.model.predict_on_batch(seq1)[0]

        if self.flat_output:
            if unigram_probs is not None:
                probs = probs / unigram_probs ** 0.66
                probs[numpy.isinf(probs)] = 0.0 #replace inf
            #import pdb;pdb.set_trace()
            seq2 = get_vector_batch([seq2], vector_length=self.lexicon_size+1)
            #prob = 1 - cosine(seq2, probs)
            probs = probs[seq2[0].astype('bool')]

        else:
            seq2 = get_seq_batch([seq2], padding='post', max_length=self.n_timesteps)

            probs = probs[numpy.arange(self.n_timesteps), seq2]
            probs = probs[seq2 > 0]

        if pred_method == 'multiply':
            prob = numpy.sum(numpy.log(probs))
            #prob = numpy.multiply(probs)
        if pred_method == 'mean':
            #prob = numpy.sum(numpy.log(probs))
            prob = numpy.mean(numpy.log(probs))
        elif pred_method == 'last':
            prob = numpy.log(probs[-1])
        elif pred_method == 'max':
            prob = numpy.log(numpy.max(probs))
        return prob
classifier.py 文件源码 项目:narrative-prediction 作者: roemmele 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def predict(self, seq1, seq2):
        seq1 = seq1 + 1e-8
        seq2 = seq2 + 1e-8 #smooth to avoid NaN
        score = 1 - cosine(seq1, seq2)
        return score
generation_metrics.py 文件源码 项目:narrative-prediction 作者: roemmele 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def get_word2vec_sim(context_seq, gen_seq):
    '''return the word2vec cosine similarity between the context and each generated sequence 
    (where the word2vec representation for a sequence is just the average of its word vectors)'''
    word_pairs = get_word_pairs(context_seq, gen_seq)
    if word_pairs:
        pair_scores = [similarity.word2vec(encoder(word1),encoder(word2)) for word1,word2 in word_pairs]
    else: #no word pairs between context and generated sequences (e.g. generated sequence might be punctuation only)
        pair_scores = [0]
    # assert(len(word_pairs) == len(pair_scores))
    word2vec_sim = numpy.mean(pair_scores)
    return word2vec_sim


问题


面经


文章

微信
公众号

扫码关注公众号