python类cosine()的实例源码

analyze_predictions.py 文件源码 项目:CS-SMAF 作者: brian-cleary 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def correlations(A,B,pc_n=100):
    p = (1 - distance.correlation(A.flatten(),B.flatten()))
    spear = spearmanr(A.flatten(),B.flatten())
    dist_genes = np.zeros(A.shape[0])
    for i in range(A.shape[0]):
        dist_genes[i] = 1 - distance.correlation(A[i],B[i])
    pg = (np.average(dist_genes[np.isfinite(dist_genes)]))
    dist_sample = np.zeros(A.shape[1])
    for i in range(A.shape[1]):
        dist_sample[i] = 1 - distance.correlation(A[:,i],B[:,i])
    ps = (np.average(dist_sample[np.isfinite(dist_sample)]))
    pc_dist = []
    if pc_n > 0:
        u0,s0,vt0 = np.linalg.svd(A)
        u,s,vt = np.linalg.svd(B)
        for i in range(pc_n):
            pc_dist.append(abs(1 - distance.cosine(u0[:,i],u[:,i])))
        pc_dist = np.array(pc_dist)
    return p,spear[0],pg,ps,pc_dist
evalrank.py 文件源码 项目:BioNLP-2016 作者: cambridgeltl 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def evaluate1Word(wv, reference):
    """Evaluate wv against reference, return (rho, count) where rwo is
    Spearman's rho and count is the number of reference word pairs
    that could be evaluated against.
    """
    count=0
    gold, predicted = [], []
    for words, sim in sorted(reference, key=lambda ws: ws[1]):
        if " " not in words[0] and " " not in words[1]:
            #print words[0],words[1]
            try:
                v1, v2 = wv[words[0]], wv[words[1]]
            except KeyError:
                count+=1
                continue
            #print words
            gold.append((words, sim))
            predicted.append((words, cosine(v1, v2)))

    simlist = lambda ws: [s for w,s in ws]
    rho, p = spearmanr(simlist(gold), simlist(predicted))
    print "Word not found in WordVector",count
    return (rho, len(gold))
unsupervised_labels.py 文件源码 项目:NETL-Automatic-Topic-Labelling- 作者: sb1992 项目源码 文件源码 阅读 93 收藏 0 点赞 0 评论 0
def get_best_label(label_list,num):
    topic_ls = get_topic_lg(topic_list[num])
    val_dict = {}
    for item in label_list:
        trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] #Extracting letter trigram for label
        label_cnt = Counter(trigrams)
        total = sum(label_cnt.values(), 0.0)
        for key in label_cnt:
            label_cnt[key] /= total
        tot_keys = list(set(topic_ls.keys() + label_cnt.keys()))
        listtopic = []
        listlabel = []
        for elem in tot_keys:
            if elem in topic_ls:
                listtopic.append(topic_ls[elem])
            else:
                listtopic.append(0.0)
            if elem in label_cnt:
                listlabel.append(label_cnt[elem])
            else:
                listlabel.append(0.0)
        val = 1 - cosine(np.array(listtopic),np.array(listlabel))   # Cosine Similarity
        val_dict[item] = val
    list_sorted=sorted(val_dict.items(), key=lambda x:x[1], reverse = True) # Sorting the labels by rank
    return [i[0] for i in list_sorted[:int(args.num_unsup_labels)]]
clouds.py 文件源码 项目:KDDCUP2016 作者: hugochan 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def texts_tfidf(ids, important_texts, citations_texts) :
    '''
    Generates tf-idf vectors for each text then calculates cosine similarity between the vectors. 
    '''

    tfidf = TfidfVectorizer(strip_accents='ascii',
                                                    stop_words='english', 
                                                    ngram_range=(1,2),
                                                    min_df=2)

    freqs1 = tfidf.fit_transform(important_texts)
    terms1 = tfidf.get_feature_names()

    freqs2 = tfidf.fit_transform(citations_texts)
    terms2 = tfidf.get_feature_names()

    return terms1, terms2, freqs1, freqs2
clouds.py 文件源码 项目:KDDCUP2016 作者: hugochan 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def texts_similarity(terms1, terms2, freqs1, freqs2) :

    # Merge all terms
    terms = list(set(terms1 + terms2))

    npapers = freqs1.shape[0]
    sims = np.empty(npapers, np.float)

    for i in xrange(npapers) :

        # If one of the vectors is nil, skip it
        if (freqs1[i].sum()==0.0) or (freqs2[i].sum()==0.0) :
            continue

        # Changes representation to a {term: freq} map
        fmap1 = to_dict(terms1, freqs1.getrow(i).toarray()[0])
        fmap2 = to_dict(terms2, freqs2.getrow(i).toarray()[0])

        vec1, vec2 = to_same_dimension(terms, fmap1, fmap2)

        sims[i] = 1.0-cosine(vec1, vec2)

    return sims
clouds.py 文件源码 项目:KDDCUP2016 作者: hugochan 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def random_similarity(terms1, terms2, freqs1, freqs2) :

    # Merge all terms
    terms = list(set(terms1 + terms2))

    npapers = freqs1.shape[0]
    sims = np.empty(npapers, np.float)

    for i in xrange(npapers) :
        a = random.randint(0,npapers-1)  #@UndefinedVariable
        b = random.randint(0,npapers-1)  #@UndefinedVariable

        # If one of the vectors is nil, skip it
        if (freqs1[a].sum()==0.0) or (freqs2[b].sum()==0.0) :
            continue

        # Changes representation to a {term: freq} map
        fmap1 = to_dict(terms1, freqs1[a].toarray()[0])
        fmap2 = to_dict(terms2, freqs2[b].toarray()[0])

        vec1, vec2 = to_same_dimension(terms, fmap1, fmap2)

        sims[i] = 1.0-cosine(vec1, vec2)

    return sims
tfidf_retrieval.py 文件源码 项目:ADEM 作者: mike-n-7 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def sanity_check(test_emb, train_emb, num_test):
    '''
    Sanity check on the cosine similarity calculations
    Finds the closest vector in the space by brute force
    '''
    correct_list = []
    for i in xrange(num_test):
        smallest_norm = np.infty
        index = 0
        for j in xrange(len(train_emb)):
            norm = np.linalg.norm(emb - test_emb[i])
            if norm < smallest_norm:
                smallest_norm = norm
                index = j
        correct_list.append(index)
    # Pad the list to make it the same length as test_emb
    for i in xrange(len(test_emb) - num_test):
        correct_list.append(-1)
    return correct_list
ranker.py 文件源码 项目:scanner 作者: cheng6076 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def token_similarity(self, words ,rwords):
        words = set(words)
        rwords = set(rwords)
        word_vec = np.zeros(self.word_dim)
        rword_vec = np.zeros(self.word_dim)
        word_count = 0
        rword_count = 0
        for word in words:
            if self.word_vec.has_key(word) and word not in self.stopwords:
                word_vec += self.word_vec[word]
                word_count += 1
        for word in rwords:
            if self.word_vec.has_key(word):
                rword_vec += self.word_vec[word]
                rword_count += 1
        if word_count > 0:
            word_vec = word_vec / word_count
        if rword_count > 0:
            rword_vec = rword_vec / rword_count
        if word_count>0 and rword_count>0:
            return cosine(word_vec, rword_vec)
        else:
            return 1
word2vec_as_MF.py 文件源码 项目:ro_sgns 作者: AlexGrinch 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def nearest_words(self, word, top=20, display=False):
        """
        Find the nearest words to the word 
        according to the cosine similarity.
        """

        W = self.W / np.linalg.norm(self.W, axis=0)   
        if (type(word)==str):
            vec = self.word_vector(word, W)
        else:
            vec = word / np.linalg.norm(word)

        cosines = (vec.T).dot(W)
        args = np.argsort(cosines)[::-1]       

        nws = []
        for i in xrange(1, top+1):
            nws.append(self.inv_vocab[args[i]])
            if (display):
                print self.inv_vocab[args[i]], round(cosines[args[i]],3)

        return nws
functions.py 文件源码 项目:ro_sgns 作者: AlexGrinch 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def argmax_fun(W, indices, argmax_type='levi'):
    """
    cosine: b* = argmax cosine(b*, b - a + a*) 
    levi: b* = argmax cos(b*,a*)cos(b*,b)/(cos(b*,a)+eps)
    """

    if (argmax_type == 'levi'):
        W = W / np.linalg.norm(W, axis=0)
        words3 = W[:, indices]
        cosines = ((words3.T).dot(W) + 1) / 2
        obj = (cosines[1] * cosines[2]) / (cosines[0] + 1e-3)
        pred_idx = np.argmax(obj)

    elif (argmax_type == 'cosine'):
        words3_vec = W[:, indices].sum(axis=1) - 2*W[:, indices[0]]
        W = W / np.linalg.norm(W, axis=0)
        words3_vec = words3_vec / np.linalg.norm(words3_vec)
        cosines = (words3_vec.T).dot(W)
        pred_idx = np.argmax(cosines)

    return pred_idx
thesaurus.py 文件源码 项目:abc 作者: daemon 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def synonyms_by_synset(self, synset_name, topn=3):
    ssid = self.id_table[synset_name]
    doc = self.doc_matrix[ssid]
    found_indices = set([ssid])
    synonyms = []
    for _ in range(topn):
      min_index = 0
      min_val = 10
      for i in range(self.doc_matrix.shape[0]):
        cos_dist = cosine(self.doc_matrix[i], doc)
        if i not in found_indices and cos_dist < min_val:
          min_index = i
          min_val = cos_dist
      found_indices.add(min_index)
      synonyms.append((self.definitions[min_index], min_val))
    return synonyms
paraphrase.py 文件源码 项目:cluster_paraphrases 作者: acocos 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def get_sils_matrix(method, scores, wordlist):
    ''' See get_sims_matrix for definitions, which are the same here. The
    difference is that the resulting matrix contains distances instead of
    similarities.

    :return: 2-dimensional np.ndarray of size len(wordlist) x len(wordlist)
    '''
    if method =='direct':
        sims = get_sims_matrix(method, scores, wordlist)
        sims = preprocessing.normalize(np.matrix(sims), norm='l2')
        sils = 1-sims
    elif method == 'dict_cosine': # cosine dist of word-PPDB2.0Score matrix
        sils = np.array([[dict_cosine_dist(scores.get(i,{}),scores.get(j,{})) for j in wordlist] for i in wordlist])
    elif method == 'dict_JS': # JS divergence of word-PPDB2.0Score matrix
        sils = np.array([[dict_js_divergence(scores.get(i,{}),scores.get(j,{}))[0] for j in wordlist] for i in wordlist])
    elif method == 'vec_cosine':
        d = scores.values()[0].shape[0]
        sils = np.array([[cosine(scores.get(i,np.zeros(d)), scores.get(j,np.zeros(d))) for j in wordlist] for i in wordlist])
    else:
        sys.stderr.write('Unknown sil method: %s' % method)
        return None
    sils = np.nan_to_num(sils)
    return sils
generation_metrics.py 文件源码 项目:narrative-prediction 作者: roemmele 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_sentiment_sim(context_seqs, gen_seqs):
    '''return the cosine similarity between the sentiment scores of each context and corresponding generated sequence;
    the sentiment scores are given in spacy'''
    gen_seqs = check_seqs_format(gen_seqs)
    emotion_types = ['AFRAID', 'AMUSED', 'ANGRY', 'ANNOYED', 'DONT_CARE', 'HAPPY', 'INSPIRED', 'SAD']
    gen_sentiment_sim_scores = []
    for context_seq, gen_seqs_ in zip(context_seqs, gen_seqs):
        context_sentiment = lexicon_methods.emotional_valence(encoder(context_seq))
        context_sentiment = numpy.array([context_sentiment[emotion_type] for emotion_type in emotion_types]) + 1e-8 #add tiny number to avoid NaN when all scores are 0
        sentiment_sim_scores = []
        for gen_seq in gen_seqs_:
            gen_sentiment = lexicon_methods.emotional_valence(encoder(gen_seq))
            gen_sentiment = numpy.array([gen_sentiment[emotion_type] for emotion_type in emotion_types]) + 1e-8 #add tiny number to avoid NaN when all scores are 0
            sentiment_sim = 1 - cosine(context_sentiment, gen_sentiment)
            sentiment_sim_scores.append(sentiment_sim)
        gen_sentiment_sim_scores.append(sentiment_sim_scores)

    gen_sentiment_sim_scores = numpy.array(gen_sentiment_sim_scores)
    return {'sentiment_sim_scores': gen_sentiment_sim_scores, 'mean_sentiment_sim_scores': numpy.mean(gen_sentiment_sim_scores)}
test_pairwise.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_cosine_similarity():
    # Test the cosine_similarity.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((3, 4))
    Xcsr = csr_matrix(X)
    Ycsr = csr_matrix(Y)

    for X_, Y_ in ((X, None), (X, Y),
                   (Xcsr, None), (Xcsr, Ycsr)):
        # Test that the cosine is kernel is equal to a linear kernel when data
        # has been previously normalized by L2-norm.
        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
        X_ = normalize(X_)
        if Y_ is not None:
            Y_ = normalize(Y_)
        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
        assert_array_almost_equal(K1, K2)
prep_wikiqa_data.py 文件源码 项目:answer-triggering 作者: jiez-osu 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def find_similar_words(wordvecs):
    """ Use loaded word embeddings to find out the most similar words in the
    embedded vector space.
    """
    from sklearn.metrics import pairwise_distances
    from scipy.spatial.distance import cosine
    pairwise_sim_mat = 1 - pairwise_distances(wordvecs.W[1:],
                                              metric='cosine',
                                              # metric='euclidean',
                                              )

    id2word = {}
    for key, value in wordvecs.word_idx_map.iteritems():
        assert(value not in id2word)
        id2word[value] = key
    while True:
        word = raw_input("Enter a word ('STOP' to quit): ")
        if word == 'STOP': break
        try:
            w_id = wordvecs.word_idx_map[word]
        except KeyError:
            print '%s not in the vocabulary.' % word
        sim_w_id  = pairwise_sim_mat[w_id-1].argsort()[-10:][::-1]
        for i in sim_w_id:
            print id2word[i+1],
        print ''
abb1t.py 文件源码 项目:Abb1t 作者: k-freeman 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def generate_answer(self, msg_text, chat_id):
        minimum_index=[1-(10**(-5)),-1] # min value / minimum index
        if chat_id in self.vectorizer:
            t=self.vectorizer[chat_id].transform([msg_text]).toarray()[0]
        else:
            reply=""
            return
        for i,t2 in enumerate(self.mat[chat_id].toarray()):
            w=cosine(t,t2)
            if abs(w)<=minimum_index[0]:
                if minimum_index[0] == abs(w): # equal weight, lets take the longer message
                    if len(self.speech[chat_id][0][i]) > len(self.speech[chat_id][0][minimum_index[1]]):
                        minimum_index[1] = i
                else: #not equal, take the lower weight
                    minimum_index[0] = w
                    minimum_index[1] = i

        if minimum_index[1]==-1 or minimum_index[0]>0.85: # no message found or score too bad
            return ""

        from_sent_id = self.speech[chat_id][1][minimum_index[1]]
        for i in range(1,5):
            try:
                if from_sent_id != self.speech[chat_id][1][minimum_index[1]+i]:
                    return self.speech[chat_id][0][minimum_index[1]+i]
            except IndexError:
                return ""
        return ""
WordVecs.py 文件源码 项目:sota_sentiment 作者: jbarnesspain 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def most_similar(self, word, num_similar=5):
        idx = self._w2idx[word]
        y = list(range(self._matrix.shape[0]))
        y.pop(idx)
        most_similar = [(1,0)] * num_similar
        for i in y:
            dist = 0
            dist = cosine(self._matrix[idx], self._matrix[i])
            if dist < most_similar[-1][0]:
                most_similar.pop()
                most_similar.append((dist,i))
                most_similar = sorted(most_similar)
        most_similar = [(distance, self._idx2w[i]) for (distance, i) in most_similar]
        return most_similar
lsa.py 文件源码 项目:uci-statnlp 作者: sameersingh 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def all_col_dist(m):
    D = m.shape[1]
    d = np.zeros((D,D))
    for i in xrange(D):
        div = m[:,i]
        for j in xrange(D):
            djv = m[:,j]
            d[j][i] = cosine(div,djv)
    return d
Assistant.py 文件源码 项目:Personal_AI_Assistant 作者: PratylenClub 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def choose_best_action(self, list_of_words):
        min_distance = 3
        best_matching_action = None
        tf_idf_shelve = shelve.open(self.tf_idf_shelve_file_name)
        current_sentence_centroid = self.compute_list_of_words_centroid(list_of_words)
        for action,centroid in tf_idf_shelve[CENTROID].iteritems():
            distance = cosine(centroid,current_sentence_centroid)
            print action,distance
            if distance <= min_distance:
                min_distance = distance
                best_matching_action = action
        tf_idf_shelve.close()
        return current_sentence_centroid, best_matching_action, min_distance
test_eol.py 文件源码 项目:Personal_AI_Assistant 作者: PratylenClub 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def choose_best_action(self, list_of_words):
        min_distance = 3
        best_matching_action = None
        tf_idf_shelve = shelve.open(self.tf_idf_shelve_file_name)
        current_sentence_centroid = self.compute_list_of_words_centroid(list_of_words)
        for action,centroid in tf_idf_shelve[CENTROID].iteritems():
            distance = cosine(centroid,current_sentence_centroid)
            print action,distance
            if distance <= min_distance:
                min_distance = distance
                best_matching_action = action
        tf_idf_shelve.close()
        return current_sentence_centroid, best_matching_action, min_distance
utils.py 文件源码 项目:MUSE 作者: MiuLab 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def calAvgSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2):
  assert(len(senseVec1)==len(senseVec2))
  avgCos = []
  for t in xrange(len(senseVec1)):
    thisCos = []
    p1 = (senseScore1[t])
    p2 = (senseScore2[t])
    for i in xrange(len(senseVec1[t])):
      for j in xrange(len(senseVec2[t])):
        thisCos.append((1-cosine(senseVec1[t][i],senseVec2[t][j]))*p1[i]*p2[j])
    avgCos.append(np.sum(thisCos))
  return spearmanr(test_score, avgCos)[0]
utils.py 文件源码 项目:MUSE 作者: MiuLab 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def calMaxSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2):
  assert(len(senseVec1)==len(senseVec2))
  avgCos = []
  for t in xrange(len(senseVec1)):
    i = np.argmax(senseScore1[t])
    j = np.argmax(senseScore2[t])
    thisCos = (1-cosine(senseVec1[t][i],senseVec2[t][j])) 
    avgCos.append(thisCos)
  return spearmanr(test_score, avgCos)[0]
sentence-similarity.py 文件源码 项目:visually-grounded-speech 作者: gchrupala 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def cosine_similarity(a, b):
    # returns cosine smilarity between a and b
    return 1.0-cosine(a, b)
sentence-similarity.py 文件源码 项目:visually-grounded-speech 作者: gchrupala 项目源码 文件源码 阅读 40 收藏 0 点赞 0 评论 0
def cosine_similarities(a, b, transform):
    """
    returns list of cosine similarities between lists of vectors
    a and b. The z_score transformation is applied if transform == True
    """
    a = numpy.stack(a)
    b = numpy.stack(b)
    #transform if requested
    if transform:
        print "transforming"
        # z_score is written to apply same scale to a and b
        a, b = z_score(a, b)
    print "calculating cosine dists"
    cos = [cosine_similarity(a[i], b[i]) for i in range(len(a))]
    return cos
eqlm.py 文件源码 项目:vec4ir 作者: lgalke 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def delta(u, v):
    """ cosine ° sigmoid
    >>> delta([0.2], [0.3])
    0.5
    >>> delta([0.3], [0.2])
    0.5
    >>> delta([0.1,0.9], [-0.9,0.1]) == delta([-0.9,0.1], [0.1,0.9])
    True
    """
    # TODO scale with a and c
    return expit(cosine(u, v))
lr.py 文件源码 项目:NeuralSum 作者: cheng6076 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def reduncy(sen_vec, doc_vec):
        return 1 - cosine(sen_vec, (doc_vec - sen_vec))
lr.py 文件源码 项目:NeuralSum 作者: cheng6076 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def relavence(sen_vec, doc_vec): 
        return 1 - cosine(sen_vec, doc_vec)
openmax_utils.py 文件源码 项目:OSDN 作者: abhijitbendale 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def compute_distance(query_channel, channel, mean_vec, distance_type = 'eucos'):
    """ Compute the specified distance type between chanels of mean vector and query image.
    In caffe library, FC8 layer consists of 10 channels. Here, we compute distance
    of distance of each channel (from query image) with respective channel of
    Mean Activation Vector. In the paper, we considered a hybrid distance eucos which
    combines euclidean and cosine distance for bouding open space. Alternatively,
    other distances such as euclidean or cosine can also be used. 

    Input:
    --------
    query_channel: Particular FC8 channel of query image
    channel: channel number under consideration
    mean_vec: mean activation vector

    Output:
    --------
    query_distance : Distance between respective channels

    """

    if distance_type == 'eucos':
        query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200. + spd.cosine(mean_vec[channel, :], query_channel)
    elif distance_type == 'euclidean':
        query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200.
    elif distance_type == 'cosine':
        query_distance = spd.cosine(mean_vec[channel, :], query_channel)
    else:
        print "distance type not known: enter either of eucos, euclidean or cosine"
    return query_distance
time_similarity.py 文件源码 项目:vsmlib 作者: undertherain 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def cmp_vectors(v1, v2):
    # c = cosine(normed(v1), normed(v2))
    # c = cosine(v1, v2)
    c = v1 @ v2
    return c
pairdist.py 文件源码 项目:BioNLP-2016 作者: cambridgeltl 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.threshold is not None and options.threshold < 0.0:
        raise ValueError('threshold must be >= 0')
    if options.tolerance is not None and options.tolerance < 0.0:
        raise ValueError('tolerance must be >= 0')
    if options.approximate and not options.threshold:
        raise ValueError('approximate only makes sense with a threshold')
    if options.approximate and options.metric != 'cosine':
        raise NotImplementedError('approximate only supported for cosine')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        # whitening should be implemented in wvlib to support together with
        # approximate similarity
        if options.approximate:
            raise NotImplemenedError
        logging.info('normalize features to unit variance')
        vectors = whiten(vectors)

    return words, vectors, wv, options


问题


面经


文章

微信
公众号

扫码关注公众号