python类cosine()的实例源码-面圈网

analyze_predictions.py 文件源码项目：CS-SMAF 作者: brian-cleary 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def correlations(A,B,pc_n=100):
    p = (1 - distance.correlation(A.flatten(),B.flatten()))
    spear = spearmanr(A.flatten(),B.flatten())
    dist_genes = np.zeros(A.shape[0])
    for i in range(A.shape[0]):
        dist_genes[i] = 1 - distance.correlation(A[i],B[i])
    pg = (np.average(dist_genes[np.isfinite(dist_genes)]))
    dist_sample = np.zeros(A.shape[1])
    for i in range(A.shape[1]):
        dist_sample[i] = 1 - distance.correlation(A[:,i],B[:,i])
    ps = (np.average(dist_sample[np.isfinite(dist_sample)]))
    pc_dist = []
    if pc_n > 0:
        u0,s0,vt0 = np.linalg.svd(A)
        u,s,vt = np.linalg.svd(B)
        for i in range(pc_n):
            pc_dist.append(abs(1 - distance.cosine(u0[:,i],u[:,i])))
        pc_dist = np.array(pc_dist)
    return p,spear[0],pg,ps,pc_dist

evalrank.py 文件源码项目：BioNLP-2016 作者: cambridgeltl 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def evaluate1Word(wv, reference):
    """Evaluate wv against reference, return (rho, count) where rwo is
    Spearman's rho and count is the number of reference word pairs
    that could be evaluated against.
    """
    count=0
    gold, predicted = [], []
    for words, sim in sorted(reference, key=lambda ws: ws[1]):
        if " " not in words[0] and " " not in words[1]:
            #print words[0],words[1]
            try:
                v1, v2 = wv[words[0]], wv[words[1]]
            except KeyError:
                count+=1
                continue
            #print words
            gold.append((words, sim))
            predicted.append((words, cosine(v1, v2)))

    simlist = lambda ws: [s for w,s in ws]
    rho, p = spearmanr(simlist(gold), simlist(predicted))
    print "Word not found in WordVector",count
    return (rho, len(gold))

unsupervised_labels.py 文件源码项目：NETL-Automatic-Topic-Labelling- 作者: sb1992 项目源码文件源码阅读 98 收藏 0 点赞 0 评论 0

def get_best_label(label_list,num):
    topic_ls = get_topic_lg(topic_list[num])
    val_dict = {}
    for item in label_list:
        trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] #Extracting letter trigram for label
        label_cnt = Counter(trigrams)
        total = sum(label_cnt.values(), 0.0)
        for key in label_cnt:
            label_cnt[key] /= total
        tot_keys = list(set(topic_ls.keys() + label_cnt.keys()))
        listtopic = []
        listlabel = []
        for elem in tot_keys:
            if elem in topic_ls:
                listtopic.append(topic_ls[elem])
            else:
                listtopic.append(0.0)
            if elem in label_cnt:
                listlabel.append(label_cnt[elem])
            else:
                listlabel.append(0.0)
        val = 1 - cosine(np.array(listtopic),np.array(listlabel))   # Cosine Similarity
        val_dict[item] = val
    list_sorted=sorted(val_dict.items(), key=lambda x:x[1], reverse = True) # Sorting the labels by rank
    return [i[0] for i in list_sorted[:int(args.num_unsup_labels)]]

clouds.py 文件源码项目：KDDCUP2016 作者: hugochan 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def texts_tfidf(ids, important_texts, citations_texts) :
    '''
    Generates tf-idf vectors for each text then calculates cosine similarity between the vectors. 
    '''

    tfidf = TfidfVectorizer(strip_accents='ascii',
                                                    stop_words='english', 
                                                    ngram_range=(1,2),
                                                    min_df=2)

    freqs1 = tfidf.fit_transform(important_texts)
    terms1 = tfidf.get_feature_names()

    freqs2 = tfidf.fit_transform(citations_texts)
    terms2 = tfidf.get_feature_names()

    return terms1, terms2, freqs1, freqs2

clouds.py 文件源码项目：KDDCUP2016 作者: hugochan 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def texts_similarity(terms1, terms2, freqs1, freqs2) :

    # Merge all terms
    terms = list(set(terms1 + terms2))

    npapers = freqs1.shape[0]
    sims = np.empty(npapers, np.float)

    for i in xrange(npapers) :

        # If one of the vectors is nil, skip it
        if (freqs1[i].sum()==0.0) or (freqs2[i].sum()==0.0) :
            continue

        # Changes representation to a {term: freq} map
        fmap1 = to_dict(terms1, freqs1.getrow(i).toarray()[0])
        fmap2 = to_dict(terms2, freqs2.getrow(i).toarray()[0])

        vec1, vec2 = to_same_dimension(terms, fmap1, fmap2)

        sims[i] = 1.0-cosine(vec1, vec2)

    return sims

clouds.py 文件源码项目：KDDCUP2016 作者: hugochan 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def random_similarity(terms1, terms2, freqs1, freqs2) :

    # Merge all terms
    terms = list(set(terms1 + terms2))

    npapers = freqs1.shape[0]
    sims = np.empty(npapers, np.float)

    for i in xrange(npapers) :
        a = random.randint(0,npapers-1)  #@UndefinedVariable
        b = random.randint(0,npapers-1)  #@UndefinedVariable

        # If one of the vectors is nil, skip it
        if (freqs1[a].sum()==0.0) or (freqs2[b].sum()==0.0) :
            continue

        # Changes representation to a {term: freq} map
        fmap1 = to_dict(terms1, freqs1[a].toarray()[0])
        fmap2 = to_dict(terms2, freqs2[b].toarray()[0])

        vec1, vec2 = to_same_dimension(terms, fmap1, fmap2)

        sims[i] = 1.0-cosine(vec1, vec2)

    return sims

tfidf_retrieval.py 文件源码项目：ADEM 作者: mike-n-7 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def sanity_check(test_emb, train_emb, num_test):
    '''
    Sanity check on the cosine similarity calculations
    Finds the closest vector in the space by brute force
    '''
    correct_list = []
    for i in xrange(num_test):
        smallest_norm = np.infty
        index = 0
        for j in xrange(len(train_emb)):
            norm = np.linalg.norm(emb - test_emb[i])
            if norm < smallest_norm:
                smallest_norm = norm
                index = j
        correct_list.append(index)
    # Pad the list to make it the same length as test_emb
    for i in xrange(len(test_emb) - num_test):
        correct_list.append(-1)
    return correct_list

ranker.py 文件源码项目：scanner 作者: cheng6076 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def token_similarity(self, words ,rwords):
        words = set(words)
        rwords = set(rwords)
        word_vec = np.zeros(self.word_dim)
        rword_vec = np.zeros(self.word_dim)
        word_count = 0
        rword_count = 0
        for word in words:
            if self.word_vec.has_key(word) and word not in self.stopwords:
                word_vec += self.word_vec[word]
                word_count += 1
        for word in rwords:
            if self.word_vec.has_key(word):
                rword_vec += self.word_vec[word]
                rword_count += 1
        if word_count > 0:
            word_vec = word_vec / word_count
        if rword_count > 0:
            rword_vec = rword_vec / rword_count
        if word_count>0 and rword_count>0:
            return cosine(word_vec, rword_vec)
        else:
            return 1

word2vec_as_MF.py 文件源码项目：ro_sgns 作者: AlexGrinch 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def nearest_words(self, word, top=20, display=False):
        """
        Find the nearest words to the word 
        according to the cosine similarity.
        """

        W = self.W / np.linalg.norm(self.W, axis=0)   
        if (type(word)==str):
            vec = self.word_vector(word, W)
        else:
            vec = word / np.linalg.norm(word)

        cosines = (vec.T).dot(W)
        args = np.argsort(cosines)[::-1]       

        nws = []
        for i in xrange(1, top+1):
            nws.append(self.inv_vocab[args[i]])
            if (display):
                print self.inv_vocab[args[i]], round(cosines[args[i]],3)

        return nws

functions.py 文件源码项目：ro_sgns 作者: AlexGrinch 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def argmax_fun(W, indices, argmax_type='levi'):
    """
    cosine: b* = argmax cosine(b*, b - a + a*) 
    levi: b* = argmax cos(b*,a*)cos(b*,b)/(cos(b*,a)+eps)
    """

    if (argmax_type == 'levi'):
        W = W / np.linalg.norm(W, axis=0)
        words3 = W[:, indices]
        cosines = ((words3.T).dot(W) + 1) / 2
        obj = (cosines[1] * cosines[2]) / (cosines[0] + 1e-3)
        pred_idx = np.argmax(obj)

    elif (argmax_type == 'cosine'):
        words3_vec = W[:, indices].sum(axis=1) - 2*W[:, indices[0]]
        W = W / np.linalg.norm(W, axis=0)
        words3_vec = words3_vec / np.linalg.norm(words3_vec)
        cosines = (words3_vec.T).dot(W)
        pred_idx = np.argmax(cosines)

    return pred_idx

thesaurus.py 文件源码项目：abc 作者: daemon 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def synonyms_by_synset(self, synset_name, topn=3):
    ssid = self.id_table[synset_name]
    doc = self.doc_matrix[ssid]
    found_indices = set([ssid])
    synonyms = []
    for _ in range(topn):
      min_index = 0
      min_val = 10
      for i in range(self.doc_matrix.shape[0]):
        cos_dist = cosine(self.doc_matrix[i], doc)
        if i not in found_indices and cos_dist < min_val:
          min_index = i
          min_val = cos_dist
      found_indices.add(min_index)
      synonyms.append((self.definitions[min_index], min_val))
    return synonyms

paraphrase.py 文件源码项目：cluster_paraphrases 作者: acocos 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def get_sils_matrix(method, scores, wordlist):
    ''' See get_sims_matrix for definitions, which are the same here. The
    difference is that the resulting matrix contains distances instead of
    similarities.

    :return: 2-dimensional np.ndarray of size len(wordlist) x len(wordlist)
    '''
    if method =='direct':
        sims = get_sims_matrix(method, scores, wordlist)
        sims = preprocessing.normalize(np.matrix(sims), norm='l2')
        sils = 1-sims
    elif method == 'dict_cosine': # cosine dist of word-PPDB2.0Score matrix
        sils = np.array([[dict_cosine_dist(scores.get(i,{}),scores.get(j,{})) for j in wordlist] for i in wordlist])
    elif method == 'dict_JS': # JS divergence of word-PPDB2.0Score matrix
        sils = np.array([[dict_js_divergence(scores.get(i,{}),scores.get(j,{}))[0] for j in wordlist] for i in wordlist])
    elif method == 'vec_cosine':
        d = scores.values()[0].shape[0]
        sils = np.array([[cosine(scores.get(i,np.zeros(d)), scores.get(j,np.zeros(d))) for j in wordlist] for i in wordlist])
    else:
        sys.stderr.write('Unknown sil method: %s' % method)
        return None
    sils = np.nan_to_num(sils)
    return sils

generation_metrics.py 文件源码项目：narrative-prediction 作者: roemmele 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def get_sentiment_sim(context_seqs, gen_seqs):
    '''return the cosine similarity between the sentiment scores of each context and corresponding generated sequence;
    the sentiment scores are given in spacy'''
    gen_seqs = check_seqs_format(gen_seqs)
    emotion_types = ['AFRAID', 'AMUSED', 'ANGRY', 'ANNOYED', 'DONT_CARE', 'HAPPY', 'INSPIRED', 'SAD']
    gen_sentiment_sim_scores = []
    for context_seq, gen_seqs_ in zip(context_seqs, gen_seqs):
        context_sentiment = lexicon_methods.emotional_valence(encoder(context_seq))
        context_sentiment = numpy.array([context_sentiment[emotion_type] for emotion_type in emotion_types]) + 1e-8 #add tiny number to avoid NaN when all scores are 0
        sentiment_sim_scores = []
        for gen_seq in gen_seqs_:
            gen_sentiment = lexicon_methods.emotional_valence(encoder(gen_seq))
            gen_sentiment = numpy.array([gen_sentiment[emotion_type] for emotion_type in emotion_types]) + 1e-8 #add tiny number to avoid NaN when all scores are 0
            sentiment_sim = 1 - cosine(context_sentiment, gen_sentiment)
            sentiment_sim_scores.append(sentiment_sim)
        gen_sentiment_sim_scores.append(sentiment_sim_scores)

    gen_sentiment_sim_scores = numpy.array(gen_sentiment_sim_scores)
    return {'sentiment_sim_scores': gen_sentiment_sim_scores, 'mean_sentiment_sim_scores': numpy.mean(gen_sentiment_sim_scores)}

test_pairwise.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def test_cosine_similarity():
    # Test the cosine_similarity.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((3, 4))
    Xcsr = csr_matrix(X)
    Ycsr = csr_matrix(Y)

    for X_, Y_ in ((X, None), (X, Y),
                   (Xcsr, None), (Xcsr, Ycsr)):
        # Test that the cosine is kernel is equal to a linear kernel when data
        # has been previously normalized by L2-norm.
        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
        X_ = normalize(X_)
        if Y_ is not None:
            Y_ = normalize(Y_)
        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
        assert_array_almost_equal(K1, K2)

prep_wikiqa_data.py 文件源码项目：answer-triggering 作者: jiez-osu 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def find_similar_words(wordvecs):
    """ Use loaded word embeddings to find out the most similar words in the
    embedded vector space.
    """
    from sklearn.metrics import pairwise_distances
    from scipy.spatial.distance import cosine
    pairwise_sim_mat = 1 - pairwise_distances(wordvecs.W[1:],
                                              metric='cosine',
                                              # metric='euclidean',
                                              )

    id2word = {}
    for key, value in wordvecs.word_idx_map.iteritems():
        assert(value not in id2word)
        id2word[value] = key
    while True:
        word = raw_input("Enter a word ('STOP' to quit): ")
        if word == 'STOP': break
        try:
            w_id = wordvecs.word_idx_map[word]
        except KeyError:
            print '%s not in the vocabulary.' % word
        sim_w_id  = pairwise_sim_mat[w_id-1].argsort()[-10:][::-1]
        for i in sim_w_id:
            print id2word[i+1],
        print ''

abb1t.py 文件源码项目：Abb1t 作者: k-freeman 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def generate_answer(self, msg_text, chat_id):
        minimum_index=[1-(10**(-5)),-1] # min value / minimum index
        if chat_id in self.vectorizer:
            t=self.vectorizer[chat_id].transform([msg_text]).toarray()[0]
        else:
            reply=""
            return
        for i,t2 in enumerate(self.mat[chat_id].toarray()):
            w=cosine(t,t2)
            if abs(w)<=minimum_index[0]:
                if minimum_index[0] == abs(w): # equal weight, lets take the longer message
                    if len(self.speech[chat_id][0][i]) > len(self.speech[chat_id][0][minimum_index[1]]):
                        minimum_index[1] = i
                else: #not equal, take the lower weight
                    minimum_index[0] = w
                    minimum_index[1] = i

        if minimum_index[1]==-1 or minimum_index[0]>0.85: # no message found or score too bad
            return ""

        from_sent_id = self.speech[chat_id][1][minimum_index[1]]
        for i in range(1,5):
            try:
                if from_sent_id != self.speech[chat_id][1][minimum_index[1]+i]:
                    return self.speech[chat_id][0][minimum_index[1]+i]
            except IndexError:
                return ""
        return ""

WordVecs.py 文件源码项目：sota_sentiment 作者: jbarnesspain 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def most_similar(self, word, num_similar=5):
        idx = self._w2idx[word]
        y = list(range(self._matrix.shape[0]))
        y.pop(idx)
        most_similar = [(1,0)] * num_similar
        for i in y:
            dist = 0
            dist = cosine(self._matrix[idx], self._matrix[i])
            if dist < most_similar[-1][0]:
                most_similar.pop()
                most_similar.append((dist,i))
                most_similar = sorted(most_similar)
        most_similar = [(distance, self._idx2w[i]) for (distance, i) in most_similar]
        return most_similar

lsa.py 文件源码项目：uci-statnlp 作者: sameersingh 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def all_col_dist(m):
    D = m.shape[1]
    d = np.zeros((D,D))
    for i in xrange(D):
        div = m[:,i]
        for j in xrange(D):
            djv = m[:,j]
            d[j][i] = cosine(div,djv)
    return d

Assistant.py 文件源码项目：Personal_AI_Assistant 作者: PratylenClub 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def choose_best_action(self, list_of_words):
        min_distance = 3
        best_matching_action = None
        tf_idf_shelve = shelve.open(self.tf_idf_shelve_file_name)
        current_sentence_centroid = self.compute_list_of_words_centroid(list_of_words)
        for action,centroid in tf_idf_shelve[CENTROID].iteritems():
            distance = cosine(centroid,current_sentence_centroid)
            print action,distance
            if distance <= min_distance:
                min_distance = distance
                best_matching_action = action
        tf_idf_shelve.close()
        return current_sentence_centroid, best_matching_action, min_distance

test_eol.py 文件源码项目：Personal_AI_Assistant 作者: PratylenClub 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def choose_best_action(self, list_of_words):
        min_distance = 3
        best_matching_action = None
        tf_idf_shelve = shelve.open(self.tf_idf_shelve_file_name)
        current_sentence_centroid = self.compute_list_of_words_centroid(list_of_words)
        for action,centroid in tf_idf_shelve[CENTROID].iteritems():
            distance = cosine(centroid,current_sentence_centroid)
            print action,distance
            if distance <= min_distance:
                min_distance = distance
                best_matching_action = action
        tf_idf_shelve.close()
        return current_sentence_centroid, best_matching_action, min_distance

utils.py 文件源码项目：MUSE 作者: MiuLab 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def calAvgSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2):
  assert(len(senseVec1)==len(senseVec2))
  avgCos = []
  for t in xrange(len(senseVec1)):
    thisCos = []
    p1 = (senseScore1[t])
    p2 = (senseScore2[t])
    for i in xrange(len(senseVec1[t])):
      for j in xrange(len(senseVec2[t])):
        thisCos.append((1-cosine(senseVec1[t][i],senseVec2[t][j]))*p1[i]*p2[j])
    avgCos.append(np.sum(thisCos))
  return spearmanr(test_score, avgCos)[0]

utils.py 文件源码项目：MUSE 作者: MiuLab 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def calMaxSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2):
  assert(len(senseVec1)==len(senseVec2))
  avgCos = []
  for t in xrange(len(senseVec1)):
    i = np.argmax(senseScore1[t])
    j = np.argmax(senseScore2[t])
    thisCos = (1-cosine(senseVec1[t][i],senseVec2[t][j])) 
    avgCos.append(thisCos)
  return spearmanr(test_score, avgCos)[0]

sentence-similarity.py 文件源码项目：visually-grounded-speech 作者: gchrupala 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def cosine_similarity(a, b):
    # returns cosine smilarity between a and b
    return 1.0-cosine(a, b)

sentence-similarity.py 文件源码项目：visually-grounded-speech 作者: gchrupala 项目源码文件源码阅读 45 收藏 0 点赞 0 评论 0

def cosine_similarities(a, b, transform):
    """
    returns list of cosine similarities between lists of vectors
    a and b. The z_score transformation is applied if transform == True
    """
    a = numpy.stack(a)
    b = numpy.stack(b)
    #transform if requested
    if transform:
        print "transforming"
        # z_score is written to apply same scale to a and b
        a, b = z_score(a, b)
    print "calculating cosine dists"
    cos = [cosine_similarity(a[i], b[i]) for i in range(len(a))]
    return cos

eqlm.py 文件源码项目：vec4ir 作者: lgalke 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def delta(u, v):
    """ cosine ° sigmoid
    >>> delta([0.2], [0.3])
    0.5
    >>> delta([0.3], [0.2])
    0.5
    >>> delta([0.1,0.9], [-0.9,0.1]) == delta([-0.9,0.1], [0.1,0.9])
    True
    """
    # TODO scale with a and c
    return expit(cosine(u, v))

lr.py 文件源码项目：NeuralSum 作者: cheng6076 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def reduncy(sen_vec, doc_vec):
        return 1 - cosine(sen_vec, (doc_vec - sen_vec))

lr.py 文件源码项目：NeuralSum 作者: cheng6076 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def relavence(sen_vec, doc_vec): 
        return 1 - cosine(sen_vec, doc_vec)

openmax_utils.py 文件源码项目：OSDN 作者: abhijitbendale 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def compute_distance(query_channel, channel, mean_vec, distance_type = 'eucos'):
    """ Compute the specified distance type between chanels of mean vector and query image.
    In caffe library, FC8 layer consists of 10 channels. Here, we compute distance
    of distance of each channel (from query image) with respective channel of
    Mean Activation Vector. In the paper, we considered a hybrid distance eucos which
    combines euclidean and cosine distance for bouding open space. Alternatively,
    other distances such as euclidean or cosine can also be used. 

    Input:
    --------
    query_channel: Particular FC8 channel of query image
    channel: channel number under consideration
    mean_vec: mean activation vector

    Output:
    --------
    query_distance : Distance between respective channels

    """

    if distance_type == 'eucos':
        query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200. + spd.cosine(mean_vec[channel, :], query_channel)
    elif distance_type == 'euclidean':
        query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200.
    elif distance_type == 'cosine':
        query_distance = spd.cosine(mean_vec[channel, :], query_channel)
    else:
        print "distance type not known: enter either of eucos, euclidean or cosine"
    return query_distance

time_similarity.py 文件源码项目：vsmlib 作者: undertherain 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def cmp_vectors(v1, v2):
    # c = cosine(normed(v1), normed(v2))
    # c = cosine(v1, v2)
    c = v1 @ v2
    return c

pairdist.py 文件源码项目：BioNLP-2016 作者: cambridgeltl 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.threshold is not None and options.threshold < 0.0:
        raise ValueError('threshold must be >= 0')
    if options.tolerance is not None and options.tolerance < 0.0:
        raise ValueError('tolerance must be >= 0')
    if options.approximate and not options.threshold:
        raise ValueError('approximate only makes sense with a threshold')
    if options.approximate and options.metric != 'cosine':
        raise NotImplementedError('approximate only supported for cosine')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        # whitening should be implemented in wvlib to support together with
        # approximate similarity
        if options.approximate:
            raise NotImplemenedError
        logging.info('normalize features to unit variance')
        vectors = whiten(vectors)

    return words, vectors, wv, options