python类pairwise_distances()的实例源码

neighbors.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def decision_function(self, X):
        """Compute the distances to the nearest centroid for
        an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        C : array, shape = [n_samples]
        """
        from sklearn.metrics.pairwise import pairwise_distances
        from sklearn.utils.validation import check_array, check_is_fitted

        check_is_fitted(self, 'centroids_')

        X = check_array(X, accept_sparse='csr')

        return pairwise_distances(X, self.centroids_,
                                  metric=self.metric).min(axis=1)
test_similarity.py 文件源码 项目:sef 作者: passalis 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_similarity_calculations():
    """
    Tests the implementation of fast similarity calculations with the PyTorch
    :return:
    """
    np.random.seed(1)

    # Create random data vectors
    for sigma in [0.01, 0.1, 0.5, 1]:
        A = np.random.randn(10, 23)
        sef_sim = fast_heat_similarity_matrix(A, sigma)

        assert sef_sim.shape[0] == 10
        assert sef_sim.shape[1] == 10

        sim = np.exp(-pairwise_distances(A, A)**2/sigma**2)
        assert np.sum((sef_sim-sim)*2) < 1e-3
test_metrics.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test_cosine2jaccard():
    from sklearn.metrics.pairwise import pairwise_distances
    from freediscovery.metrics import (cosine2jaccard_similarity,
                                       jaccard2cosine_similarity)

    x = np.array([[0, 0, 1., 1.]])
    y = np.array([[0, 1., 1., 0]])

    S_cos = 1 - pairwise_distances(x, y, metric='cosine')
    S_jac = cosine2jaccard_similarity(S_cos)
    S_jac_ref = 1 - pairwise_distances(x.astype('bool'), y.astype('bool'), metric='jaccard')

    assert_allclose(S_jac, S_jac_ref)

    S_cos2 = jaccard2cosine_similarity(S_jac)
    assert_allclose(S_cos2, S_cos)
utils.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def centroid_similarity(X, internal_ids, nn_metric='cosine'):
    """ Given a list of documents in a cluster, compute the cluster centroid,
    intertia and individual distances

    Parameters
    ----------
    internal_ids : list
      a list of internal ids
    nn_metric : str
      a rescaling of the metric if needed
    """
    from ..metrics import _scale_cosine_similarity
    from sklearn.metrics.pairwise import pairwise_distances

    X_sl = X[internal_ids, :]
    centroid = X_sl.mean(axis=0)

    if centroid.ndim == 1:
        centroid = centroid[None, :]

    S_cos = 1 - pairwise_distances(X_sl, centroid, metric='cosine')
    S_sim = _scale_cosine_similarity(S_cos, metric=nn_metric)
    S_sim_mean = np.mean(S_sim)
    return float(S_sim_mean), S_sim[:, 0]
tfidf.py 文件源码 项目:newsgraph 作者: exchez 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def query(vec, model, k, max_search_radius):

    data = model['data']
    table = model['table']
    random_vectors = model['random_vectors']
    num_vector = random_vectors.shape[1]

    # Compute bin index for the query vector, in bit representation.
    bin_index_bits = (vec.dot(random_vectors) >= 0).flatten()

    # Search nearby bins and collect candidates
    candidate_set = set()
    for search_radius in range(max_search_radius+1):
        candidate_set = search_nearby_bins(bin_index_bits, table, search_radius, initial_candidates=candidate_set)

    # Sort candidates by their true distances from the query
    nearest_neighbors = pd.DataFrame({'id':list(candidate_set)})
    candidates = data[np.array(list(candidate_set)),:]
    nearest_neighbors['distance'] = pairwise_distances(candidates, vec, metric='cosine').flatten()

    return nearest_neighbors.sort_values(by='distance').head(k), len(candidate_set)
FeatureSelect.py 文件源码 项目:CIKM_AnalytiCup_2017 作者: zxth93 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def pre_train(train_df, test_df, train_add, test_add):

    train = train_df.values[:,1:-1]
    t = train_add.values[:,1:-1]
    train = np.hstack((train, t))

    dtest = test_df.values[:,1:]
    tA = test_add.values[:,1:]
    dtest = np.hstack((dtest, tA))

    cor_distance = pairwise.pairwise_distances(dtest, train)

    resultset = set()
    for tmp in cor_distance:
        index = np.argsort(tmp)
        for i in range(10):
            resultset.add(index[i])

    index = []
    for i in resultset:
        index.append(i)

    return index
classification.py 文件源码 项目:Default-Credit-Card-Prediction 作者: AlexPnt 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def predict(self, X):
        """
        Classify the input data assigning the label of the nearest prototype

        Keyword arguments:
        X -- The feature vectors
        """
        classification=np.zeros(len(X))

        if self.distance_metric=="euclidean":
            distances=pairwise_distances(X, self.M_,self.distance_metric)                                   #compute distances to the prototypes (template matching)
        if self.distance_metric=="minkowski":
            distances=pairwise_distances(X, self.M_,self.distance_metric)   
        elif self.distance_metric=="manhattan":
            distances=pairwise_distances(X, self.M_,self.distance_metric)
        elif self.distance_metric=="mahalanobis":
            distances=pairwise_distances(X, self.M_,self.distance_metric)
        else:
            distances=pairwise_distances(X, self.M_,"euclidean")

        for i in xrange(len(X)):
            classification[i]=self.outcomes[distances[i].tolist().index(min(distances[i]))]                 #choose the class belonging to nearest prototype distance

        return classification
test_similarity.py 文件源码 项目:sef 作者: passalis 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_distance_calculations():
    """
    Tests the implementation of fast distance calculations with the PyTorch
    :return:
    """
    np.random.seed(1)

    # Create random data vectors
    A = np.random.randn(10, 23)
    B = np.random.randn(5, 23)

    sef_dists = fast_distance_matrix(A, B)

    assert sef_dists.shape[0] == 10
    assert sef_dists.shape[1] == 5

    dists = pairwise_distances(A, B)

    assert np.sum((sef_dists-dists)*2) < 1e-3
similarity.py 文件源码 项目:sef 作者: passalis 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def mean_data_distance(data):
    """
    Calculates the mean distance between a set of data points
    :param data:
    :return:
    """
    mean_distance = np.mean(pairwise_distances(data))
    return mean_distance
eval.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
eval.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
eval.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
eval.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
eval.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
eval.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
eval.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
eval.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
eval.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
eval.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
eval-all.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 40 收藏 0 点赞 0 评论 0
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
bof.py 文件源码 项目:cbof 作者: passalis 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def initialize_layer(self, data, n_samples=10000):
        """
        Initializes the layer using k-means (sigma is set to the mean pairwise distance)
        :param data: data
        :param n_samples: n_samples to keep for initializing the model
        :return:
        """
        if self.features_fn is None:
            assert False

        idx = np.arange(data.shape[0])
        np.random.shuffle(idx)

        features = []
        for i in range(idx.shape[0]):
            feats = self.features_fn([data[idx[i]]])
            feats = feats.transpose((0, 2, 3, 1))
            feats = feats.reshape((-1, feats.shape[-1]))
            features.extend(feats)
            if len(features) > n_samples:
                break
        features = np.asarray(features)

        kmeans = KMeans(n_clusters=self.n_codewords, n_jobs=4, n_init=5)
        kmeans.fit(features)
        V = kmeans.cluster_centers_.copy()

        # Initialize gamma
        mean_distance = np.sum(pairwise_distances(V)) / (self.n_codewords * (self.n_codewords - 1))
        self.gamma.set_value(self.gamma.get_value() * np.float32(mean_distance))

        # Initialize codebook
        V = V.reshape((V.shape[0], V.shape[1], 1, 1))
        self.V.set_value(np.float32(V))
query_expansion.py 文件源码 项目:vec4ir 作者: lgalke 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def delta(X, Y, n_jobs=-1, a=1, c=0):
    """Pairwise delta function: cosine and sigmoid

    :X: TODO
    :returns: TODO

    """
    D = pairwise_distances(X, Y, metric="cosine", n_jobs=n_jobs)
    if c != 0:
        D -= c
    if a != 1:
        D *= a
    D = expit(D)
    return D
test_metrics.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test_euclidean2cosine():
    from sklearn.metrics.pairwise import pairwise_distances
    x = normalize([[0, 2, 3, 5]])
    y = normalize([[1, 3, 6, 7]])

    D_cos = pairwise_distances(x, y, metric='cosine')[0, 0]
    S_cos = 1 - D_cos
    D_seuc = pairwise_distances(x, y, metric='euclidean', squared=True)[0, 0]

    assert_allclose(S_cos, seuclidean_dist2cosine_sim(D_seuc))
ranker.py 文件源码 项目:retrieval-2016-deepvision 作者: imatge-upc 项目源码 文件源码 阅读 65 收藏 0 点赞 0 评论 0
def get_distances(self):

        distances = pairwise_distances(self.query_feats,self.db_feats,self.dist_type, n_jobs=-1)

        return distances
mng.py 文件源码 项目:dyfunconn 作者: makism 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def fit(self, data):
        """

        :param data:
        :return:
        """
        [n_samples, n_obs] = data.shape
        self.protos = data[self.rng.choice(n_samples, self.n_protos),] # w
        self.context = np.zeros(self.protos.shape)                     # c

        ct = np.zeros((1, n_obs))
        wr = ct
        cr = wr
        for iteration in range(self.iterations):
            sample = data[self.rng.choice(n_samples, 1),]

            ct = (1 - self.a) * wr + self.b * cr

            t = iteration / float(self.iterations)
            lrate = self.lrate_i * (self.lrate_f / float(self.lrate_i)) ** t
            epsilon = self.epsilon_i * (self.lrate_f / float(self.lrate_i)) ** t

            d = (1 - self.a) * pairwise_distances(sample, self.protos) + self.a * pairwise_distances(ct, self.context)
            I = np.argsort(np.argsort(d))

            min_id = np.where(I == 0)[0]

            H = np.exp(-I / epsilon).ravel()

            diff_w = sample - self.protos
            diff_c = ct - self.context
            for i in range(self.n_protos):
                self.protos[i, :] += lrate * H[i] * diff_w[i, :]
                self.context[i, :] += lrate * H[i] * diff_c[i, :]

            wr = self.protos[min_id]
            cr = self.context[min_id]

        return self
mng.py 文件源码 项目:dyfunconn 作者: makism 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def encode(self, data, metric = 'euclidean'):
        """ Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.

        Parameters
        ----------
        data : real array-like, shape(n_samples, n_features)
            Data matrix, each row represents a sample.

        metric : string
            One of the following valid options as defined for function `http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html`.

            Valid options include:

             - euclidean
             - cityblock
             - l1
             - cosine

        Returns
        -------
        encoded_data : real array-like, shape(n_samples, n_features)
            ``data``, as represented by the prototypes in codebook.
        ts_symbols : list, shape(n_samples, 1)
            A discrete symbolic time series
        """
        nbrs = NearestNeighbors(n_neighbors = 1, algorithm = 'auto', metric = metric).fit(self.protos)
        _, self.__symbols = nbrs.kneighbors(data)
        self.__encoding = self.protos[self.__symbols]

        return (self.__encoding, self.__symbols)
ng.py 文件源码 项目:dyfunconn 作者: makism 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def fit(self, data):
        """ Learn data, and construct a vector codebook.

        Parameters
        ----------
        data : real array-like, shape(n_samples, n_features)
            Data matrix, each row represents a sample.

        Returns
        -------
        self : object
            The instance itself
        """
        [n_samples, _] = data.shape
        self.protos = data[self.rng.choice(n_samples, self.n_protos), ]

        # avg_p = np.mean(data, 0)
        #dist_from_avg_p = np.sum(pairwise_distances(avg_p, data))
        #ndistortion = []

        for iteration in range(self.iterations):
            sample = data[self.rng.choice(n_samples, 1), ]

            t = iteration / float(self.iterations)
            lrate = self.lrate_i * (self.lrate_f / float(self.lrate_i)) ** t
            epsilon = self.epsilon_i * (self.epsilon_f / float(self.epsilon_i)) ** t

            D = pairwise_distances(sample, self.protos, metric='euclidean', n_jobs=self.n_jobs)
            I = np.argsort(np.argsort(D))

            H = np.exp(-I / epsilon).ravel()

            diff = sample - self.protos
            for proto_id in range(self.n_protos):
                self.protos[proto_id, :] += lrate * H[proto_id] * diff[proto_id, :]
                #nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(protos)
                #distances, _ = nbrs.kneighbors(data)
        #ndistortion.append( np.sum(distances) / dist_from_avg_p )

        return self
ng.py 文件源码 项目:dyfunconn 作者: makism 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def encode(self, data, metric='euclidean'):
        """ Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.

        Parameters
        ----------
        data : real array-like, shape(n_samples, n_features)
            Data matrix, each row represents a sample.

        metric : string
            One of the following valid options as defined for function http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html.

            Valid options include:

             - euclidean
             - cityblock
             - l1
             - cosine

        Returns
        -------
        encoded_data : real array-like, shape(n_samples, n_features)
            ``data``, as represented by the prototypes in codebook.
        ts_symbols : list, shape(n_samples, 1)
            A discrete symbolic time series
        """
        # Perform a proposed data mining procedure as described in [Laskaris2004].
        mds = MDS(1, random_state=self.rng)
        protos_1d = mds.fit_transform(self.protos).ravel()
        sorted_protos_1d = np.argsort(protos_1d)

        sprotos = self.protos[sorted_protos_1d]

        nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto', metric=metric).fit(sprotos)
        _, self.__symbols = nbrs.kneighbors(data)
        self.__encoding = sprotos[self.__symbols]

        return (self.__encoding, self.__symbols)
tasks.py 文件源码 项目:newsgraph 作者: exchez 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def grab_articles(self, ids):
    task_id = self.request.id
    ids = ids[0]
    print("Entering Grab Articles Task: ", len(ids))
    print("Task id from self: ", task_id)

    s = select([articles_db.c.id, articles_db.c.tfidf]).where(articles_db.c.id.in_(ids))
    all_articles = pd.read_sql(s, con=connection, chunksize=350)
    all_articles = pd.concat(all_articles, ignore_index=True)

    stored_data = json.loads(r.get(task_id))
    stored_data['status'] = "creating article matrix"
    r.set(task_id, json.dumps(stored_data))

    tfidf_dict = stored_data['tfidf_dict']
    all_articles = all_articles.append({'id': 1, 'tfidf': tfidf_dict}, ignore_index=True)
    corpus = helpers.generate_sparse_matrix(all_articles)
    query_article_vector = corpus.getrow(-1)
    all_articles['distance'] = pairwise_distances(corpus, query_article_vector, metric='cosine').flatten()

    stored_data['status'] = "computing best matches"
    r.set(task_id, json.dumps(stored_data))

    max_distance_from_query = 0.75  # on a scale of 0 (exact match) to 1.0 (not even close)
    all_articles = all_articles[all_articles['distance'] < max_distance_from_query]
    print("Done computing matrix and distances")
    s = select([articles_db.c.id, articles_db.c.headline, articles_db.c.url, articles_db.c.date]).where(
        articles_db.c.id.in_(all_articles['id'].tolist()))
    all_articles = pd.read_sql(s, connection).set_index('id').join(all_articles.set_index('id')).sort_values(by='date')

    query_article = {'headline': stored_data['headline'], 'date': datetime.strptime(stored_data['date'], "%d-%b-%Y"),
                     'distance': 0, 'url': stored_data['url']}
    articles = helpers.make_article_array(all_articles, query_article)
    return articles, query_article['headline']
space.py 文件源码 项目:semspaces 作者: pmandera 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def pairwise_distances(self, X, Y=None, metric='cosine',
                           n_jobs=1, **kwds):

        if self.prenorm:
            if metric == 'cosine':
                return self._cosine_distances_prenorm(X, Y)
            else:
                raise Exception(
                    'Vectors are normalized and will work only with cosine.')

        return smp.pairwise_distances(X, Y, metric=metric,
                                      n_jobs=n_jobs, **kwds)


问题


面经


文章

微信
公众号

扫码关注公众号