python类euclidean_distances()的实例源码

ranking_svm.py 文件源码 项目:bolero 作者: rock-learning 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def predict(self, X):
        """Predict ranking values for new data.

        Parameters
        ----------
        X : array, shape (n_test, n_features)
            Test data

        Returns
        -------
        y : array, shape (n_test,)
            Ranking values
        """
        n_features = X.shape[1]

        if self.n_features != n_features:
            raise ValueError("Expected %d dimensions, got %d"
                             % (self.n_features, n_features))

        K = euclidean_distances(self.X, X, squared=True)
        K /= self.denom
        np.exp(K, K)

        return np.sum(self.alpha[:, np.newaxis] * (K[:-1] - K[1:]), axis=0)
birch.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def transform(self, X):
        """
        Transform X into subcluster centroids dimension.

        Each dimension represents the distance from the sample point to each
        cluster centroid.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Input data.

        Returns
        -------
        X_trans : {array-like, sparse matrix}, shape (n_samples, n_clusters)
            Transformed data.
        """
        check_is_fitted(self, 'subcluster_centers_')
        return euclidean_distances(X, self.subcluster_centers_)
lmnn.py 文件源码 项目:pylmnn 作者: johny-c 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def _select_target_neighbors(self):
        """Find the target neighbors of each sample, that stay fixed during training.

        Returns
        -------
        array_like
            An array of neighbors indices for each sample with shape (n_samples, n_neighbors).

        """

        self.logger.info('Finding target neighbors...')
        target_neighbors = np.empty((self.X_.shape[0], self.n_neighbors_), dtype=int)
        for class_ in self.classes_:
            class_ind, = np.where(np.equal(self.y_, class_))
            dist = euclidean_distances(self.X_[class_ind], squared=True)
            np.fill_diagonal(dist, np.inf)
            neigh_ind = np.argpartition(dist, self.n_neighbors_ - 1, axis=1)
            neigh_ind = neigh_ind[:, :self.n_neighbors_]
            # argpartition doesn't guarantee sorted order, so we sort again but only the k neighbors
            row_ind = np.arange(len(class_ind))[:, None]
            neigh_ind = neigh_ind[row_ind, np.argsort(dist[row_ind, neigh_ind])]
            target_neighbors[class_ind] = class_ind[neigh_ind]

        return target_neighbors
MTMKL.py 文件源码 项目:PersonalizedMultitaskLearning 作者: mitmedialab 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def eta_L2(self):
        # Note that V should be positive
        return self.V*np.sum(euclidean_distances(self.eta,squared=True))
kmeans.py 文件源码 项目:ref-extract 作者: brandonrobertz 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def closest_label(X, labels, vec, dist='cosine', ooc_only=False, top=10):
    if dist == 'euclidean':
        sim = euclidean_distances(X, vec.reshape(1, -1))
    elif dist == 'cosine':
        sim = cosine_similarity(X, vec.reshape(1, -1))
    else:
        raise NotImplementedError('dist must be euclidean or cosine')
    # get the top five indices
    indices = sim.argsort(axis=0)[-top:][::-1]
    words = []
    for i in indices:
        words.append(labels[i[0]])
    return " ".join(words)
trainer_matches.py 文件源码 项目:Yugioh-bot 作者: will7200 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def compare_distances(self, train_img, cluster):
        # sometimes the sift algorithm matches random points on screen so therefore
        # it is necessary to determine the euclidean distances between these points
        distances = euclidean_distances([self.kmeans.cluster_centers_[0]], cluster)
        height, width = train_img.shape
        new_cluster = []
        # If all the points are greater than np.sqrt((width / 2) ** 2 + (height / 2) ** 2)
        # Which then we can assume that they are not correct
        # this will only work on images that fit the same dimensions against the query image
        for index, distance in enumerate(distances[0]):
            if distance <= np.sqrt((width / 2) ** 2 + (height / 2) ** 2):
                new_cluster.append(cluster[index])
        return new_cluster
ranking_svm.py 文件源码 项目:bolero 作者: rock-learning 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def fit(self, X):
        """Fit ranking SVM.

        Parameters
        ----------
        X : array, shape (n_samples, n_features)
            Training data, sorted, highest rank first
        """
        self.n_samples, self.n_features = X.shape
        self.n_alpha = self.n_samples - 1
        self.X = X

        if self.n_samples < 2:
            raise ValueError("Expected at least 2 training samples, got %d"
                             % self.n_samples)

        random_state = check_random_state(self.random_state)
        n_iter = self.n_iter
        if n_iter < 0:
            n_iter = int(50000 * np.sqrt(self.n_features))

        K = euclidean_distances(self.X, squared=True)

        # Average distance between training data
        sigma = np.sqrt(K).sum() / ((self.n_samples - 1) * self.n_samples)
        sigma *= self.c_sigma
        self.denom = -np.maximum(2.0 * sigma ** 2, MACHINE_EPSILON)

        K /= self.denom
        np.exp(K, K)

        # Constraint violation cost
        Ci = np.linspace(self.n_alpha, 1, self.n_alpha) ** self.c_pow
        Ci *= 10 ** self.c_base

        # Optimize alpha parameters
        self.alpha = optimize(Ci, K, 1.0, n_iter, random_state)

        return self
Clustering.py 文件源码 项目:SecuML 作者: ANSSI-FR 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def generateClustering(self, assignment_proba, centroids, drop_annotated_instances = False,
                           cluster_labels = None):
        self.clusters = [Cluster() for x in range(self.num_clusters)]
        if cluster_labels is not None:
            for x in range(self.num_clusters):
                self.clusters[x].label = cluster_labels[x]
        ids = self.instances.getIds()
        for i in range(len(ids)):
            instance_id = ids[i]
            annotated   = self.instances.isAnnotated(instance_id)
            c           = self.assigned_clusters[i]
            proba       = None
            if assignment_proba is not None:
                proba = assignment_proba[i, :]
            label  = self.instances.getLabel(instance_id)
            family = self.instances.getFamily(instance_id)
            if centroids is not None:
                # Reshape to avoid warning from euclidean_distances
                # Does not take 1D array as input
                centroid = centroids[c].reshape(1, -1)
                features = self.instances.getInstance(instance_id).reshape(1,-1)
                distance = euclidean_distances(centroid, features)[0][0]
            else:
                distance = None
            self.clusters[c].addInstance(instance_id, distance, label, family, annotated)
        unknown_cluster_id = 0
        for c in range(self.num_clusters):
            unknown_cluster_id = self.clusters[c].finalComputation(unknown_cluster_id)
render.py 文件源码 项目:picasso 作者: jungmannlab 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def substract_picks(self, path):
        oldpicks = self._picks.copy()
        with open(path, 'r') as f:
            regions = yaml.load(f)
            self._picks = regions['Centers']
            diameter = regions['Diameter']

            x_cord = np.array([_[0] for _ in self._picks])
            y_cord = np.array([_[1] for _ in self._picks])
            x_cord_old = np.array([_[0] for _ in oldpicks])
            y_cord_old = np.array([_[1] for _ in oldpicks])

            distances = np.sum((euclidean_distances(oldpicks, self._picks)<diameter/2)*1,axis=1)>=1
            filtered_list = [i for (i, v) in zip(oldpicks, distances) if not v]

            x_cord_new = np.array([_[0] for _ in filtered_list])
            y_cord_new = np.array([_[1] for _ in filtered_list])
            output = False

            if output:
                fig1 = plt.figure()
                plt.title('Old picks and new picks')
                plt.scatter(x_cord,-y_cord, c='r', label='Newpicks')
                plt.scatter(x_cord_old,-y_cord_old, c='b', label='Oldpicks')
                plt.scatter(x_cord_new,-y_cord_new, c='g', label='Picks to keep')
                fig1.show()
            self._picks = filtered_list

            self.update_pick_info_short()
            self.window.tools_settings_dialog.pick_diameter.setValue(regions['Diameter'])
            self.update_scene(picks_only=True)
distance.py 文件源码 项目:soft-dtw 作者: mblondel 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def compute(self):
        """
        Compute distance matrix.

        Returns
        -------
        D: array, shape = [m, n]
            Distance matrix.
        """
        return euclidean_distances(self.X, self.Y, squared=True)
metrics.py 文件源码 项目:tslearn 作者: rtavenar 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def compute(self):
        """
        Compute distance matrix.
        Returns
        -------
        D: array, shape = [m, n]
            Distance matrix.
        """
        return euclidean_distances(self.X, self.Y, squared=True)
metrics.py 文件源码 项目:operalib 作者: operalib 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def first_periodic_kernel(X, Y=None, gamma=None, period=None):
    # TODO: Add mathematical form of the kernel in the docstring
    """Compute the first periodic kernel between *X* and *Y*.

    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)

    Y : array of shape (n_samples_Y, n_features)

    gamma : float, default None
        If None, default to 1.0 / n_samples_X

    period : float, default None
        If None, default to 2 * pi.

        This parameter should not be default as
        wrong estimation lead to poor learning score.

    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """
    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 0.8

    if period is None:
        period = 2. * pi

    a = -log(gamma) / period
    b = 2 * pi / period
    c = sqrt(pi / a) * (exp(- b ** 2 / (4 * a)) + 1)
    K = euclidean_distances(X, Y, squared=True)

    # TODO: Optimize to avoid temporary?
    return exp(-a * K) * (1 + cos(b * sqrt(K))) / c
system_1.py 文件源码 项目:semeval2017 作者: edilsonacjr 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def fit(self, X, y):

        eucl = euclidean_distances(X)

        k = self.k
        while True:
            simi_m = 1 / (1 + eucl)
            to_remove = simi_m.shape[0] - (k + 1)

            for vec in simi_m:
                vec[vec.argsort()[:to_remove]] = 0

            g = Graph.Weighted_Adjacency(simi_m.tolist(), mode=ADJ_UNDIRECTED, loops=False)

            if g.is_connected():
                break
            k += 1

        self.k = k
        comm = g.community_multilevel()
        self.y_comm = np.array(comm.membership)
        self.y = y
        self.X = X
        self.mapping = {}
        for c in list(set(comm.membership)):
            com_clas = self.y[self.y_comm==c]
            self.mapping[c] = Counter(com_clas).most_common(1)[0][0]
system_1.py 文件源码 项目:semeval2017 作者: edilsonacjr 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def predict(self, X):
        y_pred = []
        for x in X:
            dists = euclidean_distances([x], self.X)[0]
            simi_m = 1 / (1 + dists)
            nearest_com = self.y_comm[simi_m.argsort()[-self.k:]]
            y_pred.append(self.mapping[Counter(nearest_com).most_common(1)[0][0]])

        return np.array(y_pred)
numeric.py 文件源码 项目:clust 作者: BaselAbujamous 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def dist_matrices(X1, X2, criterion='euclidean'):
    X1loc = np.array(X1)
    X2loc = np.array(X2)

    if len(X1loc.shape) == 1:
        if len(X2loc.shape) == 1:
            if X1loc.shape[0] == X2loc.shape[0]:
                # As row vectors
                X1loc = X1loc.reshape(1, -1)
                X2loc = X2loc.reshape(1, -1)
            else:
                # As column vectors
                X1loc = X1loc.reshape(-1, 1)
                X2loc = X2loc.reshape(-1, 1)
        else:
            if X1loc.shape[0] == X2loc.shape[1]:
                # Row vector VS. Many rows
                X1loc = X1loc.reshape(1, -1)
            elif X2loc.shape[1] == 1:
                # Column vector VS. Column vector
                X1loc = X1loc.reshape(-1, 1)
            elif X1loc.shape[0] == X2loc.shape[0]:
                # Row vector VS. transposed columns
                X1loc = X1loc.reshape(1, -1)
                X2loc = X2loc.transpose()
            else:
                raise ValueError('Invalid dimensions of X1 and X2')
    elif len(X2loc.shape) == 1:
        if X2loc.shape[0] == X1loc.shape[1]:
            # Many rows VS. row vector
            X2loc = X2loc.reshape(1, -1)
        else:
            raise ValueError('Invalid dimensions of X1 and X2')

    if criterion == 'euclidean':
        return skdists.euclidean_distances(X1loc, X2loc)
    elif criterion == 'hamming':
        raise NotImplementedError('Hamming distance between rows of matrices has not been implemented yet.')
    else:
        raise ValueError('Invalid distance criterion')
Concurrent_AP.py 文件源码 项目:ProjectOfDataMining 作者: IljaNovo 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def process(self, rows_slice):
        tmp = self.array[rows_slice, ...]
        result = - euclidean_distances(tmp, self.array, squared = True)

        with Worker.hdf5_lock:            
            with tables.open_file(self.hdf5_file, 'r+') as fileh:
                hdf5_array = fileh.get_node(self.path)
                hdf5_array[rows_slice, ...] = result

        del tmp
test_pairwise.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def test_euclidean_distances():
    # Check the pairwise Euclidean distances computation
    X = [[0]]
    Y = [[1], [2]]
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    X = csr_matrix(X)
    Y = csr_matrix(Y)
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    rng = np.random.RandomState(0)
    X = rng.random_sample((10, 4))
    Y = rng.random_sample((20, 4))
    X_norm_sq = (X ** 2).sum(axis=1).reshape(1, -1)
    Y_norm_sq = (Y ** 2).sum(axis=1).reshape(1, -1)

    # check that we still get the right answers with {X,Y}_norm_squared
    D1 = euclidean_distances(X, Y)
    D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
    D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
    D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq,
                             Y_norm_squared=Y_norm_sq)
    assert_array_almost_equal(D2, D1)
    assert_array_almost_equal(D3, D1)
    assert_array_almost_equal(D4, D1)

    # check we get the wrong answer with wrong {X,Y}_norm_squared
    X_norm_sq *= 0.5
    Y_norm_sq *= 0.5
    wrong_D = euclidean_distances(X, Y,
                                  X_norm_squared=np.zeros_like(X_norm_sq),
                                  Y_norm_squared=np.zeros_like(Y_norm_sq))
    assert_greater(np.max(np.abs(wrong_D - D1)), .01)


# Paired distances
functions.py 文件源码 项目:DEPICT 作者: herandy 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def kmeans(encoder_val_clean, y, nClusters, y_pred_prev=None, weight_initilization='k-means++', seed=42, n_init=40,
           max_iter=300):
    # weight_initilization = { 'kmeans-pca', 'kmean++', 'random', None }

    if weight_initilization == 'kmeans-pca':

        start_time = timeit.default_timer()
        pca = PCA(n_components=nClusters).fit(encoder_val_clean)
        kmeans_model = KMeans(init=pca.components_, n_clusters=nClusters, n_init=1, max_iter=300, random_state=seed)
        y_pred = kmeans_model.fit_predict(encoder_val_clean)

        centroids = kmeans_model.cluster_centers_.T
        centroids = centroids / np.sqrt(np.diag(np.matmul(centroids.T, centroids)))

        end_time = timeit.default_timer()

    elif weight_initilization == 'k-means++':

        start_time = timeit.default_timer()
        kmeans_model = KMeans(init='k-means++', n_clusters=nClusters, n_init=n_init, max_iter=max_iter, n_jobs=15,
                              random_state=seed)
        y_pred = kmeans_model.fit_predict(encoder_val_clean)

        D = 1.0 / euclidean_distances(encoder_val_clean, kmeans_model.cluster_centers_, squared=True)
        D **= 2.0 / (2 - 1)
        D /= np.sum(D, axis=1)[:, np.newaxis]

        centroids = kmeans_model.cluster_centers_.T
        centroids = centroids / np.sqrt(np.diag(np.matmul(centroids.T, centroids)))

        end_time = timeit.default_timer()

    print('k-means: \t nmi =', normalized_mutual_info_score(y, y_pred), '\t arc =', adjusted_rand_score(y, y_pred),
          '\t acc = {:.4f} '.format(bestMap(y, y_pred)),
          'K-means objective = {:.1f} '.format(kmeans_model.inertia_), '\t runtime =', end_time - start_time)

    if y_pred_prev is not None:
        print('Different Assignments: ', sum(y_pred == y_pred_prev), '\tbestMap: ', bestMap(y_pred, y_pred_prev),
              '\tdatapoints-bestMap*datapoints: ',
              encoder_val_clean.shape[0] - bestMap(y_pred, y_pred_prev) * encoder_val_clean.shape[0])

    return centroids, kmeans_model.inertia_, y_pred
birch.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def _split_node(node, threshold, branching_factor):
    """The node has to be split if there is no place for a new subcluster
    in the node.
    1. Two empty nodes and two empty subclusters are initialized.
    2. The pair of distant subclusters are found.
    3. The properties of the empty subclusters and nodes are updated
       according to the nearest distance between the subclusters to the
       pair of distant subclusters.
    4. The two nodes are set as children to the two subclusters.
    """
    new_subcluster1 = _CFSubcluster()
    new_subcluster2 = _CFSubcluster()
    new_node1 = _CFNode(
        threshold, branching_factor, is_leaf=node.is_leaf,
        n_features=node.n_features)
    new_node2 = _CFNode(
        threshold, branching_factor, is_leaf=node.is_leaf,
        n_features=node.n_features)
    new_subcluster1.child_ = new_node1
    new_subcluster2.child_ = new_node2

    if node.is_leaf:
        if node.prev_leaf_ is not None:
            node.prev_leaf_.next_leaf_ = new_node1
        new_node1.prev_leaf_ = node.prev_leaf_
        new_node1.next_leaf_ = new_node2
        new_node2.prev_leaf_ = new_node1
        new_node2.next_leaf_ = node.next_leaf_
        if node.next_leaf_ is not None:
            node.next_leaf_.prev_leaf_ = new_node2

    dist = euclidean_distances(
        node.centroids_, Y_norm_squared=node.squared_norm_, squared=True)
    n_clusters = dist.shape[0]

    farthest_idx = np.unravel_index(
        dist.argmax(), (n_clusters, n_clusters))
    node1_dist, node2_dist = dist[[farthest_idx]]

    node1_closer = node1_dist < node2_dist
    for idx, subcluster in enumerate(node.subclusters_):
        if node1_closer[idx]:
            new_node1.append_subcluster(subcluster)
            new_subcluster1.update(subcluster)
        else:
            new_node2.append_subcluster(subcluster)
            new_subcluster2.update(subcluster)
    return new_subcluster1, new_subcluster2
lmnn.py 文件源码 项目:pylmnn 作者: johny-c 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def _find_impostors_batch(x1, x2, t1, t2, return_dist=False, batch_size=500):
        """Find impostor pairs in chunks to avoid large memory usage

        Parameters
        ----------
        x1 : array_like
            An array of transformed data samples with shape (n_samples, n_features).
        x2 : array_like
            An array of transformed data samples with shape (m_samples, n_features) where m_samples < n_samples.
        t1 : array_like
            An array of distances to the margins with shape (n_samples,).
        t2 : array_like
            An array of distances to the margins with shape (m_samples,).
        batch_size : int (Default value = 500)
            The size of each chunk of x1 to compute distances to.
        return_dist : bool (Default value = False)
            Whether to return the distances to the impostors.

        Returns
        -------
        tuple: (array_like, array_like, [array_like])

            imp1 : array_like
                An array of sample indices with shape (n_impostors,).
            imp2 : array_like
                An array of sample indices that violate a margin with shape (n_impostors,).
            dist : array_like, optional
                An array of pairwise distances of (imp1, imp2) with shape (n_impostors,).

        """

        n, m = len(t1), len(t2)
        imp1, imp2, dist = [], [], []
        for chunk in gen_batches(n, batch_size):
            dist_out_in = euclidean_distances(x1[chunk], x2, squared=True)
            i1, j1 = np.where(dist_out_in < t1[chunk, None])
            i2, j2 = np.where(dist_out_in < t2[None, :])
            if len(i1):
                imp1.extend(i1 + chunk.start)
                imp2.extend(j1)
                if return_dist:
                    dist.extend(dist_out_in[i1, j1])
            if len(i2):
                imp1.extend(i2 + chunk.start)
                imp2.extend(j2)
                if return_dist:
                    dist.extend(dist_out_in[i2, j2])

        if return_dist:
            return imp1, imp2, dist
        else:
            return imp1, imp2
postprocess_results.py 文件源码 项目:clust 作者: BaselAbujamous 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def reorderClusters(B, X, GDM, returnOrderIndices = False):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    Bloc = Bloc[:, np.any(Bloc, axis=0)]  # Only keep non-empty clusters

    B_ordered = np.zeros(Bloc.shape, dtype=bool)
    K = Bloc.shape[1]  # Number of clusters
    L = Xloc.shape[0]  # Number of datasets

    if K == 0:
        return Bloc

    # Find Cmeans and distances between clusters
    Cmeans = np.array([None] * L, dtype=object)
    D = np.zeros([K, K, L])  # KxKxL
    for l in range(L):
        Cmeans[l] = np.zeros([K, Xloc[l].shape[1]], dtype=float)  # (K) x (X[l] samples)
        for k in range(K):
            Cmeans[l][k] = np.mean(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
        D[:, :, l] = skdists.euclidean_distances(Cmeans[l])  # KxK
    D = np.median(D, axis=2)  # KxK

    # Set first cluster as first, then find closest by closest
    B_ordered[:, 0] = Bloc[:, 0]
    I = np.zeros(K, dtype=int)
    I[0] = 0
    clustersDone = np.zeros(K, dtype=bool)
    clustersDone[0] = True
    for k in range(1,K):
        relevantD = D[I[k-1], ~clustersDone]
        clustersLeft = np.nonzero(~clustersDone)[0]
        nextCluster = np.argmin(relevantD)
        nextCluster = clustersLeft[nextCluster]
        B_ordered[:, k] = Bloc[:, nextCluster]
        I[k] = nextCluster
        clustersDone[nextCluster] = True

    if returnOrderIndices:
        return (B_ordered, I)
    else:
        return B_ordered
Concurrent_AP.py 文件源码 项目:ProjectOfDataMining 作者: IljaNovo 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def set_preference(data, chunk_size):
    """Return the median of the distribution of pairwise L2 Euclidean distances 
        between samples (the rows of 'data') as the default preference parameter
        for Affinity Propagation clustering.

    Parameters
    ----------
    data : array of shape (N_samples, N_features)
        The data-set submitted for Affinity Propagation clustering.

    chunk_size : int
        The size of random subsamples from the data-set whose similarity
        matrix is computed. The resulting median of the distribution of 
        pairwise distances between the data-points selected as part of a
        given subsample is stored into a list of medians. 

    Returns
    -------
    preference : float
        The preference parameter for Affinity Propagation clustering is computed
        as the median of the list of median pairwise distances between the data-points
        selected as part of each of 15 rounds of random subsampling.
    """

    N_samples, N_features = data.shape

    rng = np.arange(0, N_samples, dtype = int)
    medians = []

    for i in xrange(15):
        selected_samples = np.random.choice(N_samples, size = chunk_size, replace = False)
        samples = data[selected_samples, :]

        S = - euclidean_distances(samples, data, squared = True)

        n = chunk_size * N_samples - (chunk_size * (chunk_size + 1) / 2)

        rows = np.zeros(0, dtype = int)
        for i in xrange(chunk_size):
            rows = np.append(rows, np.full(N_samples - i, i, dtype = int))

        cols = np.zeros(0, dtype = int)
        for i in xrange(chunk_size):
            cols = np.append(cols, np.delete(rng, selected_samples[:i+1]))

        triu_indices = tuple((rows, cols))

        preference = np.median(S, overwrite_input = True)
        medians.append(preference)

        del S

        if i % 4 == 3:
            gc.collect()       

    preference = np.median(medians)

    return preference


问题


面经


文章

微信
公众号

扫码关注公众号