python类squareform()的实例源码

features.py 文件源码 项目:cg 作者: michaelhabeck 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def gradient(self, x):

        d = self._distances
        if d is not None and np.ndim(d) == 1: d = squareform(d)

        return np.sum([self.params[k] * self.features[k].gradient(x,d)
                       for k in range(self.K)],0)
persistent_homology.py 文件源码 项目:OpenTDA 作者: outlace 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def buildGraph(data, epsilon=1., metric='euclidean', p=2):
    D = squareform(pdist(data, metric=metric, p=p))
    D[D >= epsilon] = 0.
    G = nx.Graph(D)
    edges = list(map(set, G.edges()))
    weights = [G.get_edge_data(u, v)['weight'] for u, v in G.edges()]
    return G.nodes(), edges, weights
distance_based.py 文件源码 项目:mitre 作者: gerberlab 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def cluster(target_sequence_ids, fasta_filename, method='average'):
    """ Form distance-based hierachical clustering of sequences.

    Looks up each entry in target_sequence_ids in the file 
    specified by fasta_filename to obtain an associated DNA 
    sequence. 

    In principle, we could just work with the Hamming distance, but 
    the sequences may be of different lengths (mostly small 
    differences.) So we need a more sophisticated approach: we use
    pairwise global alignment, scoring 0 for a match, -1 for mismatch,
    and -1.5 for opening or extending a gap. We then take the distance
    to be -1.0*(score). 

    UPGMA clustering is used when method='average', the default.

    Returns the distance matrix and the linkage matrix returned
    by the clustering routine.

    """
    # globalms arguments: seq1, seq2, match, mismatch, open, extend
    distance = lambda seq1, seq2: -1.0*(
        pairwise2.align.globalms(seq1,seq2,0,-1,-1.5,-1.5, score_only=True)
    )
    sequences = fasta_to_dict(fasta_filename)
    N = len(target_sequence_ids)
    distances = np.zeros((N,N))
    # fill in the upper triangle
    for i,seqid1 in enumerate(target_sequence_ids):
        seq1 = sequences[seqid1]
        for j_offset, seqid2 in enumerate(target_sequence_ids[i+1:]):
            j = j_offset + i + 1
            seq2 = sequences[seqid2]
            distances[i][j] = distance(seq1, seq2)
    # convert to the form expected by the scipy clustering routines
    y = squareform(distances,checks=False)
    return distances, hierarchy.linkage(y,method)
transformation_tests_func.py 文件源码 项目:3D_Dense_Transformer_Networks 作者: JohnYC1995 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def makeT(self,cp):
        # cp: [(k*k*k) x 3] control points
        # T: [((k*k*k)+4) x ((k*k*k)+4)]
        K = cp.shape[0]
        T = np.zeros((K+4, K+4))
        T[:K, 0] = 1; T[:K, 1:4] = cp; T[K, 4:] = 1; T[K+1:, 4:] = cp.T
        R = squareform(pdist(cp, metric='euclidean'))
        R = R * R;R[R == 0] = 1 # a trick to make R ln(R) 0
        R = R * np.log(R)
        np.fill_diagonal(R, 0)
        T[:K, 4:] = R
        return T
stats.py 文件源码 项目:gpam_stats 作者: ricoms 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def n1_fraction_borderline(data):

    def get_n1_for_round(sparse_matrix, y):
        Tcsr = minimum_spanning_tree(sparse_matrix)
        borders = set()
        a = Tcsr.nonzero()[0]
        b = Tcsr.nonzero()[1]

        for i in range(len(a)):
            if (y[a[i]] != y[b[i]]):
                borders.add(a[i])
                borders.add(b[i])
        n1 = len(borders)
        return n1

    features = data.columns[:-1, ]
    dist = pdist(data[features], 'euclidean')
    df_dist = pd.DataFrame(squareform(dist))
    sparse_matrix = csr_matrix(df_dist.values)

    labels = data.columns[-1]
    y = data[labels]

    n1 = 0
    rounds = 10

    for round in range(rounds):
        n1 = n1 + get_n1_for_round(sparse_matrix, y)

    n = len(data)
    n1 = (1.0 * n1) / (rounds * n)

    return n1
stats.py 文件源码 项目:gpam_stats 作者: ricoms 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def n2_ratio_intra_extra_class_nearest_neighbor_distance(data):

    features = data.columns[:-1,]
    labels = data.columns[-1]

    dist    = pdist(data[features], 'euclidean')
    df_dist = pd.DataFrame(squareform(dist))

    max_size = df_dist.copy( )
    max_size.iloc[:, :] = False

    classes = data.iloc[ :, -1].unique()
    n = data.shape[0]

    n2 = 0
    cl = 'bla'
    intra_min = 0
    inter_min = 0
    for i in range(data.shape[0]):
        ci = data.iloc[i, -1]
        if ci != cl:
            cl = ci
            intra_idx = data[data[labels] == ci].index.values.tolist()
            inter_idx = data[data[labels] != ci].index.values
        intra_idx.remove(i)
        intra_min = intra_min + df_dist.iloc[intra_idx, i].min()
        inter_min = inter_min + df_dist.iloc[inter_idx, i].min()
        intra_idx.append(i)

    # tratar caso de inter_min == 0
    if inter_min == 0:
        inter_min = 1

    n2 = (1.0 * intra_min) / (1.0 * inter_min)

    return n2
ORFsClustering.py 文件源码 项目:genepred 作者: egorbarsukoff 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def start_clustering(self):
        functions.log('Calculate {0} distances...'.format(int(len(self.orfs) * (len(self.orfs) + 1) / 2)))
        self.distances = self.create_distance_matrix()
        functions.log('Start clustering...')
        self.linkage_matrix = scipy.cluster.hierarchy.linkage(ssd.squareform(self.distances), method='complete')
        functions.log('Clustering done.')
candidates.py 文件源码 项目:luna16 作者: gzuidhof 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def merge_candidates_scan(candidates, seriesuid, distance=5.):
    distances = pdist(candidates, metric='euclidean')
    adjacency_matrix = squareform(distances)

    # Determine nodes within distance, replace by 1 (=adjacency matrix)
    adjacency_matrix = np.where(adjacency_matrix<=distance,1,0)

    # Determine all connected components in the graph
    n, labels = connected_components(adjacency_matrix)
    new_candidates = np.zeros((n,3))

    # Take the mean for these connected components
    for cluster_i in range(n):
        points = candidates[np.where(labels==cluster_i)]
        center = np.mean(points,axis=0)
        new_candidates[cluster_i,:] = center

    x = new_candidates[:,0]
    y = new_candidates[:,1]
    z = new_candidates[:,2]
    labels = [seriesuid]*len(x)
    class_name = [0]*len(x)

    data= zip(labels,x,y,z,class_name)

    new_candidates = pd.DataFrame(data,columns=CANDIDATES_COLUMNS)

    return new_candidates
landmarks.py 文件源码 项目:lddmm-ot 作者: jeanfeydy 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def precompute_kernels(self, q) :
        """
        Returns a tuple of kernel, kernel', kernel'' matrices at position q.
        """
        x = q.reshape((self.npoints, self.dimension))
        dists = squareform(pdist(x, 'sqeuclidean'))
        K = exp(- dists / (2* self.kernel_scale ** 2))

        return (  K, 
                - K / (2* self.kernel_scale ** 2), 
                  K / (4* self.kernel_scale ** 4))
landmarks.py 文件源码 项目:lddmm-ot 作者: jeanfeydy 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def dq_Kqp_a(self,q,p,a, kernels) :
        """
        Useful for the adjoint integration scheme.
        d_q (K_q p) . a  = ...
        """
        h = 1e-8
        Q0phA = q + h*a
        Q0mhA = q - h*a
        update_emp =  (  Landmarks.K(self, Q0phA, p, Landmarks.precompute_kernels(self, Q0phA))
                      -  Landmarks.K(self, Q0mhA, p, Landmarks.precompute_kernels(self, Q0mhA))) / (2*h)
        return update_emp

        """x = q.reshape((self.npoints, self.dimension))
        p = p.reshape((self.npoints, self.dimension))
        a = a.reshape((self.npoints, self.dimension))
        dists = squareform(pdist(x, 'sqeuclidean')) # dists_ij       = |x_i-x_j|^2
        # We have :
        # [K_q p]_nd = sum_j { k(|x_n - x_j|^2) * p_j^d }
        #
        # So that :
        # grad_nd = a_nd * sum_j { 2 * (x_n^d - x_j^d) * k'(|x_n - x_j|^2) * p_j^d }
        grad = zeros((self.npoints, self.dimension))
        for d in range(self.dimension) :
            diffs = atleast_2d(x[:,d]).T - x[:,d]  # diffs_ij = x_i^d - x_j^d

            # K_ij = 2 * (x_i^d - x_j^d) * k'(|x_i - x_j|^2) * p_j^d
            K = 2 * dists * kernels[1] * p[:,d]
            # grad_nd =   a_nd * sum_j { 2 * (x_n^d - x_j^d) * k'(|x_n - x_j|^2) * p_j^d }
            grad[:,d] = a[:,d] * sum( K , 1 )
        return grad.reshape((self.npoints * self.dimension,))"""
ppdb_utils.py 文件源码 项目:Learning-sentence-representation-with-guidance-of-human-attention 作者: wangshaonan 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def getPairsFast(d, type):
    X = []
    T = []
    pairs = []
    for i in range(len(d)):
        (p1,p2) = d[i]
        X.append(p1.representation)
        X.append(p2.representation)
        T.append(p1)
        T.append(p2)

    arr = pdist(X,'cosine')
    arr = squareform(arr)

    for i in range(len(arr)):
        arr[i,i]=1
        if i % 2 == 0:
            arr[i,i+1] = 1
        else:
            arr[i,i-1] = 1

    arr = np.argmin(arr,axis=1)
    for i in range(len(d)):
        (t1,t2) = d[i]
        p1 = None
        p2 = None
        if type == "MAX":
            p1 = T[arr[2*i]]
            p2 = T[arr[2*i+1]]
        if type == "RAND":
            p1 = getPairRand(d,i)
            p2 = getPairRand(d,i)
        if type == "MIX":
            p1 = getPairMixScore(d,i,T[arr[2*i]])
            p2 = getPairMixScore(d,i,T[arr[2*i+1]])
        pairs.append((p1,p2))
    return pairs
lda_tuna.py 文件源码 项目:twitter_LDA_topic_modeling 作者: kenneth-orton 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def cao_juan_2009(topic_term_dists, num_topics):
    cos_pdists = squareform(pdist(topic_term_dists, metric='cosine')) 
    return np.sum(cos_pdists) / (num_topics*(num_topics - 1)/2)
lda_tuna.py 文件源码 项目:twitter_LDA_topic_modeling 作者: kenneth-orton 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def deveaud_2014(topic_term_dists, num_topics):
    jsd_pdists = squareform(pdist(topic_term_dists, metric=jensen_shannon)) 
    return np.sum(jsd_pdists) / (num_topics*(num_topics - 1))
pin.py 文件源码 项目:protein-interaction-network 作者: ericmjl 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def compute_distmat(self, dataframe):
        """
        Computes the pairwise euclidean distances between every atom.

        Design choice: passed in a DataFrame to enable easier testing on
        dummy data.
        """

        self.eucl_dists = pdist(dataframe[['x', 'y', 'z']],
                                metric='euclidean')
        self.eucl_dists = pd.DataFrame(squareform(self.eucl_dists))
        self.eucl_dists.index = dataframe.index
        self.eucl_dists.columns = dataframe.index

        return self.eucl_dists
distanceratio.py 文件源码 项目:eqnet 作者: mast-group 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_representation_distance_ratio(encoder: AbstractEncoder, data_filename: str, print_stats: bool = False):
    """Compute the ratio of the avg distance of points within an equivalence class vs the avg distance between all points"""
    data = import_data(data_filename)
    encodings = []
    equivalence_sets = []

    for name, code in data.items():
        idx = len(encodings)
        enc = encoder.get_encoding(code['original'])
        assert not np.isnan(np.sum(enc))
        encodings.append(enc)
        for noisy_sample in code['noise']:
            enc = encoder.get_encoding(noisy_sample)
            assert not np.isnan(np.sum(enc))
            encodings.append(enc)
        equivalence_sets.append(set(range(idx, len(encodings))))

    encodings = np.array(encodings)

    all_distances = squareform(pdist(encodings, 'cosine'))  # TODO: avoid square form somehow
    assert not np.any(np.isnan(all_distances))

    # Average the lower triangle of all_distances
    avg_distance_between_all_points = np.sum(np.tril(all_distances, k=-1)) / (len(encodings) * (len(encodings) - 1) / 2)

    sum_distance_within_eq_class = 0.
    num_pairs = 0
    for equiv_class_idxs in equivalence_sets:
        num_elements_in_class = len(equiv_class_idxs)
        if num_elements_in_class < 2:
            continue
        elems_in_eq_class = np.fromiter(equiv_class_idxs, dtype=np.int32)
        sum_distance_within_eq_class += np.sum(np.tril(all_distances[elems_in_eq_class][:, elems_in_eq_class], k=-1))
        num_pairs += num_elements_in_class * (num_elements_in_class - 1) / 2

    avg_distance_within_eq_class = sum_distance_within_eq_class / num_pairs
    if print_stats:
        print(
            "Within Avg Dist: %s  All Avg Dist: %s " % (avg_distance_within_eq_class, avg_distance_between_all_points))
    return avg_distance_between_all_points / avg_distance_within_eq_class
contours.py 文件源码 项目:CElegansBehaviour 作者: ChristophKirst 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def sort_points_to_line(vertices, start = 0):
  """Sorts points to a line by sequentiall connectoing nearest points

  Arguments:
    vertices (nx2 array): vertices of the line
    start (int): start index

  Returns:
    nx2 array: sorted points
  """

  d = squareform(pdist(vertices));

  i = start;
  n = vertices.shape[0];
  uidx = np.ones(n, dtype = bool);
  uidx[i] = False;
  sidx = [i];  

  while np.sum(uidx) > 0:
    i = np.argmin(d[i][uidx]);
    i = np.where(uidx)[0][i];
    sidx.append(i);
    uidx[i] = False;

  return vertices[sidx];
corrneighbours.py 文件源码 项目:Building-Machine-Learning-Systems-With-Python-Second-Edition 作者: PacktPublishing 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def predict(otrain):
    binary = (otrain > 0)
    norm = NormalizePositive(axis=1)
    train = norm.fit_transform(otrain)

    dists = distance.pdist(binary, 'correlation')
    dists = distance.squareform(dists)

    neighbors = dists.argsort(axis=1)
    filled = train.copy()
    for u in range(filled.shape[0]):
        # n_u are the neighbors of user
        n_u = neighbors[u, 1:]
        for m in range(filled.shape[1]):
            # This code could be faster using numpy indexing trickery as the
            # cost of readibility (this is left as an exercise to the reader):
            revs = [train[neigh, m]
                    for neigh in n_u
                    if binary[neigh, m]]
            if len(revs):
                n = len(revs)
                n //= 2
                n += 1
                revs = revs[:n]
                filled[u,m] = np.mean(revs)

    return norm.inverse_transform(filled)
BrownianKernel.py 文件源码 项目:kerpy 作者: oxmlcs 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def kernel(self, X, Y=None):

        GenericTests.check_type(X,'X',np.ndarray,2)
        # if X=Y, use more efficient pdist call which exploits symmetry
        normX=reshape(np.linalg.norm(X,axis=1),(len(X),1))
        if Y is None:
            dists = squareform(pdist(X, 'euclidean'))
            normY=normX.T
        else:
            GenericTests.check_type(Y,'Y',np.ndarray,2)
            assert(shape(X)[1]==shape(Y)[1])
            normY=reshape(np.linalg.norm(Y,axis=1),(1,len(Y)))
            dists = cdist(X, Y, 'euclidean')
        K=0.5*(normX**self.alpha+normY**self.alpha-dists**self.alpha)
        return K
HypercubeKernel.py 文件源码 项目:kerpy 作者: oxmlcs 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def kernel(self, X, Y=None):
        """
        Computes the hypercube kerpy k(x,y)=tanh(gamma)^d(x,y), where d is the
        Hamming distance between x and y

        X - 2d numpy.bool8 array, samples on right left side
        Y - 2d numpy.bool8 array, samples on left hand side.
            Can be None in which case its replaced by X
        """

        if not type(X) is numpy.ndarray:
            raise TypeError("X must be numpy array")

        if not len(X.shape) == 2:
            raise ValueError("X must be 2D numpy array")

        if not X.dtype == numpy.bool8:
            raise ValueError("X must be boolean numpy array")

        if not Y is None:
            if not type(Y) is numpy.ndarray:
                raise TypeError("Y must be None or numpy array")

            if not len(Y.shape) == 2:
                raise ValueError("Y must be None or 2D numpy array")

            if not Y.dtype == numpy.bool8:
                raise ValueError("Y must be boolean numpy array")

            if not X.shape[1] == Y.shape[1]:
                raise ValueError("X and Y must have same dimension if Y is not None")

        # un-normalise normalised hamming distance in both cases
        if Y is None:
            K = tanh(self.gamma) ** squareform(pdist(X, 'hamming') * X.shape[1])
        else:
            K = tanh(self.gamma) ** (cdist(X, Y, 'hamming') * X.shape[1])

        return K
GaussianKernel.py 文件源码 项目:kerpy 作者: oxmlcs 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def kernel(self, X, Y=None):
        """
        Computes the standard Gaussian kernel k(x,y)=exp(-0.5* ||x-y||**2 / sigma**2)

        X - 2d numpy.ndarray, first set of samples:
            number of rows: number of samples
            number of columns: dimensionality
        Y - 2d numpy.ndarray, second set of samples, can be None in which case its replaced by X
        """
        if self.is_sparse:
            X = X.todense()
            Y = Y.todense()
        GenericTests.check_type(X, 'X',np.ndarray)
        assert(len(shape(X))==2)

        # if X=Y, use more efficient pdist call which exploits symmetry
        if Y is None:
            sq_dists = squareform(pdist(X, 'sqeuclidean'))
        else:
            GenericTests.check_type(Y, 'Y',np.ndarray)
            assert(len(shape(Y))==2)
            assert(shape(X)[1]==shape(Y)[1])
            sq_dists = cdist(X, Y, 'sqeuclidean')

        K = exp(-0.5 * (sq_dists) / self.width ** 2)
        return K


问题


面经


文章

微信
公众号

扫码关注公众号