python类linkage()的实例源码

precluster.py 文件源码 项目:texta 作者: texta-tk 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def __init__(self,words, vectors, number_of_steps = 21,metric="cosine",linkage="complete"):
        self.words = words
        self.vectors = vectors
        self.number_of_steps = number_of_steps
        self.metric = metric
        self.linkage = linkage
precluster.py 文件源码 项目:texta 作者: texta-tk 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __call__(self):
        if len(self.words) == 0 or len(self.vectors) == 0:
            return []

        distance_matrix = scidist.pdist(np.array(self.vectors),self.metric)
        linkage_matrix = hier.linkage(distance_matrix,self.linkage)

        dendrogram = self._linkage_matrix_to_dendrogram(linkage_matrix,self.words,self.vectors)
        clusterings = self._create_clusterings(dendrogram)

        return [[(node.label,node.vector) for node in _get_cluster_nodes(cluster)] for cluster in self._find_optimal_clustering(clusterings)]
test_cluster.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def test_denrogram_children():
    # temporary solution for
    # https://stackoverflow.com/questions/40239956/node-indexing-in-hierarachical-clustering-dendrograms
    import numpy as np
    from scipy.cluster.hierarchy import dendrogram, linkage
    from freediscovery.cluster import _DendrogramChildren

    # generate two clusters: a with 10 points, b with 5:
    np.random.seed(1)
    a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]],
                                      size=[10, ])
    b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]],
                                      size=[5, ])
    X = np.concatenate((a, b),)
    Z = linkage(X, 'ward')
    # make distances between pairs of children uniform
    # (re-scales the horizontal (distance) axis when plotting)
    Z[:, 2] = np.arange(Z.shape[0])+1

    ddata = dendrogram(Z, no_plot=True)
    dc = _DendrogramChildren(ddata)
    idx = 0
    # check that we can compute children for all nodes
    for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
        node_children = dc.query(idx)
        idx += 1
    # last level node should encompass all samples
    assert len(node_children) == X.shape[0]
    assert_allclose(sorted(node_children), np.arange(X.shape[0]))
test_cluster.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test_binary_linkage2clusters():
    from freediscovery.cluster.utils import _binary_linkage2clusters
    from sklearn.metrics import v_measure_score
    n_samples = 10
    linkage = np.array([[1, 2],
                        [2, 3],
                        [5, 7],
                        [6, 9]])

    cluster_id = _binary_linkage2clusters(linkage, n_samples)

    cluster_id_ref = np.array([0, 1, 1, 1, 2, 3, 4, 3, 5, 4])

    assert cluster_id.shape == cluster_id_ref.shape
    # i.e. same clusters
    assert v_measure_score(cluster_id, cluster_id_ref) == 1.0
analyze_clustering.py 文件源码 项目:indefinite-pronouns 作者: dnrb 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def get_k(clustering, depth = 10):
    """
    (ndarray, int) -> int
    clustering: ndarray -- linkage matrix representing hierarchical clustering
    depth: int -- the maximum depth to traverse clustering

    Returns the number of clusters to extract from the hierarchical clustering
    using the elbow method.
    """
    last = clustering[-depth: , 2]
    acceleration = np.diff(last, 2)
    acceleration_rev = acceleration[::-1]
    k = acceleration_rev.argmax() + 2
    return k
analyze_clustering.py 文件源码 项目:indefinite-pronouns 作者: dnrb 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_cluster_assignments(sim_matrix, parameters):
    """
    (np.array, list of int) -> list of int
    sim_matrix: list of list of float -- similarity matrix between exemplars
    parameters: list of parameters in the format ["method:method_name", 
            "algo:algo_name", "k:num_clusters", "damping:damping"]
            where order doesn't matter
            (k and damping only relevant for certain clustering methods)
            the possible values for each parameter are listed in the
            function below.

    Returns a list of integers. The integer at each index of the list corresponds
    to the cluster number of the exemplar at the same index in sim_matrix.
    """

    algorithm = next((re.split(':',f)[1] for f in parameters if f[:4] == 'algo'), 'ap')
    # from { 'hierarchical', 'kmeans', 'ap', 'ward' }
    method = next((re.split(':',f)[1] for f in parameters if f[:6] == 'method'), 'single')
    # from {'single', 'complete', 'average'} (only relevant for hierarchical clustering)
    kMk = next((int(re.split(':',f)[1]) for f in parameters if f[:1] == 'k'), 8)
    # any integer <= the data length
    damping = next((re.split(':',f)[1] for f in parameters if f[:4] == 'damping'), 0.5)
    # only relevant for AP -- in [0.5,1]
    #
    if algorithm == 'hierarchical':
        clustering = hierarchy.linkage(sim_matrix, method)
        k = get_k(clustering, 20)
        cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1
    elif algorithm == 'kmeans':
        cluster_assignments = KMeans(n_clusters = kMk).fit_predict(sim_matrix)
    elif algorithm == 'ap':
        cluster_assignments = AffinityPropagation().fit_predict(sim_matrix)
    elif algorithm == 'ward':
        clustering = hierarchy.ward(sim_matrix)
        k = get_k(clustering, 20)
        cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1
    return cluster_assignments
kmedoids.py 文件源码 项目:ECoG-ClusterFlow 作者: sugeerth 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def HierarchicalClustering(self,data):
        distances = nx.to_numpy_matrix(data)
        hierarchy = linkage(distances)
        print hierarchy,"HIERRATCJY"
        Z = dendrogram(hierarchy)
        print Z
        return hierarchy
hcluster.py 文件源码 项目:lazyprogrammer 作者: inhwane 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def main():
    D = 2 # so we can visualize it more easily
    s = 4 # separation so we can control how far apart the means are
    mu1 = np.array([0, 0])
    mu2 = np.array([s, s])
    mu3 = np.array([0, s])

    N = 900 # number of samples
    X = np.zeros((N, D))
    X[:300, :] = np.random.randn(300, D) + mu1
    X[300:600, :] = np.random.randn(300, D) + mu2
    X[600:, :] = np.random.randn(300, D) + mu3

    Z = linkage(X, 'ward')
    print "Z.shape:", Z.shape
    # Z has the format [idx1, idx2, dist, sample_count]
    # therefore, its size will be (N-1, 4)
    plt.title("Ward")
    dendrogram(Z)
    plt.show()

    Z = linkage(X, 'single')
    plt.title("Single")
    dendrogram(Z)
    plt.show()

    Z = linkage(X, 'complete')
    plt.title("Complete")
    dendrogram(Z)
    plt.show()
silhouette.py 文件源码 项目:icing 作者: slipguru 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def single_silhouette_dendrogram(dist_matrix, Z, threshold, mode='clusters',
                                 method='single', sample_names=None):
    """Compute the average silhouette at a given threshold.

    Parameters
    ----------
    dist_matrix : array-like
        Precomputed distance matrix between points.
    Z : array-like
        Linkage matrix, results of scipy.cluster.hierarchy.linkage.
    threshold : float
        Specifies where to cut the dendrogram.
    mode : ('clusters', 'thresholds'), optional
        Choose what to visualise on the x-axis.

    Returns
    -------
    x : float
        Based on mode, it can contains the number of clusters or threshold.
    silhouette_avg : float
        The average silhouette.
    """
    cluster_labels = fcluster(Z, threshold, 'distance')
    nclusts = np.unique(cluster_labels).shape[0]

    save_results_clusters("res_{}_{:03d}_clust.csv".format(method, nclusts),
                          sample_names, cluster_labels)

    try:
        silhouette_list = silhouette_samples(dist_matrix, cluster_labels,
                                             metric="precomputed")
        silhouette_avg = np.mean(silhouette_list)
        x = max(cluster_labels) if mode == 'clusters' else threshold
    except ValueError as e:
        if max(cluster_labels) == 1:
            x = 1 if mode == 'clusters' else threshold
            silhouette_avg = 0
        else:
            raise(e)

    return x, silhouette_avg
utils.py 文件源码 项目:lens 作者: ASIDataScience 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def hierarchical_ordering_indices(columns, correlation_matrix):
    """Return array with hierarchical cluster ordering of columns

    Parameters
    ----------
    columns: iterable of str
        Names of columns.
    correlation_matrix: np.ndarray
        Matrix of correlation coefficients between columns.

    Returns
    -------
    indices: iterable of int
        Indices with order of columns
    """
    if len(columns) > 2:
        pairwise_dists = distance.pdist(
            np.where(np.isnan(correlation_matrix), 0, correlation_matrix),
            metric='euclidean')
        linkage = hierarchy.linkage(pairwise_dists, method='average')
        dendogram = hierarchy.dendrogram(
            linkage, no_plot=True, color_threshold=-np.inf)
        idx = dendogram['leaves']
    else:
        idx = list(range(len(columns)))

    return idx
visualMovieAnalysis2.py 文件源码 项目:recognizeFitExercise 作者: tyiannak 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def anglesCluster(angles):
    # this function uses hierarchical clustering to find the clusters of a set of angle values
    Dists = dist.pdist(angles, angleDiff)
    linkageMatrix = hier.linkage(Dists, metric = angleDiff)
    C = hier.fcluster(linkageMatrix, 5, 'maxclust')

    return C
LogClustering_HDFS.py 文件源码 项目:loglizer 作者: logpai 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def clustering(partSeqList, partData):
    print('clustering for the seperated dataset')
    #simiMatrix = simiMatrixCal(partData)
    '''Invoke the clustering method in library'''
    data_dist = pdist(partData,metric=distCalculate)
    Z = linkage(data_dist, 'complete')
    clusterLabels = fcluster(Z, para['max_d'], criterion='distance')
    print ('there are altogether %d clusters in this initial clustering'%(len(np.unique(clusterLabels))))
    clusNum = len(set(clusterLabels))
    instIndexPerClus=[[] for i in range(clusNum)]  #initialization
    for i in range(len(clusterLabels)):
        lab = clusterLabels[i]-1
        instIndexPerClus[lab].append(partSeqList[i])
    return clusterLabels,instIndexPerClus
LogClustering_BGL.py 文件源码 项目:loglizer 作者: logpai 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def clustering(partSeqList, partData):
    '''Invoke the clustering method in library'''

    print('clustering for the seperated dataset')
    data_dist = pdist(partData,metric=distCalculate)
    Z = linkage(data_dist, 'complete')
    clusterLabels = fcluster(Z, para['max_d'], criterion='distance')
    print('there are altogether %d clusters in this initial clustering'%(len(np.unique(clusterLabels))))
    clusNum = len(set(clusterLabels))
    instIndexPerClus=[[] for i in range(clusNum)]  #initialization
    for i in range(len(clusterLabels)):
        lab = clusterLabels[i]-1
        instIndexPerClus[lab].append(partSeqList[i])
    return clusterLabels,instIndexPerClus
hu2004.py 文件源码 项目:opminreplicability 作者: epochx 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def _transactions_fuzzy_matching(transactions, match):
    """
    Runs fuzzy matching on the transactions, by applying a complete linkage
    hierarchical clustering algorithm to the set of different itemsets in the
    transactions. For clustering, the similarity ratio as given by
    fuzzywuzzy.ratio is used as the distance measure
    Input:
        transactions: list of tuples representing items on each transaction
        match: minimum similarity ratio (0 to 100) for clustering
    Output:
        transactions: new version of the transactions, where each item has been
                      replaced by the first item on its corresponding cluster
        word_clusters: dictionary that maps the cluster for each item
        in the transactions
    """
    words = set([])
    for transaction in transactions:
        words |= set(transaction)
    words = sorted(words)
    l = [((a, b), 100-Levenshtein.ratio(str(a), str(b)))
         for a, b in combinations(words, 2)]
    d = [value for pair, value in l]
    r = linkage(d, 'complete')
    clusters_index = fcluster(r, 100-match, "distance")
    clusters = {}
    for obs_i, cluster_i in enumerate(clusters_index):
        if cluster_i in clusters:
            clusters[cluster_i].append(words[obs_i])
        else:
            clusters[cluster_i] = [words[obs_i]]

    word_clusters = {word: clusters[clusters_index[i]]
                     for i, word in enumerate(words)}
    new_transactions = []
    for transaction in transactions:
        new_transaction = tuple(set(([word_clusters[word][0]
                                      for word in transaction])))
        new_transactions.append(new_transaction)
    return new_transactions, word_clusters
test_clustering.py 文件源码 项目:HiCembler 作者: lpryszcz 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def get_subtrees0(d, bin_chr, bin_position, method="ward", nchrom=1000, distfrac=0.4):
    """Recomputing linkage is slower than pruning the tree for large matrices"""
    maxtdist = 0
    i = 0
    subtrees = []
    for i in range(1, nchrom):
        Z = fastcluster.linkage(d[np.triu_indices(d.shape[0], 1)], method=method)
        names = get_names(bin_chr, bin_position)
        t = distance_matrix2tree(Z, names)
        #tree = sch.to_tree(Z, False); t = ete3.Tree(getNewick(tree, "", tree.dist, names))

        # get longest branch
        n, bestdist = get_longest(t)
        tname, tdist = t.get_farthest_leaf()#[1]
        if maxtdist < tdist:
            maxtdist = t.get_farthest_leaf()[1]
        # break if small subtree
        if tdist / maxtdist < 1.1 * bestdist / tdist or tdist < maxtdist*distfrac:
            break

        pruned = n.get_leaf_names()         
        subtrees.append(pruned)

        c = Counter(get_chr_name(n) for n in pruned)
        print i, len(names), tdist, maxtdist, bestdist, len(pruned), c.most_common(5)        
        t = truncate(t, maxd=5); t.render('tree_%s.pdf'%i)
        # prune array        
        indices = np.array([False if name in set(pruned) else True for _i, name in enumerate(names)])
        d = d[indices, :]
        d = d[:, indices]
        bin_chr = bin_chr[indices]
        bin_position = bin_position[indices, :]

    if i:    
        subtrees.append(t.get_leaf_names())
        pruned = t.get_leaf_names()
        c = Counter(get_chr_name(n) for n in pruned)
        print i, len(names), tdist, maxtdist, bestdist, len(pruned), c.most_common(5)        
    return subtrees
array2scaffolds.py 文件源码 项目:HiCembler 作者: lpryszcz 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def array2scaffolds(outbase, infile, infile2, fasta, threads, minWindows, scaffolds=[]):
    """Return scaffolds computed for given matrix"""
    logger("Loading FastA...")
    faidx = FastaIndex(fasta)
    logger(" %s bp in %s contigs"%(faidx.genomeSize, len(faidx)))

    logger("Loading matrix from %s ..."%infile)
    d, bin_chr, bin_position, contig2size = load_matrix(infile, scaffolds=scaffolds, verbose=1)
    logger(" matrix of %s windows for %s contigs summing %s bp"%(d.shape[0], len(contig2size), sum(contig2size.values())))

    # make sure all contigs from matrix are present in FastA
    diff = set(contig2size.keys()).difference(faidx)
    if diff:
        sys.stderr.write("[ERROR] %s / %s contigs are missing from provided FastA!\n"%(len(diff), len(contig2size)))
        sys.exit(1)

    logger("Calculating linkage matrix & tree...")
    names = ["%s %7sk"%(get_name(c), s/1000) for c, (s, e) in zip(bin_chr, bin_position)]
    d = transform(d)
    t = array2tree(d, names, outbase)
    del d

    logger("Assigning contigs to clusters/scaffolds...") 
    clusters = get_clusters(outbase, t, contig2size, bin_chr)

    logger("Constructing scaffolds...")
    if threads > 1: 
        scaffolds = clusters2scaffolds_multi(clusters, infile2, minWindows, scaffolds, threads)
    else:
        scaffolds = clusters2scaffolds(clusters, infile2, minWindows, scaffolds)

    logger("Reporting %s scaffolds..."%len(scaffolds))
    fastafn = report_scaffolds(outbase, infile, scaffolds, faidx)

    return scaffolds, fastafn
clustering.py 文件源码 项目:clust 作者: BaselAbujamous 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def chc(X, K, params=()):
    pnames = ['linkage_method',  'distance']
    dflts  = [          'ward', 'euclidean']
    if isinstance(params, np.ndarray):
        paramsloc = params.tolist()
    else:
        paramsloc = params
    (linkage_method, distance) = ds.resolveargumentpairs(pnames, dflts, paramsloc)

    Z = sphc.linkage(X, method=linkage_method, metric=distance)
    C = sphc.fcluster(Z, K, criterion='maxclust')
    return clustVec2partMat(C, K)


# Other related functions
knobs_into_holes.py 文件源码 项目:isambard 作者: woolfson-group 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def cluster_helices(helices, cluster_distance=12.0):
    """ Clusters helices according to the minimum distance between the line segments representing their backbone.

    Notes
    -----
    Each helix is represented as a line segement joining the CA of its first Residue to the CA if its final Residue.
    The minimal distance between pairwise line segments is calculated and stored in a condensed_distance_matrix.
    This is clustered using the 'single' linkage metric
     (all members of cluster i are at < cluster_distance away from at least one other member of cluster i).
    Helices belonging to the same cluster are grouped together as values of the returned cluster_dict.

    Parameters
    ----------
    helices: Assembly
    cluster_distance: float

    Returns
    -------
    cluster_dict: dict
        Keys: int
            cluster number
        Values: [Polymer]

    """
    condensed_distance_matrix = []
    for h1, h2 in itertools.combinations(helices, 2):
        md = minimal_distance_between_lines(h1[0]['CA']._vector, h1[-1]['CA']._vector,
                                            h2[0]['CA']._vector, h2[-1]['CA']._vector, segments=True)
        condensed_distance_matrix.append(md)
    z = linkage(condensed_distance_matrix, method='single')
    clusters = fcluster(z, t=cluster_distance, criterion='distance')
    cluster_dict = {}
    for h, k in zip(helices, clusters):
        if k not in cluster_dict:
            cluster_dict[k] = [h]
        else:
            cluster_dict[k].append(h)
    return cluster_dict
figs.py 文件源码 项目:rnnlab 作者: phueb 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def make_multi_hierarch_cluster_figs(model, field_input):
    def make_multi_cat_clust_fig(cats, freq_thr=500):  # TODO make into config
        """
        Returns fig showing hierarchical clustering of probes from multiple categories
        """
        start = time.time()
        sns.set_style('white')
        # make cat_acts_mat
        acts_mats = []
        cats_probe_list = []
        for cat in cats:
            bool_index = [True if sum(model.term_doc_freq_dict[probe]) > freq_thr else False
                          for probe in model.probe_store.cat_probe_list_dict[cat]]

            cat_probe_acts_df = model.get_single_cat_acts_df(cat)
            filtered_cat_probes_acts_mat = cat_probe_acts_df[bool_index].values
            acts_mats.append(filtered_cat_probes_acts_mat)
            cats_probe_list += [model.probe_store.probe_set[probe_id]
                                for probe_id in cat_probe_acts_df[bool_index].index.tolist()]
        cat_acts_mat = np.vstack((mat for mat in acts_mats))
        # fig
        rcParams['lines.linewidth'] = 2.0
        fig, ax = plt.subplots(figsize=(FigsConfigs.MAX_FIG_WIDTH, 5 * len(cats)), dpi=FigsConfigs.DPI)
        # dendrogram
        dist_matrix = pdist(cat_acts_mat, 'euclidean')
        linkages = linkage(dist_matrix, method='complete')
        dendrogram(linkages,
                   ax=ax,
                   labels=cats_probe_list,
                   orientation='right',
                   leaf_font_size=10)
        ax.tick_params(axis='both', which='both', top='off', right='off', left='off')
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['top'].set_visible(False)
        print('{} completed in {:.1f} secs'.format(sys._getframe().f_code.co_name, time.time() - start))
        return fig

    figs = [make_multi_cat_clust_fig(field_input)]
    return figs
test_hierarchical.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_height_linkage_tree():
    # Check that the height of the results of linkage tree is sorted.
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rng.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    for linkage_func in _TREE_BUILDERS.values():
        children, n_nodes, n_leaves, parent = linkage_func(X.T, connectivity)
        n_nodes = 2 * X.shape[1] - 1
        assert_true(len(children) + n_leaves == n_nodes)


问题


面经


文章

微信
公众号

扫码关注公众号