def __init__(self,words, vectors, number_of_steps = 21,metric="cosine",linkage="complete"):
self.words = words
self.vectors = vectors
self.number_of_steps = number_of_steps
self.metric = metric
self.linkage = linkage
python类linkage()的实例源码
def __call__(self):
if len(self.words) == 0 or len(self.vectors) == 0:
return []
distance_matrix = scidist.pdist(np.array(self.vectors),self.metric)
linkage_matrix = hier.linkage(distance_matrix,self.linkage)
dendrogram = self._linkage_matrix_to_dendrogram(linkage_matrix,self.words,self.vectors)
clusterings = self._create_clusterings(dendrogram)
return [[(node.label,node.vector) for node in _get_cluster_nodes(cluster)] for cluster in self._find_optimal_clustering(clusterings)]
def test_denrogram_children():
# temporary solution for
# https://stackoverflow.com/questions/40239956/node-indexing-in-hierarachical-clustering-dendrograms
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
from freediscovery.cluster import _DendrogramChildren
# generate two clusters: a with 10 points, b with 5:
np.random.seed(1)
a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]],
size=[10, ])
b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]],
size=[5, ])
X = np.concatenate((a, b),)
Z = linkage(X, 'ward')
# make distances between pairs of children uniform
# (re-scales the horizontal (distance) axis when plotting)
Z[:, 2] = np.arange(Z.shape[0])+1
ddata = dendrogram(Z, no_plot=True)
dc = _DendrogramChildren(ddata)
idx = 0
# check that we can compute children for all nodes
for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
node_children = dc.query(idx)
idx += 1
# last level node should encompass all samples
assert len(node_children) == X.shape[0]
assert_allclose(sorted(node_children), np.arange(X.shape[0]))
def test_binary_linkage2clusters():
from freediscovery.cluster.utils import _binary_linkage2clusters
from sklearn.metrics import v_measure_score
n_samples = 10
linkage = np.array([[1, 2],
[2, 3],
[5, 7],
[6, 9]])
cluster_id = _binary_linkage2clusters(linkage, n_samples)
cluster_id_ref = np.array([0, 1, 1, 1, 2, 3, 4, 3, 5, 4])
assert cluster_id.shape == cluster_id_ref.shape
# i.e. same clusters
assert v_measure_score(cluster_id, cluster_id_ref) == 1.0
def get_k(clustering, depth = 10):
"""
(ndarray, int) -> int
clustering: ndarray -- linkage matrix representing hierarchical clustering
depth: int -- the maximum depth to traverse clustering
Returns the number of clusters to extract from the hierarchical clustering
using the elbow method.
"""
last = clustering[-depth: , 2]
acceleration = np.diff(last, 2)
acceleration_rev = acceleration[::-1]
k = acceleration_rev.argmax() + 2
return k
def get_cluster_assignments(sim_matrix, parameters):
"""
(np.array, list of int) -> list of int
sim_matrix: list of list of float -- similarity matrix between exemplars
parameters: list of parameters in the format ["method:method_name",
"algo:algo_name", "k:num_clusters", "damping:damping"]
where order doesn't matter
(k and damping only relevant for certain clustering methods)
the possible values for each parameter are listed in the
function below.
Returns a list of integers. The integer at each index of the list corresponds
to the cluster number of the exemplar at the same index in sim_matrix.
"""
algorithm = next((re.split(':',f)[1] for f in parameters if f[:4] == 'algo'), 'ap')
# from { 'hierarchical', 'kmeans', 'ap', 'ward' }
method = next((re.split(':',f)[1] for f in parameters if f[:6] == 'method'), 'single')
# from {'single', 'complete', 'average'} (only relevant for hierarchical clustering)
kMk = next((int(re.split(':',f)[1]) for f in parameters if f[:1] == 'k'), 8)
# any integer <= the data length
damping = next((re.split(':',f)[1] for f in parameters if f[:4] == 'damping'), 0.5)
# only relevant for AP -- in [0.5,1]
#
if algorithm == 'hierarchical':
clustering = hierarchy.linkage(sim_matrix, method)
k = get_k(clustering, 20)
cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1
elif algorithm == 'kmeans':
cluster_assignments = KMeans(n_clusters = kMk).fit_predict(sim_matrix)
elif algorithm == 'ap':
cluster_assignments = AffinityPropagation().fit_predict(sim_matrix)
elif algorithm == 'ward':
clustering = hierarchy.ward(sim_matrix)
k = get_k(clustering, 20)
cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1
return cluster_assignments
def HierarchicalClustering(self,data):
distances = nx.to_numpy_matrix(data)
hierarchy = linkage(distances)
print hierarchy,"HIERRATCJY"
Z = dendrogram(hierarchy)
print Z
return hierarchy
def main():
D = 2 # so we can visualize it more easily
s = 4 # separation so we can control how far apart the means are
mu1 = np.array([0, 0])
mu2 = np.array([s, s])
mu3 = np.array([0, s])
N = 900 # number of samples
X = np.zeros((N, D))
X[:300, :] = np.random.randn(300, D) + mu1
X[300:600, :] = np.random.randn(300, D) + mu2
X[600:, :] = np.random.randn(300, D) + mu3
Z = linkage(X, 'ward')
print "Z.shape:", Z.shape
# Z has the format [idx1, idx2, dist, sample_count]
# therefore, its size will be (N-1, 4)
plt.title("Ward")
dendrogram(Z)
plt.show()
Z = linkage(X, 'single')
plt.title("Single")
dendrogram(Z)
plt.show()
Z = linkage(X, 'complete')
plt.title("Complete")
dendrogram(Z)
plt.show()
def single_silhouette_dendrogram(dist_matrix, Z, threshold, mode='clusters',
method='single', sample_names=None):
"""Compute the average silhouette at a given threshold.
Parameters
----------
dist_matrix : array-like
Precomputed distance matrix between points.
Z : array-like
Linkage matrix, results of scipy.cluster.hierarchy.linkage.
threshold : float
Specifies where to cut the dendrogram.
mode : ('clusters', 'thresholds'), optional
Choose what to visualise on the x-axis.
Returns
-------
x : float
Based on mode, it can contains the number of clusters or threshold.
silhouette_avg : float
The average silhouette.
"""
cluster_labels = fcluster(Z, threshold, 'distance')
nclusts = np.unique(cluster_labels).shape[0]
save_results_clusters("res_{}_{:03d}_clust.csv".format(method, nclusts),
sample_names, cluster_labels)
try:
silhouette_list = silhouette_samples(dist_matrix, cluster_labels,
metric="precomputed")
silhouette_avg = np.mean(silhouette_list)
x = max(cluster_labels) if mode == 'clusters' else threshold
except ValueError as e:
if max(cluster_labels) == 1:
x = 1 if mode == 'clusters' else threshold
silhouette_avg = 0
else:
raise(e)
return x, silhouette_avg
def hierarchical_ordering_indices(columns, correlation_matrix):
"""Return array with hierarchical cluster ordering of columns
Parameters
----------
columns: iterable of str
Names of columns.
correlation_matrix: np.ndarray
Matrix of correlation coefficients between columns.
Returns
-------
indices: iterable of int
Indices with order of columns
"""
if len(columns) > 2:
pairwise_dists = distance.pdist(
np.where(np.isnan(correlation_matrix), 0, correlation_matrix),
metric='euclidean')
linkage = hierarchy.linkage(pairwise_dists, method='average')
dendogram = hierarchy.dendrogram(
linkage, no_plot=True, color_threshold=-np.inf)
idx = dendogram['leaves']
else:
idx = list(range(len(columns)))
return idx
def anglesCluster(angles):
# this function uses hierarchical clustering to find the clusters of a set of angle values
Dists = dist.pdist(angles, angleDiff)
linkageMatrix = hier.linkage(Dists, metric = angleDiff)
C = hier.fcluster(linkageMatrix, 5, 'maxclust')
return C
def clustering(partSeqList, partData):
print('clustering for the seperated dataset')
#simiMatrix = simiMatrixCal(partData)
'''Invoke the clustering method in library'''
data_dist = pdist(partData,metric=distCalculate)
Z = linkage(data_dist, 'complete')
clusterLabels = fcluster(Z, para['max_d'], criterion='distance')
print ('there are altogether %d clusters in this initial clustering'%(len(np.unique(clusterLabels))))
clusNum = len(set(clusterLabels))
instIndexPerClus=[[] for i in range(clusNum)] #initialization
for i in range(len(clusterLabels)):
lab = clusterLabels[i]-1
instIndexPerClus[lab].append(partSeqList[i])
return clusterLabels,instIndexPerClus
def clustering(partSeqList, partData):
'''Invoke the clustering method in library'''
print('clustering for the seperated dataset')
data_dist = pdist(partData,metric=distCalculate)
Z = linkage(data_dist, 'complete')
clusterLabels = fcluster(Z, para['max_d'], criterion='distance')
print('there are altogether %d clusters in this initial clustering'%(len(np.unique(clusterLabels))))
clusNum = len(set(clusterLabels))
instIndexPerClus=[[] for i in range(clusNum)] #initialization
for i in range(len(clusterLabels)):
lab = clusterLabels[i]-1
instIndexPerClus[lab].append(partSeqList[i])
return clusterLabels,instIndexPerClus
def _transactions_fuzzy_matching(transactions, match):
"""
Runs fuzzy matching on the transactions, by applying a complete linkage
hierarchical clustering algorithm to the set of different itemsets in the
transactions. For clustering, the similarity ratio as given by
fuzzywuzzy.ratio is used as the distance measure
Input:
transactions: list of tuples representing items on each transaction
match: minimum similarity ratio (0 to 100) for clustering
Output:
transactions: new version of the transactions, where each item has been
replaced by the first item on its corresponding cluster
word_clusters: dictionary that maps the cluster for each item
in the transactions
"""
words = set([])
for transaction in transactions:
words |= set(transaction)
words = sorted(words)
l = [((a, b), 100-Levenshtein.ratio(str(a), str(b)))
for a, b in combinations(words, 2)]
d = [value for pair, value in l]
r = linkage(d, 'complete')
clusters_index = fcluster(r, 100-match, "distance")
clusters = {}
for obs_i, cluster_i in enumerate(clusters_index):
if cluster_i in clusters:
clusters[cluster_i].append(words[obs_i])
else:
clusters[cluster_i] = [words[obs_i]]
word_clusters = {word: clusters[clusters_index[i]]
for i, word in enumerate(words)}
new_transactions = []
for transaction in transactions:
new_transaction = tuple(set(([word_clusters[word][0]
for word in transaction])))
new_transactions.append(new_transaction)
return new_transactions, word_clusters
def get_subtrees0(d, bin_chr, bin_position, method="ward", nchrom=1000, distfrac=0.4):
"""Recomputing linkage is slower than pruning the tree for large matrices"""
maxtdist = 0
i = 0
subtrees = []
for i in range(1, nchrom):
Z = fastcluster.linkage(d[np.triu_indices(d.shape[0], 1)], method=method)
names = get_names(bin_chr, bin_position)
t = distance_matrix2tree(Z, names)
#tree = sch.to_tree(Z, False); t = ete3.Tree(getNewick(tree, "", tree.dist, names))
# get longest branch
n, bestdist = get_longest(t)
tname, tdist = t.get_farthest_leaf()#[1]
if maxtdist < tdist:
maxtdist = t.get_farthest_leaf()[1]
# break if small subtree
if tdist / maxtdist < 1.1 * bestdist / tdist or tdist < maxtdist*distfrac:
break
pruned = n.get_leaf_names()
subtrees.append(pruned)
c = Counter(get_chr_name(n) for n in pruned)
print i, len(names), tdist, maxtdist, bestdist, len(pruned), c.most_common(5)
t = truncate(t, maxd=5); t.render('tree_%s.pdf'%i)
# prune array
indices = np.array([False if name in set(pruned) else True for _i, name in enumerate(names)])
d = d[indices, :]
d = d[:, indices]
bin_chr = bin_chr[indices]
bin_position = bin_position[indices, :]
if i:
subtrees.append(t.get_leaf_names())
pruned = t.get_leaf_names()
c = Counter(get_chr_name(n) for n in pruned)
print i, len(names), tdist, maxtdist, bestdist, len(pruned), c.most_common(5)
return subtrees
def array2scaffolds(outbase, infile, infile2, fasta, threads, minWindows, scaffolds=[]):
"""Return scaffolds computed for given matrix"""
logger("Loading FastA...")
faidx = FastaIndex(fasta)
logger(" %s bp in %s contigs"%(faidx.genomeSize, len(faidx)))
logger("Loading matrix from %s ..."%infile)
d, bin_chr, bin_position, contig2size = load_matrix(infile, scaffolds=scaffolds, verbose=1)
logger(" matrix of %s windows for %s contigs summing %s bp"%(d.shape[0], len(contig2size), sum(contig2size.values())))
# make sure all contigs from matrix are present in FastA
diff = set(contig2size.keys()).difference(faidx)
if diff:
sys.stderr.write("[ERROR] %s / %s contigs are missing from provided FastA!\n"%(len(diff), len(contig2size)))
sys.exit(1)
logger("Calculating linkage matrix & tree...")
names = ["%s %7sk"%(get_name(c), s/1000) for c, (s, e) in zip(bin_chr, bin_position)]
d = transform(d)
t = array2tree(d, names, outbase)
del d
logger("Assigning contigs to clusters/scaffolds...")
clusters = get_clusters(outbase, t, contig2size, bin_chr)
logger("Constructing scaffolds...")
if threads > 1:
scaffolds = clusters2scaffolds_multi(clusters, infile2, minWindows, scaffolds, threads)
else:
scaffolds = clusters2scaffolds(clusters, infile2, minWindows, scaffolds)
logger("Reporting %s scaffolds..."%len(scaffolds))
fastafn = report_scaffolds(outbase, infile, scaffolds, faidx)
return scaffolds, fastafn
def chc(X, K, params=()):
pnames = ['linkage_method', 'distance']
dflts = [ 'ward', 'euclidean']
if isinstance(params, np.ndarray):
paramsloc = params.tolist()
else:
paramsloc = params
(linkage_method, distance) = ds.resolveargumentpairs(pnames, dflts, paramsloc)
Z = sphc.linkage(X, method=linkage_method, metric=distance)
C = sphc.fcluster(Z, K, criterion='maxclust')
return clustVec2partMat(C, K)
# Other related functions
def cluster_helices(helices, cluster_distance=12.0):
""" Clusters helices according to the minimum distance between the line segments representing their backbone.
Notes
-----
Each helix is represented as a line segement joining the CA of its first Residue to the CA if its final Residue.
The minimal distance between pairwise line segments is calculated and stored in a condensed_distance_matrix.
This is clustered using the 'single' linkage metric
(all members of cluster i are at < cluster_distance away from at least one other member of cluster i).
Helices belonging to the same cluster are grouped together as values of the returned cluster_dict.
Parameters
----------
helices: Assembly
cluster_distance: float
Returns
-------
cluster_dict: dict
Keys: int
cluster number
Values: [Polymer]
"""
condensed_distance_matrix = []
for h1, h2 in itertools.combinations(helices, 2):
md = minimal_distance_between_lines(h1[0]['CA']._vector, h1[-1]['CA']._vector,
h2[0]['CA']._vector, h2[-1]['CA']._vector, segments=True)
condensed_distance_matrix.append(md)
z = linkage(condensed_distance_matrix, method='single')
clusters = fcluster(z, t=cluster_distance, criterion='distance')
cluster_dict = {}
for h, k in zip(helices, clusters):
if k not in cluster_dict:
cluster_dict[k] = [h]
else:
cluster_dict[k].append(h)
return cluster_dict
def make_multi_hierarch_cluster_figs(model, field_input):
def make_multi_cat_clust_fig(cats, freq_thr=500): # TODO make into config
"""
Returns fig showing hierarchical clustering of probes from multiple categories
"""
start = time.time()
sns.set_style('white')
# make cat_acts_mat
acts_mats = []
cats_probe_list = []
for cat in cats:
bool_index = [True if sum(model.term_doc_freq_dict[probe]) > freq_thr else False
for probe in model.probe_store.cat_probe_list_dict[cat]]
cat_probe_acts_df = model.get_single_cat_acts_df(cat)
filtered_cat_probes_acts_mat = cat_probe_acts_df[bool_index].values
acts_mats.append(filtered_cat_probes_acts_mat)
cats_probe_list += [model.probe_store.probe_set[probe_id]
for probe_id in cat_probe_acts_df[bool_index].index.tolist()]
cat_acts_mat = np.vstack((mat for mat in acts_mats))
# fig
rcParams['lines.linewidth'] = 2.0
fig, ax = plt.subplots(figsize=(FigsConfigs.MAX_FIG_WIDTH, 5 * len(cats)), dpi=FigsConfigs.DPI)
# dendrogram
dist_matrix = pdist(cat_acts_mat, 'euclidean')
linkages = linkage(dist_matrix, method='complete')
dendrogram(linkages,
ax=ax,
labels=cats_probe_list,
orientation='right',
leaf_font_size=10)
ax.tick_params(axis='both', which='both', top='off', right='off', left='off')
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
print('{} completed in {:.1f} secs'.format(sys._getframe().f_code.co_name, time.time() - start))
return fig
figs = [make_multi_cat_clust_fig(field_input)]
return figs
def test_height_linkage_tree():
# Check that the height of the results of linkage tree is sorted.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
for linkage_func in _TREE_BUILDERS.values():
children, n_nodes, n_leaves, parent = linkage_func(X.T, connectivity)
n_nodes = 2 * X.shape[1] - 1
assert_true(len(children) + n_leaves == n_nodes)