def test_cuttreeHybrid():
from dynamicTreeCut import cutreeHybrid
d = np.transpose(np.arange(1, 10001).reshape(100, 100))
distances = pdist(d, "euclidean")
link = linkage(distances, "average")
test = cutreeHybrid(link, distances)
true = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1]
assert (test['labels'] == true).all()
assert False
python类linkage()的实例源码
def formClusters(dists, link, distance):
"""Form clusters based on hierarchical clustering of input distance matrix
with linkage type and cutoff distance
:param dists: numpy matrix of distances
:param link: linkage type for hierarchical clustering
:param distance: distance at which to cut into clusters
:return: list of cluster assignments
"""
# Make distance matrix square
dists = squareform(dists)
# Compute linkage
links = linkage(dists, link)
# import matplotlib.pyplot as plt
# from scipy.cluster import hierarchy
# plt.figure(figsize=(15,5))
# p = hierarchy.dendrogram(links)
# Break into clusters based on cutoff
clusters = fcluster(links, distance, criterion='distance')
return clusters
def generate_graphs(clusters_list, output, size, linkage, cutoff,distances):
"""
DESCRIPTION
Create a linear cluster mapping graph where every frame is printed as a
colored barplot
Args:
clusters_labels (list): list of cluster number per frame
output (string) output name for graph
Return:
colors_list (list) to be used with 2D distance projection graph
"""
colors_list = plot_barplot(clusters_list, output, size)
plot_dendro(linkage, output, cutoff, colors_list,clusters_list)
plot_hist(clusters_list, output,colors_list)
if (distances.shape[0] < 10000):
implot(distances,output)
else:
printScreenLogfile("Too many frames! The RMSD distance matrix will not be generated")
return colors_list
def linkage(df, n_groups):
# create the distance matrix based on the forbenius norm: |A-B|_F where A is
# a 24 x N matrix with N the number of timeseries inside the dataframe df
# TODO: We can save have time as we only need the upper triangle once as the
# distance matrix is symmetric
if True:
Y = np.empty((n_groups, n_groups,))
Y[:] = np.NAN
for i in range(len(Y)):
for j in range(len(Y[i,:])):
A = df.loc[i+1].values
B = df.loc[j+1].values
#print('Computing distance of:{},{}'.format(i,j))
Y[i,j] = norm(A-B, ord='fro')
# condensed distance matrix as vector for linkage (upper triangle as a vector)
y = Y[np.triu_indices(n_groups, 1)]
# create linkage matrix with wards algorithm an euclidean norm
Z = hac.linkage(y, method='ward', metric='euclidean')
# R = hac.inconsistent(Z, d=10)
return Z
methods.py 文件源码
项目:South-African-Heart-Disease-data-analysis-using-python
作者: khushi4tiwari
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def hierarchicalClustering(X,y,Maxclust, C, Method = 'single', Metric = 'euclidean'):
# Perform hierarchical/agglomerative clustering on data matrix
Z = linkage(X, method=Method, metric=Metric)
# Compute and display clusters by thresholding the dendrogram
cls = fcluster(Z, criterion='maxclust', t=Maxclust)
figure()
#clusterplot(X, cls.reshape(cls.shape[0],1), y=y)
clusterPlot(X, cls.reshape(cls.shape[0],1), Maxclust, C, y=y)
# Display dendrogram
max_display_levels=7
figure()
dendrogram(Z, truncate_mode='level', p=max_display_levels, color_threshold=0.5*np.max(Z[:,2]))
title("Dendrgram of the Hierarchical Clustering")
show()
def array2tree(d, names, outbase="", method="ward"):
"""Return tree representation for array"""
# cluster
Z = sch.linkage(d[np.triu_indices(d.shape[0], 1)], method=method)
# get ete Tree
t = distance_matrix2tree(Z, names)
# save tree & newick
if outbase:
pdf, nw = outbase+".nw.pdf", outbase+".nw"
with open(nw, "w") as out:
out.write(t.write())
ts = ete3.TreeStyle()
ts.show_leaf_name = False
ts.layout_fn = mylayout
t.render(pdf, tree_style=ts)
return t
def test_linkage_misc():
# Misc tests on linkage
rng = np.random.RandomState(42)
X = rng.normal(size=(5, 5))
assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
assert_raises(ValueError, linkage_tree, X, linkage='foo')
assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
# test hierarchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hierarchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_structured_linkage_tree():
# Check that we obtain the correct solution for structured linkage trees.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
# Avoiding a mask with only 'True' entries
mask[4:7, 4:7] = 0
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
for tree_builder in _TREE_BUILDERS.values():
children, n_components, n_leaves, parent = \
tree_builder(X.T, connectivity)
n_nodes = 2 * X.shape[1] - 1
assert_true(len(children) + n_leaves == n_nodes)
# Check that ward_tree raises a ValueError with a connectivity matrix
# of the wrong shape
assert_raises(ValueError,
tree_builder, X.T, np.ones((4, 4)))
# Check that fitting with no samples raises an error
assert_raises(ValueError,
tree_builder, X.T[:0], connectivity)
def test_unstructured_linkage_tree():
# Check that we obtain the correct solution for unstructured linkage trees.
rng = np.random.RandomState(0)
X = rng.randn(50, 100)
for this_X in (X, X[0]):
# With specified a number of clusters just for the sake of
# raising a warning and testing the warning code
with ignore_warnings():
children, n_nodes, n_leaves, parent = assert_warns(
UserWarning, ward_tree, this_X.T, n_clusters=10)
n_nodes = 2 * X.shape[1] - 1
assert_equal(len(children) + n_leaves, n_nodes)
for tree_builder in _TREE_BUILDERS.values():
for this_X in (X, X[0]):
with ignore_warnings():
children, n_nodes, n_leaves, parent = assert_warns(
UserWarning, tree_builder, this_X.T, n_clusters=10)
n_nodes = 2 * X.shape[1] - 1
assert_equal(len(children) + n_leaves, n_nodes)
def test_scikit_vs_scipy():
# Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
n, p, k = 10, 5, 3
rng = np.random.RandomState(0)
# Not using a lil_matrix here, just to check that non sparse
# matrices are well handled
connectivity = np.ones((n, n))
for linkage in _TREE_BUILDERS.keys():
for i in range(5):
X = .1 * rng.normal(size=(n, p))
X -= 4. * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out = hierarchy.linkage(X, method=linkage)
children_ = out[:, :2].astype(np.int)
children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity)
cut = _hc_cut(k, children, n_leaves)
cut_ = _hc_cut(k, children_, n_leaves)
assess_same_labelling(cut, cut_)
# Test error management in _hc_cut
assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)
def tree(self):
data = self.ccTable
Matrix=np.zeros((self.Dimension,self.Dimension))
reducedArray=[]
for line in data:
#print line
if line is not None and len(line) is not 0:
Matrix[line[0],line[1]]= line[2]
Matrix[line[1],line[0]]= line[2]
for x in range(0,self.Dimension):
for y in range(x+1,self.Dimension):
reducedArray.append(Matrix[x,y])
Distances = np.array(reducedArray, dtype=(float))
self.Tree =hierarchy.linkage(Distances, 'complete')
return self.Tree
#new function, chose the average linkage
def avgTree(self):
data = self.ccTable
Matrix=np.zeros((self.Dimension,self.Dimension))
reducedArray=[]
for line in data:
#print line
if line is not None and len(line) is not 0:
Matrix[line[0],line[1]]= line[2]
Matrix[line[1],line[0]]= line[2]
for x in range(0,self.Dimension):
for y in range(x+1,self.Dimension):
reducedArray.append(Matrix[x,y])
Distances = np.array(reducedArray, dtype=(float))
self.Tree =hierarchy.linkage(Distances, 'average')
return self.Tree
#Funtion added to plot dendrogram in shell mode only.
#still not funtioninhg
#Uncomment when will be needed
def cluster_words(words, service_name, size):
stopwords = ["GET", "POST", "total", "http-requests", service_name, "-", "_"]
cleaned_words = []
for word in words:
for stopword in stopwords:
word = word.replace(stopword, "")
cleaned_words.append(word)
def distance(coord):
i, j = coord
return 1 - jaro_distance(cleaned_words[i], cleaned_words[j])
indices = np.triu_indices(len(words), 1)
distances = np.apply_along_axis(distance, 0, indices)
return cluster_of_size(linkage(distances), size)
def cluster_sequences(sequences, minsize=5):
"""
Cluster the given sequences into groups of similar sequences.
Return a triple that contains a pandas.DataFrame with the edit distances,
the linkage result, and a list that maps sequence ids to their cluster id.
If an entry is zero in that list, it means that the sequence is not part of
a cluster.
"""
matrix = distances(sequences)
linkage = hierarchy.linkage(distance.squareform(matrix), method='average')
# Linkage columns are:
# 0, 1: merged clusters, 2: distance, 3: number of nodes in cluster
inner = inner_nodes(hierarchy.to_tree(linkage))
prev = linkage[:, 2].max() # highest distance
clusters = [0] * len(sequences)
cl = 1
for n in inner:
if n.dist > 0 and prev / n.dist < 0.8 \
and n.left.count >= minsize and n.right.count >= minsize:
for id in collect_ids(n.left):
# Do not overwrite previously assigned ids
if clusters[id] == 0:
clusters[id] = cl
cl += 1
prev = n.dist
# At the end of the above loop, we have not processed the rightmost
# subtree. In our experiments, it never contains true novel sequences,
# so we omit it.
return pd.DataFrame(matrix), linkage, clusters
def get_cell_data(n=50, seed=0):
np.random.seed(seed)
cells_data = np.load('./data/cells_data.npy')
sample_cells = np.random.choice(cells_data.shape[0], n, replace=False)
D = pdist(cells_data[sample_cells, :], 'euclidean')
Z = linkage(D, 'ward')
return cells_data, Z, D
def get_random_data(n=50, seed=0):
np.random.seed(seed)
data = np.random.choice(10000, (n, 1), replace=False)
D = pdist(data, 'euclidean')
Z = linkage(D, 'ward')
return data, Z, D
def plotDend(esd, filename=None):
"""Summary
Function to display an electrostatic similarity dendrogram from a
previously run ElecSimilarity class.
Parameters
----------
esd : ElecSimilarity class
ElecSimilarity class containing final esd matrix.
filename : str, optional
If the resulting plot should be written to disk, specify a filename.
Otherwise, the image will only be saved.
Returns
-------
None
Writes image to disk, if desired.
"""
# plt.style.use('seaborn-talk')
fig, ax = plt.subplots(sharey=True)
Z = cluster.linkage(esd.esd)
cluster.dendrogram(
Z,
labels=esd.ids,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
ax=ax)
plt.xlabel('Variants')
plt.ylabel('ESD')
plt.tight_layout()
if filename is not None:
fig.savefig(filename)
def build_clusters(predicted_scores, method='centroid'):
"""agglomerative clustering using predicted scores as distances
Args:
predicted_scores: predicted scores for all mentions in documents
method: methods for calculating distance between clusters
look at scipy.cluster.hierarchy.linkage documentation
Returns:
clustering, min_score and max_score in predicted_scores
"""
print('building clusters')
min_score = 1e10
max_score = 0
clustrering = []
for doc_id in tqdm(range(len(predicted_scores))):
scores = predicted_scores[doc_id]
if len(scores) > 0:
distances = []
for i in range(len(scores)):
for j in range(i + 1, len(scores)):
distances.append((scores[i, j] + scores[j, i]) / 2)
c = linkage(distances, method=method)
clustrering.append(c)
min_score = min(min(c[:, 2]), min_score)
max_score = max(max(c[:, 2]), max_score)
print('clusters are built: min_score: {} max_score: {}'.format(min_score, max_score))
return clustrering, min_score, max_score
def tree_from_linkage_matrix(linkage, leaf_labels):
""" Form an ete3.Tree from hierarchical linkage matrix.
Linkage should be the matrix returned by hierarchy.linkage.
leaf_labels should be a vector of names for the nodes
corresponding to the clustered items. Internal nodes will be
named node0, node1, etc, in the order in which the
clusters they represent were formed.
returns: new Tree
"""
def cluster(target_sequence_ids, fasta_filename, method='average'):
""" Form distance-based hierachical clustering of sequences.
Looks up each entry in target_sequence_ids in the file
specified by fasta_filename to obtain an associated DNA
sequence.
In principle, we could just work with the Hamming distance, but
the sequences may be of different lengths (mostly small
differences.) So we need a more sophisticated approach: we use
pairwise global alignment, scoring 0 for a match, -1 for mismatch,
and -1.5 for opening or extending a gap. We then take the distance
to be -1.0*(score).
UPGMA clustering is used when method='average', the default.
Returns the distance matrix and the linkage matrix returned
by the clustering routine.
"""
# globalms arguments: seq1, seq2, match, mismatch, open, extend
distance = lambda seq1, seq2: -1.0*(
pairwise2.align.globalms(seq1,seq2,0,-1,-1.5,-1.5, score_only=True)
)
sequences = fasta_to_dict(fasta_filename)
N = len(target_sequence_ids)
distances = np.zeros((N,N))
# fill in the upper triangle
for i,seqid1 in enumerate(target_sequence_ids):
seq1 = sequences[seqid1]
for j_offset, seqid2 in enumerate(target_sequence_ids[i+1:]):
j = j_offset + i + 1
seq2 = sequences[seqid2]
distances[i][j] = distance(seq1, seq2)
# convert to the form expected by the scipy clustering routines
y = squareform(distances,checks=False)
return distances, hierarchy.linkage(y,method)
def hierarchy(data, axis, method, metric):
if axis == 'columns':
data = data.transpose()
clusters = range(len(data.index), 2*len(data.index) - 1)
result = pd.DataFrame(
linkage(data, method=method, metric=metric),
columns=['child1', 'child2', 'distance', 'size'],
index=clusters)
for col in ['child1', 'child2', 'size']:
result[col] = result[col].astype(int)
return result
def cluster(df, metric="euclidean", method="single", row=True, column=True):
row_linkmat, col_linkmat = None, None
if row:
distmat = dist.pdist(df, metric)
row_linkmat = hier.linkage(distmat, method)
df = df.iloc[hier.leaves_list(row_linkmat), :]
if column:
df = df.T
distmat = dist.pdist(df, metric)
col_linkmat = hier.linkage(distmat, method)
df = df.iloc[hier.leaves_list(col_linkmat), :].T
return df, row_linkmat, col_linkmat
def docv_centroid_order_idx(meta_clusters):
dist = cdist(meta_clusters, meta_clusters, metric='cosine')
# Compute the linkage and the order
linkage = hierarchy.linkage(dist, method='average')
d_idx = hierarchy.dendrogram(linkage, no_plot=True)["leaves"]
return d_idx
def cluster_2d_array_rows(array_2d,
linkage_method='average',
distance_function='euclidean'):
"""
Cluster array_2d rows.
Arguments:
array_2d (array): (n_rows, n_columns)
linkage_method (str): linkage method compatible for
scipy.cluster.hierarchy.linkage
distance_function (str | callable): distance function compatible for
scipy.cluster.hierarchy.linkage
Returns:
array: (n_rows); clustered row indices
"""
clustered_indices = dendrogram(
linkage(array_2d, method=linkage_method, metric=distance_function),
no_plot=True)['leaves']
return array(clustered_indices)
def __init__(self):
"""
https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html:
A (n-1) by 4 matrix Z is returned. At the i-th iteration, clusters with indices Z[i, 0] and Z[i, 1] are
combined to form cluster n + i. A cluster with an index less than n corresponds to one of the original
observations. The distance between clusters Z[i, 0] and Z[i, 1] is given by Z[i, 2]. The fourth value
Z[i, 3] represents the number of original observations in the newly formed cluster.
"""
self.linkage = None
self.series = None
self._series_y = None
self.ts_height_factor = None
def maxnode(self):
return len(self.series) - 1 + len(self.linkage)
def get_linkage(self, node):
if node < len(self.series):
return None
idx = int(node - len(self.series))
return self.linkage[idx]
def fit(self, series, *args, **kwargs):
self.series = series
self.linkage = []
new_nodes = {i: i for i in range(len(series))}
if self._model.merge_hook:
old_merge_hook = self._model.merge_hook
else:
old_merge_hook = None
def merge_hook(from_idx, to_idx, distance):
# print('merge_hook', from_idx, to_idx)
new_idx = len(self.series) + len(self.linkage)
# print('adding to linkage: ', new_nodes[from_idx], new_nodes[to_idx], distance, 0)
if new_nodes[from_idx] is None:
raise Exception('Trying to merge series that is already merged')
self.linkage.append((new_nodes[from_idx], new_nodes[to_idx], distance, 0))
new_nodes[to_idx] = new_idx
new_nodes[from_idx] = None
if old_merge_hook:
old_merge_hook(from_idx, to_idx, distance)
self._model.merge_hook = merge_hook
result = self._model.fit(series, *args, **kwargs)
self._model.merge_hook = old_merge_hook
return result
def __init__(self, dists_fun, dists_options):
"""Hierarchical clustering using the Scipy linkage function.
This is the same but faster algorithm as available in Hierarchical (~10 times faster). But with less
options to steer the clustering (e.g. no possibility to give weights). It still computes the entire
distance matrix first and is thus not ideal for extremely large data sets.
"""
super().__init__()
self.dists_fun = dists_fun
self.dists_options = dists_options
def example_naivehierarchicalclustering():
"""Naive hierarchical clustering algorithm using DTW and based on .
For a more efficient approach, check:
Mueen, A and Keogh, E, Extracting Optimal Performance from Dynamic Time Warping,
Tutorial, KDD 2016
http://www.cs.unm.edu/~mueen/DTW.pdf
:return: None
"""
series = [
np.array([0.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0]),
np.array([0.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0]),
np.array([2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]),
np.array([2.0, 1.0, 1.0, 0.0, 0.0, 2.0, 3.0]),
np.array([4.0, 2.0, 1.0, 0.0, 0.0, 1.0, 3.0])
]
dists = dtw.distance_matrix_fast(series)
print("Distance matrix:\n{}".format(dists))
dists_cond = np.zeros(size_cond(len(series)))
idx = 0
for r in range(len(series)-1):
dists_cond[idx:idx+len(series)-r-1] = dists[r, r+1:]
idx += len(series)-r-1
z = linkage(dists_cond, method='complete', metric='euclidean')
print(z)
fig, axes = plt.subplots(2, 1, figsize=(8, 3))
for idx, serie in enumerate(series):
serie += idx * 0.1
axes[0].plot(serie, label=str(idx))
axes[0].text(0 + 0.15 * (-1)**idx * idx, serie[0] + 0.15 * idx, idx)
axes[0].add_line(Line2D([0, 0 + 0.15 * (-1)**idx * idx], [serie[0], serie[0] + 0.15 * idx],
linewidth=1, color='gray'))
axes[0].legend(loc=1)
dendrogram(z, ax=axes[1])
plt.show(block=True)