def formClusters(dists, link, distance):
"""Form clusters based on hierarchical clustering of input distance matrix
with linkage type and cutoff distance
:param dists: numpy matrix of distances
:param link: linkage type for hierarchical clustering
:param distance: distance at which to cut into clusters
:return: list of cluster assignments
"""
# Make distance matrix square
dists = squareform(dists)
# Compute linkage
links = linkage(dists, link)
# import matplotlib.pyplot as plt
# from scipy.cluster import hierarchy
# plt.figure(figsize=(15,5))
# p = hierarchy.dendrogram(links)
# Break into clusters based on cutoff
clusters = fcluster(links, distance, criterion='distance')
return clusters
python类fcluster()的实例源码
def create_hc(G):
"""Creates hierarchical cluster of graph G from distance matrix"""
path_length=nx.all_pairs_shortest_path_length(G)
distances=numpy.zeros((len(G),len(G)))
for u,p in path_length.items():
for v,d in p.items():
distances[u][v]=d
# Create hierarchical cluster
Y=distance.squareform(distances)
Z=hierarchy.complete(Y) # Creates HC using farthest point linkage
# This partition selection is arbitrary, for illustrive purposes
membership=list(hierarchy.fcluster(Z,t=1.15))
# Create collection of lists for blockmodel
partition=defaultdict(list)
for n,p in zip(list(range(len(G))),membership):
partition[p].append(n)
return list(partition.values())
def fcluster(df, Z, n_groups, n_clusters):
"""
"""
# create flat cluster, i.e. maximal number of clusters...
T = hac.fcluster(Z, criterion='maxclust', depth=2, t=n_clusters)
# add cluster id to original dataframe
df['cluster_id'] = np.NAN
# group is either days (1-365) or weeks (1-52)
#for d in df.index.get_level_values('group').unique():
for g in range(1, n_groups+1):
# T[d-1] because df.index is e.g. 1-365 (d) and T= is 0...364
df.ix[g, 'cluster_id'] = T[g-1]
# add the cluster id to the index
df.set_index(['cluster_id'], append=True, inplace=True)
# set cluster id as first index level for easier looping through cluster_ids
df.index = df.index.swaplevel(0, 'cluster_id')
# just to have datetime at the last level of the multiindex df
df.index = df.index.swaplevel('datetime', 'group')
return df
methods.py 文件源码
项目:South-African-Heart-Disease-data-analysis-using-python
作者: khushi4tiwari
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def hierarchicalClustering(X,y,Maxclust, C, Method = 'single', Metric = 'euclidean'):
# Perform hierarchical/agglomerative clustering on data matrix
Z = linkage(X, method=Method, metric=Metric)
# Compute and display clusters by thresholding the dendrogram
cls = fcluster(Z, criterion='maxclust', t=Maxclust)
figure()
#clusterplot(X, cls.reshape(cls.shape[0],1), y=y)
clusterPlot(X, cls.reshape(cls.shape[0],1), Maxclust, C, y=y)
# Display dendrogram
max_display_levels=7
figure()
dendrogram(Z, truncate_mode='level', p=max_display_levels, color_threshold=0.5*np.max(Z[:,2]))
title("Dendrgram of the Hierarchical Clustering")
show()
def thrEstimation(self):
x = 0.00
dx = 0.05
countsList = []
x_list = []
while x < 1:
FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance')
counter=collections.Counter(FlatC)
Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
countsList.append(counter[Best])
x+= dx
x_list.append(x)
dy = np.diff(countsList)
for a, b in zip (x_list, dy):
if b == max(dy):
return a
def get_cluster_assignments(sim_matrix, parameters):
"""
(np.array, list of int) -> list of int
sim_matrix: list of list of float -- similarity matrix between exemplars
parameters: list of parameters in the format ["method:method_name",
"algo:algo_name", "k:num_clusters", "damping:damping"]
where order doesn't matter
(k and damping only relevant for certain clustering methods)
the possible values for each parameter are listed in the
function below.
Returns a list of integers. The integer at each index of the list corresponds
to the cluster number of the exemplar at the same index in sim_matrix.
"""
algorithm = next((re.split(':',f)[1] for f in parameters if f[:4] == 'algo'), 'ap')
# from { 'hierarchical', 'kmeans', 'ap', 'ward' }
method = next((re.split(':',f)[1] for f in parameters if f[:6] == 'method'), 'single')
# from {'single', 'complete', 'average'} (only relevant for hierarchical clustering)
kMk = next((int(re.split(':',f)[1]) for f in parameters if f[:1] == 'k'), 8)
# any integer <= the data length
damping = next((re.split(':',f)[1] for f in parameters if f[:4] == 'damping'), 0.5)
# only relevant for AP -- in [0.5,1]
#
if algorithm == 'hierarchical':
clustering = hierarchy.linkage(sim_matrix, method)
k = get_k(clustering, 20)
cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1
elif algorithm == 'kmeans':
cluster_assignments = KMeans(n_clusters = kMk).fit_predict(sim_matrix)
elif algorithm == 'ap':
cluster_assignments = AffinityPropagation().fit_predict(sim_matrix)
elif algorithm == 'ward':
clustering = hierarchy.ward(sim_matrix)
k = get_k(clustering, 20)
cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1
return cluster_assignments
def single_silhouette_dendrogram(dist_matrix, Z, threshold, mode='clusters',
method='single', sample_names=None):
"""Compute the average silhouette at a given threshold.
Parameters
----------
dist_matrix : array-like
Precomputed distance matrix between points.
Z : array-like
Linkage matrix, results of scipy.cluster.hierarchy.linkage.
threshold : float
Specifies where to cut the dendrogram.
mode : ('clusters', 'thresholds'), optional
Choose what to visualise on the x-axis.
Returns
-------
x : float
Based on mode, it can contains the number of clusters or threshold.
silhouette_avg : float
The average silhouette.
"""
cluster_labels = fcluster(Z, threshold, 'distance')
nclusts = np.unique(cluster_labels).shape[0]
save_results_clusters("res_{}_{:03d}_clust.csv".format(method, nclusts),
sample_names, cluster_labels)
try:
silhouette_list = silhouette_samples(dist_matrix, cluster_labels,
metric="precomputed")
silhouette_avg = np.mean(silhouette_list)
x = max(cluster_labels) if mode == 'clusters' else threshold
except ValueError as e:
if max(cluster_labels) == 1:
x = 1 if mode == 'clusters' else threshold
silhouette_avg = 0
else:
raise(e)
return x, silhouette_avg
def anglesCluster(angles):
# this function uses hierarchical clustering to find the clusters of a set of angle values
Dists = dist.pdist(angles, angleDiff)
linkageMatrix = hier.linkage(Dists, metric = angleDiff)
C = hier.fcluster(linkageMatrix, 5, 'maxclust')
return C
def clustering(partSeqList, partData):
print('clustering for the seperated dataset')
#simiMatrix = simiMatrixCal(partData)
'''Invoke the clustering method in library'''
data_dist = pdist(partData,metric=distCalculate)
Z = linkage(data_dist, 'complete')
clusterLabels = fcluster(Z, para['max_d'], criterion='distance')
print ('there are altogether %d clusters in this initial clustering'%(len(np.unique(clusterLabels))))
clusNum = len(set(clusterLabels))
instIndexPerClus=[[] for i in range(clusNum)] #initialization
for i in range(len(clusterLabels)):
lab = clusterLabels[i]-1
instIndexPerClus[lab].append(partSeqList[i])
return clusterLabels,instIndexPerClus
def clustering(partSeqList, partData):
'''Invoke the clustering method in library'''
print('clustering for the seperated dataset')
data_dist = pdist(partData,metric=distCalculate)
Z = linkage(data_dist, 'complete')
clusterLabels = fcluster(Z, para['max_d'], criterion='distance')
print('there are altogether %d clusters in this initial clustering'%(len(np.unique(clusterLabels))))
clusNum = len(set(clusterLabels))
instIndexPerClus=[[] for i in range(clusNum)] #initialization
for i in range(len(clusterLabels)):
lab = clusterLabels[i]-1
instIndexPerClus[lab].append(partSeqList[i])
return clusterLabels,instIndexPerClus
def _transactions_fuzzy_matching(transactions, match):
"""
Runs fuzzy matching on the transactions, by applying a complete linkage
hierarchical clustering algorithm to the set of different itemsets in the
transactions. For clustering, the similarity ratio as given by
fuzzywuzzy.ratio is used as the distance measure
Input:
transactions: list of tuples representing items on each transaction
match: minimum similarity ratio (0 to 100) for clustering
Output:
transactions: new version of the transactions, where each item has been
replaced by the first item on its corresponding cluster
word_clusters: dictionary that maps the cluster for each item
in the transactions
"""
words = set([])
for transaction in transactions:
words |= set(transaction)
words = sorted(words)
l = [((a, b), 100-Levenshtein.ratio(str(a), str(b)))
for a, b in combinations(words, 2)]
d = [value for pair, value in l]
r = linkage(d, 'complete')
clusters_index = fcluster(r, 100-match, "distance")
clusters = {}
for obs_i, cluster_i in enumerate(clusters_index):
if cluster_i in clusters:
clusters[cluster_i].append(words[obs_i])
else:
clusters[cluster_i] = [words[obs_i]]
word_clusters = {word: clusters[clusters_index[i]]
for i, word in enumerate(words)}
new_transactions = []
for transaction in transactions:
new_transaction = tuple(set(([word_clusters[word][0]
for word in transaction])))
new_transactions.append(new_transaction)
return new_transactions, word_clusters
def chc(X, K, params=()):
pnames = ['linkage_method', 'distance']
dflts = [ 'ward', 'euclidean']
if isinstance(params, np.ndarray):
paramsloc = params.tolist()
else:
paramsloc = params
(linkage_method, distance) = ds.resolveargumentpairs(pnames, dflts, paramsloc)
Z = sphc.linkage(X, method=linkage_method, metric=distance)
C = sphc.fcluster(Z, K, criterion='maxclust')
return clustVec2partMat(C, K)
# Other related functions
def cluster_helices(helices, cluster_distance=12.0):
""" Clusters helices according to the minimum distance between the line segments representing their backbone.
Notes
-----
Each helix is represented as a line segement joining the CA of its first Residue to the CA if its final Residue.
The minimal distance between pairwise line segments is calculated and stored in a condensed_distance_matrix.
This is clustered using the 'single' linkage metric
(all members of cluster i are at < cluster_distance away from at least one other member of cluster i).
Helices belonging to the same cluster are grouped together as values of the returned cluster_dict.
Parameters
----------
helices: Assembly
cluster_distance: float
Returns
-------
cluster_dict: dict
Keys: int
cluster number
Values: [Polymer]
"""
condensed_distance_matrix = []
for h1, h2 in itertools.combinations(helices, 2):
md = minimal_distance_between_lines(h1[0]['CA']._vector, h1[-1]['CA']._vector,
h2[0]['CA']._vector, h2[-1]['CA']._vector, segments=True)
condensed_distance_matrix.append(md)
z = linkage(condensed_distance_matrix, method='single')
clusters = fcluster(z, t=cluster_distance, criterion='distance')
cluster_dict = {}
for h, k in zip(helices, clusters):
if k not in cluster_dict:
cluster_dict[k] = [h]
else:
cluster_dict[k].append(h)
return cluster_dict
def checkMultiplicity(self, thr):
FlatC = hierarchy.fcluster(self.Tree, thr, criterion='distance')
counter=collections.Counter(FlatC)
Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
print('You are clustering with a threshold of %s'%(thr))
print('The biggest cluster contains %s datasets from a total of %s'%(counter[Best], len(self.labelList)))
def completenessEstimation(self):
x = 0.00
dx = 0.05
while x > 1:
FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance')
counter=collections.Counter(FlatC)
Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
def minimalForCompleteness(self):
print("Running estimator for minimal threshold for completeness")
labels=self.createLabels()
x = 0.00
dx = 0.05
countsList = {}
x_list = []
while x < 1:
Arrays= {}
FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance')
counter=collections.Counter(FlatC)
Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
toProcess=[Best]
y=0
for cluster, filename in zip(FlatC,labels):
if cluster in toProcess:
hklFile = any_reflection_file(filename)
b= hklFile.as_miller_arrays()
for column in b:
if column.is_xray_intensity_array():
Arrays[y]=column
break
y+=1
try:
Arr = Arrays[0]
except:
countsList.append(0)
for label in range(1, y):
try:
Arr = Arr.concatenate(Arrays[label])
except:
pass
countsList[x]=(Arr.completeness())
x+= dx
# return minimal for max
L = []
for key in countsList:
if countsList[key]>0.98:
L.append(key)
L.sort()
return L[0]
def define_clusts(similarity_matrix, threshold=0.05, max_iter=200,
method='ap'):
"""Define clusters given the similarity matrix and the threshold."""
n, labels = connected_components(similarity_matrix, directed=False)
prev_max_clust = 0
print("connected components: %d" % n)
clusters = labels.copy()
if method == 'dbscan':
ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1)
if method == 'ap':
ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter,
preference='median')
for i in range(n):
idxs = np.where(labels == i)[0]
if idxs.shape[0] > 1:
sm = similarity_matrix[idxs][:, idxs]
sm += sm.T + scipy.sparse.eye(sm.shape[0])
# Hierarchical clustering
if method == 'hc':
dists = squareform(1 - sm.toarray())
links = fastcluster.linkage(dists, method='ward')
try:
clusters_ = fcluster(links, threshold, 'distance')
except ValueError as err:
logging.critical(err)
clusters_ = np.zeros(1, dtype=int)
# DBSCAN
elif method == 'dbscan':
db = ap.fit(1. - sm.toarray())
# Number of clusters in labels, ignoring noise if present.
clusters_ = db.labels_
# n_clusters_ = len(set(clusters_)) - int(0 in clusters_)
# AffinityPropagation
# ap = AffinityPropagation(affinity='precomputed')
elif method == 'ap':
db = ap.fit(sm)
clusters_ = db.labels_
else:
raise ValueError("clustering method %s unknown" % method)
if np.min(clusters_) == 0:
clusters_ += 1
clusters_ += prev_max_clust
clusters[idxs] = clusters_
prev_max_clust = max(clusters_)
else: # connected component contains just 1 element
prev_max_clust += 1
clusters[idxs] = prev_max_clust
return np.array(extra.flatten(clusters))
def community_detection(self,cutoff_threshold=0.55):
'''
Finding communities of features using pairwise differences of solutions aquired in the main LP step.
'''
svm_solution = self._svm_coef
abs_svm_sol = np.abs(svm_solution)
om = self._omegas
mins = om[:,0,:]
maxs = om[:,1,:]
abs_mins = np.abs(mins)
abs_maxs = np.abs(maxs)
# Aggregate min and max solution values to obtain the absolute variation
lower_variation = abs_svm_sol - abs_mins
upper_variation = abs_maxs - abs_svm_sol
variation = np.abs(lower_variation) + np.abs(upper_variation)
# add up lower triangular matrix to upper one
collapsed_variation = np.triu(variation)+np.tril(variation).T
np.fill_diagonal(collapsed_variation, 0)
#collapsed_variation = pd.DataFrame(collapsed_variation)
# Create distance matrix
dist_mat = np.triu(collapsed_variation).T + collapsed_variation
# normalize
dist_mat = 1- dist_mat/np.max(dist_mat)
# get numpy array
#dist_mat = dist_mat.values[:]
# feature with itself has no distance
np.fill_diagonal(dist_mat,0)
# convert to squareform for scipy compat.
dist_mat_square = squareform(dist_mat)
# Execute clustering
link = linkage(dist_mat_square, method="ward")
# Set cutoff at which threshold the linkage gets flattened (clustering)
RATIO = cutoff_threshold
threshold = RATIO * np.max(link[:,2]) # max of branch lengths (distances)
feature_clustering = fcluster(link,threshold,criterion="distance")
return feature_clustering, link
def cluster_contigs(outbase, d, bin_chr, bin_position, threads=4, frac=0.66,
iterations=30, maxnumchr=500, seed=0, dpi=100, minchr=3, nchr=0):
"""Estimate number of chromosomes and assign contigs to chromosomes"""
logger(" Estimating number of chromosomes...")
prng = np.random#.RandomState(seed)
Z = fastcluster.linkage(d[np.triu_indices(d.shape[0], 1)], method="ward")
# estimate chromosome number if not specified
if not nchr:
dZ = []
n = d.shape[0]
subset = int(round(n * frac))
if subset < maxnumchr:
maxnumchr = subset-1
''' 24 vs 31s on 4 cores - it's not worth as need to pickle large matrix and pass it through
args = (d[prng.permutation(n)[:subset], :][:, prng.permutation(n)[:subset]] for i in range(iterations))
p = Pool(threads)
for i, _dZ in enumerate(p.imap_unordered(worker, args), 1):
sys.stderr.write(' %s / %s \r'%(i, iterations))
dZ.append(_dZ)
'''
for i in range(iterations):
sys.stderr.write(' %s / %s \r'%(i+1, iterations))
selection = prng.permutation(n)[:subset]
iZ = fastcluster.linkage(d[selection, :][:, selection][np.triu_indices(subset, 1)], method="ward")
dZ.append(np.diff(iZ[:, 2]))
#'''
# get number of chromosomes
mean_step_len = np.mean(np.array(dZ), 0)
nchr = np.argmax(mean_step_len[-minchr+1::-1][:maxnumchr]) + minchr
logger(" number of chromosomes: %s"%nchr)
plt.figure(figsize = (15, 5))
plt.title("Number of clusters estimation")
plt.plot(np.arange(maxnumchr, 1, -1), mean_step_len[-maxnumchr+1:], 'b')
plt.gca().invert_xaxis()
plt.xlabel('number of clusters')
plt.savefig(outbase+'.avg_step_len.svg', dpi=dpi)
plt.figure()
plt.title("Number of clusters estimation")
plt.plot(np.arange(50, 1, -1), mean_step_len[-50+1:], 'b')
plt.gca().invert_xaxis()
plt.xlabel('number of clusters')
plt.savefig(outbase+'.avg_step_len_50.svg', dpi=dpi)
logger(" Assigning contigs to chromosomes...")
clusters = [[] for n in range(nchr)]
assignments = sch.fcluster(Z, t=nchr, criterion='maxclust')
for i, c in zip(assignments-1, bin_chr):
clusters[i].append(c)
return clusters
def merge(self, anomFlag, thr):
FlatC = hierarchy.fcluster(self.Tree, thr, criterion='distance')
Log = open(self.CurrentDir+'/.cc_cluster.log', 'a')
counter=collections.Counter(FlatC)
Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
Process = True
#change checkboxes to standard variables
if Process:
ToProcess = [Best]
else:
ToProcess = set(Clusters)
for key in ToProcess:
if counter[key]==1:
ToProcess = [x for x in ToProcess if x != key]
#Processing pipeline,
#Does all the XSCALE run
for x in ToProcess:
if [thr,x, anomFlag] not in self.alreadyDone:
os.mkdir(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s'%(float(thr),x, anomFlag))
Xscale=open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/XSCALE.INP'%(float(thr),x, anomFlag), 'a')
Pointless=open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/launch_pointless.sh'%(float(thr),x,anomFlag ), 'a')
print('OUTPUT_FILE=scaled.hkl',file=Xscale)
print('MERGE= TRUE', file=Xscale)
print('pointless hklout clustered.mtz << eof', file=Pointless)
if anomFlag=='ano':
print('FRIEDEL\'S_LAW= FALSE', file=Xscale)
elif anomFlag=='no_ano':
print('FRIEDEL\'S_LAW= TRUE', file=Xscale)
Xscale.close()
Pointless.close()
for cluster, filename in zip(FlatC,self.labelList):
if cluster in ToProcess:
OUT = open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/XSCALE.INP'%(float(thr),cluster,anomFlag), 'a')
Pointless=open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/launch_pointless.sh'%(float(thr),cluster,anomFlag), 'a')
print ('INPUT_FILE= ../%s'%(filename), file=OUT)
#print ('INCLUDE_RESOLUTION_RANGE=20, 1.8', file=OUT)
print ('MINIMUM_I/SIGMA= 0', file=OUT)
print ('XDSIN ../%s'%(filename), file= Pointless)
OUT.close()
Pointless.close()
#optional run of XSCALE
newProcesses=[]
for x in ToProcess:
if [thr,x, anomFlag] not in self.alreadyDone:
plt.savefig(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/Dendrogram.png'%(float(thr),x,anomFlag))
P= subprocess.Popen('/opt/pxsoft/xds/vdefault/linux-x86_64/xscale_par',cwd=self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/'%(float(thr), x, anomFlag))
P.wait()
print('Cluster, %s , %s , %s'%(float(thr),x, anomFlag), file=Log)
Pointless=open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/launch_pointless.sh'%(float(thr),x,anomFlag), 'a')
print('COPY \n bg\n TOLERANCE 4 \n eof', file= Pointless)
Pointless.close()
os.chmod(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/launch_pointless.sh'%(self.threshold,x,self.anomFlag ), st.st_mode | 0o111)
newProcesses.append([thr,x, anomFlag])