def debugplot(self, u = None):
print 'creating plot...'
n = len(self.region['members'][0]) / 2
plt.figure(figsize=(6, n/2*4+1))
m = self.region['members']
d = self.region['maxdistance']
for i in range(n):
plt.subplot(numpy.ceil(n / 2.), 2, 1+i)
j = i * 2
k = i * 2 + 1
plt.plot(m[:,j], m[:,k], 'x', color='b', ms=1)
plt.gca().add_artist(plt.Circle((m[0,j], m[0,k]), d, color='g', alpha=0.3))
if u is not None:
plt.plot(u[j], u[k], 's', color='r')
plt.gca().add_artist(plt.Circle((u[j], u[k]), d, color='r', alpha=0.3))
prefix='friends%s-%s_' % ('1' if self.jackknife else '', self.metric)
plt.savefig(prefix + 'cluster.pdf')
plt.close()
print 'creating plot... done'
python类cluster()的实例源码
def find_a_dominant_color(image):
# K-mean clustering to find the k most dominant color, from:
# http://stackoverflow.com/questions/3241929/python-find-dominant-most-common-color-in-an-image
n_clusters = 5
# Get image into a workable form
im = image.copy()
im = im.resize((150, 150)) # optional, to reduce time
ar = scipy.misc.fromimage(im)
im_shape = ar.shape
ar = ar.reshape(scipy.product(im_shape[:2]), im_shape[2])
ar = np.float_(ar)
# Compute clusters
codes, dist = scipy.cluster.vq.kmeans(ar, n_clusters)
vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes
counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences
# Get the indexes of the most frequent, 2nd most frequent, 3rd, ...
sorted_idxs = np.argsort(counts)
# Get the color
peak = codes[sorted_idxs[1]] # get second most frequent color
return [int(i) for i in peak.tolist()] # list comprehension to quickly cast everything to int
def process_options(args):
options = argparser().parse_args(args)
if options.max_rank is not None and options.max_rank < 1:
raise ValueError('max-rank must be >= 1')
if options.eps <= 0.0:
raise ValueError('eps must be > 0')
wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)
if options.normalize:
logging.info('normalize vectors to unit length')
wv.normalize()
words, vectors = wv.words(), wv.vectors()
if options.whiten:
logging.info('normalize features to unit variance')
vectors = scipy.cluster.vq.whiten(vectors)
return words, vectors, options
def write_fasta(file_list):
'''
Writes fasta files for cluster sequqnce from bedfile
'''
for infile in file_list:
ifi = open(infile, "r")
out_fa_file = infile + ".fa"
with open(out_fa_file, "w") as of:
of.close()
pass
of = open(out_fa_file,'a+')
for line in ifi:
f_header, f_seq = fasta_header(line)
'''
Create outfile
'''
of.write("{}\n{}\n".format(f_header, f_seq))
print("done {}".format(out_fa_file))
ifi.close()
def compute_group_overlap_score(ref_labels, pred_labels,
threshold_overlap_pred=0.5,
threshold_overlap_ref=0.5):
"""How well do the pred_labels explain the ref_labels?
A predicted cluster explains a reference cluster if it is contained within the reference
cluster with at least 50% (threshold_overlap_pred) of its points and these correspond
to at least 50% (threshold_overlap_ref) of the reference cluster.
"""
ref_unique, ref_counts = np.unique(ref_labels, return_counts=True)
ref_dict = dict(zip(ref_unique, ref_counts))
pred_unique, pred_counts = np.unique(pred_labels, return_counts=True)
pred_dict = dict(zip(pred_unique, pred_counts))
summary = []
for true in ref_unique:
sub_pred_unique, sub_pred_counts = np.unique(pred_labels[true == ref_labels], return_counts=True)
relative_overlaps_pred = [sub_pred_counts[i] / pred_dict[n] for i, n in enumerate(sub_pred_unique)]
relative_overlaps_ref = [sub_pred_counts[i] / ref_dict[true] for i, n in enumerate(sub_pred_unique)]
pred_best_index = np.argmax(relative_overlaps_pred)
summary.append(1 if (relative_overlaps_pred[pred_best_index] >= threshold_overlap_pred and
relative_overlaps_ref[pred_best_index] >= threshold_overlap_ref)
else 0)
# print(true, sub_pred_unique[pred_best_index], relative_overlaps_pred[pred_best_index],
# relative_overlaps_ref[pred_best_index], summary[-1])
return sum(summary)/len(summary)
def hierarch_cluster(M):
"""Cluster matrix using hierarchical clustering.
Parameters
----------
M : np.ndarray
Matrix, for example, distance matrix.
Returns
-------
Mclus : np.ndarray
Clustered matrix.
indices : np.ndarray
Indices used to cluster the matrix.
"""
import scipy as sp
import scipy.cluster
link = sp.cluster.hierarchy.linkage(M)
indices = sp.cluster.hierarchy.leaves_list(link)
Mclus = np.array(M[:, indices])
Mclus = Mclus[indices, :]
if False:
pl.matshow(Mclus)
pl.colorbar()
return Mclus, indices
def get_leafs(self, use_labels=False):
""" Use to retrieve labels.
Parameters
----------
use_labels : bool, optional
If True, self.labels will be returned. If False, will
use either columns (if matrix is pandas DataFrame)
or indices (if matrix is np.ndarray)
"""
if isinstance(self.mat, pd.DataFrame):
if use_labels:
return [self.labels[i] for i in scipy.cluster.hierarchy.leaves_list(self.linkage)]
else:
return [self.mat.columns.tolist()[i] for i in scipy.cluster.hierarchy.leaves_list(self.linkage)]
else:
return scipy.cluster.hierarchy.leaves_list(self.linkage)
def calc_agg_coeff(self):
""" Returns the agglomerative coefficient, measuring the clustering
structure of the linkage matrix. Because it grows with the number of
observations, this measure should not be used to compare datasets of
very different sizes.
For each observation i, denote by m(i) its dissimilarity to the first
cluster it is merged with, divided by the dissimilarity of the merger
in the final step of the algorithm. The agglomerative coefficient is
the average of all 1 - m(i).
"""
# Turn into pandas DataFrame for fancy indexing
Z = pd.DataFrame(self.linkage, columns = ['obs1','obs2','dist','n_org'] )
# Get all distances at which an original observation is merged
all_dist = Z[ ( Z.obs1.isin(self.leafs) ) | (Z.obs2.isin(self.leafs) ) ].dist.values
# Divide all distances by last merger
all_dist /= self.linkage[-1][2]
# Calc final coefficient
coeff = np.mean( 1 - all_dist )
return coeff
def main():
from optparse import OptionParser
parser = OptionParser(usage="%prog --XSCALEfile=<LP filename> --outname=<output dendogram>")
parser.add_option("-o","--outname", dest="outname", default='Dendrogram', help="output dendogram file name")
parser.add_option("-t", "--threshold", dest="threshold", default='0.4', help="Distance threshold for clustering")
parser.add_option("-c", "--count",action="store_true", dest="count", default=False, help="Counts datasets in the biggest cluster and exit")
(options, args) = parser.parse_args()
thr = float(options.threshold)
CC = Clustering('Cluster_log.txt')
link = CC.tree()
if options.count:
CC.checkMultiplicity(thr)
print(CC.thrEstimation())
else:
CC.checkMultiplicity(thr)
CC.merge('ano', thr)
def plot_feature_overlap(df, cmap='binary', method='cluster'):
"""Plot feature-feature presence overlap of a pandas dataframe.
Args:
df: A pandas dataframe.
cmap: A matplotlib colormap.
method: Method of clustering, one of 'cluster' or 'tree'.
"""
V = len(df.columns)
present = (df == df).as_matrix().astype(np.float32)
overlap = np.dot(present.T, present)
assert overlap.shape == (V, V)
# Sort features to make blocks contiguous.
if method == 'tree':
# TODO(fritzo) Fix this to not look awful.
grid = make_complete_graph(V)
weights = np.empty(grid.shape[1], dtype=np.float32)
for k, v1, v2 in grid.T:
weights[k] = overlap[v1, v2]
edges = estimate_tree(grid, weights)
order, order_inv = order_vertices(edges)
elif method == 'cluster':
distance = scipy.spatial.distance.pdist(overlap)
clustering = scipy.cluster.hierarchy.complete(distance)
order_inv = scipy.cluster.hierarchy.leaves_list(clustering)
else:
raise ValueError(method)
overlap = overlap[order_inv, :]
overlap = overlap[:, order_inv]
assert overlap.shape == (V, V)
pyplot.imshow(overlap**0.5, cmap=cmap)
pyplot.axis('off')
def cluster(self, u, ndim, keepRadius=False):
"""
"""
if self.verbose: print 'building region ...'
if len(u) > 10:
if keepRadius and self.region is not None and 'maxdistance' in self.region:
maxdistance = self.region['maxdistance']
else:
if self.radial:
if self.jackknife:
#maxdistance = initial_rdistance_guess(u, k=1, metric=self.metric)
maxdistance = nearest_rdistance_guess(u, metric=self.metric)
else:
maxdistance = find_rdistance(u, nbootstraps=20, metric=self.metric, verbose=self.verbose)
else:
maxdistance = find_maxdistance(u)
if self.force_shrink and self.region is not None and 'maxdistance' in self.region:
maxdistance = min(maxdistance, self.region['maxdistance'])
if self.keep_phantom_points and len(self.phantom_points) > 0:
# add phantoms to u now
print 'including phantom points in cluster members', self.phantom_points
u = numpy.vstack((u, self.phantom_points))
ulow = numpy.max([u.min(axis=0) - maxdistance, numpy.zeros(ndim)], axis=0)
uhigh = numpy.min([u.max(axis=0) + maxdistance, numpy.ones(ndim)], axis=0)
else:
maxdistance = None
ulow = numpy.zeros(ndim)
uhigh = numpy.ones(ndim)
if self.verbose: print 'setting sampling region:', (ulow, uhigh), maxdistance
self.region = dict(members=u, maxdistance=maxdistance, ulow=ulow, uhigh=uhigh)
self.generator = None
def rebuild(self, u, ndim, keepRadius=False):
if self.last_cluster_points is None or \
len(self.last_cluster_points) != len(u) or \
numpy.any(self.last_cluster_points != u):
self.cluster(u=self.transform_new_points(u), ndim=ndim, keepRadius=keepRadius)
self.last_cluster_points = u
# reset generator
self.generator = self.generate(ndim=ndim)
def rebuild(self, u, ndim, keepMetric=False):
if self.last_cluster_points is not None and \
len(self.last_cluster_points) == len(u) and \
numpy.all(self.last_cluster_points == u):
# do nothing if everything stayed the same
return
self.cluster(u=u, ndim=ndim, keepMetric=keepMetric)
self.last_cluster_points = u
print 'maxdistance:', self.region.maxdistance
self.generator = self.generate(ndim)
def start_clustering(self):
functions.log('Calculate {0} distances...'.format(int(len(self.orfs) * (len(self.orfs) + 1) / 2)))
self.distances = self.create_distance_matrix()
functions.log('Start clustering...')
self.linkage_matrix = scipy.cluster.hierarchy.linkage(ssd.squareform(self.distances), method='complete')
functions.log('Clustering done.')
def clustering(self, clustering_distance):
"""
Create clusters
:param clustering_distance: max distance between ORFs in one clusters
:return:
"""
functions.log('Create clusters...')
for ind, o in enumerate(scipy.cluster.hierarchy.fcluster(self.linkage_matrix, clustering_distance, 'distance')):
if self.clusters.get(o) is None:
self.clusters[o] = set()
self.clusters[o].add(ind)
self.orfs_clusters[ind] = o
def get_args():
parser = argparse.ArgumentParser(description='Clustering ORFs by position in the graph')
parser.add_argument('sequences', type=str, help='Path to ORFs sequences file')
parser.add_argument('paths', type=str, help='Path to ORFs paths file from ORFFinderInGraph.py')
parser.add_argument('graph', type=str, help='Path to graph in GFA format')
parser.add_argument('threshold', type=int, help='Max level of dissimilarity of ORFs in one cluster(between 0 and 1)')
parser.add_argument('-o', '--output', type=str, default='', help='Output folder')
parser.add_argument('-sd', '--savedistances', default='False', action='store_true',
help='Save pairwise distances matrix (in .npy format)')
parser.set_defaults(savedistances=False)
return parser.parse_args()
def _auto_color(self, url:str, ranks):
phrases = ["Calculating colors..."] # in case I want more
#try:
await self.bot.say("**{}**".format(random.choice(phrases)))
clusters = 10
async with self.session.get(url) as r:
image = await r.content.read()
with open('data/leveler/temp_auto.png','wb') as f:
f.write(image)
im = Image.open('data/leveler/temp_auto.png').convert('RGBA')
im = im.resize((290, 290)) # resized to reduce time
ar = scipy.misc.fromimage(im)
shape = ar.shape
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
codes, dist = scipy.cluster.vq.kmeans(ar.astype(float), clusters)
vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes
counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences
# sort counts
freq_index = []
index = 0
for count in counts:
freq_index.append((index, count))
index += 1
sorted_list = sorted(freq_index, key=operator.itemgetter(1), reverse=True)
colors = []
for rank in ranks:
color_index = min(rank, len(codes))
peak = codes[sorted_list[color_index][0]] # gets the original index
peak = peak.astype(int)
colors.append(''.join(format(c, '02x') for c in peak))
return colors # returns array
#except:
#await self.bot.say("```Error or no scipy. Install scipy doing 'pip3 install numpy' and 'pip3 install scipy' or read here: https://github.com/AznStevy/Maybe-Useful-Cogs/blob/master/README.md```")
# converts hex to rgb
def write_cluster_ids(words, cluster_ids, out=None):
"""Write given list of words and their corresponding cluster ids to out."""
assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch'
if out is None:
out = sys.stdout
for word, cid in izip(words, cluster_ids):
print >> out, '%s\t%d' % (word, cid)
def main(argv=None):
if argv is None:
argv = sys.argv
try:
words, vectors, options = process_options(argv[1:])
except Exception, e:
if str(e):
print >> sys.stderr, 'Error: %s' % str(e)
return 1
else:
raise
dbscan = sklearn.cluster.DBSCAN(eps=options.eps, metric=options.metric)
dbscan.fit(numpy.array(vectors))
noisy = sum(1 for l in dbscan.labels_ if l == -1)
unique = len(set(dbscan.labels_))
logging.info('%d clusters, %d noisy, %d vectors' % (unique, noisy,
len(vectors)))
if noisy >= len(vectors) / 4:
logging.warning('%d/%d noisy (-1) labels (try higher eps?)' % \
(noisy, len(vectors)))
elif unique < (len(vectors)/2)**0.5:
logging.warning('only %d clusters (try lower eps?)' % unique)
write_cluster_ids(words, dbscan.labels_)
return 0
def process_options(args):
options = argparser().parse_args(args)
if options.max_rank is not None and options.max_rank < 1:
raise ValueError('max-rank must be >= 1')
if options.k is not None and options.k < 2:
raise ValueError('cluster number must be >= 2')
if options.method == MINIBATCH_KMEANS and not with_sklearn:
logging.warning('minibatch kmeans not available, using kmeans (slow)')
options.method = KMEANS
if options.jobs != 1 and (options.method != KMEANS or not with_sklearn):
logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS)
options.jobs = 1
wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)
if options.k is None:
options.k = int(math.ceil((len(wv.words())/2)**0.5))
logging.info('set k=%d (%d words)' % (options.k, len(wv.words())))
if options.normalize:
logging.info('normalize vectors to unit length')
wv.normalize()
words, vectors = wv.words(), wv.vectors()
if options.whiten:
logging.info('normalize features to unit variance')
vectors = scipy.cluster.vq.whiten(vectors)
return words, vectors, options
def minibatch_kmeans(vectors, k):
if not with_sklearn:
raise NotImplementedError
# Sculley (http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf)
# uses batch size 1000. sklearn KMeans defaults to n_init 10
kmeans = sklearn.cluster.MiniBatchKMeans(k, batch_size=1000, n_init=10)
kmeans.fit(vectors)
return kmeans.labels_
def write_cluster_ids(words, cluster_ids, out=None):
"""Write given list of words and their corresponding cluster ids to out."""
assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch'
if out is None:
out = sys.stdout
for word, cid in izip(words, cluster_ids):
print >> out, '%s\t%d' % (word, cid)
def __getattr__(self, key):
if key == 'linkage':
self.cluster()
return self.linkage
elif key in ['leafs', 'leaves']:
return self.get_leafs()
elif key == 'cophenet':
return self.calc_cophenet()
elif key == 'agg_coeff':
return self.calc_agg_coeff()
def calc_cophenet(self):
""" Returns Cophenetic Correlation coefficient of your clustering.
This (very very briefly) compares (correlates) the actual pairwise
distances of all your samples to those implied by the hierarchical
clustering. The closer the value is to 1, the better the clustering
preserves the original distances,
"""
return scipy.cluster.hierarchy.cophenet( self.linkage, self.condensed_dist_mat )
def cluster(self, method='ward'):
""" Cluster distance matrix. This will automatically be called when
attribute linkage is requested for the first time.
Parameters
----------
method : str, optional
Clustering method (see scipy.cluster.hierarchy.linkage
for reference)
"""
# First, convert similarity matrix to distance matrix
if self.mat_type != 'distance':
if isinstance( self.mat, pd.DataFrame ):
self.dist_mat = ( self.mat.as_matrix() - self.mat.max().max() ) * -1
else:
self.dist_mat = ( self.mat - self.mat.max() ) * -1
else:
if isinstance( self.mat, pd.DataFrame ):
self.dist_mat = self.mat.as_matrix()
else:
self.dist_mat = self.mat
# Second, convert into condensed distance matrix - otherwise clustering
# thinks we are passing observations instead of final scores
self.condensed_dist_mat = scipy.spatial.distance.squareform( self.dist_mat, checks=False )
self.linkage = scipy.cluster.hierarchy.linkage(self.condensed_dist_mat, method=method)
# Save method in case we want to look it up later
self.method = method
module_logger.info('Clustering done using method "{0}"'.format(method) )
def plot3d(self, k=5, criterion='maxclust', **kwargs):
"""Plot neuron using :func:`pymaid.plot.plot3d`. Will only work if
instance has neurons attached to it.
Parameters
----------
k : {int, float}
criterion : {'maxclust','distance'}, optional
If `maxclust`, `k` clusters will be formed. If `distance`,
clusters will be created at threshold `k`.
**kwargs
will be passed to plot.plot3d()
see help(plot.plot3d) for a list of keywords
See Also
--------
:func:`pymaid.plot.plot3d`
Function called to generate 3d plot
"""
if 'neurons' not in self.__dict__:
module_logger.error(
'This works only with cluster results from neurons')
return None
cmap = self.get_colormap(k=k, criterion=criterion)
kwargs.update({'color': cmap})
return plotting.plot3d(self.neurons, **kwargs)
def to_json(self, fname='cluster.json', k=5, criterion='maxclust'):
""" Convert clustered neurons into json file that can be loaded into
CATMAID selection table.
Parameters
----------
fname : str, optional
Filename to save selection to
k : {int, float}
criterion : {'maxclust','distance'}, optional
If `maxclust`, `k` clusters will be formed. If `distance`,
clusters will be created at threshold `k`.
See Also
--------
:func:`pymaid.plot.plot3d`
Function called to generate 3d plot
"""
cmap = self.get_colormap(k=k, criterion=criterion)
# Convert to 0-255
cmap = { n : [ int(v*255) for v in cmap[n] ] for n in cmap }
data = [ dict(skeleton_id=int(n),
color="#{:02x}{:02x}{:02x}".format( cmap[n][0],cmap[n][1],cmap[n][2] ),
opacity=1
) for n in cmap ]
with open(fname, 'w') as outfile:
json.dump(data, outfile)
module_logger.info('Selection saved as %s in %s' % (fname, os.getcwd()))
return
def get_clusters(self, k, criterion='maxclust', return_type='labels'):
""" Wrapper for cluster.hierarchy.fcluster to get clusters.
Parameters
----------
k : {int, float}
criterion : {'maxclust','distance'}, optional
If `maxclust`, `k` clusters will be formed. If
`distance`, clusters will be created at threshold `k`.
return_type : {'labels','indices','columns','rows'}
Determines what to construct the clusters of. 'labels'
only works if labels are provided. 'indices' refers
to index in distance matrix. 'columns'/'rows' works
if distance matrix is pandas DataFrame
Returns
-------
list
list of clusters [ [leaf1, leaf5], [leaf2, ...], ... ]
"""
cl = scipy.cluster.hierarchy.fcluster(self.linkage, k, criterion=criterion)
if self.labels and return_type.lower()=='labels':
return [[self.labels[j] for j in range(len(cl)) if cl[j] == i] for i in range(min(cl), max(cl) + 1)]
elif return_type.lower() == 'rows':
return [[self.mat.columns.tolist()[j] for j in range(len(cl)) if cl[j] == i] for i in range(min(cl), max(cl) + 1)]
elif return_type.lower() == 'columns':
return [[self.mat.index.tolist()[j] for j in range(len(cl)) if cl[j] == i] for i in range(min(cl), max(cl) + 1)]
else:
return [[j for j in range(len(cl)) if cl[j] == i] for i in range(min(cl), max(cl) + 1)]
def checkMultiplicity(self, thr):
FlatC = hierarchy.fcluster(self.Tree, thr, criterion='distance')
counter=collections.Counter(FlatC)
Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
print('You are clustering with a threshold of %s'%(thr))
print('The biggest cluster contains %s datasets from a total of %s'%(counter[Best], len(self.labelList)))
def minimalForCompleteness(self):
print("Running estimator for minimal threshold for completeness")
labels=self.createLabels()
x = 0.00
dx = 0.05
countsList = {}
x_list = []
while x < 1:
Arrays= {}
FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance')
counter=collections.Counter(FlatC)
Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
toProcess=[Best]
y=0
for cluster, filename in zip(FlatC,labels):
if cluster in toProcess:
hklFile = any_reflection_file(filename)
b= hklFile.as_miller_arrays()
for column in b:
if column.is_xray_intensity_array():
Arrays[y]=column
break
y+=1
try:
Arr = Arrays[0]
except:
countsList.append(0)
for label in range(1, y):
try:
Arr = Arr.concatenate(Arrays[label])
except:
pass
countsList[x]=(Arr.completeness())
x+= dx
# return minimal for max
L = []
for key in countsList:
if countsList[key]>0.98:
L.append(key)
L.sort()
return L[0]