def compare_clusters(X,Y,method='spectral',s=10000):
A = (X/np.linalg.norm(X,axis=0)).T
A[np.isnan(A)] = 0
B = (Y/np.linalg.norm(Y,axis=0)).T
B[np.isnan(B)] = 0
random_samples = np.zeros(A.shape[0],dtype=np.bool)
random_samples[:min(s,A.shape[0])] = True
np.random.shuffle(random_samples)
A = A[random_samples]
B = B[random_samples]
dA = 1 - A.dot(A.T)
dA = np.exp(-dA**2/2.)
dB = 1 - B.dot(B.T)
dB = np.exp(-dB**2/2.)
del A,B
if method == 'spectral':
n = max(5,min(30,X.shape[1]/50))
lA = SpectralClustering(n_clusters=n,affinity='precomputed').fit_predict(dA)
lB = SpectralClustering(n_clusters=n,affinity='precomputed').fit_predict(dB)
elif method == 'ap':
lA = AffinityPropagation(affinity='precomputed').fit_predict(dA)
lB = AffinityPropagation(affinity='precomputed').fit_predict(dB)
return adjusted_mutual_info_score(lA,lB)
python类adjusted_mutual_info_score()的实例源码
def evaluate_groups(true_groups, predicted):
""" Compute the AMI score and corresponding mean confidence for given gammas.
:param true_groups: (B, 1, W, H, 1)
:param predicted: (B, K, W, H, 1)
:return: scores, confidences (B,)
"""
scores, confidences = [], []
assert true_groups.ndim == predicted.ndim == 5, true_groups.shape
batch_size, K = predicted.shape[:2]
true_groups = true_groups.reshape(batch_size, -1)
predicted = predicted.reshape(batch_size, K, -1)
predicted_groups = predicted.argmax(1)
predicted_conf = predicted.max(1)
for i in range(batch_size):
true_group = true_groups[i]
idxs = np.where(true_group != 0.0)[0]
scores.append(adjusted_mutual_info_score(true_group[idxs], predicted_groups[i, idxs]))
confidences.append(np.mean(predicted_conf[i, idxs]))
return scores, confidences
def score_simulation(h5_file):
print("Opening/creating database file")
tsdatabase = TimeSeriesData(h5_file)
nreps = int((tsdatabase.h5_table["timeseries/indptr"].shape[0]-1)/6)
#Items belonging in the same cluster are next to one another
true_labels = [0]*nreps+[1]*nreps+[2]*nreps+[3]*nreps+[4]*nreps+[5]*nreps
#Order is: drop, rise, normal, noisy, conditionally rare, seasonal
max_ami = 0
for i in range(tsdatabase.h5_table["genes/clusters"].shape[1]):
pred_labels = tsdatabase.get_cluster_labels(i)
ami = metrics.adjusted_mutual_info_score(true_labels, pred_labels)
if (ami > max_ami):
max_ami = ami
print("Maximum AMI of clusters is: %f" % (max_ami,))
def calc(gr_truth, predicted):
# precision, recall, fscore, _ = score(gr_truth, predicted, average='micro')
# print('precision: {}'.format(precision))
# print('recall: {}'.format(recall))
# print('fscore: {}'.format(fscore))
# print('jaccard: {}'.format(jaccard_similarity_score(gr_truth, predicted, normalize=True)))
# print('mutual: {}'.format(mutual_info_score(gr_truth, predicted)))
# print('mutual adj: {}'.format(adjusted_mutual_info_score(gr_truth, predicted)))
# print('mutual norm: {}'.format(normalized_mutual_info_score(gr_truth, predicted)))
return normalized_mutual_info_score(gr_truth, predicted)
def bench_k_means(estimator, name, data):
t0 = time()
estimator.fit(data)
print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f'
% (name, (time() - t0), estimator.inertia_,
metrics.homogeneity_score(labels, estimator.labels_),
metrics.completeness_score(labels, estimator.labels_),
metrics.v_measure_score(labels, estimator.labels_),
metrics.adjusted_rand_score(labels, estimator.labels_),
metrics.adjusted_mutual_info_score(labels, estimator.labels_),
metrics.silhouette_score(data, estimator.labels_,
metric='euclidean',
sample_size=sample_size)))
def bench_k_means(labels, labels_, name, data):
print('%20s %.3f %.3f %.3f %.3f %.3f'
% ( name,
metrics.homogeneity_score(labels, labels_),
metrics.completeness_score(labels, labels_),
metrics.v_measure_score(labels, labels_),
metrics.adjusted_rand_score(labels, labels_),
metrics.adjusted_mutual_info_score(labels, labels_)))
nbins=len(set(labels_))
vals,bins=np.histogram(labels_,bins=nbins)
print 20*' ','hist-min,max',np.min(vals),np.max(vals)
def computeAdjustedEvaluations(self, labels_families, predicted_clusters):
if labels_families is None:
self.adjusted_rand_score = 0
self.adjusted_mutual_info_score = 0
return
self.adjusted_rand_score = metrics.adjusted_rand_score(labels_families, predicted_clusters)
self.adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(labels_families, predicted_clusters)
def toJson(self):
obj = {}
obj['homogeneity'] = self.homogeneity
obj['completeness'] = self.completeness
obj['v_measure'] = self.v_measure
obj['adjusted_rand_score'] = self.adjusted_rand_score
obj['adjusted_mutual_info_score'] = self.adjusted_mutual_info_score
return obj
def score(self, truth=None):
if self.truth == None:
self.truth = truth
if len(self.truth)==4:
return -objective_f(self.truth, [self.H_, self.a_, self.Y_, self.b_])
else:
return metrics.adjusted_mutual_info_score(self.labels_,self.truth)
def analyze_k_means(estimator, name, data):
t0 = time()
estimator.fit(data)
print(" %9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f"%( name, time()-t0, estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size = samples) ))
def performance(self, group_labels=None):
"""
Computes performance metrics for clustering algorithm
Parameters
----------
group_labels : (optional) ndarray(shape=nsubjects)
Labels for subject groups
"""
n_samples = len(self.algorithm.labels_)
if group_labels is None:
truelab = np.zeros(n_samples)
unique_labels = np.unique(group_labels)
self.clusters["true_int"] = truelab
else:
truelab = np.zeros(n_samples)
unique_labels = np.unique(group_labels)
for i, label_i in enumerate(unique_labels):
truelab[group_labels == label_i] = i
self.clusters["true"] = group_labels
self.clusters["true_int"] = truelab
lab = self.algorithm.labels_
self.results["homogeneity"] = homogeneity_score(truelab, lab)
self.results["completeness"] = completeness_score(truelab, lab)
self.results["v_measure"] = v_measure_score(truelab, lab)
self.results["adj_rand"] = adjusted_rand_score(truelab, lab)
self.results["adj_MI"] = adjusted_mutual_info_score(truelab, lab)
def __init__(self):
self.eval_metrics = {
"Adjusted Mutual Information": ami,
"Homogeneity": hom,
"Completeness": com,
"V-measure" : vm
}
self.clustering_tests = {
"k-means Task" : self.kmeans_test,
"Mixture of Gaussians Task" : self.mog_test
}
def bench_k_means(estimator, name, data):
t0 = time()
estimator.fit(data)
print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f'
% (name, (time() - t0), estimator.inertia_,
metrics.homogeneity_score(labels, estimator.labels_),
metrics.completeness_score(labels, estimator.labels_),
metrics.v_measure_score(labels, estimator.labels_),
metrics.adjusted_rand_score(labels, estimator.labels_),
metrics.adjusted_mutual_info_score(labels, estimator.labels_),
metrics.silhouette_score(data, estimator.labels_,
metric='euclidean',
sample_size=sample_size)))
def assort(self, model):
#if not source:
# data = self.data
# sim_source = self.similarity_matrix('cos')
data = self.data
N = self.data.shape[0]
sim_source = self.similarity_matrix(sim='cos')
y = model.generate(N)
#y = np.triu(y) + np.triu(y, 1).T
sim_learn = model.similarity_matrix(sim='cos')
np.fill_diagonal(indic_source, ma.masked)
assert(N == y.shape[0])
indic_source = ma.array(np.ones(sim_source.shape)*-1, mask=ma.masked)
indic_source[(data == 1) & (sim_source > 0)] = 0
indic_source[(data == 1) & (sim_source <= 0)] = 1
indic_source[(data == 0) & (sim_source > 0)] = 2
indic_source[(data == 0) & (sim_source <= 0)] = 3
indic_learn = ma.array(np.ones(sim_learn.shape)*-1, mask=ma.masked)
indic_learn[(y == 1) & (sim_learn > 0)] = 0
indic_learn[(y == 1) & (sim_learn <= 0)] = 1
indic_learn[(y == 0) & (sim_learn > 0)] = 2
indic_learn[(y == 0) & (sim_learn <= 0)] = 3
np.fill_diagonal(indic_learn, ma.masked)
np.fill_diagonal(indic_source, ma.masked)
indic_source[indic_source == -1] = ma.masked
indic_learn[indic_learn == -1] = ma.masked
### Indicateur Homophily Christine
homo_ind1_source = 1.0 * ( (indic_source==0).sum()+(indic_source==3).sum()-(indic_source==1).sum() - (indic_source==2).sum() ) / (N*(N-1))
homo_ind1_learn = 1.0 * ( (indic_learn== 0).sum()+(indic_learn==3).sum()-(indic_learn==1).sum() - (indic_learn==2).sum() ) / (N*(N-1))
# AMI / NMI
from sklearn import metrics
AMI = metrics.adjusted_mutual_info_score(indic_source.compressed(), indic_learn.compressed())
NMI = metrics.normalized_mutual_info_score(indic_source.compressed(), indic_learn.compressed())
print('homo_ind1 source: %f' % (homo_ind1_source))
print('homo_ind1 learn: %f' % (homo_ind1_learn))
print('AMI: %f, NMI: %f' % (AMI, NMI))
d = {'NMI' : NMI, 'homo_ind1_source' : homo_ind1_source, 'homo_ind1_learn' : homo_ind1_learn}
return d
def compute_affinity_propagation(preference_, X):
# DATA FILLING
#text = io.Input.local_read_text_file(inputFilePath)
#input_array = text.split('\n')
centers = [[1, 1], [-1, -1], [1, -1]]
n_samples = 300
#Make Blobs used for generating of labels_true array
if (X == None):
X, labels_true = make_blobs(n_samples = n_samples, centers=centers, cluster_std=1, random_state=0)
print("Data is none!!!")
print("Generating " + str(n_samples) + " samples")
else :
data, labels_true = make_blobs(n_samples=len(X), centers=centers, cluster_std=1, random_state=0)
#slist = list()
#for line in X:
# slist.append(line)
#io.Output.write_array_to_txt_file("clustering\\Affinity_Propagation\\input_data1.txt", slist)
#float_array = []
#for line in input_array:
# float_line = [float(i) for i in line.split(' ')]
# float_array.append(float_line)
#X = array(float_array)
af = AffinityPropagation(preference=preference_).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
# print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
print("Fowlkes Mallows Score: %0.3f" % metrics.fowlkes_mallows_score(labels_true, labels))
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
def compare_with_children(
self, idea_id, post_ids, post_clusters, remainder, labels):
# Compare to children classification
compare_with_ideas = None
all_idea_scores = []
ideas_of_post = defaultdict(list)
children_remainder = set(post_ids)
children_ids = self.idea_children[idea_id]
if len(children_ids):
posts_of_children = {
child_id: self.get_posts_of_idea(child_id)
for child_id in children_ids}
for idea_id, c_post_ids in posts_of_children.items():
for post_id in c_post_ids:
ideas_of_post[post_id].append(idea_id)
children_remainder -= set(c_post_ids)
for post_id in children_remainder:
ideas_of_post[post_id] = [idea_id]
# if many ideas to a post, choose one with the most ideas in same cluster.
# A bit arbitrary but I need a single idea.
for cluster in chain(post_clusters, (remainder,)):
idea_score = defaultdict(int)
all_idea_scores.append(idea_score)
for post_id in cluster:
for idea_id in ideas_of_post[post_id]:
idea_score[idea_id] += 1
for post_id in cluster:
if len(ideas_of_post[post_id]) > 1:
scores = [(idea_score[idea_id], idea_id)
for idea_id in ideas_of_post[post_id]]
scores.sort(reverse=True)
ideas_of_post[post_id] = [score[1] for score in scores]
# index_by_post_id = {v: k for (k, v) in post_id_by_index.iteritems()}
idea_of_index = [ideas_of_post[post_id][0] for post_id in post_ids]
compare_with_ideas = {
"Homogeneity": metrics.homogeneity_score(idea_of_index, labels),
"Completeness": metrics.completeness_score(idea_of_index, labels),
"V-measure": metrics.v_measure_score(idea_of_index, labels),
"Adjusted Rand Index": metrics.adjusted_rand_score(
idea_of_index, labels),
"Adjusted Mutual Information": metrics.adjusted_mutual_info_score(
idea_of_index, labels)}
else:
for post_id in children_remainder:
ideas_of_post[post_id] = [idea_id]
for cluster in chain(post_clusters, (remainder,)):
all_idea_scores.append({idea_id: len(cluster)})
return (compare_with_ideas, all_idea_scores, ideas_of_post,
children_remainder)