def perform_clustering(X, connectivity, title, num_clusters=3, linkage='ward'):
plt.figure()
model = AgglomerativeClustering(linkage=linkage,
connectivity=connectivity, n_clusters=num_clusters)
model.fit(X)
# extract labels
labels = model.labels_
# specify marker shapes for different clusters
markers = '.vx'
for i, marker in zip(range(num_clusters), markers):
# plot the points belong to the current cluster
plt.scatter(X[labels==i, 0], X[labels==i, 1], s=50,
marker=marker, color='k', facecolors='none')
plt.title(title)
python类AgglomerativeClustering()的实例源码
agglomerative.py 文件源码
项目:Python-Machine-Learning-Cookbook
作者: PacktPublishing
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
clustering.py 文件源码
项目:oss-github-analysis-project
作者: itu-oss-project-team
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def agglomerative_clustering(self, out_path, pd_data, number_of_clusters):
headers, repos, features = self.__fetch_data(pd_data)
agglomerative_clustering = AgglomerativeClustering(n_clusters=number_of_clusters, linkage="complete")
agglomerative_clustering.fit(features)
# form clusters
clusters = []
for i in range(0, number_of_clusters): # k cluster
repo_list = []
for j in range(0, len(agglomerative_clustering.labels_)): # a label for each repo.
if i == agglomerative_clustering.labels_[j]: # if repo label is equal to Cluster number
repo_list.append(repos[j]) # add repo to cluster i's list.
clusters.append(repo_list)
out_file_path = os.path.join(out_path, "agglomerative_noOfClusters" + str(number_of_clusters))
self.__export_agglomerative_results(agglomerative_clustering, clusters, out_file_path)
def __init__(self, edges, branching_factor=50, threshold=0.1):
# Make features list.
features = []
for i in range(len(edges)):
edge = edges[i]
features.append([edge['perimeter'], edge['area'],
edge['shape_factor'], edge['radius_deviation']])
features = np.array(features)
# Normalize features
normed_features = features.copy()
for i in range(features.shape[1]):
avg = np.median(features[::, i])
std = np.std(features[::, i])
normed_features[::, i] -= avg
normed_features[::, i] /= avg
self.features = features
self.normed_features = normed_features
self.branching_factor = branching_factor
self.threshold = threshold
#self.run(Birch, branching_factor=50, threshold=0.1, n_clusters=2)
self.run(KMeans, n_clusters=2)
#self.run(AgglomerativeClustering, n_clusters=2)
def cluster_ward(self, image_cols):
# Connectivity
# TODO optional connectivity
connectivity = grid_to_graph(*self.image.shape[:2])
ward = AgglomerativeClustering(
n_clusters=self.params.num_clusters,
linkage='ward',
connectivity=connectivity
)
ward.fit(image_cols)
self.number_of_clusters = len(np.unique(ward.labels_))
print 'number of clusters', self.number_of_clusters
centers = np.zeros((self.number_of_clusters, 3))
for i in range(0, self.number_of_clusters):
cluster_points = image_cols[ward.labels_ == i]
cluster_mean = np.mean(cluster_points, axis=0)
centers[i, :] = cluster_mean
return centers, ward.labels_
def agglomerative_clustering(X, k=10):
""" Run an agglomerative clustering on X.
Args:
X: the TF-IDF matrix where each line represents a document and each
column represents a word, typically obtained by running
transform_text() from the TP2.
k: the number of clusters we want (default: 10).
Returns:
An AgglomerativeClustering model trained on X.
"""
model = AgglomerativeClustering(n_clusters=k)
model.fit(X)
# Note all the other functions are the same except we use
# 'AgglomerativeClustering' instead of 'KMeans'.
return model
# Ex4.1
def cluster_agglomerative(X_train, model_args=None, gridsearch=True, connectivity_graph=True, connectivity_graph_neighbors=10):
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
print('AgglomerativeClustering')
if connectivity_graph:
print('Creating k-neighbors graph for connectivity restraint')
connectivity = kneighbors_graph(X_train, n_neighbors=connectivity_graph_neighbors)
model_args['connectivity'] = connectivity
if gridsearch is True:
## TODO:
# add hyperparamter searching. No scoring method available for this model,
# so we can't easily use gridsearching.
raise NotImplementedError('No hyperparameter optimization available yet for this model. Set gridsearch to False')
# prune(param_grid, model_args)
else:
if 'n_clusters' not in model_args:
raise KeyError('Need to define n_clusters for AgglomerativeClustering')
param_grid = None
return ModelWrapper(AgglomerativeClustering, X=X_train, model_args=model_args, param_grid=param_grid, unsupervised=True)
def test_AgglomerativeClustering_nclusters(*data):
'''
test the performance with different n_clusters
:param data: data, target
:return: None
'''
X,labels_true=data
nums=range(1,50)
ARIs=[]
for num in nums:
clst=cluster.AgglomerativeClustering(n_clusters=num)
predicted_labels=clst.fit_predict(X)
ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
## graph
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(nums,ARIs,marker="+")
ax.set_xlabel("n_clusters")
ax.set_ylabel("ARI")
fig.suptitle("AgglomerativeClustering")
plt.show()
def test_linkage_misc():
# Misc tests on linkage
rng = np.random.RandomState(42)
X = rng.normal(size=(5, 5))
assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
assert_raises(ValueError, linkage_tree, X, linkage='foo')
assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
# test hierarchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hierarchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def agglomerativeClustering(sourceFiles, fileExtension):
""" Performs agglomerative hierarchical clustering using files with <fileExtension> in the <sourceFiles> directory and return accuracy measure"""
try:
accuracy = 0
# Step 1 - Check the required algorithm to specify the data type to load
dataFiles = glob.glob("%s/*.%s" % (arguments.sourcedir, arguments.datatype)) # Get the paths of files to load
dataSamples, dataLabels, loadedClusters = [], [], []
for dataPoint in dataFiles:
dataSamples.append([float(x) for x in open(dataPoint).read()[1:-1].split(",")])
# Also load its cluster
clusterName, paramNames = loadLabelFromFile(dataPoint.replace(".%s" % arguments.datatype, ".metadata"))
if not clusterName in loadedClusters:
loadedClusters.append(clusterName)
dataLabels.append(loadedClusters.index(clusterName))
prettyPrint("Successfully retrieved %s instances for clustering" % len(dataSamples))
# Step 2 - Perform clustering
clusterer = AgglomerativeClustering(n_clusters=len(loadedClusters))
predicted = clusterer.fit_predict(numpy.array(dataSamples), dataLabels)
accuracy = round(metrics.accuracy_score(dataLabels, predicted), 2)
except Exception as e:
prettyPrint("Error encountered: %s" % e, "error")
return accuracy
def agglom(reduced_data, n_clusters):
#----Do Agglomerative clustering and return relevant performance data
clustering = cluster.AgglomerativeClustering(n_clusters = n_clusters)
clustering = clustering.fit(reduced_data)
sil_score = metrics.silhouette_score(reduced_data, clustering.labels_, metric='euclidean')
return {
"labels":clustering.labels_,
"silhouette_score": sil_score
}
def _cluster_documents(self):
method = self.params['cluster_method']
n_clusters = int(self.params['cluster_n_clusters'])
n_samples = len(self.document_vectors)
if n_clusters > n_samples:
n_clusters = n_samples
if method == 'kmeans':
clusterer = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1)
else:
clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete', affinity='cosine')
clustering = clusterer.fit(self.document_vectors)
cluster_labels = clustering.labels_
clustering_dict = clustering.__dict__
cluster_centers = clustering_dict['cluster_centers_']
clusters = {}
for document_id,cluster_label in enumerate(cluster_labels):
if cluster_label not in clusters:
clusters[cluster_label] = []
clusters[cluster_label].append(document_id)
return clusters,cluster_centers
def test_compute_centers(self, data_labels):
data, _ = data_labels
ac = cluster.AgglomerativeClustering()
fit = ac.fit(data)
result = compute_centers(fit, data)
assert result.shape == (data.shape[1], len(set(fit.labels_)))
def _get_htree(self, X=None, metric='cosine'):
km = self.km
method_name = type(km).__name__
if method_name == 'AgglomerativeClustering':
htree = {'n_leaves': km.n_leaves_,
'n_components': km.n_components_,
'children': km.children_.tolist()}
elif method_name in ['Birch', '_BirchDummy']\
and self._pars['n_clusters'] is None:
hmod = _BirchHierarchy(km, metric=metric)
hmod.fit(X)
htree = hmod.htree
else:
htree = {}
return htree
def ward_hc(self, n_clusters, n_neighbors=10):
"""
Perform Ward hierarchical clustering
Parameters
----------
n_clusters : int
number of clusters
lsi_components : int
apply LSA before the clustering algorithm
n_neighbors : int
N nearest neighbors used for computing the connectivity matrix
"""
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
pars = {'n_neighbors': n_neighbors, 'is_hierarchical': True,
"metric": self.metric}
if 'lsi' not in self.pipeline:
raise ValueError("you must use lsi with birch clustering "
"for scaling reasons.")
# This is really not efficient as
# it's done a second time in _cluster_func
X = self.pipeline.data
connectivity = kneighbors_graph(X, n_neighbors=n_neighbors,
include_self=False)
km = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward',
connectivity=connectivity)
return self._cluster_func(n_clusters, km, pars)
def plot_mfi(self, outputfile='embeddings.pdf', nb_clusters=8, weights='NA'):
# collect embeddings for mfi:
X = np.asarray([self.w2v_model[w] for w in self.mfi \
if w in self.w2v_model], dtype='float32')
# dimension reduction:
tsne = TSNE(n_components=2)
coor = tsne.fit_transform(X) # unsparsify
plt.clf()
sns.set_style('dark')
sns.plt.rcParams['axes.linewidth'] = 0.4
fig, ax1 = sns.plt.subplots()
labels = self.mfi
# first plot slices:
x1, x2 = coor[:,0], coor[:,1]
ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none')
# clustering on top (add some colouring):
clustering = AgglomerativeClustering(linkage='ward',
affinity='euclidean', n_clusters=nb_clusters)
clustering.fit(coor)
# add names:
for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_):
ax1.text(x, y, name, ha='center', va="center",
color=plt.cm.spectral(cluster_label / 10.),
fontdict={'family': 'Arial', 'size': 8})
# control aesthetics:
ax1.set_xlabel('')
ax1.set_ylabel('')
ax1.set_xticklabels([])
ax1.set_xticks([])
ax1.set_yticklabels([])
ax1.set_yticks([])
sns.plt.savefig(outputfile, bbox_inches=0)
def clustering(docs,n_clusters): # ?? n_clusters ???
kmeans_model=KMeans(n_clusters=n_clusters,random_state=1).fit(docs) # kmeans??
labels=kmeans_model.labels_
# hmodel=AgglomerativeClustering(n_clusters=n_clusters).fit(docs) # ????
# labels=hmodel.labels_
score=metrics.silhouette_score(np.array(docs),labels,metric='euclidean') # euclidean ??
return labels,score
def make_ward_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'WARD/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
ward = AgglomerativeClustering(n_clusters=self.ward_clusters_count, linkage='ward')
predict_result = ward.fit_predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
def ex4_agglomerative_clustering(X, y):
""" This does the same thing as ex2_kmeans but with an agglomerative
clustering and K=2.
"""
# AgglomerativeClustering needs a non-spare matrix
X = X.toarray()
k = 2
model = AgglomerativeClustering(k).fit(X, y)
print "Silhouette score: %f" % metrics.silhouette_score(X, model.labels_)
# Ex 5
def Learning(X):
from sklearn.cluster import AgglomerativeClustering
learner = AgglomerativeClustering(n_clusters=3)
y = learner.fit_predict(X)
yield 'Agglomerative clusters(n=3)', y
#=================================================
def Learning(X):
from sklearn.cluster import AgglomerativeClustering
learner = AgglomerativeClustering(n_clusters=3)
y = learner.fit_predict(X)
yield 'Agglomerative clusters(n=3)', y
#=================================================
def test_AgglomerativeClustering(*data):
'''
test AGG method
:param data: data, target
:return: None
'''
X,labels_true=data
clst=cluster.AgglomerativeClustering()
predicted_labels=clst.fit_predict(X)
print("ARI:{0}".format(adjusted_rand_score(labels_true,predicted_labels)))
def test_AgglomerativeClustering_linkage(*data):
'''
test the performance with different linkages
:param data: data, target
:return: None
'''
X,labels_true=data
nums=range(1,50)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
linkages=['ward','complete','average']
markers="+o*"
for i, linkage in enumerate(linkages):
ARIs=[]
for num in nums:
clst=cluster.AgglomerativeClustering(n_clusters=num,linkage=linkage)
predicted_labels=clst.fit_predict(X)
ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
ax.plot(nums,ARIs,marker=markers[i],label="linkage:{0}".format(linkage))
ax.set_xlabel("n_clusters")
ax.set_ylabel("ARI")
ax.legend(loc="best")
fig.suptitle("AgglomerativeClustering")
plt.show()
def test_connectivity_propagation():
# Check that connectivity in the ward tree is propagated correctly during
# merging.
X = np.array([(.014, .120), (.014, .099), (.014, .097),
(.017, .153), (.017, .153), (.018, .153),
(.018, .153), (.018, .153), (.018, .153),
(.018, .153), (.018, .153), (.018, .153),
(.018, .152), (.018, .149), (.018, .144)])
connectivity = kneighbors_graph(X, 10, include_self=False)
ward = AgglomerativeClustering(
n_clusters=4, connectivity=connectivity, linkage='ward')
# If changes are not propagated correctly, fit crashes with an
# IndexError
ward.fit(X)
def test_connectivity_fixing_non_lil():
# Check non regression of a bug if a non item assignable connectivity is
# provided with more than one component.
# create dummy data
x = np.array([[0, 0], [1, 1]])
# create a mask with several components to force connectivity fixing
m = np.array([[True, False], [False, True]])
c = grid_to_graph(n_x=2, n_y=2, mask=m)
w = AgglomerativeClustering(connectivity=c, linkage='ward')
assert_warns(UserWarning, w.fit, x)
def test_connectivity_callable():
rng = np.random.RandomState(0)
X = rng.rand(20, 5)
connectivity = kneighbors_graph(X, 3, include_self=False)
aglc1 = AgglomerativeClustering(connectivity=connectivity)
aglc2 = AgglomerativeClustering(
connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False))
aglc1.fit(X)
aglc2.fit(X)
assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_connectivity_ignores_diagonal():
rng = np.random.RandomState(0)
X = rng.rand(20, 5)
connectivity = kneighbors_graph(X, 3, include_self=False)
connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
aglc1 = AgglomerativeClustering(connectivity=connectivity)
aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
aglc1.fit(X)
aglc2.fit(X)
assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_agg_n_clusters():
# Test that an error is raised when n_clusters <= 0
rng = np.random.RandomState(0)
X = rng.rand(20, 10)
for n_clus in [-1, 0]:
agc = AgglomerativeClustering(n_clusters=n_clus)
msg = ("n_clusters should be an integer greater than 0."
" %s was provided." % str(agc.n_clusters))
assert_raise_message(ValueError, msg, agc.fit, X)
def makeWard(X, k=2):
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=10)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
return cluster.AgglomerativeClustering(n_clusters=k,
linkage='ward', connectivity=connectivity)
def makeAvgLinkage(X=None, k=2):
connectivity = kneighbors_graph(X, n_neighbors=10)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
return cluster.AgglomerativeClustering(linkage="average",
affinity="cityblock", n_clusters=k,
connectivity=connectivity)
def makeMaxLinkage(X=None, k=2):
connectivity = kneighbors_graph(X, n_neighbors=10)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
return cluster.AgglomerativeClustering(linkage="complete",
affinity="cityblock", n_clusters=k,
connectivity=connectivity)