def _cluster_documents(self):
method = self.params['cluster_method']
n_clusters = int(self.params['cluster_n_clusters'])
n_samples = len(self.document_vectors)
if n_clusters > n_samples:
n_clusters = n_samples
if method == 'kmeans':
clusterer = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1)
else:
clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete', affinity='cosine')
clustering = clusterer.fit(self.document_vectors)
cluster_labels = clustering.labels_
clustering_dict = clustering.__dict__
cluster_centers = clustering_dict['cluster_centers_']
clusters = {}
for document_id,cluster_label in enumerate(cluster_labels):
if cluster_label not in clusters:
clusters[cluster_label] = []
clusters[cluster_label].append(document_id)
return clusters,cluster_centers
评论列表
文章目录