def runClustering(ssearch, eps, min_samples):
"""
Run DBSCAN with the determined eps and MinPts values.
"""
print('Clustering all documents with DBSCAN, eps=%0.2f min_samples=%d' % (eps, min_samples))
# Initialize DBSCAN with parameters.
# I forgot to use cosine at first!
db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', algorithm='brute')
# Time this step.
t0 = time.time()
# Cluster the LSI vectors.
db.fit(ssearch.index.index)
# Calculate the elapsed time (in seconds)
elapsed = (time.time() - t0)
print(" done in %.3fsec" % elapsed)
# Get the set of unique IDs.
cluster_ids = set(db.labels_)
# Show the number of clusters (don't include noise label)
print('Number of clusters (excluding "noise"): %d' % (len(cluster_ids) - 1))
# For each of the clusters...
for cluster_id in cluster_ids:
# Get the list of all doc IDs belonging to this cluster.
cluster_doc_ids = []
for doc_id in range(0, len(db.labels_)):
if db.labels_[doc_id] == cluster_id:
cluster_doc_ids.append(doc_id)
# Get the top words in this cluster
top_words = ssearch.getTopWordsInCluster(cluster_doc_ids)
print(' Cluster %d: (%d docs) %s' % (cluster_id, len(cluster_doc_ids), " ".join(top_words)))
评论列表
文章目录