runDBSCAN.py 文件源码

python
阅读 18 收藏 0 点赞 0 评论 0

项目:simsearch 作者: chrisjmccormick 项目源码 文件源码
def runClustering(ssearch, eps, min_samples):
    """
    Run DBSCAN with the determined eps and MinPts values.
    """
    print('Clustering all documents with DBSCAN, eps=%0.2f min_samples=%d' % (eps, min_samples))

    # Initialize DBSCAN with parameters.
    # I forgot to use cosine at first!
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', algorithm='brute')

    # Time this step.
    t0 = time.time()

    # Cluster the LSI vectors.     
    db.fit(ssearch.index.index)

    # Calculate the elapsed time (in seconds)
    elapsed = (time.time() - t0)
    print("  done in %.3fsec" % elapsed)

    # Get the set of unique IDs.
    cluster_ids = set(db.labels_)

    # Show the number of clusters (don't include noise label)
    print('Number of clusters (excluding "noise"): %d' % (len(cluster_ids) - 1))  

    # For each of the clusters...    
    for cluster_id in cluster_ids:

            # Get the list of all doc IDs belonging to this cluster.
            cluster_doc_ids = []
            for doc_id in range(0, len(db.labels_)):            
                if db.labels_[doc_id] == cluster_id:
                    cluster_doc_ids.append(doc_id)

            # Get the top words in this cluster
            top_words = ssearch.getTopWordsInCluster(cluster_doc_ids)

            print('  Cluster %d: (%d docs) %s' % (cluster_id, len(cluster_doc_ids), " ".join(top_words)))
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号