runDBSCAN.py 文件源码

python
阅读 22 收藏 0 点赞 0 评论 0

项目:simsearch 作者: chrisjmccormick 项目源码 文件源码
def findEps(ssearch):
    """
    Find a good epsilon value to use.
    """
    ###########################################################################
    # Calculate nearest neighbors
    ###########################################################################

    # Create a nearest neighbors model--we need 2 nearest neighbors since the 
    # nearest neighbor to a point is going to be itself.
    nbrs_model = NearestNeighbors(n_neighbors=2, algorithm='brute', metric='cosine').fit(ssearch.index.index)

    t0 = time.time()

    # Find nearest neighbors.
    distances, indices = nbrs_model.kneighbors(ssearch.index.index)

    elapsed = time.time() - t0

    print 'Took %.2f seconds' % elapsed

    distances = [d[1] for d in distances]
    indeces = [ind[1] for ind in indices]

    ###########################################################################
    # Histogram the nearest neighbor distances.
    ###########################################################################

    import matplotlib.pyplot as plt

    counts, bins, patches = plt.hist(distances, bins=16)
    plt.title("Nearest neighbor distances")
    plt.xlabel("Distance")
    plt.ylabel("Frequency")

    print '\n%d bins:' % len(counts)

    countAcc = 0
    num_points = len(ssearch.index.index)

    for i in range(0, len(counts)):
        countAcc += counts[i]

        # Calculate the percentage of values which fall below the upper limit 
        # of this bin.
        prcnt = float(countAcc) / float(num_points) * 100.0    

        print '  %.2f%% < %.2f' % (prcnt, bins[i + 1])
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号