def findEps(ssearch):
"""
Find a good epsilon value to use.
"""
###########################################################################
# Calculate nearest neighbors
###########################################################################
# Create a nearest neighbors model--we need 2 nearest neighbors since the
# nearest neighbor to a point is going to be itself.
nbrs_model = NearestNeighbors(n_neighbors=2, algorithm='brute', metric='cosine').fit(ssearch.index.index)
t0 = time.time()
# Find nearest neighbors.
distances, indices = nbrs_model.kneighbors(ssearch.index.index)
elapsed = time.time() - t0
print 'Took %.2f seconds' % elapsed
distances = [d[1] for d in distances]
indeces = [ind[1] for ind in indices]
###########################################################################
# Histogram the nearest neighbor distances.
###########################################################################
import matplotlib.pyplot as plt
counts, bins, patches = plt.hist(distances, bins=16)
plt.title("Nearest neighbor distances")
plt.xlabel("Distance")
plt.ylabel("Frequency")
print '\n%d bins:' % len(counts)
countAcc = 0
num_points = len(ssearch.index.index)
for i in range(0, len(counts)):
countAcc += counts[i]
# Calculate the percentage of values which fall below the upper limit
# of this bin.
prcnt = float(countAcc) / float(num_points) * 100.0
print ' %.2f%% < %.2f' % (prcnt, bins[i + 1])
评论列表
文章目录