def dbscan_partition(iterable, params):
"""
:type iterable: iter
:param iterable: iterator yielding ((key, partition), vector)
:type params: dict
:param params: dictionary containing sklearn DBSCAN parameters
:rtype: iter
:return: ((key, cluster_id), v)
Performs a DBSCAN on a given partition of the data
"""
# read iterable into local memory
data = list(iterable)
(key, part), vector = data[0]
x = np.array([v for (_, __), v in data])
y = np.array([k for (k, _), __ in data])
# perform DBSCAN
model = skc.DBSCAN(**params)
c = model.fit_predict(x)
cores = set(model.core_sample_indices_)
# yield (key, cluster_id), non-core samples labeled with *
for i in xrange(len(c)):
flag = '' if i in cores else '*'
yield (y[i], '%i:%i%s' % (part, c[i], flag))
评论列表
文章目录