def dbscan_partition(iterable, params, sample_weight=None):
"""
:type iterable: iter
:param iterable: iterator yielding ((key, partition), vector)
:type params: dict
:param params: dictionary containing sklearn DBSCAN parameters
:rtype: iter
:return: ((key, cluster_id), v)
Performs a DBSCAN on a given partition of the data
"""
# read iterable into local memory
data = list(iterable)
(key, part), vector = data[0]
x = np.array([v for (_, __), v in data])
y = np.array([k for (k, _), __ in data])
# perform DBSCAN
model = skc.DBSCAN(**params)
# import sys
# print(model, file=sys.stderr)
weights = [sample_weight[k[0]] for k in x]
c = model.fit_predict(x, sample_weight=weights)
cores = set(model.core_sample_indices_)
# yield (key, cluster_id), non-core samples labeled with *
for i in xrange(len(c)):
flag = '' if i in cores else '*'
yield (y[i], '%i:%i%s' % (part, c[i], flag))
评论列表
文章目录