def adhoc_clustering(messages, dist_func=combined):
''' an adhoc method for clustering messages '''
m = len(messages)
# extract message features.
for (mi, message) in enumerate(messages):
if type(message) != dict:
message = {
'text': message
}
message.update(extract_all(parse_body(message['text'])))
# run clustering (ad hoc).
max_label = 0
bias = 600
labels = []
for (mi, message) in enumerate(messages):
min_mj = -1
min_dist = float('inf')
for mj in range(mi-1, -1, -1):
dist = dist_func(messages[mi], messages[mj])
if dist < min_dist:
min_dist = dist
min_mj = mj
if (bias- 100 * worth(messages[mi])) < min_dist: # create new cluster.
labels.append(max_label)
max_label += 1
else: # assign to an old cluster.
labels.append(labels[min_mj])
return labels
评论列表
文章目录