def cluster(self, data_set, threshold, verbose=True):
grp_map = {}
grp_id = 0
for index in range(len(data_set)):
sample = data_set[index]
if not verbose:
print "[+] Processing Sample:", sample["id"]
scores = {}
for prev_grp_id, prev_grp_data in grp_map.iteritems():
scores[prev_grp_id] = min([Util.simscore(sample["encoded"], prev["encoded"]) for prev in prev_grp_data])
if len(scores) == 0 or max(scores.values()) < threshold:
grp_id += 1
cur_grp_id = grp_id
grp_map[cur_grp_id] = []
else:
cur_grp_id = max(scores.iteritems(), key=operator.itemgetter(1))[0]
if not verbose:
print "[+] Found Closet Cluster:", cur_grp_id
grp_map[cur_grp_id].append(sample)
grp_info = {}
for prev_grp_id, prev_grp_data in grp_map.iteritems():
grp_info[prev_grp_id] = [prev["id"] for prev in prev_grp_data]
return grp_info
评论列表
文章目录