def clusterMalwareNames(malwareNames):
# strictly lexical clustering over malware-names
wordCount = {}
# create a distance matrix
matrix = np.zeros((len(malwareNames), len(malwareNames)))
for i in range(len(malwareNames)):
for j in range(len(malwareNames)):
if matrix[i, j] == 0.0:
matrix[i, j] = computeSimilarity(malwareNames[i], malwareNames[j])
matrix[j, i] = matrix[i, j]
# Scikit-Learn's DBSCAN implementation to cluster the malware-names
clust = DBSCAN(eps=0.1, min_samples=5, metric="precomputed")
clust.fit(matrix)
preds = clust.labels_
clabels = np.unique(preds)
# create Word-Count Map
for i in range(clabels.shape[0]):
if clabels[i] < 0:
continue
cmem_ids = np.where(preds == clabels[i])[0]
cmembers = []
for cmem_id in cmem_ids:
cmembers.append(malwareNames[cmem_id])
wordCount[", ".join(uniqueList(cmembers))] = len(cmem_ids)
return wordCount
评论列表
文章目录