def cluster_keyterms(keyterms, word2vec_model):
'''
This function takes a list of keyterms, filters out only the words that can be used in the model
and clusters them
:param
keyterms : list of keyterms in dictionary format. They contain the following details: lemma_string, pos, len,
cvalue, words, tf, lemma_list
word2vec_model : embedding model
:return:
cluster of keyterms
'''
from sklearn import cluster
#filter keyterms to work with the embedding model
filtered_keyterms = filter_keyterms_byVocab(keyterms, word2vec_model.vocab)
X = []
for kt1 in filtered_keyterms:
line = []
for kt2 in filtered_keyterms:
sim = word2vec_model.n_similarity(kt1, kt2)
line.append(sim)
X.append(line)
# preference = [np.amin(X)] * len(filtered_keyterms)
preference = [np.median(X)] * len(filtered_keyterms)
print "Start Affinity Propagation ..."
af = cluster.AffinityPropagation(affinity="precomputed", damping=0.5, preference = preference)
af.fit(X)
print "Finished affinity propagation"
af_cluster_indices = af.cluster_centers_indices_
af_labels = af.labels_
n_clusters = len(af_cluster_indices)
clusters = []
for i in range(n_clusters):
cluster_center_1 = filtered_keyterms[af_cluster_indices[i]]
## compute cluster composition
cluster_members = []
for ktIdx in range(len(af_labels)):
if af_labels[ktIdx] == i:
cluster_members.append(filtered_keyterms[ktIdx])
cluster_data = {
"idx" : i,
"center": cluster_center_1,
"members": cluster_members,
"len": len(cluster_members)
}
clusters.append(cluster_data)
return clusters
# if __name__ == "__main__":
# process_keyterm_clusters(GENERATION_NT_CANDIDATES)
keyterm_clustering.py 文件源码
python
阅读 21
收藏 0
点赞 0
评论 0
评论列表
文章目录