def sem_clust(self, w2p, simsdict):
''' Baseline SEMCLUST method (dynamic thresholding), based on:
Marianna Apidianaki, Emilia Verzeni, and Diana McCarthy. Semantic
Clustering of Pivot Paraphrases. In LREC 2014.
Builds a graph where nodes are words, and edges connect words that
have a connection in <w2p>. Weights edges by the values given in
<simsdict>.
:param w2p: word -> {paraphrase: score} dictionary, used to decide which nodes to connect with edges
:param simsdict: word -> {paraphrase: score} OR word -> vector, used for edge weights
:return:
'''
self.reset_sense_clustering()
wordlist = self.pp_dict.keys()
oov = [w for w in wordlist if w not in w2p or w not in simsdict]
if len(oov) > 0:
sys.stderr.write('WARNING: Paraphrases %s are OOV. '
'Removing from ppset.\n' % str(oov))
wordlist = list(set(wordlist) - set(oov))
if len(wordlist) == 1:
self.add_sense_cluster([wordlist[0]])
return
# Using cosine similarity of word-paraphrase vectors:
if type(simsdict.values()[0]) != dict:
similarities = np.array([[1-cosine(simsdict[i], simsdict[j])
for j in wordlist] for i in wordlist])
else:
similarities = np.array([[(1-dict_cosine_dist(simsdict[i], simsdict[j]))
for j in wordlist] for i in wordlist])
gr = sem_clust.toGraph(similarities, wordlist, self.target_word, w2p)
for c in nx.connected_components(gr):
self.add_sense_cluster(c)
评论列表
文章目录