def test_lee(self):
"""correlation with human data > 0.6
(this is the value which was achieved in the original paper)
"""
global bg_corpus, corpus
# create a dictionary and corpus (bag of words)
dictionary = corpora.Dictionary(bg_corpus)
bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
corpus = [dictionary.doc2bow(text) for text in corpus]
# transform the bag of words with log_entropy normalization
log_ent = models.LogEntropyModel(bg_corpus)
bg_corpus_ent = log_ent[bg_corpus]
# initialize an LSI transformation from background corpus
lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
# transform small corpus to lsi bow->log_ent->fold-in-lsi
corpus_lsi = lsi[log_ent[corpus]]
# compute pairwise similarity matrix and extract upper triangular
res = np.zeros((len(corpus), len(corpus)))
for i, par1 in enumerate(corpus_lsi):
for j, par2 in enumerate(corpus_lsi):
res[i, j] = matutils.cossim(par1, par2)
flat = res[matutils.triu_indices(len(corpus), 1)]
cor = np.corrcoef(flat, human_sim_vector)[0, 1]
logging.info("LSI correlation coefficient is %s" % cor)
self.assertTrue(cor > 0.6)
# def test_lee_mallet(self):
# global bg_corpus, corpus, bg_corpus2, corpus2
# # create a dictionary and corpus (bag of words)
# dictionary = corpora.Dictionary(bg_corpus2)
# bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
# corpus = [dictionary.doc2bow(text) for text in corpus2]
# # initialize an LDA transformation from background corpus
# lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
# corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
# corpus_lda = lda[corpus]
# # compute pairwise similarity matrix and extract upper triangular
# res = np.zeros((len(corpus), len(corpus)))
# for i, par1 in enumerate(corpus_lda):
# for j, par2 in enumerate(corpus_lda):
# res[i, j] = matutils.cossim(par1, par2)
# flat = res[matutils.triu_indices(len(corpus), 1)]
# cor = np.corrcoef(flat, human_sim_vector)[0, 1]
# logging.info("LDA correlation coefficient is %s" % cor)
# self.assertTrue(cor > 0.35)
评论列表
文章目录