def __init__(self, filename):
self.docs = loads(open(filename, "r").read())
self.docmap = hoist_dict(self.docs, "id")
if isfile("data.dict"):
self.dictionary = Dictionary.load("data.dict")
else:
self.dictionary = Dictionary(iterate_summaries(self.docs))
self.dictionary.save("data.dict")
if isfile("data.mm"):
self.corpus = MmCorpus("data.mm")
else:
corpus = (self.dictionary.doc2bow(text) for text in iterate_summaries(self.docs))
MmCorpus.serialize("data.mm", corpus)
self.corpus = MmCorpus("data.mm")
self.lsi = LsiModel(self.corpus, id2word=self.dictionary, num_topics=3)
if isfile("data.sim"):
self.sim = MatrixSimilarity.load("data.sim")
else:
self.sim = MatrixSimilarity(self.lsi[self.corpus])
self.sim.save("data.sim")
# self.lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1)
self.sentiment_model = Doc2Vec.load("imdb.d2v")
self.sentiment = LogisticRegression()
self.sentiment.fit([self.sentiment_model.docvecs["TEST_POS_" + str(i)] for i in range(12500)] +
[self.sentiment_model.docvecs["TEST_NEG_" + str(i)] for i in range(12500)],
asarray(list(chain(repeat(0, 12500), repeat(1, 12500)))))
if isfile("arxiv.d2v"):
self.doc_model = Doc2Vec.load("arxiv.d2v")
else:
tagged = [TaggedDocument(doc.get("summary").split(), [doc.get("id")]) for doc in self.docs]
doc_model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
doc_model.build_vocab(tagged)
shuffle(tagged) # Replace with functional stuff
for epoch in range(10):
doc_model.train(tagged, total_examples=doc_model.corpus_count, epochs=doc_model.iter)
doc_model.save("arxiv.d2v")
评论列表
文章目录