def run_model(name):
if name == 'lsi':
lsi = models.LsiModel(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics)
print('Saving lsi_model...')
lsi.save('exports/lsi.model')
print('lsi_model saved!')
# lsi_matrix = gensim.matutils.corpus2dense(lsi[corpus_gensim], len(lsi.projection.s)).T / lsi.projection.s
# print('Saving lsi_matrix...')
# pickle.dump(lsi_matrix, open('exports/lsi_matrix.p','wb'))
# print('lsi_matrix saved!')
elif name == 'lda':
# lda = models.LdaModel(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics, passes=5)
lda = models.ldamulticore.LdaMulticore(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics, passes=5)#, alpha='auto') #auto needs non multicore LDA
print('Saving lda_model...')
lda.save('exports/lda.model')
print('lda_model saved!')
# lda_matrix = gensim.matutils.corpus2dense(lda[corpus_gensim], lda.num_topics)
# print('Saving lda_matrix...')
# pickle.dump(lda_matrix, open('exports/lda_matrix.p','wb'))
# print('lda_matrix saved!')
gc.collect()
python类LdaModel()的实例源码
def train_lda_model_gensim(corpus, total_topics=2):
norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
dictionary = corpora.Dictionary(norm_tokenized_corpus)
mapped_corpus = [dictionary.doc2bow(text)
for text in norm_tokenized_corpus]
tfidf = models.TfidfModel(mapped_corpus)
corpus_tfidf = tfidf[mapped_corpus]
lda = models.LdaModel(corpus_tfidf,
id2word=dictionary,
iterations=1000,
num_topics=total_topics)
return lda
def train_model(model_name, corpus, id2word, num_topics):
"""
Train specified model
"""
# LDA
if model_name == 'lda':
model = models.LdaModel(
corpus,
id2word=id2word,
num_topics=num_topics,
alpha='auto',
eval_every=5,
)
return model
# LSI
elif model_name == 'lsi':
model = models.LsiModel(
corpus,
id2word=id2word,
num_topics=num_topics,
)
return model
else:
print('Invalid model name')
return None
def run_lda(self):
self.lda = models.LdaModel(corpus=self.corpus, id2word=self.dict, num_topics = self.nt, iterations=500000)
def build_lda(self, nt, corpus, dictionary, bow_matrix):
## Description: Builds LDA and does document similarity
## Params: Number of topics, corpus, dict, BOW matrix
## Returns: Similarity index and matrix
lda_model = models.LdaModel(corpus, id2word= dictionary, num_topics=nt)
self.lda_model = lda_model
index = similarities.MatrixSimilarity(lda_model[corpus])
matrix = bow_matrix.apply(lambda x: [lda_model[x[0]]], 1)
return (index, matrix)
def lda_model_topics():
dictionary = corpora.Dictionary.load(DictionaryFile)
corpus_bow = corpora.MmCorpus(BowFile)
N_TOPICS = 100
model = models.LdaModel(corpus_bow, id2word=dictionary, num_topics=N_TOPICS)
print "================= LDA MODEL IS BUILT ================="
model.save(LdaModelFile)
save_topics(model,LdaTopicsFile)
def __init__(self,
num_topics=NUM_TOPICS,
dictionary_file=DICTIONARY_FILE,
model_file=MODEL_FILE):
"""Initializes the ranker.
Args:
num_topics: int (default: NUM_TOPICS), number of LSI topics to use.
dictionary_file: str, where to save / load the dictionary file
(defaults to DICTIONARY_FILE).
model_file: str, where to save / load the model (defaults to
MODEL_FILE).
"""
self.dictionary = None
self.model = None
self.num_topics = num_topics
self.dictionary_file = dictionary_file
self.model_file = model_file
# Loads stopwords from the associated file.
with open(STOPWORDS_FILE, 'r') as f:
self.stoplist = set(f.read().strip().split())
# Loads an existing dictionary file, if one exists.
if os.path.exists(self.dictionary_file):
with open(self.dictionary_file, 'rb') as f:
self.dictionary = pickle.load(f)
# Loads an existing model file, if one exists.
if os.path.exists(self.model_file):
self.model = models.LdaModel.load(self.model_file)
else:
logging.warn('No model found in "%s"', self.model_file)
# Determines if the model needs to be trained.
self._trained = self.dictionary and self.model
def train(self, corpus, passes=1):
"""Updates dictionary and model given a corpus.
Args:
corpus: list of str, the documents to tokenize.
"""
if self.dictionary is not None or self.model is not None:
x = raw_input('You are about to overwrite an existing '
'model file (%s). Are you sure? [y/N] '
% self.model_file)
if x[0] != 'y':
raise RuntimeError('You chose not to overwrite the '
'existing model and dictionary.')
# Tokenizes the corpus.
documents = [self.tokenize(document) for document in corpus]
# Builds a dictionary from the existing documents.
self.dictionary = corpora.Dictionary(documents)
# Dumps the dictionary to a pickled file to use later.
pickle.dump(self.dictionary, open(self.dictionary_file, 'wb'))
# Converts the corpus to tokens.
corpus_bow = [self.dictionary.doc2bow(doc) for doc in documents]
# Trains the LSI model.
self.model = models.LdaModel(corpus_bow,
passes=passes,
id2word=self.dictionary,
num_topics=self.num_topics)
# Saves the model to use later.
self.model.save(self.model_file)
# Flag to remember that training has taken place.
self._trained = True
def train(self, **kargs) :
self.config.update(kargs)
self.model = _LDA(self.database.corpus, id2word = self.database.dictionary, **self.config)
delattr(self, "database")
def score(self, entities: list, context: str) -> list:
queries = [
(i, q['context']) for i, q in enumerate(entities) if q['context']
]
context = tokenize(context)
dictionary = Dictionary([context])
vectors = [
dictionary.doc2bow(
tokenize(q)
) for _, q in queries
]
model = LdaModel(id2word=dictionary, **self.model_kwargs)
ents = (
entities[i] for i, _ in queries
)
scores = (
model[vec][-1][1] for vec in vectors if model[vec]
)
results = zip(ents, scores)
def sort_by_score(item):
return item[1]
return sorted(results, key=sort_by_score, reverse=True)
def trainModel(self):
'''
Train a LDA model, inclusive of 4 steps:
1. Parse the whole corpora into unigram token collections and document mapping (for later use)
2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc)
3. Indexing the token collections and do TF-IDF transformation
4. Call gensim.models.LdaModel and generate topic distributions of the corpora
'''
print 'Start preparing unigram tokens....'
## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW)
# Get document_count, tokens, and document-index mapping from the corpora
doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora)
# Put the training data into gensim.corpora for later use
dic = corpora.Dictionary(train_set)
denominator = len(dic)
# Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality)
dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc)
nominator = len(dic)
corpus = [dic.doc2bow(text) for text in train_set] # transform every token into BOW
print 'There are %i documents in the pool' % (doc_count)
print "In the corpus there are ", denominator, " raw tokens"
print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%"
print 'Finished preparing unigram tokens....'
##END
print 'Start training LDA model....'
## Implementing TF-IDF as a vector for each document, and train LDA model on top of that
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes)
corpus_lda = lda[corpus_tfidf]
# Once done training, print all the topics and related words
print 'Finished training LDA model.......Here is the list of all topics & their most frequent words'
for i in range(self.num_topics):
print 'Topic %s : ' % (str(i)) + lda.print_topic(i)
# Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better
print '==============================='
print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics)
print '==============================='
return lda,doc_mapping,link_mapping,corpus
def get_score_for_question(question_answer_word_dir,question_num,question_answer_score_label_file_dir):
DCG_score_list = []
for question_index in range(int(question_num)):
if (question_index+1)%1000 == 1:
print 'Now for line : ' + str(question_index+1) + '\n'
index = question_index + 1
file_read_name = os.path.join(question_answer_word_dir,str(index))
file_write_name = os.path.join(question_answer_score_label_file_dir,str(index))
file_read = open(file_read_name,'rb+')
question_line = file_read.readline()
question_line_list = question_line.strip().split('\t')
question_line_list.remove('question')
answer_index = 0
answer_index_line_label_dict = {}
answer_sentences_word_list = []
for line in file_read.readlines():
answer_temp_line_list = line.strip().split('\t')
answer_label = answer_temp_line_list[1]
answer_temp_line_list.remove('answer')
answer_temp_line_list.remove(answer_label)
answer_sentences_word_list.append(answer_temp_line_list)
answer_list_temp = []
answer_list_temp.append(answer_label)
answer_index_line_label_dict[answer_index] = answer_list_temp
answer_index += 1
dic = corpora.Dictionary(answer_sentences_word_list)
corpus=[dic.doc2bow(text) for text in answer_sentences_word_list]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2)
index = similarities.MatrixSimilarity(lda[corpus_tfidf])
query_bow = dic.doc2bow(question_line_list)
query_lda = lda[query_bow]
sims = index[query_lda]
list_simes = list(enumerate(sims))
sort_sims = sorted(enumerate(sims),key=lambda item:-item[1])
#answer_label_list = []
for item in list_simes:
answer_index_temp = item[0]
answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0])
answer_score = str(item[1])
file_write = open(file_write_name,'ab+')
file_write.write(str(answer_label)+'\t'+str(answer_score)+'\n')
file_write.close()
#answer_label_list.append(answer_label)
#DCG_score = calu_DCG(answer_label_list,k)
#DCG_score_list.append(DCG_score)
#DCG_avg = calu_avg_answer_length(DCG_score_list)
#print 'DCG_avg : \t' + str(DCG_avg)
def get_score_for_question(question_answer_word_dir,question_num,k):
DCG_score_list = []
for question_index in range(int(question_num)):
if (question_index+1)%1000 == 1:
print 'Now for line : ' + str(question_index+1) + '\n'
index = question_index + 1
file_read_name = os.path.join(question_answer_word_dir,str(index))
file_read = open(file_read_name,'rb+')
question_line = file_read.readline()
question_line_list = question_line.strip().split('\t')
question_line_list.remove('question')
answer_index = 0
answer_index_line_label_dict = {}
answer_sentences_word_list = []
for line in file_read.readlines():
answer_temp_line_list = line.strip().split('\t')
answer_label = answer_temp_line_list[1]
answer_temp_line_list.remove('answer')
answer_temp_line_list.remove(answer_label)
answer_sentences_word_list.append(answer_temp_line_list)
answer_list_temp = []
answer_list_temp.append(answer_label)
answer_index_line_label_dict[answer_index] = answer_list_temp
answer_index += 1
dic = corpora.Dictionary(answer_sentences_word_list)
corpus=[dic.doc2bow(text) for text in answer_sentences_word_list]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2)
index = similarities.MatrixSimilarity(lda[corpus_tfidf])
query_bow = dic.doc2bow(question_line_list)
query_lda = lda[query_bow]
sims = index[query_lda]
sort_sims = sorted(enumerate(sims),key=lambda item:-item[1])
answer_label_list = []
for item in sort_sims:
answer_index_temp = item[0]
answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0])
answer_label_list.append(answer_label)
DCG_score = calu_DCG(answer_label_list,k)
DCG_score_list.append(DCG_score)
DCG_avg = calu_avg_answer_length(DCG_score_list)
print 'DCG_avg : \t' + str(DCG_avg)
def __init__(self, filename):
self.docs = loads(open(filename, "r").read())
self.docmap = hoist_dict(self.docs, "id")
if isfile("data.dict"):
self.dictionary = Dictionary.load("data.dict")
else:
self.dictionary = Dictionary(iterate_summaries(self.docs))
self.dictionary.save("data.dict")
if isfile("data.mm"):
self.corpus = MmCorpus("data.mm")
else:
corpus = (self.dictionary.doc2bow(text) for text in iterate_summaries(self.docs))
MmCorpus.serialize("data.mm", corpus)
self.corpus = MmCorpus("data.mm")
self.lsi = LsiModel(self.corpus, id2word=self.dictionary, num_topics=3)
if isfile("data.sim"):
self.sim = MatrixSimilarity.load("data.sim")
else:
self.sim = MatrixSimilarity(self.lsi[self.corpus])
self.sim.save("data.sim")
# self.lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1)
self.sentiment_model = Doc2Vec.load("imdb.d2v")
self.sentiment = LogisticRegression()
self.sentiment.fit([self.sentiment_model.docvecs["TEST_POS_" + str(i)] for i in range(12500)] +
[self.sentiment_model.docvecs["TEST_NEG_" + str(i)] for i in range(12500)],
asarray(list(chain(repeat(0, 12500), repeat(1, 12500)))))
if isfile("arxiv.d2v"):
self.doc_model = Doc2Vec.load("arxiv.d2v")
else:
tagged = [TaggedDocument(doc.get("summary").split(), [doc.get("id")]) for doc in self.docs]
doc_model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
doc_model.build_vocab(tagged)
shuffle(tagged) # Replace with functional stuff
for epoch in range(10):
doc_model.train(tagged, total_examples=doc_model.corpus_count, epochs=doc_model.iter)
doc_model.save("arxiv.d2v")