def train(self, corpus, passes=1):
"""Updates dictionary and model given a corpus.
Args:
corpus: list of str, the documents to tokenize.
"""
if self.dictionary is not None or self.model is not None:
x = raw_input('You are about to overwrite an existing '
'model file (%s). Are you sure? [y/N] '
% self.model_file)
if x[0] != 'y':
raise RuntimeError('You chose not to overwrite the '
'existing model and dictionary.')
# Tokenizes the corpus.
documents = [self.tokenize(document) for document in corpus]
# Builds a dictionary from the existing documents.
self.dictionary = corpora.Dictionary(documents)
# Dumps the dictionary to a pickled file to use later.
pickle.dump(self.dictionary, open(self.dictionary_file, 'wb'))
# Converts the corpus to tokens.
corpus_bow = [self.dictionary.doc2bow(doc) for doc in documents]
# Trains the LSI model.
self.model = models.LdaModel(corpus_bow,
passes=passes,
id2word=self.dictionary,
num_topics=self.num_topics)
# Saves the model to use later.
self.model.save(self.model_file)
# Flag to remember that training has taken place.
self._trained = True
评论列表
文章目录