def build_corpus(self, fname=None, save_to=None):
# read sentences file
if not fname:
fname = click.prompt('sentences file')
fname = self.__dest(fname)
assert os.path.isfile(fname), 'No such file: %s' % fname
if save_to:
self.corpus_fname = self.__dest(save_to)
else:
self.corpus_fname = LdaUtils.change_ext(fname, 'corpus')
# if there is no corpus file or the user wants to rebuild, build .corpus
if not os.path.isfile(self.corpus_fname) or click.confirm('There already is corpus. Do you want to rebuild?'):
print 'start building corpus'
start = time()
corpora.MmCorpus.serialize(self.corpus_fname, self.__iter_doc2bow(LdaUtils.iter_csv(fname, -1).split())) # save
print 'building corpus takes: %s' % LdaUtils.human_readable_time(time() - start)
self.corpus = corpora.MmCorpus(self.corpus_fname)
return self.corpus
评论列表
文章目录