def main(argv):
cli_parser = make_cli_parser()
opts, args = cli_parser.parse_args(argv)
if len(args) != 2:
cli_parser.error("Please provide an input/output file")
if not os.path.isfile(args[1]+'.lda'):
if os.path.isfile(args[1]+'.bow2mm') and os.path.isfile(args[1]+'.id2word'):
id2word = corpora.Dictionary.load(args[1]+'.id2word')
else :
id2word = corpora.Dictionary(iter_file(args[0], opts.numlines))
# ignore words that appear in less than 5 documents or more than 20% documents
# when we do filtering, some vector becomes empty! it generates a huge problem!!
# id2word.filter_extremes(no_below=5, no_above=0.2, keep_n=None)
# save dictionary
id2word.save(args[1]+'.id2word')
# save doc2bow vector
corpora.MmCorpus.serialize(args[1]+'.bow2mm', iter_doc2bow(args[0], opts.numlines, id2word))
mm_corpus = corpora.MmCorpus(args[1]+'.bow2mm')
model=LdaMulticore(mm_corpus, id2word=id2word, num_topics=opts.numtopics, workers=opts.numprocs, passes=opts.numepochs)
model.save(args[1]+'.lda')
infile = open(args[0])
outfile = open(args[1]+'.csv', "w")
out_csvfile = csv.writer(outfile, delimiter =',')
in_csvfile = csv.reader(infile, delimiter=',')
for row in in_csvfile:
if row[0] == 0:
break
processed_post = preprocess(row[3]).split()
if len(processed_post) == 0: # skip 0~2 word documents (quite useless)
continue
result_list = row[1:3]
result_list.extend(query_tag(id2word, model, processed_post))
out_csvfile.writerow(result_list)
infile.close()
outfile.close()
#print query_tag(id2word, model, "Hello über, world is awesome!")
评论列表
文章目录