def main():
parser = argparse.ArgumentParser(description='Create a corpus from a collection of tweets and/or build an LDA model')
subparsers = parser.add_subparsers(dest='mode')
text_corpus_parser = subparsers.add_parser('text', help='Build corpus from directory of text files')
text_corpus_parser.add_argument('-d', '--docs_loc', required=True, action='store', dest='docs_loc', help='Directory where tweet documents stored')
text_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus')
text_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')
wiki_corpus_parser = subparsers.add_parser('wiki', help='Build corpus from compressed Wikipedia articles')
wiki_corpus_parser.add_argument('-w', '--wiki_loc', required=True, action='store', dest='wiki_loc', help='Location of compressed Wikipedia dump')
wiki_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus')
wiki_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')
lda_model_parser = subparsers.add_parser('lda', help='Create LDA model from saved corpus')
lda_model_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus')
lda_model_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary')
lda_model_parser.add_argument('-n', '--num_topics', required=True, action='store', dest='num_topics', help='Number of topics to assign to LDA model')
lda_model_parser.add_argument('-p', '--num_pass', required=True, action='store', dest='num_pass', help='Number of passes through corpus when training the LDA model')
lda_model_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location and name to save LDA model')
lda_vis_parser = subparsers.add_parser('ldavis', help='Create visualization of LDA model')
lda_vis_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus')
lda_vis_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary')
lda_vis_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location of LDA model')
argcomplete.autocomplete(parser)
args = parser.parse_args()
if args.mode == 'text':
doc_corpus = DocCorpus(args.docs_loc, args.lemma)
doc_corpus.dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
MmCorpus.serialize(args.corp_loc + '.mm', doc_corpus)
doc_corpus.dictionary.save(args.corp_loc + '.dict')
if args.mode == 'wiki':
if args.lemma:
wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=True, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15)
else:
wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=False, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15)
wiki_corpus.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
MmCorpus.serialize(args.corp_loc + '.mm', wiki_corpus)
wiki_corpus.dictionary.save(args.corp_loc + '.dict')
if args.mode == 'lda':
build_LDA_model(args.corp_loc, args.dict_loc, args.num_topics, args.num_pass, args.lda_loc)
if args.mode == 'ldavis':
build_pyLDAvis_output(args.corp_loc, args.dict_loc, args.lda_loc)
create_LDA_model.py 文件源码
python
阅读 30
收藏 0
点赞 0
评论 0
评论列表
文章目录