tweets_on_LDA.py 文件源码

python
阅读 24 收藏 0 点赞 0 评论 0

项目:twitter_LDA_topic_modeling 作者: kenneth-orton 项目源码 文件源码
def main():
    parser = argparse.ArgumentParser(description='Create a corpus from a collection of tweets and/or build an LDA model')
    parser.add_argument('-t', '--topology_file', required=True, action='store', dest='top_file', help='Location of topology file')
    parser.add_argument('-p', '--dir_prefix', choices=['clique', 'community'], required=True, action='store', dest='dir_prefix', help='Select whether the topology contains cliques or communities')
    parser.add_argument('-w', '--working_dir', required=True, action='store', dest='working_dir', help='Name of the directory you want to direct output to')
    parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location of the saved LDA model')
    parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary for the model')
    parser.add_argument('-u', '--unseen_docs', required=True, action='store', dest='unseen_docs', help='Directory containing unseen documents')
    parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')
    argcomplete.autocomplete(parser)
    args = parser.parse_args()

    output_dir = args.working_dir + '/'
    if not os.path.exists(os.path.dirname(output_dir)):
        os.makedirs(os.path.dirname(output_dir), 0o755)

    # load dictionary
    model_dict = corpora.Dictionary.load(args.dict_loc)
    # load trained model from file
    lda = models.LdaModel.load(args.lda_loc)
    write_topn_words(output_dir, lda)

    with open(args.top_file, 'r') as inp_file:
        users = set(str(user) for community in inp_file for user in ast.literal_eval(community))
    try:
        with open(output_dir + 'document_vectors.json', 'r') as all_community_file:
            document_vectors = json.load(all_community_file)
    except:
        document_vectors = {}

    pool = multiprocessing.Pool(max(1, multiprocessing.cpu_count() - 1))
    func = partial(get_document_vectors, 
                   tweets_dir=args.unseen_docs, 
                   document_vectors=document_vectors, 
                   dictionary=model_dict, 
                   lda_model=lda,
                   lemma=args.lemma) 
    doc_vecs = pool.map(func, users)
    doc_vecs = [item for item in doc_vecs if item is not None]
    pool.close()
    pool.join()
    doc_vecs = dict(doc_vecs)

    document_vectors.update(doc_vecs)
    with open(output_dir + 'document_vectors.json', 'w') as document_vectors_file:
        json.dump(document_vectors, document_vectors_file, sort_keys=True, indent=4)

    print('Building directories')
    with open(args.top_file, 'r') as topology_file:
        for i, community in enumerate(topology_file):
            community_dir = output_dir + args.dir_prefix + '_' + str(i) + '/'
            if not os.path.exists(os.path.dirname(community_dir)):
                os.makedirs(os.path.dirname(community_dir), 0o755)
            comm_doc_vecs = community_document_vectors(doc_vecs, community)
            with open(community_dir + 'community_doc_vecs.json', 'w') as comm_docs_file:
                json.dump(comm_doc_vecs, comm_docs_file, sort_keys=True, indent=4)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号