def main():
parser = argparse.ArgumentParser(description='Create a corpus from a collection of tweets and/or build an LDA model')
parser.add_argument('-t', '--topology_file', required=True, action='store', dest='top_file', help='Location of topology file')
parser.add_argument('-p', '--dir_prefix', choices=['clique', 'community'], required=True, action='store', dest='dir_prefix', help='Select whether the topology contains cliques or communities')
parser.add_argument('-w', '--working_dir', required=True, action='store', dest='working_dir', help='Name of the directory you want to direct output to')
parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location of the saved LDA model')
parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary for the model')
parser.add_argument('-u', '--unseen_docs', required=True, action='store', dest='unseen_docs', help='Directory containing unseen documents')
parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')
argcomplete.autocomplete(parser)
args = parser.parse_args()
output_dir = args.working_dir + '/'
if not os.path.exists(os.path.dirname(output_dir)):
os.makedirs(os.path.dirname(output_dir), 0o755)
# load dictionary
model_dict = corpora.Dictionary.load(args.dict_loc)
# load trained model from file
lda = models.LdaModel.load(args.lda_loc)
write_topn_words(output_dir, lda)
with open(args.top_file, 'r') as inp_file:
users = set(str(user) for community in inp_file for user in ast.literal_eval(community))
try:
with open(output_dir + 'document_vectors.json', 'r') as all_community_file:
document_vectors = json.load(all_community_file)
except:
document_vectors = {}
pool = multiprocessing.Pool(max(1, multiprocessing.cpu_count() - 1))
func = partial(get_document_vectors,
tweets_dir=args.unseen_docs,
document_vectors=document_vectors,
dictionary=model_dict,
lda_model=lda,
lemma=args.lemma)
doc_vecs = pool.map(func, users)
doc_vecs = [item for item in doc_vecs if item is not None]
pool.close()
pool.join()
doc_vecs = dict(doc_vecs)
document_vectors.update(doc_vecs)
with open(output_dir + 'document_vectors.json', 'w') as document_vectors_file:
json.dump(document_vectors, document_vectors_file, sort_keys=True, indent=4)
print('Building directories')
with open(args.top_file, 'r') as topology_file:
for i, community in enumerate(topology_file):
community_dir = output_dir + args.dir_prefix + '_' + str(i) + '/'
if not os.path.exists(os.path.dirname(community_dir)):
os.makedirs(os.path.dirname(community_dir), 0o755)
comm_doc_vecs = community_document_vectors(doc_vecs, community)
with open(community_dir + 'community_doc_vecs.json', 'w') as comm_docs_file:
json.dump(comm_doc_vecs, comm_docs_file, sort_keys=True, indent=4)
tweets_on_LDA.py 文件源码
python
阅读 24
收藏 0
点赞 0
评论 0
评论列表
文章目录