def gen_sent_on_doc(docs, tags, idxvocab, vocabxid, start_symbol, end_symbol, cf):
topics, _ = tm.get_topics(sess, topn=topn)
topics = [ " ".join([idxvocab[w] for w in t]) for t in topics ]
doc_text = [ item.replace("\t", "\n") for item in codecs.open(args.input_doc, "r", "utf-8").readlines() ]
output = codecs.open(args.gen_sent_on_doc, "w", "utf-8")
with tf.variable_scope("model", reuse=True, initializer=initializer):
mgen = LM(is_training=False, vocab_size=len(idxvocab), batch_size=1, num_steps=1, config=cf, \
reuse_conv_variables=True)
for d in range(len(docs)):
output.write("\n" + "="*100 + "\n")
output.write("Doc " + str(d) +":\n")
output.write(doc_text[d])
doc, _, _, t, _ = get_batch_doc(docs, None, tags, d, cf.doc_len, cf.tag_len, 1, vocabxid[pad_symbol])
best_topics, best_words = mgen.get_topics_on_doc(sess, doc, t, topn)
output.write("\nRepresentative topics:\n")
output.write("\n".join([ ("[%.3f] %s: %s" % (item[1],str(item[0]).zfill(3),topics[item[0]])) \
for item in best_topics ]) + "\n")
output.write("\nRepresentative words:\n")
output.write("\n".join([ ("[%.3f] %s" % (item[1], idxvocab[item[0]])) for item in best_words ]) + "\n")
output.write("\nSentence generation (greedy; argmax):" + "\n")
s = mgen.generate_on_doc(sess, doc, t, vocabxid[start_symbol], 0, cf.lm_sent_len+10, vocabxid[end_symbol])
output.write("[0] " + " ".join([ idxvocab[item] for item in s ]) + "\n")
for temp in gen_temps:
output.write("\nSentence generation (random; temperature = " + str(temp) + "):\n")
for i in xrange(gen_num):
s = mgen.generate_on_doc(sess, doc, t, vocabxid[start_symbol], temp, cf.lm_sent_len+10, \
vocabxid[end_symbol])
output.write("[" + str(i) + "] " + " ".join([ idxvocab[item] for item in s ]) + "\n")
######
#main#
######
#load the vocabulary
tdlm_test.py 文件源码
python
阅读 34
收藏 0
点赞 0
评论 0
评论列表
文章目录