def get_data(setname):
dataset = CorporaDataSet(setname)
# topic_word_array = dataset.getWordsInTopicMatrix()
# topic_doc_array = dataset.getDocsInTopicMatrix()
topic_word_array = dataset.getDocsInTopicMatrix()
topic_doc_array = dataset.getWordsInTopicMatrix().T
doc_length_array = numpy.full([topic_doc_array.shape[0]],1)
vocabulary = dataset.loadVocabulary()[0].keys()
print "topic word array shape: ",topic_word_array.shape
print "topic doc shape: ",topic_doc_array.shape
print "vocabulary: ",len(vocabulary)
wordfreqs = mmread(setname + ".mtx").sum(1)
word_freq_array = numpy.array(wordfreqs)[:,0]
return {topic_word_key:topic_word_array,
topic_doc_key:topic_doc_array,
doc_length_key:doc_length_array,
vocabulary_key:vocabulary,
word_freq_key:word_freq_array}
评论列表
文章目录