def get_topics_from_text(line):
doc_complete = line.split('.')
doc_clean = [clean_txt_to_clean_words(doc).split() for doc in doc_complete]# ignore if length of docs for topic analysis is less than 3
doc_clean_empty = True
all_topics = []
for doc in doc_clean:
if len(doc) > 0:
doc_clean_empty = False
if len(doc_clean) >=1 and doc_clean_empty == False:
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
Lda = gensim.models.ldamodel.LdaModel
num_topics = 3
ldamodel = Lda(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=25)
# print '\n\n',doc_complete
# print '\n',doc_clean, '\n'
# print ldamodel.print_topics(num_topics=5, num_words=2), '\n\n'
for i in range(0,num_topics):
topic = ldamodel.get_topic_terms(i, topn=2)
topic_list = []
for word in topic:
word_name = dictionary.get(word[0])
if len(word_name) > 1:
topic_list.append(word_name)
topic_list.sort()
topic_name = " ".join(topic_list)
add = False
for ch in topic_name:# ignore numerical topics
if ch in r"[abcdefghijklmnopqrstuvwxyz]":
add = True
if add:
if topic_name not in all_topics:
all_topics.append(str(topic_name))
return all_topics
评论列表
文章目录