def analyze(content, url, title):
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = LancasterStemmer()
stop_token = ['The', 'can', 's', 'I', 't', 'am', 'are']
texts = []
content_tokens = word_tokenize(content)
title_tokens = word_tokenize(title)
content_text = nltk.Text(content_tokens)
tokens = tokenizer.tokenize(content)
tokens = [i for i in tokens if not i.isdigit()] #Remove all numbers
stopped_tokens = [i for i in tokens if not i in en_stop] #Remove all meaningless words
stopped_tokens = [i for i in stopped_tokens if not i in stop_token] #Stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
texts.append(stemmed_tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=1,\
id2word = dictionary, passes=20)
topics = ldamodel.show_topic(0, 3)
#topics = ldamodel.print_topics(num_topics=1, num_words=3)[0]
Rtopic = []
for topicTuple in topics:
topic, rate = topicTuple
Rtopic.append(topic)
if len(Rtopic) == 0:
Rtopic.append("Not English")
Rtopic.append("Maybe Chinese?")
return (Rtopic, url, title)
评论列表
文章目录