analyze.py 文件源码

python
阅读 28 收藏 0 点赞 0 评论 0

项目:Distrpy 作者: j0e1in 项目源码 文件源码
def analyze(content, url, title):
    tokenizer = RegexpTokenizer(r'\w+')
    en_stop = get_stop_words('en')
    p_stemmer = LancasterStemmer()

    stop_token = ['The', 'can', 's', 'I', 't', 'am', 'are']
    texts = []
    content_tokens = word_tokenize(content)
    title_tokens = word_tokenize(title)
    content_text = nltk.Text(content_tokens)

    tokens = tokenizer.tokenize(content)

    tokens = [i for i in tokens if not i.isdigit()]  #Remove all numbers
    stopped_tokens = [i for i in tokens if not i in en_stop] #Remove all meaningless words
    stopped_tokens = [i for i in stopped_tokens if not i in stop_token] #Stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    texts.append(stemmed_tokens)

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=1,\
     id2word = dictionary, passes=20)
    topics = ldamodel.show_topic(0, 3)
    #topics = ldamodel.print_topics(num_topics=1, num_words=3)[0]
    Rtopic = []

    for topicTuple in topics:
        topic, rate = topicTuple
        Rtopic.append(topic)

    if len(Rtopic) == 0:
        Rtopic.append("Not English")
        Rtopic.append("Maybe Chinese?")

    return (Rtopic, url, title)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号