related_posts.py 文件源码-python代码片段

def process(self, unused, site, config):

        try:
            num_related_posts = config.num_related_posts
            # Tokenize
            docs = []
            valid_posts = [] #exclude pages that are not posts
            for post in site.posts:
                if post.meta.microdata_type not in RelatedPosts.VALID_FORMAT:
                    continue
                txt = post.md
                docs.append(gensim.utils.simple_preprocess(txt, deacc=True, min_len=3, max_len=15))
                valid_posts.append(post)
                # Fixme stemming

            # build model
            dictionary = corpora.Dictionary(docs)
            corpus = [dictionary.doc2bow(doc) for doc in docs]
            tfidf = models.tfidfmodel.TfidfModel(corpus=corpus)
            # Fixme: get correct number of topics
            num_topics = site.posts_by_tag.get_num_collections() * 2  # use collections as a proxy for the number of topics
            topic_model = models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=num_topics)
            index = similarities.MatrixSimilarity(topic_model[tfidf[corpus]], num_best=num_related_posts + 1) #+1 because the best one is itself

            # find simlar posts and store them
            log_details = ""
            for post, sims in zip(valid_posts, index):
                if post.meta.microdata_type not in RelatedPosts.VALID_FORMAT:
                    continue
                post.meta.related_posts = []
                log_details += '<div class="subsection"><h3>%s</h3>Related posts:<ol>' % (post.meta.title)
                for idx, score in sims[1:]: #1: > first one is the article itself
                    p = valid_posts[idx]
                    o = utils.create_objdict()
                    o.meta = p.meta
                    o.score = score
                    o.html = p.score
                    post.meta.related_posts.append(o)
                    log_details += '<li>%s (%s)</li>' % (o.meta.title, round(score,2))
                log_details += '<ol></div>'
            return (SiteFab.OK, "Related posts via LSI", log_details)
        except Exception as e:
            return (SiteFab.ERROR, "Related posts via LSI", e)