contribs.py 文件源码

python
阅读 21 收藏 0 点赞 0 评论 0

项目:KDDCUP2016 作者: hugochan 项目源码 文件源码
def mark_contribs(html_file, marked_html_file) :

    h = html.parse(html_file)
#   text = "".join([ p.text_content() for p in h.xpath("//p") ])

    pars = h.xpath("//p")

    for par in pars :

        # Get the paragraph's text fixing the hyphenation
        text = par.text_content().replace("-\n", "")

        sentences = tokenizer.tokenize(text.strip())
        scores = map(calc_score, sentences)

        intervals = max_subarray(scores, 1.0)
        mask = positive_ones(len(sentences), intervals)

        par.clear()

        texts = []
#       text = ''
#       marked_sentences = []
        for i, s in enumerate(sentences) :
            if mask[i] :
                marked = etree.Element("font", style="background-color:yellow", score=str(scores[i]))
                marked.text = s
                marked.tail = ''
                par.append(marked)

            else :
                if len(par):
                    marked = par[-1]
                    marked.tail += ' ' + s
                else: 
                    texts.append(s)


        par.text = ' '.join(texts)

    h.write(marked_html_file, pretty_print=True, method="html")
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号