def mark_contribs(html_file, marked_html_file) :
h = html.parse(html_file)
# text = "".join([ p.text_content() for p in h.xpath("//p") ])
pars = h.xpath("//p")
for par in pars :
# Get the paragraph's text fixing the hyphenation
text = par.text_content().replace("-\n", "")
sentences = tokenizer.tokenize(text.strip())
scores = map(calc_score, sentences)
intervals = max_subarray(scores, 1.0)
mask = positive_ones(len(sentences), intervals)
par.clear()
texts = []
# text = ''
# marked_sentences = []
for i, s in enumerate(sentences) :
if mask[i] :
marked = etree.Element("font", style="background-color:yellow", score=str(scores[i]))
marked.text = s
marked.tail = ''
par.append(marked)
else :
if len(par):
marked = par[-1]
marked.tail += ' ' + s
else:
texts.append(s)
par.text = ' '.join(texts)
h.write(marked_html_file, pretty_print=True, method="html")
评论列表
文章目录