def totxt(self, paperid):
'''
Converts HTML to pure text by extracting all text elements from the the HTML.
'''
infile = config.HTML_PATH % paperid
outfile = config.TXT_PATH % paperid
h = html.parse(infile)
pars = h.xpath("//p")
text = ''.join([par.text_content() for par in pars])
text = text.replace("-\n", "")
with open(outfile, 'w') as f :
f.write(text.encode("UTF-8"))
评论列表
文章目录