def get_content(self, page, meta):
if not page.is_html:
return page.content
check_path = self.config.data.get('check_path')
if check_path is not None:
if page.doc.find(check_path) is None:
log.info("Failed XML path check: %r", page.url)
return None
for meta_el in ['title', 'author', 'date']:
path = self.config.data.get('%s_path' % meta_el)
if path is not None and page.doc.findtext(path):
meta[meta_el] = page.doc.findtext(path)
if 'date' in meta:
try:
date = meta.pop('date')
date = parse(date)
if 'dates' not in meta:
meta['dates'] = []
meta['dates'].append(date.isoformat())
except Exception as ex:
log.exception(ex)
body = page.doc
if self.config.data.get('body_path') is not None:
body = page.doc.find(self.config.data.get('body_path'))
for path in self.config.data.get('remove_paths', []):
for el in body.findall(path):
el.drop_tree()
return html.tostring(body)
评论列表
文章目录