mf.py 文件源码

python
阅读 20 收藏 0 点赞 0 评论 0

项目:krauler 作者: occrp-attic 项目源码 文件源码
def get_content(self, page, meta):
        if not page.is_html:
            return page.content

        check_path = self.config.data.get('check_path')
        if check_path is not None:
            if page.doc.find(check_path) is None:
                log.info("Failed XML path check: %r", page.url)
                return None

        for meta_el in ['title', 'author', 'date']:
            path = self.config.data.get('%s_path' % meta_el)
            if path is not None and page.doc.findtext(path):
                meta[meta_el] = page.doc.findtext(path)

        if 'date' in meta:
            try:
                date = meta.pop('date')
                date = parse(date)
                if 'dates' not in meta:
                    meta['dates'] = []
                meta['dates'].append(date.isoformat())
            except Exception as ex:
                log.exception(ex)

        body = page.doc
        if self.config.data.get('body_path') is not None:
            body = page.doc.find(self.config.data.get('body_path'))

        for path in self.config.data.get('remove_paths', []):
            for el in body.findall(path):
                el.drop_tree()

        return html.tostring(body)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号