content.py 文件源码

python
阅读 28 收藏 0 点赞 0 评论 0

项目:BlogSpider 作者: hack4code 项目源码 文件源码
def process_item(self, item, spider):
        item['title'] = self.format_title(item['title'])
        doc = item['content']
        if not isinstance(doc,
                          HtmlElement):
            if isinstance(doc,
                          (str, bytes)):
                doc = fromstring(bytes(bytearray(doc,
                                                 encoding=item['encoding'])),
                                 parser=HTMLParser(encoding=item['encoding']))
            else:
                raise ContentException((
                    'Error in content pipeline unsupported doc type[{}]'
                    ).format(doc.__class__.__name__))

        # remove element with class name for clean display
        removed_classes = getattr(spider,
                                  self.REMOVED_CLASSES_NAME,
                                  None)
        if removed_classes is not None:
            doc = self.remove_element_with_class(doc,
                                                 removed_classes)

        # remove element with xpath for clean display
        removed_xpath_nodes = getattr(spider,
                                      self.REMOVED_XPATH_NODES_NAME,
                                      None)
        if removed_xpath_nodes is not None:
            doc = self.remove_element_with_xpath(doc,
                                                 removed_xpath_nodes)
        allow_classes = getattr(spider,
                                self.ALLOW_CLASSES_NAME,
                                None)
        safe_attrs = getattr(spider,
                             self.SAFE_ATTRS_NAME,
                             None)
        doc = self.clean_html(doc,
                              allow_classes=allow_classes,
                              safe_attrs=safe_attrs)
        doc = self.make_abs_link(doc,
                                 item['link'])
        item['content'] = doc
        return item
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号