pipelines.py 文件源码

python
阅读 29 收藏 0 点赞 0 评论 0

项目:smth_coupons_crawler 作者: moyawong 项目源码 文件源码
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data :
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if item['title'] == '':
            valid = False
            raise DropItem("title is empty")
        if item['content'] == '':
            valid = False
            raise DropItem("content is empty")

        for keyword in settings['EXCLUDE']:
            if keyword in item['title']:
                valid = False
                DropItem("title have invalid keywords")
                break

        if valid:
            iskey = False
            for key in settings['KEYS']:
                if key in item['title']:
                    iskey = True
                    break
            for author in settings['AUTHOR']:
                if author == item['author']:
                    iskey = True
                    break
            if not iskey:
                raise DropItem("item do not have keywords")

            for info in self.db.items.find({}, {"title":1}):
                infoTitle = info["title"].encode("utf-8")
                if infoTitle == item["title"]:
                    valid = False
                    raise DropItem("item exist!")
                    break

        if valid:
            self.collection.insert(dict(item))
            send_mail(item['title'], item['content'], item['href'])

            # log.msg("webCrewl item added to MongoDB database!",
            #         level=log.DEBUG, spider=spider)
        return item
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号