python类DropItem()的实例源码

pipelines_bk.py 文件源码 项目:web_crawler 作者: NearXdu 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        if item['link'] in self.seen:
            raise DropItem('Duplicate link %s' % item['link'])
        self.seen.add(item['link'])
        line = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.file.write(line)
        return item
pipelines.py 文件源码 项目:web_crawler 作者: NearXdu 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        if redis_db.hexists(redis_data_dict, item['link']):
            raise DropItem("Duplicate item found: %s" % item)
        else:
#            print item['link']

            cur=self.conn.cursor()
            add_url = """insert into sohuurl(url) VALUES (%s)"""
            data_url=(str(item['link']),)
            cur.execute(add_url,data_url)
            self.conn.commit()
            cur.close()
            return item
pipelines.py 文件源码 项目:web_crawler 作者: NearXdu 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        if item['link'] in self.seen:
            raise DropItem('Duplicate Link %s' % item['link'])
        self.seen.add(item['link'])
        line = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.file.write(line)
        return item
pipelines.py 文件源码 项目:multithread-crawler 作者: SaberAlexander 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def process_item(self,jd_item,JDspider):  
        # if not jd_item['flag']:  
        #     raise DropItem("item dropped found: %s" % jd_item)  
        # else:
        str_line1= json.dumps(dict(jd_item)) + "\n"
        self.file1.write(str_line1)
        str_line2=json.dumps(dict(jd_item))+','+'\n'
        self.file2.write(str_line2)

        return jd_item
pipelines.py 文件源码 项目:Jobs-search 作者: Hopetree 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem('Missming{}!'.format(data))
        if valid:
            self.coll.insert(dict(item))
            log.msg('item added to mongodb database !',level=log.DEBUG,spider=spider)

        return item
pipelines.py 文件源码 项目:FreeFoodCalendar 作者: Yuliang-Zou 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            self.collection.insert(dict(item))
            log.msg("Event added to MongoDB database!",
                    level=log.DEBUG, spider=spider)
        return item
pipelines.py 文件源码 项目:FreeFoodCalendar 作者: Yuliang-Zou 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            self.collection.insert(dict(item))
            log.msg("Event added to MongoDB database!",
                    level=log.DEBUG, spider=spider)
        return item
duplicates_pipeline.py 文件源码 项目:bigdata_data 作者: htzy 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        if item['link'] in self.ids_seen:
            raise DropItem("Duplicate item found:%s" % item)
        else:
            self.ids_seen.add(item['link'])
            return item
content_pipeline.py 文件源码 项目:bigdata_data 作者: htzy 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        if re.search(u'window|??|??|????', item['title'], re.I):
            print "ignore this item"
            raise DropItem("Contains word that you don't want: %s" % item['title'])
        elif re.search(u'window|??|??|????', item['abstract'], re.I):
            print "ignore this item"
            raise DropItem("Contains word that you don't want: %s" % item['abstract'])
        else:
            return item
pipelines.py 文件源码 项目:SinaWeiboSpider 作者: wen-fei 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        collection_name = item.__class__.__name__
        try:
            self.db[collection_name].insert(dict(item))
        except DuplicateKeyError:
            return DropItem("Duplicate item found: %s" % item)
        else:
            return item
pipelines.py 文件源码 项目:airbnb_scraper 作者: bashedev 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        """Drop items not fitting parameters. Open in browser if specified. Return accepted items."""

        if self._skip_list and str(item['id']) in self._skip_list:
            raise DropItem('Item in skip list: {}'.format(item['id']))

        if self._minimum_monthly_discount and 'monthly_discount' in item:
            if item['monthly_discount'] < self._minimum_monthly_discount:
                raise DropItem('Monthly discount too low: {}'.format(item['monthly_discount']))

        if self._minimum_weekly_discount and 'weekly_discount' in item:
            if item['weekly_discount'] < self._minimum_monthly_discount:
                raise DropItem('Weekly discount too low: {}'.format(item['weekly_discount']))

        # check regexes
        if self._cannot_have_regex:
            for f in self._fields_to_check:
                v = str(item[f].encode('ASCII', 'replace'))
                if self._cannot_have_regex.search(v):
                    raise DropItem('Found: {}'.format(self._cannot_have_regex.pattern))

        if self._must_have_regex:
            has_must_haves = False
            for f in self._fields_to_check:
                v = str(item[f].encode('ASCII', 'replace'))
                if self._must_have_regex.search(v):
                    has_must_haves = True
                    break

            if not has_must_haves:
                raise DropItem('Not Found: {}'.format(self._must_have_regex.pattern))

        # open in browser
        if self._web_browser:
            webbrowser.get(self._web_browser).open(item['url'])

        return item
pipelines.py 文件源码 项目:pythonStudy 作者: jeikerxiao 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        #item['image_paths'] = image_paths
        return item
pipelines.py 文件源码 项目:pythonStudy 作者: jeikerxiao 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        return item
pipelines.py 文件源码 项目:jd_spider 作者: samrayleung 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            try:
                key = {}
                self.db[item['item_name']].insert(dict(item))
                logging.debug("add {}".format(item['item_name']))
            except (pymongo.errors.WriteError, KeyError) as err:
                raise DropItem(
                    "Duplicated comment Item: {}".format(item['good_name']))
        return item
pipelines.py 文件源码 项目:crawler 作者: brantou 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        if item['pid'] in self.ids_seen:
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['pid'])
            return item
pipelines.py 文件源码 项目:scrapy-image 作者: lamphp 项目源码 文件源码 阅读 42 收藏 0 点赞 0 评论 0
def item_completed(self, results, item, info):
        if info.spider.name == 'sisy':
            image_paths = [x['path'] for ok, x in results if ok]
            if not image_paths:
                raise DropItem("Item contains no images")
            item['image_paths'] = image_paths
            return item
duplicate_removal.py 文件源码 项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        """??????
        """

        # ????
        if isinstance(item, WeChat):
            if self.is_duplicate_wechat(item):
                return DropItem("Duplicate news found: %s" % item['article_addr'])
            else:
                return item
pipelines.py 文件源码 项目:ip_proxy_pool 作者: leeyis 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        if Redis.exists('ip_port:%s:%s' % (datetime.now().strftime("%Y%m%d"),item['ip_port'])) :
            raise DropItem("Duplicate item found: %s" % item)
        else:
            Redis.set('ip_port:%s:%s' % (datetime.now().strftime("%Y%m%d"),item['ip_port']),1)
            return item
pipelines.py 文件源码 项目:autoinjection 作者: ChengWiLL 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def __getValue(self,url):
        isHaveManyQueryInUrl = False
        for value in self.valuedict:
            div_by_value = url.split(value.rstrip('\n'))
            mm = div_by_value[0]
            if mm in self.seen:
                raise DropItem('Duplicate link %s' % url)
            elif len(div_by_value) > 1 and not isHaveManyQueryInUrl:
                self.seen.add(mm)
                isHaveManyQueryInUrl = True
                line = url+'\n'
                print url
                self.file.write(line)
pipelines.py 文件源码 项目:Newscrawler 作者: JBH168 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        # For the case where something goes wrong
        if item['spider_response'].status != 200:
            # Item is no longer processed in the pipeline
            raise DropItem("%s: Non-200 response" % item['url'])
        else:
            return item


问题


面经


文章

微信
公众号

扫码关注公众号