python类DropItem()的实例源码-面圈网

pipelines.py 文件源码项目：news-please 作者: fhamborg 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        if spider.name == 'RssCrawler':
            # Search the CurrentVersion table for a version of the article
            try:
                self.cursor.execute(self.compare_versions, (item['url'],))
            except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError,
                    pymysql.IntegrityError, TypeError) as error:
                self.log.error("Something went wrong in rss query: %s", error)

            # Save the result of the query. Must be done before the add,
            #   otherwise the result will be overwritten in the buffer
            old_version = self.cursor.fetchone()

            if old_version is not None:
                # Compare the two download dates. index 3 of old_version
                #   corresponds to the download_date attribute in the DB
                if (datetime.datetime.strptime(
                        item['download_date'], "%y-%m-%d %H:%M:%S") -
                        old_version[3]) \
                        < datetime.timedelta(hours=self.delta_time):
                    raise DropItem("Article in DB too recent. Not saving.")

        return item

pipelines.py 文件源码项目：job_scraper 作者: wlabatey 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        keywords = spider.search_terms
        title = item['title'].lower()
        #####
        # We can pass in excluded words the same way as keywords later. Commented out for now.

            # excluded_words = ['asp.net', 'java', 'c#', 'web developer', 'c++', 
                    # 'windows', 'qa', 'support', '.net', 'manager', 'sales', 
                    # 'marketing', 'senior', 'snr', 'salesforce', 'crm']
        #####
        #####
        # if any(keyword in title for keyword in excluded_words):
            # raise DropItem("Job title contained excluded word")
        #####
        if any(keyword in title for keyword in keywords):
            return item
        else:
            raise DropItem("Job title doesn't contain our search terms")

pipelines.py 文件源码项目：crepriceSpider 作者: zhousenbiao 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem('Missing{0}!'.format(data))
        if valid:
            self.collection.insert(dict(item))
            log.msg('??????!', level=log.DEBUG, spider=spider)

        return item

    # def testdb(self):
    #     # ???MongoHQ
    #     con = pymongo.Connection("paulo.mongohq.com",10042)
    #     db = con.mytest
    #     db.authenticate("root", "sa123")
    #     db.urllist.drop()

pipelines.py 文件源码项目：ssp-transparencia 作者: eltermann 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        if isinstance(item, SsptransparenciaBO):
            key = 'bos'
            _id = item['id']
        elif isinstance(item, SsptransparenciaVitima):
            key = 'vitimas'
            _id = '%s::%s' % (item['bo_id'], item['count'])
        elif isinstance(item, SsptransparenciaNatureza):
            key = 'naturezas'
            _id = '%s::%s' % (item['bo_id'], item['count'])

        if _id in self.ids_seen[key]:
            raise DropItem('Duplicate item found: %s' % item)
        else:
            self.ids_seen[key].add(_id)
            return item

pipelines.py 文件源码项目：housebot 作者: jbkopecky 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def process_item(self, item, domain):
        now = arrow.now()
        seen = self.check_seen_before(item)
        if len(seen) > 0:
            last_seen = max(seen)
            time_limit = now.replace(**self.time_scale).timestamp
            if last_seen < time_limit:
                self.insert_item_price(item, now.timestamp)
            raise DropItem("Already seen %s, %s" % (item['url'], arrow.get(last_seen).humanize()))
        else:
            self.insert_item_price(item, now.timestamp)
            self.insert_item_main(item)
            self.insert_item_tag_list(item)
            self.insert_item_description(item)
            self.conn.commit()
            return item

pipelines.py 文件源码项目：jd_spider 作者: samrayleung 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            try:
                # key = {}
                # key['sku_id'] = item['sku_id']
                # self.db[item['item_name']].update(key, dict(item), upsert=True)
                self.db[item['item_name']].insert(dict(item))
                logging.debug("add {}".format(item['item_name']))
            except (pymongo.errors.WriteError, KeyError) as err:
                raise DropItem("Duplicated Item: {}".format(item['name']))
        return item

pipelines.py 文件源码项目：mensa-tracker 作者: annyanich 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):

        db_matches = db.session.query(DBMenuEntry).filter_by(
            category=item['category'],
            mensa=item['mensa'],
            description=item['description'],
            date_valid=item['date_valid'],
            allergens=item['allergens'],
            price=item['price']
        ).all()

        if db_matches:
            # If there is more than one matching entry in the database, we probably
            # already saved a duplicate by accident.  I really hope that doesn't happen.
            assert(len(db_matches) == 1)

            spider.crawler.stats.inc_value('items_already_in_db')
            raise DropItem(
                "Menu item already found in database.\n"
                "Previously scraped on: {previous_scrape_time}".format(
                    previous_scrape_time=str(db_matches[0].time_scraped)))
        else:
            return item

pipelines.py 文件源码项目：jiandan_2 作者: qiyeboy 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def item_completed(self, results, item, info):
        '''

        :param results:
        :param item:
        :param info:
        :return:
        ????????????????????????????????????????
         item_completed() ???????
        '''
        spiderName = self.spiderinfo.spider.name
        if spiderName == 'jiandan':
            image_paths = [x['path'] for ok, x in results if ok]
            if not image_paths:
                raise DropItem("Item contains no images")
            return item

pipelines.py 文件源码项目：Newscrawler 作者: JBH168 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        if spider.name == 'RssCrawler':
            # Search the CurrentVersion table for a version of the article
            try:
                self.cursor.execute(self.compare_versions, (item['url'],))
            except mysql.connector.Error as error:
                self.log.error("Something went wrong in rss query: %s", error)

            # Save the result of the query. Must be done before the add,
            #   otherwise the result will be overwritten in the buffer
            old_version = self.cursor.fetchone()

            if old_version is not None:
                # Compare the two download dates. index 3 of old_version
                #   corresponds to the download_date attribute in the DB
                if (datetime.datetime.strptime(
                        item['download_date'], "%y-%m-%d %H:%M:%S") -
                        old_version[3]) \
                        < datetime.timedelta(hours=self.delta_time):
                    raise DropItem("Article in DB too recent. Not saving.")

        return item

pipelines.py 文件源码项目：feeds 作者: nblock 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        def raise_if_missing(name, item):
            if name not in item:
                raise DropItem('The required field "{}" is missing in: {}.'.
                               format(name, item))

        # Required fields for all items
        for required in ('id', 'title', 'link'):
            raise_if_missing(required, item)

        # Required fields for FeedEntryItems
        if isinstance(item, FeedEntryItem):
            for required in ('updated',):
                raise_if_missing(required, item)

        return item

pipelines.py 文件源码项目：ProxyPool 作者: Time1ess 项目源码文件源码阅读 64 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        if not isinstance(item, ProxyItem):
            return item
        if not item.get('ip', None) or not item.get('port', None):
            raise DropItem('Bad ProxyItem')
        item.setdefault('addr', 'Unknown')
        item.setdefault('mode', 'Unknown')
        item.setdefault('protocol', 'http')
        item.setdefault('validation_time', 'Unknown')
        proxy = '{}://{}'.format(item['protocol'], item['proxy'])
        if self.conn.sismember('rookie_proxies', proxy) or\
                self.conn.sismember('available_proxies', proxy) or\
                self.conn.sismember('lost_proxies', proxy) or\
                self.conn.sismember('dead_proxies', proxy):
            raise DropItem('Already in the waiting list')
        key = 'proxy_info:'+item['proxy']
        pipe = self.conn.pipeline(False)
        pipe.sadd('rookie_proxies', proxy)
        pipe.zadd('rookies_checking', item['proxy'], time.time())
        pipe.hmset(key, dict(item))
        pipe.hset(key, 'failed_times', 0)
        pipe.execute()
        return item

pipelines.py 文件源码项目：rojak 作者: pyk 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        title = item.get('title', 'title_not_set')
        if title == 'title_not_set':
            err_msg = 'Missing title in: %s' % item.get('url')
            raise DropItem(err_msg)

        raw_content = item.get('raw_content', 'raw_content_not_set')
        if raw_content == 'raw_content_not_set':
            err_msg = 'Missing raw_content in: %s' % item.get('url')
            raise DropItem(err_msg)

        published_at = item.get('published_at', 'published_at_not_set')
        if published_at == 'published_at_not_set':
            err_msg = 'Missing published_at in: %s' % item.get('url')
            raise DropItem(err_msg)

        # Pass item to the next pipeline, if any
        return item

pipelines.py 文件源码项目：dazdp 作者: guapier 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            try:
                # key = {}
                # key['sku_id'] = item['sku_id']
                # self.db[item['item_name']].update(key, dict(item), upsert=True)
                self.db[item['item_name']].insert(dict(item))
                logging.debug("add {}".format(item['item_name']))
            except (pymongo.errors.WriteError, KeyError) as err:
                raise DropItem("Duplicated Item: {}".format(item['name']))
        return item

pipelines.py 文件源码项目：multimedia_crawler 作者: JFluo2011 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        try:
            data = {
                'url': item['url'],
                'file_name': item['file_name'],
                'media_type': item['media_type'],
                'host': item['host'],
                'file_dir': item['file_dir'],
                'download': item['download'],
                'extract': item['extract'],
                'info': item['info'],
                'stack': item['stack'],
                'media_urls': item['media_urls'],
            }
            self.col.update({'url': item['url']}, data, upsert=True)
            # self.col.update({'url': item['url']}, {'$set': {'info': item['info']}})
            # self.col.insert(data)
        except Exception, err:
            logging.error(str(err))
            raise DropItem(str(err))
        return item

pipelines.py 文件源码项目：multimedia_crawler 作者: JFluo2011 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __insert_item(self, item=None):
        item, self.items = self.items, item
        item.pop('index', None)
        try:
            data = {
                'url': item['url'],
                'file_name': item['file_name'],
                'media_type': item['media_type'],
                'host': item['host'],
                'file_dir': item['file_dir'],
                'download': item['download'],
                'extract': item['extract'],
                'info': item['info'],
                'stack': item['stack'],
                'media_urls': item['media_urls'],
            }
            self.col.update({'url': item['url']}, data, upsert=True)
            # self.col.insert(data)
        except Exception, err:
            logging.error(str(err))
            raise DropItem(str(err))
        return item

pipelines.py 文件源码项目：scrapy_redis_mongodb 作者: smilemilk1992 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        if self.site_item_exist(item):
            self.MG_table.insert(dict(item))
            logging.debug("Question added to MongoDB database!")
            # log.msg("Question added to MongoDB database!", level=log.DEBUG, spider=spider)
            '''
            Scrapy ?? 5 ? logging ???
            CRITICAL - ????(critical)
            ERROR - ????(regular errors)
            WARNING - ????(warning messages)
            INFO - ????(informational messages)
            DEBUG - ????(debugging messages)     ???????DEBUG

            '''
        else:
            raise DropItem("{} is exist".format(item['url']))
        return item

pipelines.py 文件源码项目：hoaxy-backend 作者: IUNetSci 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        """Main function that process URL item (first phase)."""
        # validate URL length
        if len(item['raw']) > MAX_URL_LEN:
            item['raw'] = item['raw'][:MAX_URL_LEN]
            logger.error('Raw URL too long, trucate it! %r', item['raw'])
        # parse raw URL
        purl = get_parsed_url(item['raw'])
        if purl is None or purl.hostname is None:
            raise DropItem('Invalide URL')
        site_id = belongs_to_site(purl.hostname, self.site_tuples)
        if site_id is None:
            raise DropItem('Offsite domain: %s', item)
        item['site_id'] = site_id
        # insert URL into table
        try:
            get_or_create_murl(spider.session, item, spider.platform_id)
        except SQLAlchemyError as e:
            logger.error(e)
            spider.session.rollback()
            raise DropItem('Fail to insert database of url: %s', item)
        return item

pipelines.py 文件源码项目：remotor 作者: jamiebull1 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        """Check if we need to store the item and decide whether to notify.
        """
        # check if already in the database
        stored = self.jobs_collection.find_one({'url': item['url']})
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            if stored:
                item = stored
                item['times_seen'] += 1
                self.jobs_collection.update(
                    {'_id': item['_id']}, dict(item), False)
            else:
                # if not (and if not already set), add date to item
                if not item.get('date_added', False):
                    item['date_added'] = datetime.now().isoformat()
                if not item.get('date_posted', False):
                    item['date_posted'] = datetime.now().isoformat()
                item['times_seen'] = 0
                self.jobs_collection.insert(item)
        return item

pipelines.py 文件源码项目：pydata_webscraping 作者: jmortega 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def _convert(self, item, spider):
        image_paths = [im['path'] for im in item['images']]

        datapath = spider.crawler.settings['FILES_STORE']
        image_files = [datapath + path for path in image_paths]

        item['pdf_file'] = '%s.pdf' % item['id']
        dest = '{root}/{spider}/{file}'.format(
            root=datapath,
            spider=item['spider'],
            file=item['pdf_file'],
        )
        print "file:"+dest
        # Use convert command from ImageMagick.
        cmd = ['convert'] + image_files + [dest]
        try:
            # TODO: capture errors
            subprocess.check_call(cmd, stdout=subprocess.PIPE)
        except subprocess.CalledProcessError as detail:
            print detail
            raise DropItem("failed to generate PDF")

        return item

pipelines.py 文件源码项目：crawl_web 作者: hanxlinsist 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        str = ""
        for e in item["bookinfo"]:
            if re.search(r'^\s*$', e):
                print "drop this element"
            else:
                str = str + e + ","
    item["bookinfo"] = str[:-1]

        if item['name']:
            if item['author']:
                return item
            else:
                raise DropItem("Missing name or author in %s" % item)

pipelines.py 文件源码项目：NewsScrapy 作者: yinzishao 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        item_keywords = judge_key_words(item)#??item????????
        if item_keywords:   #????????item
            item["keywords"] = item_keywords
            return item
        else:
            logger = logging.getLogger(spider.name)
            logger.info("No keyword in %s" % item["news_url"])
            raise DropItem("No keyword in %s" % item["news_url"])

pipelines.py 文件源码项目：scrapy_projects 作者: morefreeze 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        """check item weather in item_seen
        """
        if item['hash'] in self.item_seen:
            raise DropItem('Duplicate item found: %s' %item)
        else:
            self.item_seen.add(item['hash'])
            return item

pipelines.py 文件源码项目：scrapy_projects 作者: morefreeze 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        """return ip is duplicate or not

        :item: crawl item including host port
        :returns: return item or DropItem
        """
        if 'ip' not in item:
            raise DropItem('')
        port = item.get('port', 80)
        host = '%s:%s' % (item['ip'], port)
        if self.conn.sismember(settings.HOST_S, host) or self.dup_in_queue(host):
            raise DropItem('%s, cause duplicate' % (host))
        else:
            return item

pipelines.py 文件源码项目：scrapy_projects 作者: morefreeze 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        """save to redis and return item

        :item: crawl item including host port
        :returns: return item or DropItem
        """
        if 'ip' not in item:
            raise DropItem('')
        port = item.get('port', 80)
        host = '%s:%s' % (item['ip'], port)
        self.conn.sadd(self.host_s, host)
        return item

pipelines.py 文件源码项目：job_scraper 作者: wlabatey 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        job_title_company = item['title'] + item['company']
        if job_title_company in self.title_company:
            raise DropItem("Duplicate item found: %s" % (item))
        else: 
            self.title_company.add(job_title_company)
            return item

pipelines.py 文件源码项目：rental 作者: meihuanyu 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        valid = True
        print '--'*40
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            try:
                self.collection.insert(dict(item))
                log.msg("Question added to MongoDB database!",
                        level=log.DEBUG, spider=spider)
            except:
                print 'ggggg'*40
        return item

pipelines.py 文件源码项目：crepriceSpider 作者: zhousenbiao 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        print "------"
        if item.keys() >= 5:
            if item in self.has:
                raise DropItem("Duplicate item found: %s" % item)
            else:
                self.has.add(item)
                return item

# mongodb??

pipelines.py 文件源码项目：amazon-crawler 作者: ahmedezzeldin93 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        if item['pid'] in self.ids_seen:
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['id'])
            return item

pipelines.py 文件源码项目：web_crawler 作者: NearXdu 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        if item['link'] in self.seen:
            raise DropItem('Duplicate link %s' % item['link'])
        self.seen.add(item['link'])
        line = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.file.write(line)
        return item

pipelines.py 文件源码项目：web_crawler 作者: NearXdu 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def process_item(self, item, spider):
        if not re.match('.*comment.*',item['link']):
            if re.match('^http.*qq.com.*\.s?html?$',item['link']):
                if item['link'] in self.seen:
                    raise DropItem('Duplicate link %s' % item['link'])
                self.seen.add(item['link'])
                line = json.dumps(dict(item), ensure_ascii=False) + '\n'
                self.file.write(line)
                return item