def process_item(self, item, spider):
if not isinstance(item, ProxyItem):
return item
if not item.get('ip', None) or not item.get('port', None):
raise DropItem('Bad ProxyItem')
item.setdefault('addr', 'Unknown')
item.setdefault('mode', 'Unknown')
item.setdefault('protocol', 'http')
item.setdefault('validation_time', 'Unknown')
proxy = '{}://{}'.format(item['protocol'], item['proxy'])
if self.conn.sismember('rookie_proxies', proxy) or\
self.conn.sismember('available_proxies', proxy) or\
self.conn.sismember('lost_proxies', proxy) or\
self.conn.sismember('dead_proxies', proxy):
raise DropItem('Already in the waiting list')
key = 'proxy_info:'+item['proxy']
pipe = self.conn.pipeline(False)
pipe.sadd('rookie_proxies', proxy)
pipe.zadd('rookies_checking', item['proxy'], time.time())
pipe.hmset(key, dict(item))
pipe.hset(key, 'failed_times', 0)
pipe.execute()
return item
评论列表
文章目录