def process_request(self, request, spider):
# don't use this middleware while testing is site is up
if hasattr(spider, "test") and spider.test=="yes":
#logger = logging.getLogger()
#logger.info("Testing mode, dead domains disabled")
return None
if not Domain.is_onion_url(request.url):
return None
domain = Domain.find_by_url(request.url)
if not domain or domain.is_up:
return None
raise IgnoreRequest('Domain %s is dead, skipping' % domain.host)
python类IgnoreRequest()的实例源码
def process_request(self, request, spider):
parsed_url = urlparse.urlparse(request.url)
if not self.test_mode or not parsed_url.path in ["/", ""]:
return None
if not Domain.is_onion_url(request.url):
return None
d = Domain.find_by_url(request.url)
if d is None:
return None
now = datetime.now()
if now > d.next_scheduled_check:
return None
else:
raise IgnoreRequest('FilterNotScheduledMiddleware: %s is not scheduled to check' % d.host)
def _redirect(self, redirected, request, spider, reason):
reason = response_status_message(reason)
redirects = request.meta.get('redirect_times', 0) + 1
if redirects <= self.max_redirect_times:
redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
[request.url]
redirected.meta['priority'] = redirected.meta['priority'] + self.priority_adjust
self.logger.debug("Redirecting %s to %s from %s for %s times " % (
reason, redirected.url, request.url, redirected.meta.get("redirect_times")))
return redirected
else:
self.logger.info("Discarding %s: max redirections reached" % request.url)
if request.meta.get("callback") == "parse":
self.crawler.stats.inc_total_pages(crawlid=request.meta['crawlid'])
self.logger.error(
" in redicrect request error to failed pages url:%s, exception:%s, meta:%s" % (
request.url, reason, request.meta))
raise IgnoreRequest("max redirections reached:%s" % reason)
def _retry(self, request, reason, spider):
spider.change_proxy = True
retries = request.meta.get('retry_times', 0) + 1
if request.meta.get("if_next_page"):
self.logger.debug("in _retry re-yield next_pages request: %s, reason: %s. " % (request.url, reason))
return request.copy()
elif retries <= self.max_retry_times:
retryreq = request.copy()
retryreq.meta['retry_times'] = retries
retryreq.dont_filter = True
retryreq.meta['priority'] = retryreq.meta['priority'] + self.crawler.settings.get(
"REDIRECT_PRIORITY_ADJUST")
self.logger.debug("in _retry retries times: %s, re-yield request: %s, reason: %s" % (
retries, request.url, reason))
return retryreq
else:
if request.meta.get("callback") == "parse":
spider.crawler.stats.inc_total_pages(crawlid=request.meta['crawlid'])
self.logger.error(
"retry request error to failed pages url:%s, exception:%s, meta:%s" % (
request.url, reason, request.meta))
self.logger.info("Gave up retrying %s (failed %d times): %s" % (request.url, retries, reason))
raise IgnoreRequest("%s %s" % (reason, "retry %s times. "%retries))
def process_requset_method_wrapper(func):
@wraps(func)
def wrapper_method(*args, **kwds):
self = args[0]
request = kwds.get("request")
spider = kwds.get("spider")
try:
return func(*args, **kwds)
except Exception as e:
spider.logger.error("error heppened in process_request method of %s in %s. Error:%s, processing %s," % (
self.__class__.__name__, IP, traceback.format_exc(), request.url))
spider.crawler.stats.set_failed_download(request.meta, str(e))
raise IgnoreRequest(e)
return wrapper_method
def process_response_method_wrapper(func):
@wraps(func)
def wrapper_method(*args, **kwds):
self = args[0]
request = kwds.get("request")
response = kwds.get("response")
spider = kwds.get("spider")
try:
return func(*args, **kwds)
except Exception as e:
spider.logger.error("error heppened in process_response method of %s in %s. Error:%s, processing %s," % (
self.__class__.__name__, IP, traceback.format_exc(), response.url))
spider.crawler.stats.set_failed_download(request.meta, str(e))
raise IgnoreRequest(e)
return wrapper_method
def process_response(self, request, response, spider):
try:
if response.status in self._http_status_codes:
raise BlacklistError(response, u'HTTP status '.format(response.status))
self._counter += 1
if self._counter > self._counter_max:
logger.debug(u'Max requests: Change IP')
self._reset_session()
return response
except BlacklistError as ex:
logger.debug(
u'Ignoring Blacklisted response %(response)r: %(message)r',
{'response': response, 'message': ex.message}, extra={'spider': spider},
)
self._reset_session()
self.scheduler.process_exception(request, ex, spider)
raise IgnoreRequest()
def process_exception(self, request, exception, spider):
if 'proxy' not in request.meta: return
if isinstance(exception, IgnoreRequest): return # No problem
mode=request.meta.get('proxy_mode', self.mode) # Possible override
if mode == 'once': # Try once mode, quit here
return
# Simple downvote
self.pp.set_status(self.map_proxy(request.meta['proxy']), None)
del request.meta['proxy'] # Will pick new proxy on next request
# List of conditions when we retry. Some of them may disable the proxy (TBD)
if type(exception) in (
ConnectionRefusedError, ConnectError, TimeoutError,
TCPTimedOutError, NoRouteError, ResponseNeverReceived,
ResponseFailed, TunnelError ):
lg.error('{} on %s'.format(type(exception)), request.url)
return request.replace(dont_filter = True)
def process_response(self, request, response, spider): # pylint:disable=unused-argument
"""
Only allow HTTP response types that that match the given list of
filtering regexs
"""
# to specify on a per-spider basis
# type_whitelist = getattr(spider, "response_type_whitelist", None)
type_whitelist = (r'text', )
content_type_header = response.headers.get('content-type', None)
if content_type_header and self.is_valid_response(type_whitelist,
content_type_header):
return response
else:
msg = "Ignoring request {}, content-type was not in whitelist" \
.format(response.url)
logging.info(msg)
raise IgnoreRequest()
def process_response(self, request, response, spider):
if request.meta.get('crack_retry_count', 0) > self.MAX_RETRY:
raise IgnoreRequest('Max retries exceeded %s' % request.meta.get('original_request', request))
if isinstance(response, HtmlResponse) and 'robot check' in ''.join([x.strip().lower() for x in response.xpath('//title/text()').extract()]):
self.cracking = True
self.crawler.stats.inc_value('robot_check')
# Log the url of the original request that got blocked
self.logger.warning('robot check {}'.format(request.meta.get('original_request') or request))
return self.request_image(request, response)
elif request.meta.get('image_request', False):
self.logger.debug('processing image {}'.format(request))
return self.process_image(request, response)
else:
self.cracking = False
return response
def process_request(self, request, spider):
if not Domain.is_onion_url(request.url):
return None
parsed_url = urlparse.urlparse(request.url)
host = parsed_url.hostname
subdomains = host.count(".")
if subdomains > 2:
raise IgnoreRequest('Too many subdomains (%d > 2)' % subdomains)
return None
def process_request(self, request, spider):
parsed_url = urlparse.urlparse(request.url)
host = parsed_url.hostname
if self.counter[host] < self.max_pages:
self.counter[host] += 1
spider.logger.info('Page count is %d for %s' % (self.counter[host], host))
return None
else:
raise IgnoreRequest('MAX_PAGES_PER_DOMAIN reached, filtered %s' % request.url)
def process_response(self,request,response,spider):
logging.info('response url %s with proxy:%s got status %s '%(response.url,request.meta['proxy'],response.status))
if response.status != 200:
if response.status == 301 or response.status == 404:
Sup.letpagesgo(response.url)
raise IgnoreRequest('found no pages')
else:
Sup.deleteProxy(request)
new_request = request.copy()
new_request.dont_filter = True
return new_request
else:
return response
#?????????IP
def process_request(self, request, spider):
if not request.url:
return None
url_hash = hashlib.md5(request.url.encode("utf8")).hexdigest()
if self.redis_client.sismember(spider.name, url_hash):
raise IgnoreRequest("Spider : %s, IgnoreRequest : %s" % (spider.name, request.url))
else:
self.redis_client.sadd(spider.name, url_hash)
def _process_requests(self, items_or_requests, start=False):
"""Acquire the webdriver manager when it's available for requests."""
error_msg = "WebdriverRequests from start_requests can't be in-page."
for request in iter(items_or_requests):
if isinstance(request, WebdriverRequest):
if start and isinstance(request, WebdriverActionRequest):
raise IgnoreRequest(error_msg)
request = self.manager.acquire(request)
if request is WebdriverRequest.WAITING:
continue # Request has been enqueued, so drop it.
yield request
def process_response(self, request, response, spider):
#???????response
http_code = response.status
if http_code // 100 == 2:
self.stats.inc_value('response/%d'%http_code, spider=spider)
return response
#???????304????3??????
if http_code // 100 == 3 and http_code != 304:
self.stats.inc_value('response/%d'%http_code, spider=spider)
#??????url
url = response.headers['location']
domain = urlparse.urlparse(url).netloc
#??????url?domain???allowed_domains?
if domain in spider.allowed_domains:
return Request(url=url, meta=request.meta)
else:
raise IgnoreRequest(u'not allowed to crawl')
if http_code // 100 == 4 and http_code != 403:
self.stats.inc_value('response/%d'%http_code, spider=spider)
#????403????????????
raise IgnoreRequest(u'404')
if http_code // 100 == 5:
self.stats.inc_value('response/%d'%http_code, spider=spider)
return request
#????meta refresh???
url = html.get_html_meta_refresh(response)
if url:
self.stats.inc_value('response/metarefresh', spider=spider)
domain = urlparse.urlparse(url).netloc
#??meta refresh????url?domain???allowed_domains?
if domain in spider.allowed_domains:
return Request(url=url, meta=request.meta)
def callback(self, result):
if result:
self.logger.info('%s has been cached', self.request.url)
raise IgnoreRequest('%s has been cached'%self.request.url)
def process_exception(self, request, exception, spider):
if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
and not request.meta.get('dont_retry', False):
return self._retry(request, "%s:%s" % (exception.__class__.__name__, exception), spider)
else:
if request.meta.get("callback") == "parse":
spider.crawler.stats.inc_total_pages(crawlid=request.meta['crawlid'])
self.logger.error("in retry request error %s" % traceback.format_exc())
raise IgnoreRequest("%s:%s unhandle error. " % (exception.__class__.__name__, exception))
def process_exception(self, request, exception, spider):
logger.debug(
u'Ignoring Exception: %(message)r',
{'message': exception.message}, extra={'spider': spider},
)
self._reset_session()
self.scheduler.process_exception(request, exception, spider)
raise IgnoreRequest()
def process_request(self, request, spider): # pylint:disable=unused-argument
"""Process incoming request."""
parsed_uri = urlparse(request.url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
domain = domain.replace("http://", "").replace("https://", "") \
.replace("/", "")
banned_domains = settings.get('BANNED_DOMAINS')
if hashlib.md5(domain).hexdigest() in banned_domains:
# Do not execute this request
request.meta['proxy'] = ""
msg = "Ignoring request {}, This domain is banned." \
.format(request.url)
logging.info(msg)
raise IgnoreRequest()
def process_request(self, request, spider): # pylint:disable=unused-argument
"""Process incoming request."""
hostname = urlparse(request.url).hostname
if len(hostname.split(".")) > 4:
# Do not execute this request
request.meta['proxy'] = ""
msg = "Ignoring request {}, too many sub domains." \
.format(request.url)
logging.info(msg)
raise IgnoreRequest()
def _redirect(self, redirected, request, spider, reason):
if self.domain_limit(spider) and \
get_domain(redirected.url) != get_domain(request.url):
raise IgnoreRequest('Redirecting off-domain')
return super()._redirect(redirected, request, spider, reason)
def process_request(self, request, spider):
if request.meta.get('from_qtwebkit', False):
ext = urlparse(request.url).path.rsplit('.', 1)[-1]
if ext in {'css', 'gif', 'png'}:
raise IgnoreRequest()
def process_request(self, request, spider):
if self.col.find_one({'$and': [
{'host': spider.name},
{'url': request.url},
# {'download': {'$in': [0, 1, 2]}}
{'download': {'$ne': -1}},
]}):
logging.warning('the page is crawled, url is {0}'.format(request.url))
raise IgnoreRequest()
return None
def process_request(self, request, spider):
url = request.url.split('?')[0]
if self.col.find_one({'$and': [
{'host': spider.name},
{'url': url},
# {'download': {'$in': [0, 1, 2]}}
{'download': {'$ne': -1}},
]}):
logging.warning('the page is crawled, url is {0}'.format(request.url))
raise IgnoreRequest()
return None
def process_request(self, request, spider):
if 'http://v.youku.com/v_show/' in request.url:
url = request.url.split('?')[0]
else:
url = request.url
if self.col.find_one({'$and': [
{'host': spider.name},
{'url': url},
# {'download': {'$in': [0, 1, 2]}}
{'download': {'$ne': -1}},
]}):
logging.warning('the page is crawled, url is {0}'.format(url))
raise IgnoreRequest()
return None
def process_request(self, request, spider):
if 'http://v.youku.com/v_show/' in request.url:
url = request.url.split('?')[0]
else:
url = request.url
if self.col.find_one({'$and': [
{'host': spider.name},
{'url': url},
# {'download': {'$in': [0, 1, 2]}}
{'download': {'$ne': -1}},
]}):
logging.warning('the page is crawled, url is {0}'.format(url))
raise IgnoreRequest()
return None
def process_request(self, request, spider):
def set_auth(request, proxy):
if proxy.creds:
request.headers['Proxy-Authorization'] = proxy.creds
lg.debug('in process_request: {}, {}'.format(request, request.meta))
pa=request.meta.pop('proxy_action', None)
if pa == 'disable':
self.pp.set_status(self.map_proxy(request.meta['proxy']), 'D')
del request.meta['proxy'] # Make it pick another proxy
elif pa == 'release':
proxy=self.map_proxy(request.meta['proxy'])
self.pp.release_proxy(proxy)
raise IgnoreRequest
# Don't overwrite with a random one (server-side state for IP)
if 'proxy' in request.meta:
proxy=self.map_proxy(request.meta['proxy'])
set_auth(request, proxy)
return # No fuss, we have a proxy already
if self.mode == 'random':
proxy = self.pp.get_proxy(True)
elif self.mode == 'sequential':
proxy = self.pp.get_proxy()
request.meta['proxy'] = proxy.p
set_auth(request, proxy)
lg.debug('Using proxy '+proxy.p)
# Start setup_session anew wherever we are, fresh or recurring
req=request.meta.get('ss_request')
if req:
# Store original request to use after the session is setup
if 'original_request' not in request.meta:
request.meta['original_request']=request
else:
req=request
return req.replace(meta=request.meta, dont_filter=True)