def process_request(self, request, spider):
if 'http://v.youku.com/v_show/' in request.url:
url = request.url.split('?')[0]
else:
url = request.url
if self.col.find_one({'$and': [
{'host': spider.name},
{'url': url},
# {'download': {'$in': [0, 1, 2]}}
{'download': {'$ne': -1}},
]}):
logging.warning('the page is crawled, url is {0}'.format(url))
raise IgnoreRequest()
return None
评论列表
文章目录