def parse(self, response):
"""
Scrapy parse callback
"""
# Get current nesting level
curr_depth = response.meta.get('depth', 1)
if self.config['login']['enabled']:
curr_depth = curr_depth - 1 # Do not count the login page as nesting depth
# Store to disk?
if self.config['store']['enabled']:
path = response.url.replace(os.sep, '--') # Replace directory separator
path = self.config['store']['path'] + os.sep + path
with open(path, 'wb') as fpointer:
fpointer.write(response.body)
# Yield current url item
item = CrawlpyItem()
item['url'] = response.url
item['status'] = response.status
item['depth'] = curr_depth
item['referer'] = response.meta.get('referer', '')
yield item
# Get all links from the current page
links = LinkExtractor().extract_links(response)
# Iterate all found links and crawl them
for link in links:
deny = False
# Check requests to be ignored
for ignore in self.config['ignores']:
if (ignore in link.url) or (ignore.lower() in link.url.lower()):
# Ignore pattern found, stop looking into other patterns
deny = True
break
# [NO] Max depth exceeded
if curr_depth >= self.max_depth:
logging.info('[Not Crawling] Current depth (' + curr_depth + ') exceeds max depth (' + self.max_depth + ')')
pass
# [NO] Duplicate URL
elif link.url in self.duplicates:
logging.info('[Not Crawling] Url already crawled: ' + link.url)
pass
# [NO] URL denied
elif deny:
logging.info('[Not Crawling] Url denied (pattern: "' + ignore + '"): ' + link.url)
pass
# [OK] Crawl!
else:
self.duplicates.append(link.url)
yield Request(link.url, meta={'depth': curr_depth+1, 'referer': response.url})
评论列表
文章目录