def parse_response(self, response):
page_id = ObjectId()
analyzer = Analyzer(response)
alerts = analyzer.inspect_response()
elems = analyzer.get_resource_elems()
page = analyzer.get_page_info()
for alert in alerts:
alert['org_id'] = self.org
yield alert
for elem in elems:
elem['page_id'] = page_id
elem['org_id'] = self.org
yield elem
page['page_id'] = page_id
page['org_id'] = self.org
yield page
#limit page depth
if self.pages_crawled >= settings.PAGES_PER_DOMAIN:
return
for link in LxmlLinkExtractor(unique=True, deny_extensions=list(), allow_domains=self.allowed_domains).extract_links(response):
if not link.url in self.already_crawled and self.pages_crawled <= settings.PAGES_PER_DOMAIN:
self.already_crawled.add(link.url)
self.pages_crawled = self.pages_crawled + 1
log.msg("Yielding request for " + link.url, level=log.INFO)
yield WebdriverRequest(link.url, callback=self.parse_response)
elif self.pages_crawled >= settings.PAGES_PER_DOMAIN:
log.msg("Reached max crawl depth: " + str(settings.PAGES_PER_DOMAIN), level=log.INFO)
return
else:
log.msg("avoiding duplicate request for: " + link.url, level=log.INFO)
评论列表
文章目录