def determine_level(self, response):
"""
determine the index level of current response, so we can decide wether to continue crawl or not.
level 1: people/[a-z].html
level 2: people/[A-Z][\d+].html
level 3: people/[a-zA-Z0-9-]+.html
level 4: search page, pub/dir/.+
level 5: profile page
"""
import re
url = response.url
if re.match(".+/[a-z]\.html", url):
return 1
elif re.match(".+/[A-Z]\d+.html", url):
return 2
elif re.match(".+/people-[a-zA-Z0-9-]+", url):
return 3
elif re.match(".+/pub/dir/.+", url):
return 4
elif re.match(".+/search/._", url):
return 4
elif re.match(".+/pub/.+", url):
return 5
log.msg("Crawl cannot determine the url's level: " + url)
return None
评论列表
文章目录