def can_fetch(self, user_agent, url):
parsed = urlsplit(url)
domain = parsed.netloc
if domain in self.robots_txt_cache:
robot_txt = self.robots_txt_cache[domain]
if time.time() - robot_txt.mtime() > self.robot_txt_age:
robot_txt = None
else:
robot_txt = None
if robot_txt is None:
robot_txt = RobotFileParser()
try:
response = yield gen.maybe_future(self.http_client.fetch(
urljoin(url, '/robots.txt'), connect_timeout=10, request_timeout=30))
content = response.body
except tornado.httpclient.HTTPError as e:
logger.error('load robots.txt from %s error: %r', domain, e)
content = ''
try:
content = content.decode('utf8', 'ignore')
except UnicodeDecodeError:
content = ''
robot_txt.parse(content.splitlines())
self.robots_txt_cache[domain] = robot_txt
raise gen.Return(robot_txt.can_fetch(user_agent, url))
评论列表
文章目录