def get_surt_host(url):
try:
host = urlparse(url).hostname
except:
# self.get_logger().debug("Failed to parse URL {}: {}".format(url, e))
return None
if host is None or ExtractHostLinksJob.ip_pattern.match(host):
return None
host = host.strip().lower()
if len(host) < 1 or len(host) > 253:
return None
parts = host.split('.')
if parts[-1] == '':
# trailing dot is allowed, strip it
parts = parts[0:-1]
if parts[0] == 'www' and len(parts) > 1:
# strip leading 'www' to reduce number of "duplicate" hosts
parts = parts[1:]
for i in range(0, len(parts)):
part = parts[i]
if not ExtractHostLinksJob.host_part_pattern.match(part):
try:
idn = idna.encode(part).decode('ascii')
except (idna.IDNAError, UnicodeDecodeError, IndexError,
UnicodeEncodeError, Exception):
# self.get_logger().debug("Invalid host name: {}".format(url))
return None
if ExtractHostLinksJob.host_part_pattern.match(idn):
parts[i] = idn
else:
# self.get_logger().debug("Invalid host name: {}".format(url))
return None
parts.reverse()
return '.'.join(parts)
评论列表
文章目录