wat_extract_links.py 文件源码

python
阅读 17 收藏 0 点赞 0 评论 0

项目:cc-pyspark 作者: commoncrawl 项目源码 文件源码
def get_surt_host(url):
        try:
            host = urlparse(url).hostname
        except:
            # self.get_logger().debug("Failed to parse URL {}: {}".format(url, e))
            return None
        if host is None or ExtractHostLinksJob.ip_pattern.match(host):
            return None
        host = host.strip().lower()
        if len(host) < 1 or len(host) > 253:
            return None
        parts = host.split('.')
        if parts[-1] == '':
            # trailing dot is allowed, strip it
            parts = parts[0:-1]
        if parts[0] == 'www' and len(parts) > 1:
            # strip leading 'www' to reduce number of "duplicate" hosts
            parts = parts[1:]
        for i in range(0, len(parts)):
            part = parts[i]
            if not ExtractHostLinksJob.host_part_pattern.match(part):
                try:
                    idn = idna.encode(part).decode('ascii')
                except (idna.IDNAError, UnicodeDecodeError, IndexError,
                        UnicodeEncodeError, Exception):
                    # self.get_logger().debug("Invalid host name: {}".format(url))
                    return None

                if ExtractHostLinksJob.host_part_pattern.match(idn):
                    parts[i] = idn
                else:
                    # self.get_logger().debug("Invalid host name: {}".format(url))
                    return None
        parts.reverse()
        return '.'.join(parts)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号