sitemap.py 文件源码

python
阅读 20 收藏 0 点赞 0 评论 0

项目:swarm 作者: a7vinx 项目源码 文件源码
def _parse_url(self,dst,src):
        """
        Check wether target url 'dst' is in the same domain(include port) with url 'src', and 
        convert url into complete url without params.

        Returns:
            String of complete url with query params if it has. if target url is not in the 
            same domain, return '';
        """
        LOG.debug('detecting url: '+dst)
        s_parsed=urlparse.urlparse(src)
        s_scheme=s_parsed.scheme
        s_netloc=s_parsed.netloc
        s_cur_dir=s_parsed.path
        if s_cur_dir[-1]!='/':
            s_cur_dir='/'.join(s_cur_dir.split('/')[:-1])
        else:
            s_cur_dir=s_cur_dir[:-1]

        d_parsed=urlparse.urlparse(dst)
        d_scheme=d_parsed.scheme
        if d_parsed.netloc.find(':')==-1 and d_parsed.netloc!='':
            if d_scheme=='http':
                d_netloc=d_parsed.netloc+':80'
            elif d_scheme=='https':
                d_netloc=d_parsed.netloc+':443'
            elif d_scheme=='':
                d_netloc=d_parsed.netloc+':80' if s_scheme=='http' else d_parsed.netloc+':443'
            else:
                d_netloc=d_parsed.netloc
        else:
            d_netloc=d_parsed.netloc
        # add '/' as prefix if the path does not starts with '/'
        if d_parsed.path!='':
            d_path='/'+d_parsed.path if d_parsed.path[0]!='/' else d_parsed.path
        else:
            d_path='/'
        d_query=d_parsed.query

        # if it is a relative url
        if d_netloc=='':
            return urlparse.ParseResult(s_scheme,s_netloc,s_cur_dir+d_path,'',d_query,'').geturl()
        elif d_netloc==s_netloc and (d_scheme==s_scheme or d_scheme==''):
            return urlparse.ParseResult(s_scheme,s_netloc,d_path,'',d_query,'').geturl()
        else:
            return ''
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号