SpiderCommon.py 文件源码

python
阅读 18 收藏 0 点赞 0 评论 0

项目:ws-cli 作者: hack4sec 项目源码 文件源码
def prepare_links_for_insert(links, url, site):
        """ Get links dicts and prepare it to insert in MongoDB """
        links_to_insert = []
        for link in links:
            if not link:
                continue

            link = urlparse(link)

            if not link.scheme and \
                not link.netloc and \
                not link.path and \
                not link.query:
                continue

            if link.netloc \
                and link.netloc != site \
                and 'www.' + link.netloc != site \
                and link.netloc != 'www.' + site:
                SpiderCommon._external_hosts.append(link.netloc)
                continue

            link = SpiderCommon.clear_link(link)
            link = SpiderCommon.build_path(link, url.path)
            link = SpiderCommon.clear_link(link)

            links_to_insert.append(link)

        separated_links = []
        for link in links_to_insert:
            paths = link.path.split("/")
            while len(paths) != 1:
                del paths[-1]
                separated_links.append(
                    ParseResult(
                        scheme='',
                        netloc='',
                        path="/".join(paths) + '/',
                        params='',
                        query='',
                        fragment=''
                    )
                )
        return links_to_insert + separated_links
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号