isp_data_pollution.py 文件源码-python代码片段

isp_data_pollution.py 文件源码

python

阅读 34 收藏 0 点赞 0 评论 0

项目：isp-data-pollution 作者: essandess 项目源码文件源码

def add_url_links(self,links,url=''):
        k = 0
        for link in sorted(links,key=lambda k: random.random()):
            lp = uprs.urlparse(link)
            if (lp.scheme == 'http' or lp.scheme == 'https') and not self.blacklisted(link):
                if self.add_link(link): k += 1
                if k > self.max_links_per_page: break
        if self.verbose or self.debug:
            current_url = url  # default
            try:
                @self.phantomjs_short_timeout
                def phantomjs_current_url(): return self.driver.current_url
                current_url = phantomjs_current_url()
                # the current_url method breaks on a lot of sites, e.g.
                # python3 -c 'from selenium import webdriver; driver = webdriver.PhantomJS(); driver.get("https://github.com"); print(driver.title); print(driver.current_url); driver.quit()'
            except Exception as e:
                if self.debug: print('.current_url exception:\n{}'.format(e))
        if self.debug:
            print("{}: {:d} links added, {:d} total, {:.1f} bits domain entropy".format(current_url,k,self.link_count(),self.domain_entropy()))
        elif self.verbose:
            self.print_progress(current_url,num_links=k)