def add_url_links(self,links,url=''):
k = 0
for link in sorted(links,key=lambda k: random.random()):
lp = uprs.urlparse(link)
if (lp.scheme == 'http' or lp.scheme == 'https') and not self.blacklisted(link):
if self.add_link(link): k += 1
if k > self.max_links_per_page: break
if self.verbose or self.debug:
current_url = url # default
try:
@self.phantomjs_short_timeout
def phantomjs_current_url(): return self.driver.current_url
current_url = phantomjs_current_url()
# the current_url method breaks on a lot of sites, e.g.
# python3 -c 'from selenium import webdriver; driver = webdriver.PhantomJS(); driver.get("https://github.com"); print(driver.title); print(driver.current_url); driver.quit()'
except Exception as e:
if self.debug: print('.current_url exception:\n{}'.format(e))
if self.debug:
print("{}: {:d} links added, {:d} total, {:.1f} bits domain entropy".format(current_url,k,self.link_count(),self.domain_entropy()))
elif self.verbose:
self.print_progress(current_url,num_links=k)
isp_data_pollution.py 文件源码
python
阅读 34
收藏 0
点赞 0
评论 0
评论列表
文章目录