isp_data_pollution.py 文件源码

python
阅读 35 收藏 0 点赞 0 评论 0

项目:isp-data-pollution 作者: essandess 项目源码 文件源码
def url_links(self):
        """Generic webpage link finder format."""
        # https://github.com/detro/ghostdriver/issues/169
        @self.phantomjs_short_timeout
        def phantomjs_find_elements_by_tag_name():
            return WebDriverWait(self.driver,3).until(lambda x: x.find_elements_by_tag_name('a'))
        elements = phantomjs_find_elements_by_tag_name()

        # get links in random order until max. per page
        k = 0
        links = []
        try:
            for a in sorted(elements,key=lambda k: random.random()):
                @self.phantomjs_short_timeout
                def phantomjs_get_attribute(): return a.get_attribute('href')
                href = phantomjs_get_attribute()
                if href is not None: links.append(href)
                k += 1
                if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break
        except Exception as e:
            if self.debug: print('.get_attribute() exception:\n{}'.format(e))
        return links
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号