SpiderConfig.py 文件源码

python
阅读 30 收藏 0 点赞 0 评论 0

项目:SpiderConfig 作者: brady-chen 项目源码 文件源码
def get_urls(self, get_proxie_or_not=False):
        """
        :type get_proxie_or_not: bool
        :param get_proxie_or_not: ??????ip
        :return: ?????url???????url??
        """
        list_url = []
        try:
            if get_proxie_or_not:
                p = Proxies()
                p.get_ip_and_port()
                self.session.proxies = {
                    "http": p.ip_and_port,
                    "https": p.ip_and_port
                }
            response = self.session.get(self.start_url, timeout=30)
            if response.status_code == 200:
                html = response.content
            else:
                # ??selenium+phantomjs???????
                # ??phantomjs????????
                desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
                headers = self.headers
                for key, value in headers.iteritems():
                    desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
                driver = webdriver.PhantomJS(
                    desired_capabilities=desired_capabilities
                )
                driver.get(self.start_url)
                html = driver.page_source
                driver.quit()
            soup = BeautifulSoup(html, 'lxml')
            # ?????????????BeautifulSoup?????
            urls = soup.find()
            assert urls is not None
            repeat_num = 0
            for url in urls:
                if url['href'] not in list_url:
                    list_url.append(url['href'])
                else:
                    repeat_num += 1
            print "??%d??????????" % repeat_num
        except requests.ConnectTimeout:
            print "url????????????????"

        if list_url:
            return list_url
        else:
            print "??url????????"
            raise ValueError
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号