html_downloader.py 文件源码

python
阅读 32 收藏 0 点赞 0 评论 0

项目:wechat_spider 作者: CoolWell 项目源码 文件源码
def download(self, link, name, url):
        """
        ????????????
        :param link:
        :param name:
        :param url:
        :return:
        """
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = (
            random.choice(self.agents)
        )
        dcap["takesScreenshot"] = False
        dcap["phantomjs.page.customHeaders.Cookie"] = random.choice(self.cookie)
        # dcap["phantomjs.page.settings.resourceTimeout"] = ("1000")
        try:
            driver1 = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--load-images=no', ])
        except Exception as e:
            with open(r'list_error.txt', 'a') as f:
                f.write(name.encode('utf-8'))
                f.write('\n')
            print(datetime.datetime.now())
            print(url)
            print(e)
        else:
            try:
                driver1.set_page_load_timeout(20)
                driver1.get(link)
                b = True
                try:
                    driver1.find_element_by_class_name('page_verify')
                except:
                    b = False

                if b is True:
                    print('page needs verify, stop the program')
                    print('the last weixinNUM is %s\n' % name)
                    self.ocr4wechat(link)
                    time.sleep(5)
                    with open(r'list_error.txt', 'a') as f:
                        f.write(name.encode('utf-8'))
                        f.write('\n')
                else:
                    html = driver1.page_source
                    return link, html
            except Exception as e:
                with open(r'list_error.txt', 'a') as f:
                    f.write(name.encode('utf-8'))
                    f.write('\n')
                print(url)
                print(datetime.datetime.now())
                print(e)

            finally:
                driver1.quit()
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号