crawl-proxy-nonblock.py 文件源码

python
阅读 18 收藏 0 点赞 0 评论 0

项目:tianyancha_project 作者: sunbopython 项目源码 文件源码
def proxy_checker(self):
        """ Further test for proxy"""
        def main():
            success={}         
            settings = Settings() 

            @defer.inlineCallbacks
            def getResponse(proxy,request):
                try:
                    print("Request {} using proxy:{}".format(request,proxy))
                    response = yield HTTP11DownloadHandler(settings).download_request(request=request,spider=None)
                    if response.status==200:
                        success[proxy]=success.setdefault(proxy,0) + 1
                        print("Successful(+{}/{}) ip:{}".format(success[proxy],self.checknum,proxy))
                        if success[proxy]/self.checknum>= self.checkthreshold:
                            self.passproxy.add(proxy)    
                except Exception as e:
                    #print(e)
                    pass

            def output_better_proxy(_):
                """ writing proxies to file"""
                with open('validProxy.txt','w') as f:
                    for p in self.passproxy:
                        print(p)
                        f.write(p+'\n')

            def iter_proxy():
                # work needs to be a generator, i tried to use list but failed to realize concurrent
                work = (    getResponse(proxy,Request(url='http://myip.dnsdynamic.org',
                                    headers=self.headers,
                                    meta={ 'proxy':"http://"+proxy, 'download_timeout':self.timeout})) for proxy in self.proxy_list for times in range(self.checknum)
                        )
                coop = task.Cooperator()
                join = defer.DeferredList(coop.coiterate(work) for i in range(self.concurrent))
                join.addCallback(output_better_proxy)
                join.addCallback(lambda _: reactor.stop())

            iter_proxy()

        main()
        reactor.run()
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号