Spider.py 文件源码

python
阅读 34 收藏 0 点赞 0 评论 0

项目:sogaQuant 作者: idoplay 项目源码 文件源码
def sGet(self, url, ch='gbk', bt='solomon'):
        bots = {
            "baidu": "Baiduspider+(+http://www.baidu.com/search/spider.htm)",
            'google': "Googlebot/2.1 (+http://www.google.com/bot.html)",
            'solomon': "Solomon Net Vampire/1.0",
            'de': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:44.0) Gecko/20100101 Firefox/44.0"
        }
        headers = {
            #'Host': 'www.super-ping.com',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
            'User-Agent': bots[bt],
            #'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8,ja;q=0.6'
        }

        import cookielib
        cookie = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
        urllib2.install_opener(opener)
        req = urllib2.Request(url=url)

        try:
            gc.enable()
            #gc.set_debug(gc.DEBUG_LEAK)
            req = urllib2.urlopen(req)
            data = req.read()
            if ch == 'gbk':
                data = data.decode("gbk", 'ignore')
            elif ch == 'utf8':
                data = data.decode("utf-8")
            req.close()
            del req
            gc.collect()
            return data

        except IOError, e:
            print e
            #if(e.code == 404):
            #    return False
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号