def get_connection():
# pycurl initialization
h = pycurl.Curl()
# follow redirects
h.setopt(pycurl.FOLLOWLOCATION, False)
# enable compression
h.setopt(pycurl.ENCODING, 'gzip, deflate')
# certifi
h.setopt(pycurl.CAINFO, certifi.where())
# no signal
h.setopt(pycurl.NOSIGNAL, 1)
# certificate informations
h.setopt(pycurl.OPT_CERTINFO, 1)
return h
python类ENCODING的实例源码
def test_post(self):
curl = CurlStub(b"result")
result = fetch("http://example.com", post=True, curl=curl)
self.assertEqual(result, b"result")
self.assertEqual(curl.options,
{pycurl.URL: b"http://example.com",
pycurl.FOLLOWLOCATION: 1,
pycurl.MAXREDIRS: 5,
pycurl.CONNECTTIMEOUT: 30,
pycurl.LOW_SPEED_LIMIT: 1,
pycurl.LOW_SPEED_TIME: 600,
pycurl.NOSIGNAL: 1,
pycurl.WRITEFUNCTION: Any(),
pycurl.POST: True,
pycurl.DNS_CACHE_TIMEOUT: 0,
pycurl.ENCODING: b"gzip,deflate"})
def test_post_data(self):
curl = CurlStub(b"result")
result = fetch("http://example.com", post=True, data="data", curl=curl)
self.assertEqual(result, b"result")
self.assertEqual(curl.options[pycurl.READFUNCTION](), b"data")
self.assertEqual(curl.options,
{pycurl.URL: b"http://example.com",
pycurl.FOLLOWLOCATION: 1,
pycurl.MAXREDIRS: 5,
pycurl.CONNECTTIMEOUT: 30,
pycurl.LOW_SPEED_LIMIT: 1,
pycurl.LOW_SPEED_TIME: 600,
pycurl.NOSIGNAL: 1,
pycurl.WRITEFUNCTION: Any(),
pycurl.POST: True,
pycurl.POSTFIELDSIZE: 4,
pycurl.READFUNCTION: Any(),
pycurl.DNS_CACHE_TIMEOUT: 0,
pycurl.ENCODING: b"gzip,deflate"})
def test_cainfo(self):
curl = CurlStub(b"result")
result = fetch("https://example.com", cainfo="cainfo", curl=curl)
self.assertEqual(result, b"result")
self.assertEqual(curl.options,
{pycurl.URL: b"https://example.com",
pycurl.FOLLOWLOCATION: 1,
pycurl.MAXREDIRS: 5,
pycurl.CONNECTTIMEOUT: 30,
pycurl.LOW_SPEED_LIMIT: 1,
pycurl.LOW_SPEED_TIME: 600,
pycurl.NOSIGNAL: 1,
pycurl.WRITEFUNCTION: Any(),
pycurl.CAINFO: b"cainfo",
pycurl.DNS_CACHE_TIMEOUT: 0,
pycurl.ENCODING: b"gzip,deflate"})
def test_headers(self):
curl = CurlStub(b"result")
result = fetch("http://example.com",
headers={"a": "1", "b": "2"}, curl=curl)
self.assertEqual(result, b"result")
self.assertEqual(curl.options,
{pycurl.URL: b"http://example.com",
pycurl.FOLLOWLOCATION: 1,
pycurl.MAXREDIRS: 5,
pycurl.CONNECTTIMEOUT: 30,
pycurl.LOW_SPEED_LIMIT: 1,
pycurl.LOW_SPEED_TIME: 600,
pycurl.NOSIGNAL: 1,
pycurl.WRITEFUNCTION: Any(),
pycurl.HTTPHEADER: ["a: 1", "b: 2"],
pycurl.DNS_CACHE_TIMEOUT: 0,
pycurl.ENCODING: b"gzip,deflate"})
def test_pycurl_insecure(self):
curl = CurlStub(b"result")
result = fetch("http://example.com/get-ca-cert", curl=curl,
insecure=True)
self.assertEqual(result, b"result")
self.assertEqual(curl.options,
{pycurl.URL: b"http://example.com/get-ca-cert",
pycurl.FOLLOWLOCATION: 1,
pycurl.MAXREDIRS: 5,
pycurl.CONNECTTIMEOUT: 30,
pycurl.LOW_SPEED_LIMIT: 1,
pycurl.LOW_SPEED_TIME: 600,
pycurl.NOSIGNAL: 1,
pycurl.WRITEFUNCTION: Any(),
pycurl.SSL_VERIFYPEER: False,
pycurl.DNS_CACHE_TIMEOUT: 0,
pycurl.ENCODING: b"gzip,deflate"})
def initHandle(self):
""" sets common options to curl handle """
self.c.setopt(pycurl.FOLLOWLOCATION, 1)
self.c.setopt(pycurl.MAXREDIRS, 5)
self.c.setopt(pycurl.CONNECTTIMEOUT, 30)
self.c.setopt(pycurl.NOSIGNAL, 1)
self.c.setopt(pycurl.NOPROGRESS, 1)
if hasattr(pycurl, "AUTOREFERER"):
self.c.setopt(pycurl.AUTOREFERER, 1)
self.c.setopt(pycurl.SSL_VERIFYPEER, 0)
self.c.setopt(pycurl.LOW_SPEED_TIME, 30)
self.c.setopt(pycurl.LOW_SPEED_LIMIT, 5)
#self.c.setopt(pycurl.VERBOSE, 1)
self.c.setopt(pycurl.USERAGENT,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64;en; rv:5.0) Gecko/20110619 Firefox/5.0")
if pycurl.version_info()[7]:
self.c.setopt(pycurl.ENCODING, "gzip, deflate")
self.c.setopt(pycurl.HTTPHEADER, ["Accept: */*",
"Accept-Language: en-US,en",
"Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7",
"Connection: keep-alive",
"Keep-Alive: 300",
"Expect:"])
def Curl(url,headers):
while 1:
try:
c = pycurl.Curl()
c.setopt(pycurl.REFERER, 'http://weixin.sogou.com/')
c.setopt(pycurl.FOLLOWLOCATION, True)
c.setopt(pycurl.MAXREDIRS,5)
c.setopt(pycurl.CONNECTTIMEOUT, 60)
c.setopt(pycurl.TIMEOUT,120)
c.setopt(pycurl.ENCODING, 'gzip,deflate')
c.fp = StringIO.StringIO()
c.setopt(pycurl.URL, url)
c.setopt(pycurl.HTTPHEADER,headers)
c.setopt(c.WRITEFUNCTION, c.fp.write)
c.perform()
html = c.fp.getvalue()
if '??????' in html:
print u'??????,??10??'
time.sleep(600)
else:
return html
except Exception, e:
print url,'curl(url)',e
continue
#????????
def getHtml(url,headers):
c = pycurl.Curl() #??curl????????
c.setopt(pycurl.URL, url) #??????URL
c.setopt(pycurl.FOLLOWLOCATION, True) #????????
c.setopt(pycurl.MAXREDIRS,5) #?????????
c.setopt(pycurl.CONNECTTIMEOUT, 60) #??????
c.setopt(pycurl.TIMEOUT,120) #????
c.setopt(pycurl.ENCODING, 'gzip,deflate') #??gzip???????????????????gzip?????????gzip??????
c.fp = StringIO.StringIO() #??StringIO??
c.setopt(pycurl.HTTPHEADER,headers) #?????
c.setopt(pycurl.POST, 1) #??get
c.setopt(pycurl.POSTFIELDS, data) #??POST??
c.setopt(c.WRITEFUNCTION, c.fp.write) #???????
c.perform() #??
html = c.fp.getvalue() #?????
return html
def curl(url, debug=False, **kwargs):
while 1:
try:
s = StringIO.StringIO()
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
c.setopt(pycurl.REFERER, url)
c.setopt(pycurl.FOLLOWLOCATION, True)
c.setopt(pycurl.TIMEOUT, 60)
c.setopt(pycurl.ENCODING, 'gzip')
c.setopt(pycurl.USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')
c.setopt(pycurl.NOSIGNAL, True)
c.setopt(pycurl.WRITEFUNCTION, s.write)
for k, v in kwargs.iteritems():
c.setopt(vars(pycurl)[k], v)
c.perform()
c.close()
return s.getvalue()
except:
if debug:
raise
continue
def test_gzip(url):
t = Test()
c = pycurl.Curl()
c.setopt(pycurl.WRITEFUNCTION,t.callback)
c.setopt(pycurl.ENCODING, 'gzip')
c.setopt(pycurl.URL,url)
c.setopt(pycurl.USERAGENT,"User-Agent':'EMAO_OPS_MONITOR) Gecko/20091201 Firefox/3.5.6)")
c.perform()
TOTAL_TIME = c.getinfo(c.TOTAL_TIME)
#print "????????%.2f ms" %(TOTAL_TIME*1000)
return TOTAL_TIME * 1000
def test_basic(self):
curl = CurlStub(b"result")
result = fetch("http://example.com", curl=curl)
self.assertEqual(result, b"result")
self.assertEqual(curl.options,
{pycurl.URL: b"http://example.com",
pycurl.FOLLOWLOCATION: 1,
pycurl.MAXREDIRS: 5,
pycurl.CONNECTTIMEOUT: 30,
pycurl.LOW_SPEED_LIMIT: 1,
pycurl.LOW_SPEED_TIME: 600,
pycurl.NOSIGNAL: 1,
pycurl.WRITEFUNCTION: Any(),
pycurl.DNS_CACHE_TIMEOUT: 0,
pycurl.ENCODING: b"gzip,deflate"})
def test_create_curl(self):
curls = []
def pycurl_Curl():
curl = CurlStub(b"result")
curls.append(curl)
return curl
Curl = pycurl.Curl
try:
pycurl.Curl = pycurl_Curl
result = fetch("http://example.com")
curl = curls[0]
self.assertEqual(result, b"result")
self.assertEqual(curl.options,
{pycurl.URL: b"http://example.com",
pycurl.FOLLOWLOCATION: 1,
pycurl.MAXREDIRS: 5,
pycurl.CONNECTTIMEOUT: 30,
pycurl.LOW_SPEED_LIMIT: 1,
pycurl.LOW_SPEED_TIME: 600,
pycurl.NOSIGNAL: 1,
pycurl.WRITEFUNCTION: Any(),
pycurl.DNS_CACHE_TIMEOUT: 0,
pycurl.ENCODING: b"gzip,deflate"})
finally:
pycurl.Curl = Curl
def getPage (self, url, requestHeader = []) :
resultFormate = StringIO.StringIO()
fakeIp = self.fakeIp()
requestHeader.append('CLIENT-IP:' + fakeIp)
requestHeader.append('X-FORWARDED-FOR:' + fakeIp)
try:
curl = pycurl.Curl()
curl.setopt(pycurl.URL, url.strip())
curl.setopt(pycurl.ENCODING, 'gzip,deflate')
curl.setopt(pycurl.HEADER, 1)
curl.setopt(pycurl.TIMEOUT, 120)
curl.setopt(pycurl.SSL_VERIFYPEER, 0)
curl.setopt(pycurl.SSL_VERIFYHOST, 0)
curl.setopt(pycurl.HTTPHEADER, requestHeader)
curl.setopt(pycurl.WRITEFUNCTION, resultFormate.write)
curl.perform()
headerSize = curl.getinfo(pycurl.HEADER_SIZE)
curl.close()
header = resultFormate.getvalue()[0 : headerSize].split('\r\n')
body = resultFormate.getvalue()[headerSize : ]
except Exception, e:
header = ''
body = ''
return header, body
def getKeyword(i):#??json
try:
time.sleep(1)
headers = [
'Host:fengchao.baidu.com',
'User-Agent: %s' %getUA(),
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding: gzip, deflate',
'Referer: http://fengchao.baidu.com/nirvana/main.html?userid=8048066',
'Connection: keep-alive',
'COOKIE:%s' %COOKIE,
]
post = urllib.urlencode({
'params': '{"entry":"kr_station","query":"%s","querytype":1,"pageNo":1,"pageSize":300}' % keyword_list[i],
'path': 'jupiter/GET/kr/word',
'token': TOKEN,
'userid': USERID,
})
url = 'http://fengchao.baidu.com/nirvana/request.ajax?path=jupiter/GET/kr/word'
c = pycurl.Curl()
# c.setopt(pycurl.PROXY, getRandomAlbIp())
c.setopt(pycurl.URL, url)
c.setopt(pycurl.FOLLOWLOCATION, True)
c.setopt(pycurl.MAXREDIRS,5)
c.setopt(pycurl.CONNECTTIMEOUT, 20)
c.setopt(pycurl.TIMEOUT,20)
c.setopt(pycurl.ENCODING, 'gzip,deflate')
c.fp = StringIO.StringIO()
c.setopt(pycurl.HTTPHEADER,headers)
c.setopt(pycurl.POST, 1)
c.setopt(pycurl.POSTFIELDS, post)
c.setopt(c.WRITEFUNCTION, c.fp.write)
c.perform()
# mutex.acquire()#??
jsonData = c.fp.getvalue()
analyseJsonData(i,jsonData)
# mutex.release()#??
except Exception,e:
print e
pass
def init_handle(self):
"""
Sets common options to curl handle.
"""
self.setopt(pycurl.FOLLOWLOCATION, 1)
self.setopt(pycurl.MAXREDIRS, 5)
self.setopt(pycurl.CONNECTTIMEOUT, 30)
self.setopt(pycurl.NOSIGNAL, 1)
self.setopt(pycurl.NOPROGRESS, 1)
if hasattr(pycurl, "AUTOREFERER"):
self.setopt(pycurl.AUTOREFERER, 1)
self.setopt(pycurl.SSL_VERIFYPEER, 0)
# Interval for low speed, detects connection loss, but can abort dl if
# hoster stalls the download
self.setopt(pycurl.LOW_SPEED_TIME, 45)
self.setopt(pycurl.LOW_SPEED_LIMIT, 5)
# do not save the cookies
self.setopt(pycurl.COOKIEFILE, '')
self.setopt(pycurl.COOKIEJAR, '')
# self.setopt(pycurl.VERBOSE, 1)
self.setopt(
pycurl.USERAGENT,
'Mozilla/5.0 (Windows NT 10.0; Win64; rv:53.0) '
'Gecko/20100101 Firefox/53.0')
if pycurl.version_info()[7]:
self.setopt(pycurl.ENCODING, 'gzip,deflate')
self.headers.update(
{'Accept': "*/*",
'Accept-Language': "en-US,en",
'Accept-Charset': "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
'Connection': "keep-alive",
'Keep-Alive': "300",
'Expect': ""})
def _create_handle(self):
c = pycurl.Curl()
# c.setopt(pycurl.VERBOSE, 1)
c.setopt(pycurl.CONNECTTIMEOUT, CONNECT_TIMEOUT)
c.setopt(pycurl.TIMEOUT, TIMEOUT)
c.setopt(pycurl.ENCODING, 'gzip, deflate')
return c
def pidePOST(self,url,data,compressed = False,cookie=False, contador_curl = 0, debug=False):
time.sleep(3)
Scrape.contador+=1
print ("\n"+url)
print ("\n\t.l."+str(Scrape.contador))
c = pycurl.Curl()
if cookie:
c.setopt(pycurl.COOKIEJAR, 'cookie.txt')
c.setopt(pycurl.COOKIEFILE, 'cookie.txt')
c.setopt(pycurl.URL, url)
c.setopt(pycurl.CONNECTTIMEOUT, 15)
c.setopt(pycurl.TIMEOUT, 25)
c.setopt(pycurl.HTTPHEADER, self.headers)
if compressed:
c.setopt(pycurl.ENCODING, 'gzip,deflate')
c.setopt(c.POSTFIELDS, data)
if debug:
c.setopt(c.VERBOSE, True)
c.setopt( pycurl.PROXY, '127.0.0.1' )
c.setopt( pycurl.PROXYPORT, 9050 )
c.setopt( pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5_HOSTNAME )
b = BytesIO()
BytesIO
c.setopt(pycurl.WRITEFUNCTION, b.write)
self.url = url
try:
c.perform()
self.response_string = b.getvalue()
#print (self.response_string)
b.close()
except Exception as e:
#print ('Razon:',e)
self.response_string = None