def signin(self, user, password, data):
self.req.http.c.setopt(
pycurl.REFERER,
"https://1fichier.com/login.pl?lg=en")
try:
html = self.load("https://1fichier.com/login.pl?lg=en",
post={'mail': user,
'pass': password,
'It': "on",
'purge': "off",
'valider': "Send"})
if any(_x in html for _x in
('>Invalid username or Password', '>Invalid email address', '>Invalid password')):
self.fail_login()
except BadHeader, e:
if e.code == 403:
self.fail_login()
else:
raise
python类REFERER的实例源码
def Curl(url,headers):
while 1:
try:
c = pycurl.Curl()
c.setopt(pycurl.REFERER, 'http://weixin.sogou.com/')
c.setopt(pycurl.FOLLOWLOCATION, True)
c.setopt(pycurl.MAXREDIRS,5)
c.setopt(pycurl.CONNECTTIMEOUT, 60)
c.setopt(pycurl.TIMEOUT,120)
c.setopt(pycurl.ENCODING, 'gzip,deflate')
c.fp = StringIO.StringIO()
c.setopt(pycurl.URL, url)
c.setopt(pycurl.HTTPHEADER,headers)
c.setopt(c.WRITEFUNCTION, c.fp.write)
c.perform()
html = c.fp.getvalue()
if '??????' in html:
print u'??????,??10??'
time.sleep(600)
else:
return html
except Exception, e:
print url,'curl(url)',e
continue
#????????
def curl(url, debug=False, **kwargs):
while 1:
try:
s = StringIO.StringIO()
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
c.setopt(pycurl.REFERER, url)
c.setopt(pycurl.FOLLOWLOCATION, True)
c.setopt(pycurl.TIMEOUT, 60)
c.setopt(pycurl.ENCODING, 'gzip')
c.setopt(pycurl.USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')
c.setopt(pycurl.NOSIGNAL, True)
c.setopt(pycurl.WRITEFUNCTION, s.write)
for k, v in kwargs.iteritems():
c.setopt(vars(pycurl)[k], v)
c.perform()
c.close()
return s.getvalue()
except:
if debug:
raise
continue
def signin(self, user, password, data):
self.req.http.c.setopt(
pycurl.REFERER,
"https://1fichier.com/login.pl?lg=en")
try:
html = self.load("https://1fichier.com/login.pl?lg=en",
post={'mail': user,
'pass': password,
'It': "on",
'purge': "off",
'valider': "Send"})
if any(_x in html for _x in
('>Invalid username or Password', '>Invalid email address', '>Invalid password')):
self.fail_login()
except BadHeader, e:
if e.code == 403:
self.fail_login()
else:
raise
def ccurl(url,value):
hdr = "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:45.0) Gecko/20100101 Firefox/45.0"
c = pycurl.Curl()
if value == "no_redir":
print("no redirect")
else:
c.setopt(c.FOLLOWLOCATION, True)
c.setopt(c.USERAGENT, hdr)
if value != "" and value != "no_redir":
post_data = {'id': value}
post_d = urllib.parse.urlencode(post_data)
c.setopt(c.POSTFIELDS,post_d)
#if rfr != "":
# c.setopt(pycurl.REFERER, rfr)
url = str(url)
c.setopt(c.URL, url)
storage = BytesIO()
c.setopt(c.WRITEDATA, storage)
c.perform()
c.close()
content = storage.getvalue()
content = getContentUnicode(content)
return (content)
def get_html(url, user_agent, refer_url):
"""
curl html
:param url:
:param user_agent:
:param refer_url:
:return:
"""
curl = pycurl.Curl()
curl.setopt(pycurl.USERAGENT, user_agent)
curl.setopt(pycurl.REFERER, refer_url)
buffers = StringIO()
curl.setopt(pycurl.URL, url)
curl.setopt(pycurl.WRITEDATA, buffers)
curl.perform()
body = buffers.getvalue()
buffers.close()
curl.close()
return body
def get (url, user_agent=UA, referrer=None):
"""Make a GET request of the url using pycurl and return the data
(which is None if unsuccessful)"""
data = None
databuffer = StringIO()
curl = pycurl.Curl()
curl.setopt(pycurl.URL, url)
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.CONNECTTIMEOUT, 5)
curl.setopt(pycurl.TIMEOUT, 8)
curl.setopt(pycurl.WRITEFUNCTION, databuffer.write)
curl.setopt(pycurl.COOKIEFILE, '')
if user_agent:
curl.setopt(pycurl.USERAGENT, user_agent)
if referrer is not None:
curl.setopt(pycurl.REFERER, referrer)
try:
curl.perform()
data = databuffer.getvalue()
except Exception:
pass
curl.close()
return data
def get_download_link(fs_id):
"""
??????
:param fs_id:
:return:
"""
curl = pycurl.Curl()
curl.setopt(pycurl.USERAGENT, const.USER_AGENT)
curl.setopt(pycurl.REFERER, const.PAN_REFER_URL)
buffers = StringIO()
request_dict = {
'channel': 'chunlei',
'timestamp': '1473685224',
'fidlist': [fs_id],
'type': 'dlink',
'web': 1,
'clienttype': 0,
'bdstoken': 'e0e895bb3ef7b0cb70899ee66b74e809',
'sign': decode_sign(parse_sign2('d76e889b6aafd3087ac3bd56f4d4053a', '3545d271c5d07ba27355d39da0c62a4ee06d2d25'))
}
target_url = const.PAN_API_URL + 'download?' + urllib.urlencode(request_dict)
curl.setopt(pycurl.URL, target_url)
curl.setopt(pycurl.WRITEDATA, buffers)
curl.setopt(pycurl.COOKIEFILE, "cookie.txt")
curl.perform()
body = buffers.getvalue()
buffers.close()
curl.close()
data = json.loads(body)
if data['errno']:
return None
return data['dlink'][0]['dlink']
def setRequestContext(self, url, get, post, referer, cookies, multipart=False):
""" sets everything needed for the request """
url = myquote(url)
if get:
get = urlencode(get)
url = "%s?%s" % (url, get)
self.c.setopt(pycurl.URL, url)
self.c.lastUrl = url
if post:
self.c.setopt(pycurl.POST, 1)
if not multipart:
if type(post) == unicode:
post = str(post) #unicode not allowed
elif type(post) == str:
pass
else:
post = myurlencode(post)
self.c.setopt(pycurl.POSTFIELDS, post)
else:
post = [(x, y.encode('utf8') if type(y) == unicode else y ) for x, y in post.iteritems()]
self.c.setopt(pycurl.HTTPPOST, post)
else:
self.c.setopt(pycurl.POST, 0)
if referer and self.lastURL:
self.c.setopt(pycurl.REFERER, str(self.lastURL))
if cookies:
self.c.setopt(pycurl.COOKIEFILE, "")
self.c.setopt(pycurl.COOKIEJAR, "")
self.getCookies()
def curl_get(self, url, refUrl=None):
buf = cStringIO.StringIO()
curl = pycurl.Curl()
curl.setopt(curl.URL, url)
curl.setopt(curl.WRITEFUNCTION, buf.write)
curl.setopt(pycurl.SSL_VERIFYPEER, 0)
#curl.setopt(pycurl.SSL_VERIFYHOST, 0)
#curl.setopt(pycurl.HEADERFUNCTION, self.headerCookie)
curl.setopt(pycurl.VERBOSE, 0)
curl.setopt(pycurl.USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:46.0) Gecko/20100101 Firefox/46.0')
#curl.setopt(pycurl.HTTPGET,1)
#curl.setopt(pycurl.COOKIE, Cookie)
#curl.setopt(pycurl.POSTFIELDS, 'j_username={ngnms_user}&j_password={ngnms_password}'.format(**self.ngnms_login))
curl.setopt(pycurl.COOKIEJAR, '/htdocs/logs/py_cookie.txt')
curl.setopt(pycurl.COOKIEFILE, '/htdocs/logs/py_cookie.txt')
if refUrl:
curl.setopt(pycurl.REFERER, refUrl)
#curl.setopt(c.CONNECTTIMEOUT, 5)
#curl.setopt(c.TIMEOUT, 8)
curl.perform()
backinfo = ''
if curl.getinfo(pycurl.RESPONSE_CODE) == 200:
backinfo = buf.getvalue()
curl.close()
return backinfo
def handle_request(self):
curl_handle = pycurl.Curl()
# set default options.
curl_handle.setopt(pycurl.URL, self.request_url)
curl_handle.setopt(pycurl.REFERER, self.request_url)
curl_handle.setopt(pycurl.USERAGENT, self.useragent)
curl_handle.setopt(pycurl.TIMEOUT, self.curlopts['TIMEOUT'])
curl_handle.setopt(pycurl.CONNECTTIMEOUT, self.curlopts['CONNECTTIMEOUT'])
curl_handle.setopt(pycurl.HEADER, True)
#curl_handle.setopt(pycurl.VERBOSE, 1)
curl_handle.setopt(pycurl.FOLLOWLOCATION, 1)
curl_handle.setopt(pycurl.MAXREDIRS, 5)
if(self.request_headers and len(self.request_headers) > 0):
tmplist = list()
for(key, value) in self.request_headers.items():
tmplist.append(key + ':' + value)
curl_handle.setopt(pycurl.HTTPHEADER, tmplist)
#??????POST
curl_handle.setopt(pycurl.HTTPPROXYTUNNEL, 1)
curl_handle.setopt(pycurl.POSTFIELDS, self.request_body)
response = StringIO.StringIO()
curl_handle.setopt(pycurl.WRITEFUNCTION, response.write)
try:
curl_handle.perform()
except pycurl.error as error:
raise ChannelException(error, 5)
self.response_code = curl_handle.getinfo(curl_handle.HTTP_CODE)
header_size = curl_handle.getinfo(curl_handle.HEADER_SIZE)
resp_str = response.getvalue()
self.response_headers = resp_str[0 : header_size]
self.response_body = resp_str[header_size : ]
response.close()
curl_handle.close()
def ccurlPost(url,value):
hdr = "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:45.0) Gecko/20100101 Firefox/45.0"
c = pycurl.Curl()
if value == "no_redir":
print("no redirect")
else:
c.setopt(c.FOLLOWLOCATION, True)
c.setopt(c.USERAGENT, hdr)
if value != "" and value != "no_redir":
post_data = {'id': value}
post_d = urllib.parse.urlencode(post_data)
c.setopt(c.POSTFIELDS,post_d)
#if rfr != "":
# c.setopt(pycurl.REFERER, rfr)
url = str(url)
c.setopt(c.URL, url)
storage = BytesIO()
c.setopt(c.WRITEDATA, storage)
c.perform()
c.close()
content = storage.getvalue()
content = getContentUnicode(content)
return (content)
def list_dir(dir_name):
"""
????????
:param dir_name: ??
:return:
"""
result = list()
curl = pycurl.Curl()
curl.setopt(pycurl.USERAGENT, const.USER_AGENT)
curl.setopt(pycurl.REFERER, const.PAN_REFER_URL)
buffers = StringIO()
request_dict = {
'channel': 'chunlei',
'clienttype': 0,
'showempty': 0,
'web': 1,
'order': 'time',
'desc': 1,
'page': 1,
'num': 100,
'dir': dir_name,
'bdstoken': 'e0e895bb3ef7b0cb70899ee66b74e809'
}
target_url = const.PAN_API_URL + 'list?' + urllib.urlencode(request_dict)
curl.setopt(pycurl.URL, target_url)
curl.setopt(pycurl.WRITEDATA, buffers)
curl.setopt(pycurl.COOKIEFILE, "cookie.txt")
curl.perform()
body = buffers.getvalue()
print body
buffers.close()
curl.close()
data = json.loads(body)
if data['errno'] == 0:
for a_list in data['list']:
dlink = get_download_link(a_list['fs_id'])
if dlink:
dlink = dlink.replace('\\', '')
result.append(dlink)
return result
def get_dlinks(search_target, get_dlinks_only=True):
"""
????url???????
:param search_target: ????
:param get_dlinks_only: ??????
:return ???????????
"""
refer_url = const.REFER_URL % search_target
curl = pycurl.Curl()
curl.setopt(pycurl.USERAGENT, const.USER_AGENT)
curl.setopt(pycurl.REFERER, refer_url)
result = []
ll = 0
record_start_cursor = get_record_start_cursor(const.CURSOR_FILE)
if record_start_cursor:
ll = int(record_start_cursor)
print('start')
# ??????????????
while True:
print('crawler pictures of page %d' % (ll / 30 + 1))
# ??str?????
buffers = StringIO()
target_url = const.API_URL % (search_target, search_target, ll)
curl.setopt(pycurl.URL, target_url)
curl.setopt(pycurl.WRITEDATA, buffers)
curl.perform()
body = buffers.getvalue()
body = body.replace('null', 'None')
data = eval(body)
if 'data' in data:
has_data = False
for a_data in data['data']:
obj_url = None
if 'objURL' in a_data:
obj_url = a_data['objURL']
if obj_url:
has_data = True
result.append(obj_url)
if not has_data:
print('no more pic')
break
ll += 30
else:
print('no more pic')
break
print('done')
curl.close()
# ??page_num
if ll:
set_record_start_cursor(str(ll), const.CURSOR_FILE)
for index, data in enumerate(result):
result[index] = decode_url(data)
if not get_dlinks_only:
save_to_file(result, search_target + '.txt', const.BASE_FOLDER)