spider.py 文件源码

python
阅读 26 收藏 0 点赞 0 评论 0

项目:python_tool 作者: hunterhug 项目源码 文件源码
def getHtml(url, daili='', postdata={}, header=[]):
    """
    ???????cookie
    ?????????????POST???
    """
    # COOKIE??????
    filename = 'cookie.txt'

    # ????MozillaCookieJar??????????
    cj = http.cookiejar.MozillaCookieJar(filename)
    # cj =http.cookiejar.LWPCookieJar(filename)

    # ??????cookie?????
    # ignore_discard??????cookies???????????
    # ignore_expires??????????? cookies????????????
    # ??????????COOKIE
    if os.path.exists(filename):
        cj.load(filename, ignore_discard=True, ignore_expires=True)
    # ????COOKIE
    if os.path.exists('../subcookie.txt'):
        cookie = open('../subcookie.txt', 'r').read()
    else:
        cookie = ''
    # ????COOKIE????????
    proxy_support = urllib.request.ProxyHandler({'http': 'http://' + daili})
    # ??????
    if daili:
        print('??:' + daili + '??')
        opener = urllib.request.build_opener(proxy_support, urllib.request.HTTPCookieProcessor(cj),
                                             urllib.request.HTTPHandler)
    else:
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    if header:
        pass
    else:
        header = [('User-Agent',
                   'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'),
                  ('Referer', 'http://s.m.taobao.com'),
                  ('Host', 'h5.m.taobao.com'),
                  ('Cookie', cookie)]
    # ???????
    opener.addheaders = header

    # ????
    urllib.request.install_opener(opener)
    # ?????POST
    if postdata:
        # ??URL??
        postdata = urllib.parse.urlencode(postdata)

        # ????
        html_bytes = urllib.request.urlopen(url, postdata.encode()).read()
    else:
        html_bytes = urllib.request.urlopen(url).read()

    # ??COOKIE????
    cj.save(ignore_discard=True, ignore_expires=True)
    return html_bytes
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号