movie.py 文件源码

python
阅读 25 收藏 0 点赞 0 评论 0

项目:DIS_MeituanReptile 作者: myvary 项目源码 文件源码
def downloader_html_ph(url, up_num):  ##??PhantomJS??????
    '''
    url        :??????url
    up_num     :?????
    '''
    # print driver.service
    print '????????!    URL?', url, '    ?????:', up_num
    conf = {}
    for line in fileinput.input("..//..//abuyun.conf"):
        lines = line.replace(' ', '').replace('\n', '').split("=")
        conf[lines[0]] = lines[1]
    # ?????
    proxyHost = conf["proxyHost"]
    proxyPort = conf["proxyPort"]
    # ???????????
    proxyUser = conf["proxyUser"]
    proxyPass = conf["proxyPass"]
    service_args = [
        "--proxy-type=http",
        "--proxy=%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
        },
        "--proxy-auth=%(user)s:%(pass)s" % {
            "user": proxyUser,
            "pass": proxyPass,
        },
    ]
    phantomjs_path = r"phantomjs"
    dcap = dict(DesiredCapabilities.PHANTOMJS)
    # ?????UA??????????
    ua = rad_ua()  ##?????UA
    dcap["phantomjs.page.settings.userAgent"] = ua
    # ,service_args=service_args ?????
    driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path=phantomjs_path)
    driver.get(url)
    time.sleep(2)
    ##???????????????
    dian = ''
    print '?????',
    for i in range(up_num):
        driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        ##??????????????
        time.sleep(2)
        dian = dian + '.'
        print '.',

    print driver.current_url, '?????????????'

    data = driver.page_source.encode("utf-8")
    # ??????
    html_parser = HTMLParser.HTMLParser()
    data = html_parser.unescape(data)
    return data
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号