iptime_kr_crawler.py 文件源码-python代码片段

iptime_kr_crawler.py 文件源码

python

阅读 28 收藏 0 点赞 0 评论 0

项目：DLink_Harvester 作者: MikimotoH 项目源码文件源码

def walkNextPages(sess, url="https://iptime.com/iptime/?page_id=126&dffid=1&dfsid=11"):
    try:
        from os.path import basename

        def get_pageid(url):
            from urllib.parse import parse_qsl, urlsplit
            qs = dict(parse_qsl(urlsplit(url).query))
            return int(qs.get("pageid", "1"))
        while True:
            pageid = get_pageid(url)
            print("pageid=%d" % pageid)
            walkListItems(sess, url)

            root = html.fromstring(sess.get(url=url).text)
            arrows = [basename(_) for _ in root.xpath(".//ul[@class='pages']//img/@src")]
            if 'next_1.gif' not in arrows:
                break
            nexturl = next(_ for _ in root.xpath(".//ul[@class='pages']//img") if
                           basename(_.attrib['src']) == 'next_1.gif')
            url = urljoin(url, nexturl.xpath("../../a/@href")[0])
            nextpageid = get_pageid(url)
            assert nextpageid == pageid+1
    except BaseException as ex:
        traceback.print_exc()
        print(ex)