iptime_kr_crawler.py 文件源码-python代码片段

iptime_kr_crawler.py 文件源码

python

阅读 32 收藏 0 点赞 0 评论 0

项目：DLink_Harvester 作者: MikimotoH 项目源码文件源码

def walkListItems(sess, url):
    try:
        global visited

        def replacewhite(text):
            return re.sub(r'(\ |\r|\n|\t)+', ' ', text)
        resp = sess.get(url=url)
        root = html.fromstring(resp.text)
        tds = root.xpath(".//*[@class='kboard-list']//tr/td[2]")
        for td in tds:
            href = td.xpath(".//a")[0].attrib['href']
            href = urljoin(url, href)
            href = re.sub(r'pageid=\d+', '', href)
            if href in visited:
                continue
            text = re.sub(r'(\ |\r|\n|\t)+', ' ', td.text_content())
            if '???' not in text:
                continue
            print(text)
            visited[href] = (text)
            walkPageItem(sess, href, text)
    except BaseException as ex:
        traceback.print_exc()
        print(ex)