extract_dmoz_links.py 文件源码-python代码片段

extract_dmoz_links.py 文件源码

python

阅读 17 收藏 0 点赞 0 评论 0

项目：scrapy-streaming 作者: scrapy-plugins 项目源码文件源码

def response_category(response):
    global pending_requests
    # this response is no longer pending
    pending_requests -= 1

    # using scrapy selector
    selector = Selector(text=response['body'])
    # get div with link and title
    divs = selector.css('div.title-and-desc')

    for div in divs:
        url = div.css("a::attr('href')").extract_first();
        title = div.css("a > div.site-title::text").extract_first();
        result[title] = url

    # if finished all requests, we can close the spider
    if pending_requests == 0:
        # serialize the extracted data and close the spider
        open('outputs/dmoz_data.json', 'w').write(json.dumps(result))
        write_line('{"type": "close"}')