def response_category(response):
global pending_requests
# this response is no longer pending
pending_requests -= 1
# using scrapy selector
selector = Selector(text=response['body'])
# get div with link and title
divs = selector.css('div.title-and-desc')
for div in divs:
url = div.css("a::attr('href')").extract_first();
title = div.css("a > div.site-title::text").extract_first();
result[title] = url
# if finished all requests, we can close the spider
if pending_requests == 0:
# serialize the extracted data and close the spider
open('outputs/dmoz_data.json', 'w').write(json.dumps(result))
write_line('{"type": "close"}')
extract_dmoz_links.py 文件源码
python
阅读 17
收藏 0
点赞 0
评论 0
评论列表
文章目录