def response_parse(response):
global pending_requests
# using scrapy selector to extract data from the html
selector = Selector(text=response['body'])
# get the url of repositories
for href in selector.css("#subcategories-div > section > div > div.cat-item > a::attr('href')"):
# we count the number of requests using this var
pending_requests += 1
# open a new request
write_line('''
{
"type": "request",
"id": "category",
"url": "http://www.dmoz.org%s"
}
''' % href.extract())
extract_dmoz_links.py 文件源码
python
阅读 18
收藏 0
点赞 0
评论 0
评论列表
文章目录