def parse(self,response):
sel = scrapy.Selector(response)
article_info = sel.xpath("//a")
for info in article_info:
item = GovcrawlItem()
link = info.xpath('@href').extract()
if not link:
continue
position = link[0].find("/")
if position < 0 or "?" not in link[0]:
continue
elif "http" not in link[0]:
url = response.url + link[0][position:]
else:
url = link[0]
yield scrapy.Request(url,callback=self.parse)
item['link'] = url
title = info.xpath('text()').extract()
if title:
item['title'] = title[0]
else:
item['title'] = None
#print item['link']
yield item
评论列表
文章目录