def parse(self,response):
# filename = 'xueshu.html'
# with open(filename, 'wb') as f:
# f.write(response.body)
for sel in response.xpath('//div[@srcid]'):
item=XueshuItem()
for cell in sel.xpath('div[1]'):
item['title']=cell.xpath('h3//a//text()').extract()
item['link']=cell.xpath('h3/a/@href').extract()
item['author']=cell.xpath('div[1]/span[1]//a/text()').extract()
link='http://xueshu.baidu.com'+cell.xpath('h3/a/@href').extract()[0]
item['publish']=cell.xpath('div[1]/span[2]/a/@title').extract()
item['year']=cell.xpath('div[1]/span[3]/text()').extract()
item['cite']=cell.xpath('div[1]/span[4]/a/text()').extract()
item['abstract']=self.get_abstract(link)
# self.log(self.get_abstract(link))
item['subject']=sel.xpath('div[2]/div[1]//a/text()').extract()
yield item
评论列表
文章目录