xueshu_spider.py 文件源码-python代码片段

xueshu_spider.py 文件源码

python

阅读 19 收藏 0 点赞 0 评论 0

项目：XueshuCrawler 作者: rollingstarky 项目源码文件源码

def parse(self,response):
#       filename = 'xueshu.html'
#       with open(filename, 'wb') as f:
#           f.write(response.body)
        for sel in response.xpath('//div[@srcid]'):
            item=XueshuItem()
            for  cell in sel.xpath('div[1]'):
                item['title']=cell.xpath('h3//a//text()').extract()
                item['link']=cell.xpath('h3/a/@href').extract()
                item['author']=cell.xpath('div[1]/span[1]//a/text()').extract()
                link='http://xueshu.baidu.com'+cell.xpath('h3/a/@href').extract()[0]
                item['publish']=cell.xpath('div[1]/span[2]/a/@title').extract()
                item['year']=cell.xpath('div[1]/span[3]/text()').extract()
                item['cite']=cell.xpath('div[1]/span[4]/a/text()').extract()
                item['abstract']=self.get_abstract(link)
#               self.log(self.get_abstract(link))
            item['subject']=sel.xpath('div[2]/div[1]//a/text()').extract()
            yield item