spider.py 文件源码-python代码片段

spider.py 文件源码

python

阅读 30 收藏 0 点赞 0 评论 0

项目：autoinjection 作者: ChengWiLL 项目源码文件源码

def parse(self,response):
        sel = scrapy.Selector(response)
        article_info = sel.xpath("//a")

        for info in article_info:
            item = GovcrawlItem()
            link = info.xpath('@href').extract()
            if not link:
                continue
            position = link[0].find("/")
            if position < 0 or "?" not in link[0]:
                continue
            elif "http" not in link[0]:
                url = response.url + link[0][position:]
            else:
                url = link[0]
            yield scrapy.Request(url,callback=self.parse)
            item['link'] = url
            title = info.xpath('text()').extract()
            if title:
                item['title'] = title[0]
            else:
                item['title'] = None
            #print item['link']
            yield item