zycggov_spider.py 文件源码-python代码片段

def parse(self, response):
        detail = response.xpath('//ul[@class="lby-list"]//li')
        pubtime = None
        for temp in detail[:20]:
            item = SiteItem()
            temp_pubtime = temp.xpath('span/text()').extract_first().strip()[1:11]
            if temp_pubtime:
                item['pubtime'] = temp.xpath('span/text()').extract_first().strip()[1:11]
                pubtime = item['pubtime']
            item['title'] = temp.xpath('a//text()').extract_first()
            print "------------------------------{}----".format(item['title'])
            if temp.xpath('a/@href').extract_first():
                item['link'] = "http://www.zycg.gov.cn" + temp.xpath('a//@href').extract_first()
            yield item
        # ???????????????
        # print ('-----------------------??-------------------------------')
        # print ('-------pubtime----------------{}-------------------------------'.format(pubtime))
        # print ('------date.get_curdate-----------------{}-------------------------------'.format(date.get_curdate()))
        if pubtime == date.get_curdate():
            # ?????
            # print "-----------------??-----------------"
            next_page_href = "http://www.zycg.gov.cn" + (
                str(response.xpath('//a[@class="next_page"]//@href').extract_first()))
            yield scrapy.FormRequest(next_page_href, callback=self.parse)