sightcrawl.py 文件源码-python代码片段

def picture_parse(self, response):
        log.msg('run into picture_parse at line 66', level=log.INFO)
        item = response.meta['item']
        host_address = 'http://image.baidu.com'
        path = response.xpath('//*[@id="page"]/a[10]/@href').extract_first()
        url = host_address.encode('utf-8') + path
        page_num = response.xpath('//*[@id="page"]/strong/span/text()').extract_first()
        log.msg('page_num is %s' % page_num, level=log.INFO)
        for option in response.xpath('//div[@id="imgid"]/ul[@class="imglist"]/li[@class="imgitem"]'):
            item_final = SightItem()
            item_final['title'] = item['title']
            item_final['lng'] = item['lng']
            item_final['lat'] = item['lat']
            item_final['description'] = item['description']
            item_final['category'] = item['category']
            img_src = option.xpath('a/@href').extract_first()
            result = re.search(r'.*objurl=(http.*?)&.*', img_src).groups()[0]
            img_src = urllib.unquote(result).encode('utf-8')
            item['url'] = img_src
            print 'img_src: %s ========================****==============' % img_src
            img_url = jpg_test(img_url=img_src)
            print 'function jpg_test img_url is: %s ****************************' % img_url
            # if img_url is not None:
            try:
                print 'id_num: %s' % item['id_num']
                save_img(img_url=img_url, id_num=item['id_num'])
            except TypeError as e:
                log.msg('img url is NoneType in function picture_parse at line 103: {0}'.format(e), level=log.INFO)
            if img_src is None or len(img_src) == 0:
                item['url'] = 'url_null'
                log.msg('img_src is null==============' + img_src, level=log.INFO)
            item_final['url'] = item['url']
            log.msg('img_src in line 61***********' + img_src + '; type: %s ' % type(img_src), log.INFO)
            log.msg('run out picture_parse at line 92', level=log.INFO)
            yield item

        if path and page_num < PAGE_NUM:
            log.msg('***************path**************\r\n' + path, level=log.INFO)
            yield scrapy.Request(url, meta={'item': item,
                                            'splash': {
                                                'endpoint': 'render.html',
                                                'args': {'wait': 0.5}
                                            }
                                            }, callback=self.picture_parse)

            # def next_page_parse(self, response):