def picture_parse(self, response):
log.msg('run into picture_parse at line 66', level=log.INFO)
item = response.meta['item']
host_address = 'http://image.baidu.com'
path = response.xpath('//*[@id="page"]/a[10]/@href').extract_first()
url = host_address.encode('utf-8') + path
page_num = response.xpath('//*[@id="page"]/strong/span/text()').extract_first()
log.msg('page_num is %s' % page_num, level=log.INFO)
for option in response.xpath('//div[@id="imgid"]/ul[@class="imglist"]/li[@class="imgitem"]'):
item_final = SightItem()
item_final['title'] = item['title']
item_final['lng'] = item['lng']
item_final['lat'] = item['lat']
item_final['description'] = item['description']
item_final['category'] = item['category']
img_src = option.xpath('a/@href').extract_first()
result = re.search(r'.*objurl=(http.*?)&.*', img_src).groups()[0]
img_src = urllib.unquote(result).encode('utf-8')
item['url'] = img_src
print 'img_src: %s ========================****==============' % img_src
img_url = jpg_test(img_url=img_src)
print 'function jpg_test img_url is: %s ****************************' % img_url
# if img_url is not None:
try:
print 'id_num: %s' % item['id_num']
save_img(img_url=img_url, id_num=item['id_num'])
except TypeError as e:
log.msg('img url is NoneType in function picture_parse at line 103: {0}'.format(e), level=log.INFO)
if img_src is None or len(img_src) == 0:
item['url'] = 'url_null'
log.msg('img_src is null==============' + img_src, level=log.INFO)
item_final['url'] = item['url']
log.msg('img_src in line 61***********' + img_src + '; type: %s ' % type(img_src), log.INFO)
log.msg('run out picture_parse at line 92', level=log.INFO)
yield item
if path and page_num < PAGE_NUM:
log.msg('***************path**************\r\n' + path, level=log.INFO)
yield scrapy.Request(url, meta={'item': item,
'splash': {
'endpoint': 'render.html',
'args': {'wait': 0.5}
}
}, callback=self.picture_parse)
# def next_page_parse(self, response):
评论列表
文章目录