def process_item(self, item, spider):
if 'image_urls' in item:
images = []
abpath = '%s/%s/%s/%s' % (spider.name, item['id'][0],item['id'][1],item['id'])
dir_path = '%s/%s' % (settings['IMAGES_STORE'], abpath)
if not os.path.exists(dir_path) and len(item['image_urls'])>0:
os.makedirs(dir_path)
for image_url in item['image_urls']:
name = image_url.split('/')[-1]
_i = name.rfind('!')
if _i > 4:
name = name[:_i]
name = re.sub('\\\|/|:|\*|\?|"|<|>','_',name)
image_file_name = name[-100:]
file_path = '%s/%s' % (dir_path, image_file_name)
images.append((image_url, file_path))
if os.path.exists(file_path):
continue
with open(file_path, 'wb') as handle:
try:
response = requests.get(image_url, stream=True)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
# log.msg("download img to %s" % file_path)
except:
continue
item['images'] = images
if not images:
pass
else:
_ = images[0][1]
item['firstimage'] = '%s/%s' % (abpath, _[_.rfind('/')+1:])
print item['firstimage']
return item
评论列表
文章目录