def get_media_requests(self, item, info):
doc = item['content']
if isinstance(doc,
(str, bytes)):
doc = fromstring(doc,
parser=HTMLParser(encoding=item['encoding']))
item['content'] = doc
try:
attr = self.spiderinfo.spider.image_url_attr
except AttributeError:
attr = 'src'
urls = []
for e in doc.xpath('//img'):
if attr in e.attrib:
url = e.get(attr).strip(' \t\n')
if url.startswith('/'):
url = urljoin(item['link'].strip(),
url)
if url.startswith('//'):
url = 'http:' + url
urls.append((url, e))
reqs = []
for url, e in urls:
if not url.startswith('data'):
try:
r = Request(url,
meta={'img': e})
except ValueError:
logger.error((
'Error in pipeline image create Request[{}]'
).format(url))
else:
reqs.append(r)
return reqs
评论列表
文章目录