def page_item(self, response: HtmlResponse) -> Item:
media_urls = []
get_urls = lambda le: (link.url for link in le.extract_links(response))
if self.settings.get('FILES_STORE'):
media_urls.extend(get_urls(self.images_le))
media_urls.extend(
set(get_urls(self.files_le)) - set(get_urls(self.le)))
metadata = {
'id': _url_hash(response.url, as_bytes=False),
'parent': _url_hash_as_str(response.meta.get('parent')),
'depth': response.meta.get('depth'),
'priority': response.request.priority,
}
if (self.settings.get('AUTOLOGIN_ENABLED') and
not self.queue.has_login_form(response.url)):
for form_el, form_meta in extract_forms(
response.text, fields=False):
if form_meta.get('form') == 'login':
self.queue.add_login_form(response.url)
metadata['has_login_form'] = True
return text_cdr_item(
response,
crawler_name=self.settings.get('CDR_CRAWLER'),
team_name=self.settings.get('CDR_TEAM'),
objects=media_urls,
metadata=metadata,
)
评论列表
文章目录