def start_requests(self):
"""This function generates the initial request of ArchiveSpider.
See 'http://doc.scrapy.org/en/latest/topics/spiders.html#\
scrapy.spiders.Spider.start_requests'.
The most import part of the function is to set a request meta,
'archive_meta', according to its site 'archive_rules'. The meta would
be used to parse article URLs from response and generate next request!
"""
for page in self.page_templates:
url = page.format(p_num=self.p_kw['start'])
meta = dict(archive_meta=dict(
last_urls=dict(),
p_num=self.p_kw['start'],
next_tries=0,
max_next_tries=self.p_kw['max_next_tries'],
page=page))
logger.debug('Page format meta info:\n%s', pprint.pformat(meta))
yield scrapy.Request(url, callback=self.parse, meta=meta)
评论列表
文章目录