def crawl(ctx, spiders, stats):
"""
Crawl one or many or all pages.
What spider(s) to run is determined in the following order:
1. Spider(s) given as argument(s)
2. Spider(s) specified in the configuration file
Note that if a spider is given as an argument, the spiders in the
configuration file are ignored. All available spiders will be used to
crawl if no arguments are given and no spiders are configured.
"""
settings = ctx.obj['settings']
if stats:
settings.set('STATS_CLASS',
'scrapy.statscollectors.MemoryStatsCollector')
# Start a new crawler process.
process = CrawlerProcess(settings)
spiders = spiders_to_crawl(process, spiders)
if not spiders:
logger.error('Please specify what spiders you want to run!')
else:
for spider in spiders:
logger.info('Starting crawl of {} ...'.format(spider))
process.crawl(spider)
process.start()
if settings.getbool('HTTPCACHE_ENABLED'):
run_cleanup_cache(settings)
评论列表
文章目录