cli.py 文件源码-python代码片段

cli.py 文件源码
python
阅读 25 收藏 0 点赞 0 评论 0
def crawl(ctx, spiders, stats):
    """
    Crawl one or many or all pages.

    What spider(s) to run is determined in the following order:

      1. Spider(s) given as argument(s)

      2. Spider(s) specified in the configuration file

    Note that if a spider is given as an argument, the spiders in the
    configuration file are ignored. All available spiders will be used to
    crawl if no arguments are given and no spiders are configured.
    """
    settings = ctx.obj['settings']
    if stats:
        settings.set('STATS_CLASS',
                     'scrapy.statscollectors.MemoryStatsCollector')

    # Start a new crawler process.
    process = CrawlerProcess(settings)
    spiders = spiders_to_crawl(process, spiders)
    if not spiders:
        logger.error('Please specify what spiders you want to run!')
    else:
        for spider in spiders:
            logger.info('Starting crawl of {} ...'.format(spider))
            process.crawl(spider)

    process.start()

    if settings.getbool('HTTPCACHE_ENABLED'):
        run_cleanup_cache(settings)