__main__.py 文件源码-python代码片段

__main__.py 文件源码

python

阅读 24 收藏 0 点赞 0 评论 0

项目：wayback-machine-scraper 作者: sangaline 项目源码文件源码

def main():
    # configure the settings for the crawler and spider
    args = parse_args()
    config = {
        'domains': args.domains,
        'directory': args.output,
        'allow': args.allow,
        'deny': args.deny,
        'unix': args.unix,
    }
    settings = Settings({
        'USER_AGENT': (
            'Wayback Machine Scraper/{0} '
            '(+https://github.com/sangaline/scrapy-wayback-machine)'
        ).format(get_distribution('wayback-machine-scraper').version),
        'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO',
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy_wayback_machine.WaybackMachineMiddleware': 5,
        },
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_DEBUG': args.verbose,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency,
        'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to),
    })

    # start the crawler
    process = CrawlerProcess(settings)
    process.crawl(MirrorSpider, **config)
    process.start()