def main():
# configure the settings for the crawler and spider
args = parse_args()
config = {
'domains': args.domains,
'directory': args.output,
'allow': args.allow,
'deny': args.deny,
'unix': args.unix,
}
settings = Settings({
'USER_AGENT': (
'Wayback Machine Scraper/{0} '
'(+https://github.com/sangaline/scrapy-wayback-machine)'
).format(get_distribution('wayback-machine-scraper').version),
'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO',
'DOWNLOADER_MIDDLEWARES': {
'scrapy_wayback_machine.WaybackMachineMiddleware': 5,
},
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_DEBUG': args.verbose,
'AUTOTHROTTLE_START_DELAY': 1,
'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency,
'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to),
})
# start the crawler
process = CrawlerProcess(settings)
process.crawl(MirrorSpider, **config)
process.start()
评论列表
文章目录