def launch_crawlers(crawler_class, exclusion=None):
settings = get_settings()
configure_logging(settings=settings)
launcher = CrawlerRunner(settings)
crawlers = launcher.spider_loader.list()
crawlers = list([c for c in crawlers if c.__contains__(crawler_class)])
if exclusion:
for c in settings.get(exclusion, []):
crawlers.remove(c)
try:
for crawler in crawlers:
launcher.crawl(crawler)
d = launcher.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
return True
except Exception as e:
launch_logger.error('(????)????? | ?????:\n{excep}'
.format(excep=e))
return False
python类CrawlerRunner()的实例源码
def run():
configure_logging()
# importing project settings for further usage
# mainly because of the middlewares
settings = get_project_settings()
runner = CrawlerRunner(settings)
# running spiders sequentially (non-distributed)
@defer.inlineCallbacks
def crawl():
yield runner.crawl(IPTesterSpider)
yield runner.crawl(UATesterSpider)
reactor.stop()
crawl()
reactor.run() # block until the last call
def crawl(args):
spids = args.get('spiders')
configure_logging(SETTINGS,
install_root_handler=False)
logging.getLogger('scrapy').setLevel(logging.WARNING)
runner = CrawlerRunner(SETTINGS)
loader = runner.spider_loader
if 'all' in spids:
spids = loader.list()
spiders = [loader.load(_)
for _ in filter(lambda __: __ in loader.list(),
spids)]
if not spiders:
return False
random.shuffle(spiders)
for __ in spiders:
runner.crawl(__)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
logger.info('crawl reator starting ...')
reactor.run()
logging.info('crawl reator stopped')
def make_crawler(spider_cls=ATestBaseSpider, **extra_settings):
# clean up queue before starting spider
assert spider_cls.name.startswith('test_'), 'pass a special test spider'
redis_server = redis.from_url('redis://localhost')
name = spider_cls.name
redis_server.delete(
SCHEDULER_DUPEFILTER_KEY % {'spider': name},
*redis_server.keys(
SCHEDULER_QUEUE_KEY % {'spider': name} + '*'))
settings = Settings()
settings.setmodule(dd_crawler.settings)
settings['ITEM_PIPELINES']['tests.utils.CollectorPipeline'] = 100
settings.update(extra_settings)
runner = CrawlerRunner(settings)
return runner.create_crawler(spider_cls)
def test_spider(setting):
setting = setting.copy()
spid = str(uuid.uuid4())
setting['_id'] = spid
try:
cls = SpiderFactory.mkspider(setting)
except SpiderFactoryException as e:
logger.error('Error in test_spider SpiderFactory[%s]',
e)
return False
url = SETTINGS['TEMP_SPIDER_STATS_URL']
TEST_SETTINGS = {
'EXTENSIONS': {'mydm.extensions.ExtensionStats': 900,
'scrapy.extensions.logstats.LogStats': None,
'scrapy.extensions.spiderstate.SpiderState': None,
'scrapy.extensions.telnet.TelnetConsole': None, },
'SPIDER_STATS_URL': url,
'BOT_NAME': 'TestSpider',
'WEBSERVICE_ENABLED': False,
'TELNETCONSOLE_ENABLED': False,
'LOG_LEVEL': 'INFO',
'LOG_FORMAT': '%(asctime)s-%(levelname)s: %(message)s',
'LOG_DATEFORMAT': '%Y-%m-%d %H:%M:%S'
}
configure_logging(TEST_SETTINGS,
install_root_handler=False)
logging.getLogger('scrapy').setLevel(logging.WARNING)
runner = CrawlerRunner(TEST_SETTINGS)
d = runner.crawl(cls)
d.addBoth(lambda _: reactor.stop())
logger.info('test_spider reator starting ...')
reactor.run()
logger.info('test_spider reator stopped')
stats = get_stats(url,
[spid])
n = stats[spid]
return True if n > 0 else False
def run(cls):
runner = CrawlerRunner(get_project_settings())
@defer.inlineCallbacks
def deferred_crawl():
for spider, args, kwargs in cls.queue:
try:
yield runner.crawl(spider, *args, **kwargs)
except KeyError as err:
# Log a warning if the scraper name is invalid instead of
# causing the job to fail.
# NOTE: If there is any other type of error, the job will
# fail, and all the jobs that depend on it will fail as
# well.
logger.warning(err.args[0])
# XXX: If all the names fail, then trying to run
# `reactor.stop()` will give an "Unhandled error in
# Deferred" complaint and hang. It will also hang in
# general if no spiders have been run. I assume there's
# some twisted-way to handle this, but for now, just log an
# error.
if reactor.running:
reactor.stop()
else:
logger.critical("LocalQueue: No valid scraper names found.")
deferred_crawl()
reactor.run()
def make_crawler(**extra_settings):
settings = Settings()
settings['ITEM_PIPELINES'] = {
'scrapy_cdr.media_pipeline.CDRMediaPipeline': 1,
'tests.utils.CollectorPipeline': 100,
}
settings.update(extra_settings)
runner = CrawlerRunner(settings)
return runner.create_crawler(Spider)
def make_crawler(settings, **extra_settings):
settings.update(extra_settings)
runner = CrawlerRunner(settings)
return runner.create_crawler(BaseSpider)
def run(self, args, opts):
conn = redis.Redis(decode_responses=True)
runner = CrawlerRunner(get_project_settings())
try:
rules = Rule.loads()
if not rules:
raise ValueError
except ValueError:
print('Error in loading Redis rules, fallback to CSV rules')
rules = Rule.loads('csv')
for rule in rules:
rule.save()
if rule.name in self.excludes:
continue
if conn.hget('Rule:' + rule.name, 'status') == 'started':
d = runner.crawl(ProxySpider, rule)
# Set status to stopped if crawler finished
d.addBoth(lambda _: conn.hset(
'Rule:' + rule.name, 'status', 'finished'))
rule_maintainer = RuleMaintainer(conn, runner)
proxy_maintainer = ProxyMaintainer(conn)
schedule_maintainer = ScheduleMaintainer(conn)
lc = task.LoopingCall(rule_maintainer)
lc.start(1)
lc = task.LoopingCall(proxy_maintainer)
lc.start(0.5)
lc = task.LoopingCall(schedule_maintainer)
lc.start(10)
reactor.run()
def runTest(self):
settings = get_project_settings()
settings.set('SPIDER_MODULES', ['classes.spiders'])
try:
sys.path.append(scrapy_path)
runner = CrawlerRunner(settings)
spiders = runner.spider_loader.list()
self.assertEqual(set(class_pipeline.get_spiders()), set(spiders))
except:
pass
def make_crawler(settings, spider_cls=None, **extra_settings):
settings.update(extra_settings)
runner = CrawlerRunner(settings)
return runner.create_crawler(spider_cls or TestSpider)
def _run_spiders(ticker_list, start_date, end_date):
configure_logging()
runner = CrawlerRunner(settings=get_project_settings())
spider_dict = {
'symbols': ticker_list,
'start_date': start_date,
'end_date': end_date
}
runner.crawl(EdgarSpider, **spider_dict)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
def runspider(self):
configure_logging(install_root_handler = False)
s = get_project_settings()
runner = CrawlerRunner(settings = s)
@defer.inlineCallbacks
def crawl(**spargs):
yield runner.crawl(JDItemInfoSpider, **spargs)
yield runner.crawl(JDCommentSpider, **spargs)
reactor.stop()
crawl(**self.spargs)
reactor.run() # the script will block here until the last crawl call is finished
# ????