def default(self, o):
if isinstance(o, datetime.datetime):
return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
elif isinstance(o, datetime.date):
return o.strftime(self.DATE_FORMAT)
elif isinstance(o, datetime.time):
return o.strftime(self.TIME_FORMAT)
elif isinstance(o, decimal.Decimal):
return str(o)
elif isinstance(o, defer.Deferred):
return str(o)
elif isinstance(o, BaseItem):
return dict(o)
elif isinstance(o, Request):
return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
elif isinstance(o, Response):
return "<%s %s %s>" % (type(o).__name__, o.status, o.url)
elif isinstance(o, Crawler):
return o.stats.get_stats()
else:
return super(ScrapyJSONEncoder, self).default(o)
python类Crawler()的实例源码
def run_spider():
settings = Settings()
settings.set('ITEM_PIPELINES', {
'__main__.JsonWriterPipeline': 100
})
# enable remote sever certificate verification
# see http://doc.scrapy.org/en/latest/topics/settings.html#downloader-clientcontextfactory
settings.set('DOWNLOADER_CLIENTCONTEXTFACTORY',
'scrapy.core.downloader.contextfactory.BrowserLikeContextFactory'
)
# uncomment below line to enable the logging for debug
# configure_logging()
crawler = Crawler(JenkinsJobSpider, settings)
crawler.signals.connect(callback, signal=signals.spider_closed)
crawler.crawl()
reactor.run()
def test_spider_output_handling(self):
spider = self.MySpider()
scraper = Scraper(Crawler(spider))
scraper.open_spider(spider)
scraper._process_spidermw_output(RssItem(), None, None, None)
scraper._process_spidermw_output(ExtendableItem(), None, None, None)
scraper._process_spidermw_output(RssedItem(), None, None, None)
scraper.close_spider(spider)
def make_queue(redis_server, cls: type, slots=None, skip_cache=True, settings=None,
hints=None) -> BaseRequestQueue:
global logging_configured
if not logging_configured:
configure_logging(settings=settings)
logging_configured = True
crawler = Crawler(Spider, settings=settings)
if slots is None:
slots = {}
spider = Spider.from_crawler(crawler, 'test_dd_spider')
if hints:
spider.hint_urls = hints
return cls(server=redis_server, spider=spider, key=SCHEDULER_QUEUE_KEY,
slots_mock=slots, skip_cache=skip_cache)