def from_crawler(cls, crawler):
instance = cls(crawler.stats)
crawler.signals.connect(instance.item_dropped,
signal=signals.item_dropped)
crawler.signals.connect(instance.item_scraped,
signal=signals.item_scraped)
crawler.signals.connect(instance.response_received,
signal=signals.response_received)
crawler.signals.connect(instance.response_downloaded,
signal=signals.response_downloaded)
crawler.signals.connect(instance.item_saved,
signal=mysignals.item_saved)
crawler.signals.connect(instance.item_saved_failed,
signal=mysignals.item_saved_failed)
crawler.signals.connect(instance.html_saved,
signal=mysignals.html_saved)
crawler.signals.connect(instance.html_saved_failed,
signal=mysignals.html_saved_failed)
crawler.signals.connect(instance.timeouterror,
signal=mysignals.timeouterror)
crawler.signals.connect(instance.dnslookuperror,
signal=mysignals.dnslookuperror)
return instance
python类item_scraped()的实例源码
def __init__(self, crawler):
self.crawler = crawler
self.initiatives = 0
self.amendments = 0
self.finishtext = 0
self.responses = 0
self.members = 0
# connect the extension object to signals
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
def item_scraped(self, item, spider):
if isinstance(item, InitiativeItem):
self.initiatives += 1
elif isinstance(item, AmendmentItem):
self.amendments += 1
elif isinstance(item, FinishTextItem):
self.finishtext += 1
elif isinstance(item, ResponseItem):
self.responses += 1
elif isinstance(item, MemberItem):
self.responses += 1
def setup_redis(self):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
"""
if not self.redis_key:
self.redis_key = '%s:start_urls' % self.name
self.server = connection.from_settings(self.crawler.settings)
# idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
self.log("Reading URLs from redis list '%s'" % self.redis_key)
def item_scraped(self, *args, **kwargs):
"""Avoids waiting for the spider to idle before scheduling the next request"""
self.schedule_next_request()
def from_crawler(cls, crawler):
ext = cls(crawler.stats)
crawler.signals.connect(ext.spider_opened,
signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed,
signal=signals.spider_closed)
crawler.signals.connect(ext.item_scraped,
signal=signals.item_scraped)
return ext
def item_scraped(self, item, spider):
pass
def from_crawler(cls, crawler):
instance = cls(crawler.stats)
crawler.signals.connect(instance.item_dropped,
signal=signals.item_dropped)
crawler.signals.connect(instance.item_scraped,
signal=signals.item_scraped)
crawler.signals.connect(instance.response_received,
signal=signals.response_received)
crawler.signals.connect(instance.response_downloaded,
signal=signals.response_downloaded)
return instance
def item_scraped(self, item, spider):
#??item??????itempipeline??????
self.stats.inc_value('item/scraped', spider=spider)
def setup_redis(self):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
"""
if not self.redis_key:
self.redis_key = '%s:start_urls' % self.name
self.server = connection.from_settings(self.crawler.settings)
# idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
self.log("Reading URLs from redis list '%s'" % self.redis_key)
def item_scraped(self, *args, **kwargs):
"""Avoids waiting for the spider to idle before scheduling the next request"""
self.schedule_next_request()
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.engine_started, signals.engine_started) # ????
crawler.signals.connect(pipeline.engine_stopped, signals.engine_stopped) # ????
crawler.signals.connect(pipeline.item_scraped, signals.item_scraped) # ??????????
crawler.signals.connect(pipeline.item_dropped, signals.item_dropped) # ??????????
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) # ????????????
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) # ????????????
crawler.signals.connect(pipeline.spider_idle, signals.spider_idle) # ????
crawler.signals.connect(pipeline.spider_error, signals.spider_error) # ????
crawler.signals.connect(pipeline.request_scheduled, signals.request_scheduled) # ??????
crawler.signals.connect(pipeline.request_dropped, signals.request_dropped) # ??????
crawler.signals.connect(pipeline.response_received, signals.response_received) # ????
crawler.signals.connect(pipeline.response_downloaded, signals.response_downloaded) # ????
return pipeline
def item_scraped(self, item, response, spider):
"""
??????????
:param item:
:param response:
:param spider:
:return:
"""
print time.strftime("%Y-%m-%d %H:%M:%S"), 'Pipeline Signals: item_scraped'
pass
def item_scraped(self, item, spider):
#??item??????itempipeline??????
self.stats.inc_value('item/scraped', spider=spider)