def spider_closed(self, spider):
"""Handle the spider_closed event to save the map"""
# create the special marker for all the ads without geocode
print "found %d items without geocode" % (len(self.no_geocode))
if len(self.no_geocode) > 0:
html = ""
for x in self.no_geocode:
html += "<a href=%s target=_blank>%s</a> : %s<br>" % (x["url"], x["title"], x["price"])
iframe = folium.element.IFrame(html=html, width=500, height=100)
popup = folium.Popup(iframe, max_width=500)
folium.Marker(MAP_LATLNG,
popup=popup,
icon=folium.Icon()).add_to(self.m_map)
print "found %d new items" % (self.new_items)
pickle.dump(self.m_list, open(DATABASE, 'wb'))
self.m_map.save('map.html')
python类spider_closed()的实例源码
def __init__(self,rule):
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.rule = rule
self.name = rule.name
self.allowed_domains = rule.allowed_domains.split(',')
self.start_urls = rule.start_urls.split(',')
rule_list = []
# ??`???`???
if len(rule.next_page):
rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True))
rule_list.append(Rule(LinkExtractor(
allow=rule.allow_url.split(','),
unique=True),
follow=True,
callback='parse_item'))
self.rules = tuple(rule_list)
super(ProxySpiderSpider, self).__init__()
def from_crawler(cls, crawler):
try:
output_path = (
crawler.settings.get('FEEDS_CONFIG')['feeds']['output_path']
)
except (KeyError, TypeError):
output_path = 'output'
try:
output_url = (
crawler.settings.get('FEEDS_CONFIG')['feeds']['output_url']
)
except (KeyError, TypeError):
output_url = None
pipeline = cls(output_path=output_path, output_url=output_url)
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def process_item(self, item, spider):
if isinstance(item, AirbnbItem):
self.room_count += 1
if self.room_count > 100000:
self.room_count = 0
self.room_file_count += 1
self.spider_closed(spider, mode=1)
self.spider_opened(spider, mode=1)
self.exporter_room.export_item(item)
elif isinstance(item, UserItem):
self.user_count += 1
if self.user_count > 100000:
self.user_count = 0
self.user_file_count += 1
self.spider_closed(spider, mode=2)
self.spider_opened(spider, mode=2)
self.exporter_user.export_item(item)
else:
logger.info('Some error happened!')
def run_spider():
settings = Settings()
settings.set('ITEM_PIPELINES', {
'__main__.JsonWriterPipeline': 100
})
# enable remote sever certificate verification
# see http://doc.scrapy.org/en/latest/topics/settings.html#downloader-clientcontextfactory
settings.set('DOWNLOADER_CLIENTCONTEXTFACTORY',
'scrapy.core.downloader.contextfactory.BrowserLikeContextFactory'
)
# uncomment below line to enable the logging for debug
# configure_logging()
crawler = Crawler(JenkinsJobSpider, settings)
crawler.signals.connect(callback, signal=signals.spider_closed)
crawler.crawl()
reactor.run()
def spider_closed(spider):
spider_stats[spider.name] = {
'finish_reason': spider.crawler.stats.get_value('finish_reason'),
'duration': (
spider.crawler.stats.get_value('finish_time') -
spider.crawler.stats.get_value('start_time')).total_seconds(),
'item_scraped_count':
spider.crawler.stats.get_value('item_scraped_count'),
}
print("Spider %s closed (%s) after %0.1f sec, %d items" % (
spider.name,
spider.crawler.stats.get_value('finish_reason'),
(spider.crawler.stats.get_value('finish_time') -
spider.crawler.stats.get_value('start_time')).total_seconds(),
spider.crawler.stats.get_value('item_scraped_count') or 0,
))
def __init__(self):
dispatcher.connect(self.spider_opended, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
dispatcher.connect(self.engine_stopped, signals.engine_stopped)
dispatcher.connect(self.engine_started, signals.engine_started)
# ????????????scrapy_site??????
self.curpath = os.getcwd()
#?????????????
self.spidername_filepath = self.curpath + "/scrapy_site/msg/"
# ?????keyword.conf????????
self.keywordsDict = dict()
self.getKeywords()
#????????????
self.webnamesDict = dict()
self.getWebnames()
# ????
self.msgDict = dict()
SavePipeline.initCount = SavePipeline.initCount + 1
def __init__(self):
self.fail_urls=[]
dispatcher.connect(self.handle_spider_cosed, signals.spider_closed)
def __init__(self, settings):
self.options = settings.get('PHANTOMJS_OPTIONS', {}) # ???
max_run = settings.get('PHANTOMJS_MAXRUN', 10) # PhantomJS ???????????, ??10
self.sem = defer.DeferredSemaphore(max_run)
self.queue = Queue.LifoQueue(maxsize=max_run) # LifoQueue ??????
SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
def __init__(self,*a, **kw):
super(StackSpider,self).__init__(*a, **kw)
self.time = datetime.datetime.now()
self.congress = Congress()
self.members = self.congress.searchAll("diputados")
self.groups = self.congress.searchAll("grupos")
dispatcher.connect(self.whenFinish, signals.spider_closed)
def __init__(self, crawler):
self.crawler = crawler
self.initiatives = 0
self.amendments = 0
self.finishtext = 0
self.responses = 0
self.members = 0
# connect the extension object to signals
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
def spider_closed(self, spider):
self.crawler.stats.set_value('item/initiatives', self.initiatives)
self.crawler.stats.set_value('item/amendments', self.amendments)
self.crawler.stats.set_value('item/finishtext', self.finishtext)
self.crawler.stats.set_value('item/responses', self.responses)
self.crawler.stats.set_value('item/members', self.responses)
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def __init__(self, *a, **kw):
"""Attach a callback to the spider_closed signal"""
super(Kijiji, self).__init__(*a, **kw)
dispatcher.connect(self.spider_closed, signals.spider_closed)
if USE_DB is True:
self.open_database()
if DRAW_ALL_DB is True and DRAW_NEW_AD_ONLY is False:
# add already know marker
for x in self.m_list:
self.add_marker(x, False)
def __init__(self, *a, **kw):
super(TianqiSpider, self).__init__(*a, **kw)
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.sql = SqlHelper()
self.weather_table_name = config.weather_table
self.citys = []
self.init()
def spider_closed(self, spider):
self.log('spider_closed ???????????????')
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_closed(self, spider):
for exporter in self.exporters.values():
exporter.finish_exporting()
for file in self.files:
file.close()
def from_crawler(cls, crawler):
ext = cls(crawler.stats)
crawler.signals.connect(ext.spider_opened,
signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed,
signal=signals.spider_closed)
crawler.signals.connect(ext.item_scraped,
signal=signals.item_scraped)
return ext
def spider_closed(self, spider):
value = self.stats.get_value('item_scraped_count',
0)
save_stats(spider.settings['SPIDER_STATS_URL'],
spider._id,
value)
if spider.settings['BOT_NAME'] != 'TestSpider':
logger.info('spider[%s] crawled %d articles',
spider.name,
value)
if value == 0:
update_spider_stats(spider,
{'fail': 1})
def spider_closed(self, spider):
print "spider is closed!"
session = loadSession()
log = session.query(SpiderCrawlLog).filter(SpiderCrawlLog.spiderID == self.rule.id
and SpiderCrawlLog.endTime is None
).first()
log.endTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
session.commit()
pass
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_closed(self, spider):
file = self.files.pop(spider.name)
file.close()
pass
def __init__(self):
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.file.seek(-2, os.SEEK_END)
self.file.truncate()
self.file.write(']')
self.file.close()
def from_crawler(cls, crawler):
m = cls()
if not crawler.settings.getbool('SELENIUM_ENABLED'):
raise NotConfigured()
crawler.signals.connect(m.spider_closed, signal=signals.spider_closed)
return m
def spider_closed(self, spider):
self.driver.close()
def spider_closed(self, spider):
# Add feed header(s) at the end so they can be dynamic.
for feed_header in iterate_spider_output(spider.feed_headers()):
self._exporters[spider].export_item(feed_header)
self._exporters[spider].finish_exporting()
self._exporters.pop(spider)
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline