def from_crawler(cls, crawler):
s = crawler.settings
proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None)
if proxy_path is not None:
with codecs.open(proxy_path, 'r', encoding='utf8') as f:
proxy_list = [line.strip() for line in f if line.strip()]
else:
proxy_list = s.getlist('ROTATING_PROXY_LIST')
if not proxy_list:
raise NotConfigured()
mw = cls(
proxy_list=proxy_list,
logstats_interval=s.getfloat('ROTATING_PROXY_LOGSTATS_INTERVAL', 30),
stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False),
max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5),
backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300),
backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600)
)
crawler.signals.connect(mw.engine_started,
signal=signals.engine_started)
crawler.signals.connect(mw.engine_stopped,
signal=signals.engine_stopped)
return mw
python类engine_stopped()的实例源码
def __init__(self):
dispatcher.connect(self.spider_opended, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
dispatcher.connect(self.engine_stopped, signals.engine_stopped)
dispatcher.connect(self.engine_started, signals.engine_started)
# ????????????scrapy_site??????
self.curpath = os.getcwd()
#?????????????
self.spidername_filepath = self.curpath + "/scrapy_site/msg/"
# ?????keyword.conf????????
self.keywordsDict = dict()
self.getKeywords()
#????????????
self.webnamesDict = dict()
self.getWebnames()
# ????
self.msgDict = dict()
SavePipeline.initCount = SavePipeline.initCount + 1
def __init__(self):
self.conn = None
dispatcher.connect(self.initialize, signals.engine_started)
dispatcher.connect(self.finalize, signals.engine_stopped)
def __init__(self, crawler):
if not crawler.settings.getbool('JSONRPC_ENABLED'):
raise NotConfigured
self.crawler = crawler
logfile = crawler.settings['JSONRPC_LOGFILE']
self.portrange = [int(x) for x in crawler.settings.getlist('JSONRPC_PORT', [6023, 6073])]
self.host = crawler.settings.get('JSONRPC_HOST', '127.0.0.1')
root = RootResource(crawler)
root.putChild('crawler', CrawlerResource(self.crawler))
# root.putChild('spidercls', CrawlerResource(self.crawler.__dict__['spidercls']))
server.Site.__init__(self, root, logPath=logfile)
self.noisy = False
crawler.signals.connect(self.start_listening, signals.engine_started)
crawler.signals.connect(self.stop_listening, signals.engine_stopped)
pipelines.py 文件源码
项目:Android-Repackaged-App-Detection-System
作者: M157q
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def __init__(self):
self.filename += settings.MARKET_NAME
self.filename += ".db"
self.filename = path.join(settings.DATABASE_DIR, self.filename)
print self.filename
self.conn = None
dispatcher.connect(self.initialize, signals.engine_started)
dispatcher.connect(self.finalize, signals.engine_stopped)
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.engine_started, signals.engine_started) # ????
crawler.signals.connect(pipeline.engine_stopped, signals.engine_stopped) # ????
crawler.signals.connect(pipeline.item_scraped, signals.item_scraped) # ??????????
crawler.signals.connect(pipeline.item_dropped, signals.item_dropped) # ??????????
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) # ????????????
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) # ????????????
crawler.signals.connect(pipeline.spider_idle, signals.spider_idle) # ????
crawler.signals.connect(pipeline.spider_error, signals.spider_error) # ????
crawler.signals.connect(pipeline.request_scheduled, signals.request_scheduled) # ??????
crawler.signals.connect(pipeline.request_dropped, signals.request_dropped) # ??????
crawler.signals.connect(pipeline.response_received, signals.response_received) # ????
crawler.signals.connect(pipeline.response_downloaded, signals.response_downloaded) # ????
return pipeline
def engine_stopped(self):
"""
????
:return:
"""
print time.strftime("%Y-%m-%d %H:%M:%S"), 'Pipeline Signals: engine_stopped'
pass
def __init__(self):
self.is_running = False
dispatcher.connect(self.pause_crawler, signals.engine_stopped)
self.setting = get_project_settings()
self.process = None
def __init__(self):
self.is_running = False
dispatcher.connect(self.pause_crawler, signals.engine_stopped)
self.setting = get_project_settings()
self.process = None
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.items = {}
self.found = {}
dispatcher.connect(self.init_scrapy, signals.engine_started)
dispatcher.connect(self.close_scrapy, signals.engine_stopped)
def __init__(self, asin, daily=0, *args, **kwargs):
super().__init__(*args, **kwargs)
self.asin = asin
self.last_review = 0
self.profile_update_self = False # profile??????
self.updated = False # profile?????
self.daily = True if int(daily) == 1 else False # ?????????
self.start_urls = [
'https://www.amazon.com/product-reviews/%s?sortBy=recent&filterByStar=three_star' % self.asin,
'https://www.amazon.com/product-reviews/%s?sortBy=recent&filterByStar=two_star' % self.asin,
'https://www.amazon.com/product-reviews/%s?sortBy=recent&filterByStar=one_star' % self.asin
]
dispatcher.connect(self.update_profile_self, signals.engine_stopped)
dispatcher.connect(self.init_profile, signals.engine_started)
def webdriver(self):
"""Return the webdriver instance, instantiate it if necessary."""
if self._webdriver is None:
short_arg_classes = (webdriver.Firefox, webdriver.Ie)
if issubclass(self._browser, short_arg_classes):
cap_attr = 'capabilities'
else:
cap_attr = 'desired_capabilities'
options = self._options
options[cap_attr] = self._desired_capabilities
self._webdriver = self._browser(**options)
self._webdriver.set_window_size(settings.DRIVER_WINDOW_WIDTH, settings.DRIVER_WINDOW_HEIGHT)
self._webdriver.set_page_load_timeout(self.crawler.settings.get('DOMAIN_TIMEOUT', 30))
self.crawler.signals.connect(self._cleanup, signal=engine_stopped)
return self._webdriver
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.items = {}
self.found = {}
self.keyword_pool = {}
self.store_poll = {}
self.store_date = {}
dispatcher.connect(self.init_scrapy, signals.engine_started)
dispatcher.connect(self.close_scrapy, signals.engine_stopped)
def __init__(self, asin, daily=0, *args, **kwargs):
super().__init__(*args, **kwargs)
self.asin = asin
self.last_review = 0
self.profile_update_self = False # profile??????
self.updated = False # profile?????
self.daily = True if int(daily) == 1 else False # ?????????
self.start_urls = [
'https://www.amazon.com/product-reviews/%s?sortBy=recent&filterByStar=three_star' % self.asin,
'https://www.amazon.com/product-reviews/%s?sortBy=recent&filterByStar=two_star' % self.asin,
'https://www.amazon.com/product-reviews/%s?sortBy=recent&filterByStar=one_star' % self.asin
]
dispatcher.connect(self.update_profile_self, signals.engine_stopped)
dispatcher.connect(self.init_profile, signals.engine_started)
def __init__(self, signal_manager, app):
super(_QApplicationStopper, self).__init__()
self._qapplication = weakref.ref(app)
self.signals = signal_manager
self.signals.connect(self, signal=signals.engine_stopped, weak=False)
def __call__(self):
self.signals.disconnect(self, signals.engine_stopped)
app = self._qapplication()
if app is not None:
app.quit()
def engine_stopped():
if QApplication.instance():
QApplication.instance().quit()
def engine_stopped(self):
if self.log_task.running:
self.log_task.stop()
if self.reanimate_task.running:
self.reanimate_task.stop()
def engine_stopped(self):
print ('Pipeline???????========%s' % SavePipeline.initCount)