def spider_opened(self, spider):
try:
file = open(spider.settings.get('FEED_FILE'), 'wb')
except TypeError:
raise NotConfigured('FEED_FILE parameter does not string or does not exist')
except (IOError, OSError) as e:
raise CloseSpider('Cannot open file {}: {}'.format(spider.settings.get('FEED_FILE', None), e))
self.files[spider] = file
feed_title = spider.settings.get('FEED_TITLE')
if not feed_title:
raise NotConfigured('FEED_TITLE parameter does not exist')
feed_link = spider.settings.get('FEED_LINK')
if not feed_link:
raise NotConfigured('FEED_LINK parameter does not exist')
feed_description = spider.settings.get('FEED_DESCRIPTION')
if feed_description is None:
raise NotConfigured('FEED_DESCRIPTION parameter does not exist')
feed_exporter = spider.settings.get('FEED_EXPORTER', RssItemExporter)
if isinstance(feed_exporter, six.string_types):
feed_exporter = load_object(feed_exporter)
if not issubclass(feed_exporter, RssItemExporter):
raise TypeError("FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(feed_exporter))
self.exporters[spider] = feed_exporter(file, feed_title, feed_link, feed_description)
self.exporters[spider].start_exporting()
python类NotConfigured()的实例源码
def from_crawler(cls, crawler):
s = crawler.settings
proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None)
if proxy_path is not None:
with codecs.open(proxy_path, 'r', encoding='utf8') as f:
proxy_list = [line.strip() for line in f if line.strip()]
else:
proxy_list = s.getlist('ROTATING_PROXY_LIST')
if not proxy_list:
raise NotConfigured()
mw = cls(
proxy_list=proxy_list,
logstats_interval=s.getfloat('ROTATING_PROXY_LOGSTATS_INTERVAL', 30),
stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False),
max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5),
backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300),
backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600)
)
crawler.signals.connect(mw.engine_started,
signal=signals.engine_started)
crawler.signals.connect(mw.engine_stopped,
signal=signals.engine_stopped)
return mw
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
# s = cls()
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
# return s
user_agents = crawler.settings.get('USER_AGENT_CHOICES', [])
if not user_agents:
raise NotConfigured("USER_AGENT_CHOICES not set or empty")
o = cls(user_agents)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
# s = cls()
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
# return s
user_agents = crawler.settings.get('USER_AGENT_CHOICES', [])
if not user_agents:
raise NotConfigured("USER_AGENT_CHOICES not set or empty")
o = cls(user_agents)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def from_crawler(cls, crawler):
m = cls()
if not crawler.settings.getbool('SELENIUM_ENABLED'):
raise NotConfigured()
crawler.signals.connect(m.spider_closed, signal=signals.spider_closed)
return m
def from_crawler(cls, crawler):
try:
return cls(crawler)
except Exception as e:
raise NotConfigured('WEBDRIVER_BROWSER is misconfigured: %r (%r)'
% (crawler.settings.get('WEBDRIVER_BROWSER'), e))
def test_empty_feed(self):
for partial_settings in itertools.chain.from_iterable(
itertools.combinations(self.feed_settings.items(), r)
for r in range(1, len(self.feed_settings))):
partial_settings = dict(partial_settings)
undefined_settings = [name.upper() for name in set(self.feed_settings) - set(partial_settings)]
with self.assertRaisesRegexp(NotConfigured,
'({})'.format('|'.join(undefined_settings))
if len(undefined_settings) > 1 else undefined_settings[0],
msg='The feed file, title, link and description must be specified, but the absence of {} is allowed'
.format(undefined_settings)):
with CrawlerContext(**partial_settings):
pass
with self.assertRaises(CloseSpider):
feed_settings = dict(self.feed_settings)
feed_settings['feed_file'] = 'non/existent/filepath'
with CrawlerContext(**feed_settings):
pass
with CrawlerContext(**self.feed_settings):
pass
with open(self.feed_settings['feed_file']) as data, \
open(os.path.join(os.path.dirname(__file__), 'expected_rss', 'empty_feed.rss')) as expected:
self.assertUnorderedXmlEquivalentOutputs(data.read(), expected.read())
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
# s = cls()
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
# return s
user_agents = crawler.settings.get('USER_AGENT_CHOICES', [])
if not user_agents:
raise NotConfigured("USER_AGENT_CHOICES not set or empty")
o = cls(user_agents)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def __init__(self):
# Open database connection
self.db = mysql.connect(host=ROJAK_DB_HOST, port=ROJAK_DB_PORT,
user=ROJAK_DB_USER, passwd=ROJAK_DB_PASS, db=ROJAK_DB_NAME)
self.cursor = self.db.cursor()
self.media = {}
try:
# Get media information from the database
self.logger.info('Fetching media information')
self.cursor.execute(sql_get_media, [self.name])
row = self.cursor.fetchone()
self.media['id'] = row[0]
self.media['last_scraped_at'] = row[1]
except mysql.Error as err:
self.logger.error('Unable to fetch media data: %s', err)
raise NotConfigured('Unable to fetch media data: %s' % err)
if ROJAK_SLACK_TOKEN != '':
self.is_slack = True
self.slack = Slacker(ROJAK_SLACK_TOKEN)
else:
self.is_slack = False
self.logger.info('Post error to #rojak-pantau-errors is disabled')
# Capture the signal spider_opened and spider_closed
# https://doc.scrapy.org/en/latest/topics/signals.html
def start_requests(self):
if not self.page_clf and self.settings.get(
'QUEUE_MAX_RELEVANT_DOMAINS'):
raise NotConfigured('Pass page_clf to spider')
for request in super().start_requests():
request.priority = self.initial_priority
if self.queue is not None:
self.queue.push(request)
else:
yield request
def from_crawler(cls, crawler) -> 'RequestLogMiddleware':
log_path = crawler.settings.get('RESPONSE_LOG_FILE')
if not log_path:
raise NotConfigured('RESPONSE_LOG_FILE not defined')
jl_logger = get_jl_logger(log_path)
threshold = crawler.settings.getfloat('PAGE_RELEVANCY_THRESHOLD', 0.5)
return cls(jl_logger=jl_logger, relevancy_threshold=threshold)
def from_crawler(cls, crawler):
if crawler.settings.getbool('DOMAIN_LIMIT'):
log_path = crawler.settings.get('RESPONSE_LOG_FILE')
if not log_path:
raise NotConfigured('RESPONSE_LOG_FILE not defined')
mw = cls(get_jl_logger(log_path))
crawler.signals.connect(mw.on_queues_changed, queues_changed)
return mw
def from_crawler(cls, crawler):
path_segments = crawler.settings.getint('MAX_DUPLICATE_PATH_SEGMENTS')
query_segments = crawler.settings.getint('MAX_DUPLICATE_QUERY_SEGMENTS')
if not (path_segments or query_segments):
raise NotConfigured()
return cls(path_segments, query_segments, crawler.stats)
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
# s = cls()
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
# return s
user_agents = crawler.settings.get('USER_AGENT_CHOICES', [])
if not user_agents:
raise NotConfigured("USER_AGENT_CHOICES not set or empty")
o = cls(user_agents)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def from_crawler(cls, crawler):
splash_base_url = crawler.settings.get('SPLASH_URL',
cls.default_splash_url)
log_400 = crawler.settings.getbool('SPLASH_LOG_400', True)
slot_policy = crawler.settings.get('SPLASH_SLOT_POLICY',
cls.default_policy)
if slot_policy not in SlotPolicy._known:
raise NotConfigured("Incorrect slot policy: %r" % slot_policy)
return cls(crawler, splash_base_url, slot_policy, log_400)
def from_crawler(cls, crawler):
splash_base_url = crawler.settings.get('SPLASH_URL',
cls.default_splash_url)
log_400 = crawler.settings.getbool('SPLASH_LOG_400', True)
slot_policy = crawler.settings.get('SPLASH_SLOT_POLICY',
cls.default_policy)
if slot_policy not in SlotPolicy._known:
raise NotConfigured("Incorrect slot policy: %r" % slot_policy)
return cls(crawler, splash_base_url, slot_policy, log_400)
def from_crawler(cls, crawler):
splash_base_url = crawler.settings.get('SPLASH_URL',
cls.default_splash_url)
log_400 = crawler.settings.getbool('SPLASH_LOG_400', True)
slot_policy = crawler.settings.get('SPLASH_SLOT_POLICY',
cls.default_policy)
if slot_policy not in SlotPolicy._known:
raise NotConfigured("Incorrect slot policy: %r" % slot_policy)
return cls(crawler, splash_base_url, slot_policy, log_400)