def process_options(self, args, opts):
try:
self.settings.setdict(arglist_to_dict(opts.set),
priority='cmdline')
except ValueError:
raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False)
if opts.logfile:
self.settings.set('LOG_ENABLED', True, priority='cmdline')
self.settings.set('LOG_FILE', opts.logfile, priority='cmdline')
if opts.loglevel:
self.settings.set('LOG_ENABLED', True, priority='cmdline')
self.settings.set('LOG_LEVEL', opts.loglevel, priority='cmdline')
if opts.nolog:
self.settings.set('LOG_ENABLED', False, priority='cmdline')
if opts.pidfile:
with open(opts.pidfile, "w") as f:
f.write(str(os.getpid()) + os.linesep)
if opts.pdb:
failure.startDebugMode()
python类UsageError()的实例源码
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
if opts.output:
if opts.output == '-':
self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
else:
self.settings.set('FEED_URI', opts.output, priority='cmdline')
feed_exporters = without_none_values(
self.settings.getwithbase('FEED_EXPORTERS'))
valid_output_formats = feed_exporters.keys()
if not opts.output_format:
opts.output_format = os.path.splitext(opts.output)[1].replace(".", "")
if opts.output_format not in valid_output_formats:
raise UsageError("Unrecognized output format '%s', set one"
" using the '-t' switch or as a file extension"
" from the supported list %s" % (opts.output_format,
tuple(valid_output_formats)))
self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline')
#??
def run(self, args, opts):
if len(args) != 1:
raise UsageError()
command = args[0]
arguments = _parse_arguments(opts.args)
spider = ExternalSpider('StreamingSpider', command, arguments)
loader = ExternalSpiderLoader.from_settings(self.settings, load_spiders=False)
loader.crawl(spider)
def run(self, args, opts):
if len(args) != 4:
raise UsageError()
spider_name, url, login, password = args
crawler = self.crawler_process.create_crawler(spider_name)
scheduler = Scheduler.from_settings(self.settings)
spider = crawler.spidercls.from_crawler(crawler)
scheduler.open(spider)
add_login(spider, url, login, password, queue=scheduler.queue)
def run(self, args, opts):
if len(args) != 1:
raise UsageError()
crawler = self.crawler_process.create_crawler(args[0])
scheduler = Scheduler.from_settings(self.settings)
spider = crawler.spidercls.from_crawler(crawler)
scheduler.open(spider)
stats = scheduler.queue.get_stats()
print('\nQueue size: {len}, domains: {n_domains}\n'.format(**stats))
print_top = 10
printed_count = 0
queues = stats['queues']
print('{:<50}\tCount\tScore'.format('Domain'))
for queue, score, count in queues[:print_top]:
printed_count += count
domain = queue.rsplit(':')[-1]
print('{:<50}\t{}\t{:.0f}'.format(domain, count, score))
others_count = sum(count for _, _, count in queues) - printed_count
if others_count:
print('...')
print('{:<50}\t{}'.format(
'other {}:'.format(len(queues) - print_top), others_count))
print()
if opts.output:
with open(opts.output, 'w') as f:
json.dump(stats, f,
ensure_ascii=False, indent=True, sort_keys=True)
print('Stats dumped to {}'.format(opts.output))
def run(self, args, opts):
if not args:
raise UsageError()
if len(args) == 1 and '*' in args[0]:
# paths were not expanded (docker)
filenames = glob.glob(args[0])
else:
filenames = args
del args
filtered_filenames = [
f for f in filenames
if re.match(r'[a-z0-9]{12}\.csv$', os.path.basename(f))]
filenames = filtered_filenames or filenames
if not filenames:
raise UsageError()
response_logs = []
for filename in filenames:
with json_lines.open(filename) as f:
response_logs.append(pd.DataFrame(f))
print('Read data from {} files'.format(len(filenames)))
all_rpms = [rpms for rpms in (
get_rpms(name, rlog, step=opts.step, smooth=opts.smooth)
for name, rlog in zip(filenames, response_logs))
if rpms is not None]
if all_rpms:
print_rpms(all_rpms, opts)
print_scores(response_logs, opts)
def set_pages(self, pages):
if len(pages) == 0:
begin_page = 1
end_page = 999999
else:
begin_page = pages[0]
end_page = pages[1]
if begin_page <= 0:
raise UsageError("The number of begin page must not be less than 1!")
if begin_page > end_page:
raise UsageError("The number of end page must not be less than that of begin page!")
self.settings.set('BEGIN_PAGE', begin_page, priority='cmdline')
self.settings.set('END_PAGE', end_page, priority='cmdline')
def run(self, args, opts):
self.set_pages(opts.pages)
self.settings.set('GOOD_ONLY', opts.good_only)
self.settings.set('SEE_LZ', opts.see_lz)
if opts.filter:
try:
opts.filter = eval('filter.' + opts.filter)
except:
raise UsageError("Invalid filter function name!")
self.settings.set("FILTER", opts.filter)
cfg = config.config()
if len(args) >= 3:
raise UsageError("Too many arguments!")
self.settings.set('MYSQL_HOST', cfg.config['MYSQL_HOST'])
self.settings.set('MYSQL_USER', cfg.config['MYSQL_USER'])
self.settings.set('MYSQL_PASSWD', cfg.config['MYSQL_PASSWD'])
tbname = cfg.config['DEFAULT_TIEBA']
if len(args) >= 1:
tbname = args[0]
if isinstance(tbname, unicode):
tbname = tbname.encode('utf8')
dbname = None
for key in cfg.config['MYSQL_DBNAME'].keys():
if key.encode('utf8') == tbname:
dbname = cfg.config['MYSQL_DBNAME'][key]
if len(args) >= 2:
dbname = args[1]
cfg.config['MYSQL_DBNAME'][tbname.decode('utf8')] = dbname
if not dbname:
raise UsageError("Please input database name!")
self.settings.set('TIEBA_NAME', tbname, priority='cmdline')
self.settings.set('MYSQL_DBNAME', dbname, priority='cmdline')
config.init_database(cfg.config['MYSQL_HOST'], cfg.config['MYSQL_USER'], cfg.config['MYSQL_PASSWD'], dbname)
log = config.log(tbname, dbname, self.settings['BEGIN_PAGE'], opts.good_only, opts.see_lz)
self.settings.set('SIMPLE_LOG', log)
self.crawler_process.crawl('tieba', **opts.spargs)
self.crawler_process.start()
cfg.save()