def first_pass(config):
config = copy.deepcopy(config)
config['result_dir_prefix'] = config['result_dir_prefix'] + '/first-pass'
if config['debug']:
print('Removing fixed params')
config["fixed_params"] = {}
config['max_iter'] = 5 if config['debug'] else 150
if config['debug']:
print('Overriding max_iter params to %d' % config['max_iter'])
dry_run = True if config['debug'] else False
get_params = get_agent_class(config).get_random_config
results = []
futures = []
with concurrent.futures.ProcessPoolExecutor(min(multiprocessing.cpu_count(), config['nb_process'])) as executor:
nb_config = 5 if config['debug'] else 1000
for i in range(nb_config):
params = get_params(config["fixed_params"])
config.update(params)
futures.append(executor.submit(exec_first_pass, i, copy.deepcopy(config), params))
concurrent.futures.wait(futures)
results = []
for future in futures:
results.append(future.result())
return {
'results': sorted(results, key=lambda result: result['mean_score'], reverse=True)
}
python类append()的实例源码
def second_pass(config, best_agent_config):
config = copy.deepcopy(config)
config.update(best_agent_config)
config['result_dir_prefix'] = config['result_dir_prefix'] + '/second-pass'
config['max_iter'] = 5 if config['debug'] else 500
futures = []
with concurrent.futures.ProcessPoolExecutor(min(multiprocessing.cpu_count(), config['nb_process'])) as executor:
if config['debug']:
lrs = [1e-4, 1e-2, 1]
else:
lrs = [1e-4, 2e-4, 3e-4, 4e-4, 5e-4, 6e-4, 7e-4, 8e-4, 9e-4, 1e-3, 2e-3, 3e-3, 4e-3, 5e-3, 6e-3, 7e-3, 8e-3, 9e-3, 1e-2, 2e-2, 3e-2, 4e-2, 5e-2, 6e-2, 7e-2, 8e-2, 9e-2, 1e-1, 2e-1, 3e-1, 4e-1, 5e-1, 6e-1, 7e-1, 8e-1, 9e-1, 1]
for lr in lrs:
config['lr'] = lr
futures.append(executor.submit(exec_second_pass, copy.deepcopy(config)))
concurrent.futures.wait(futures)
results = []
for future in futures:
results.append(future.result())
return {
'best_agent_config': best_agent_config
, 'results': sorted(results, key=lambda result: result['mean_score'], reverse=True)
}
def search(config):
get_params = get_agent_class(config).get_random_config
params_keys = list(get_params().keys())
nb_hp_params = len(params_keys)
if config['debug']:
print('*** Number of hyper-parameters: %d' % nb_hp_params)
config['max_iter'] = 5 if config['debug'] else 500
futures = []
with concurrent.futures.ProcessPoolExecutor(min(multiprocessing.cpu_count(), config['nb_process'])) as executor:
nb_config = 5 if config['debug'] else 200 * nb_hp_params
for i in range(nb_config):
params = get_params(config["fixed_params"])
config.update(params)
config['random_seed'] = 1
futures.append(executor.submit(test_params, i, copy.deepcopy(config), copy.deepcopy(params)))
concurrent.futures.wait(futures)
results = [future.result() for future in futures]
results = sorted(results, key=lambda result: result['mean_score'], reverse=True)
best_params = results[0]['params']
return {
'best_params': best_params
, 'results': results
}
def run_all(self):
try:
self.api.connect()
except URLError as e:
self.application_logger.exception('Failed to connect to the server. Salt API URLError: %s',
e.args[0].strerror)
self.test_report_logger.debug(e)
exit(1)
# Run async tests
started_counter = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = []
for test in self.test_suite.test_cases_async:
self.application_logger.info('Start test ' + test.name)
futures.append(executor.submit(self._start_test_async, test))
for x in concurrent.futures.as_completed(futures):
if not x.result():
self.application_logger.error('Error starting async test')
executor.shutdown(wait=False)
exit(1)
started_counter += 1
self.application_logger.info('Started test %s of %s', started_counter,
len(self.test_suite.test_cases_async))
test_counter = 0
self.application_logger.info('----------------Started all tests-----------------')
for test in self.test_suite.test_cases_async:
self.application_logger.info('CollectResult of Test ' + test.name)
self._collect_result(test)
test_counter += 1
self.application_logger.info('Collected results from %s of %s tests', test_counter,
len(self.test_suite.test_cases_async))
self.application_logger.info('--------------Collected all results---------------')
# Run sync tests
for test in self.test_suite.test_cases_sync:
self.application_logger.info('Start Test ' + test.name)
self._start_test_sync(test)
self.application_logger.info('\n')
def _find_filenames(directory, extensions, class_indices, follow_links=False):
def _recursive_list(subpath):
return sorted(os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
classes = []
filenames = []
subdir = os.path.basename(directory)
basedir = os.path.dirname(directory)
for root, _, files in _recursive_list(directory):
for fname in files:
is_valid = False
for extension in extensions:
if fname.lower().endswith('.' + extension):
is_valid = True
break
if is_valid:
classes.append(class_indices[subdir])
# add filename relative to directory
absolute_path = os.path.join(root, fname)
filenames.append(os.path.relpath(absolute_path, basedir))
return classes, filenames
def add_handler(self, handler):
"""Adds a transaction family handler
Args:
handler (TransactionHandler): the handler to be added
"""
self._handlers.append(handler)
def _split_batch_list(batch_list):
new_list = []
for batch in batch_list.batches:
new_list.append(batch)
if len(new_list) == 100:
yield batch_pb2.BatchList(batches=new_list)
new_list = []
if new_list:
yield batch_pb2.BatchList(batches=new_list)
def extract_source(self, source: DataWarehouseSchema,
relations: List[RelationDescription]) -> List[RelationDescription]:
"""
For a given upstream source, iterate through given relations to extract the relations' data.
"""
self.logger.info("Extracting %d relation(s) from source '%s'", len(relations), source.name)
failed = []
with Timer() as timer:
for i, relation in enumerate(relations):
try:
def _monitored_table_extract(attempt_num):
with etl.monitor.Monitor(relation.identifier,
"extract",
options=self.options_info(),
source=self.source_info(source, relation),
destination={'bucket_name': relation.bucket_name,
'object_key': relation.manifest_file_name},
index={"current": i + 1, "final":
len(relations), "name": source.name},
dry_run=self.dry_run,
attempt_num=attempt_num + 1):
self.extract_table(source, relation)
retries = get_config_int("arthur_settings.extract_retries")
retry(retries, _monitored_table_extract, self.logger)
except ETLRuntimeError:
self.failed_sources.add(source.name)
failed.append(relation)
if not relation.is_required:
self.logger.exception("Extract failed for non-required relation '%s':", relation.identifier)
elif self.keep_going:
self.logger.exception("Ignoring failure of required relation '%s' and proceeding as requested:",
relation.identifier)
else:
self.logger.debug("Extract failed for required relation '%s'", relation.identifier)
raise
self.logger.info("Finished extract from source '%s': %d succeeded, %d failed (%s)",
source.name, len(relations) - len(failed), len(failed), timer)
return failed
def extract_sources(self) -> None:
"""
Iterate over sources to be extracted and parallelize extraction at the source level
"""
self.logger.info("Starting to extract %d relation(s) in %d schema(s)", len(self.relations), len(self.schemas))
self.failed_sources.clear()
max_workers = len(self.schemas)
# TODO With Python 3.6, we should pass in a thread_name_prefix
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for source_name, relation_group in groupby(self.relations, attrgetter("source_name")):
future = executor.submit(self.extract_source, self.schemas[source_name], list(relation_group))
futures.append(future)
if self.keep_going:
done, not_done = concurrent.futures.wait(futures, return_when=concurrent.futures.ALL_COMPLETED)
else:
done, not_done = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_EXCEPTION)
if self.failed_sources:
self.logger.error("Failed to extract from these source(s): %s", join_with_quotes(self.failed_sources))
# Note that iterating over result of futures may raise an exception (which surfaces exceptions from threads)
missing_tables = [] # type: List
for future in done:
missing_tables.extend(future.result())
for table_name in missing_tables:
self.logger.warning("Failed to extract: '%s'", table_name.identifier)
if not_done:
raise DataExtractError("Extract failed to complete for {:d} source(s)".format(len(not_done)))
def sync_with_s3(relations: List[RelationDescription], bucket_name: str, prefix: str, dry_run: bool=False) -> None:
"""
Copy (validated) table design and SQL files from local directory to S3 bucket.
"""
logger.info("Validating %d table design(s) before upload", len(relations))
RelationDescription.load_in_parallel(relations)
files = [] # typing: List[Tuple[str, str]]
for relation in relations:
relation_files = [relation.design_file_name]
if relation.is_transformation:
if relation.sql_file_name:
relation_files.append(relation.sql_file_name)
else:
raise MissingQueryError("Missing matching SQL file for '%s'" % relation.design_file_name)
for file_name in relation_files:
local_filename = relation.norm_path(file_name)
remote_filename = os.path.join(prefix, local_filename)
files.append((local_filename, remote_filename))
uploader = etl.s3.S3Uploader(bucket_name, dry_run=dry_run)
with Timer() as timer:
futures = [] # typing: List[concurrent.futures.Future]
# TODO With Python 3.6, we should pass in a thread_name_prefix
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
for local_filename, remote_filename in files:
futures.append(executor.submit(uploader.__call__, local_filename, remote_filename))
errors = 0
for future in concurrent.futures.as_completed(futures):
exception = future.exception()
if exception is not None:
logger.error("Failed to upload file: %s", exception)
errors += 1
if not dry_run:
logger.info("Uploaded %d of %d file(s) to 's3://%s/%s (%s)",
len(files) - errors, len(files), bucket_name, prefix, timer)
if errors:
raise ETLRuntimeError("There were {:d} error(s) during upload".format(errors))
def replace_vars(value, *, autonumify=False):
if isinstance(value, dict):
materialized = {}
for k, v in value.items():
materialized[k] = replace_vars(v)
return materialized
if isinstance(value, list):
materialized = []
for v in value:
materialized.append(replace_vars(v))
return materialized
if type(value) is str:
for var in re.findall("<<\\w+>>", value):
k = var.replace('<<', '').replace('>>', '')
nv = STORED_VARS[k] if k in STORED_VARS else None
if nv is not None:
value = value.replace(var, str(nv))
if autonumify:
try:
return float(value)
except Exception:
pass
return value
def load_classes(kind: str):
import pkgutil
import importlib
paths = []
paths.append(os.path.join(os.path.dirname(__file__), "..", "plugins"))
extra_path = os.getenv('PHAT_EXTRA_PLUGINS_DIR')
if extra_path:
extra_path = os.path.join(extra_path, "plugins")
paths.append(extra_path)
for path in paths:
if path not in sys.path:
sys.path.append(path)
logger.debug("Loading {}s from {}: ".format(kind, paths))
loaded_items = []
for _, name, ispkg in pkgutil.walk_packages(path=paths):
logger.debug("Loading {}: {}".format(kind, name))
if ispkg:
try:
module = importlib.import_module('{}.{}'.format(name, kind))
except ImportError as ex:
logger.debug("Error importing module: {0} of kind: {1}".format(name, kind))
else:
loaded_items.append(module)
# TODO: Return the classes instead of modules
return loaded_items
def include_files(self, filename):
materialized_tests = []
f = open(filename, encoding='utf-8')
if f:
try:
settings = json.load(f)
except JSONDecodeError as e:
data = open(filename, 'r').read()
print("{filename}:{line}:{col}: failed to decode json: {msg}".format(filename=filename, line=e.lineno,
col=e.colno, msg=e.msg))
print("\tGave up here: {context} ?".format(
context=repr(data[max(0, e.pos - 40):e.pos + 1].translate(str.maketrans("\t\n", " ")))))
exit(1)
tests = settings["tests"]
it = iter(tests)
for item in it:
if 'include' in item:
inc_filename = item['include']
if not os.path.isabs(inc_filename):
base_path = os.path.dirname(filename)
inc_filename = os.path.join(base_path, inc_filename)
inc = self.include_files(inc_filename)
materialized_tests.extend(inc)
else:
materialized_tests.append(item)
return materialized_tests
def fail(self, message, *, url=None):
if url is None:
url = self.url
url = replace_vars(url)
self.errors.append({"url": url, "error": message})
print("\N{BALLOT X}", end="", flush=True)
def chunkify(iterable, chunksize=3000):
i = 0
chunk = []
for item in iterable:
chunk.append(item)
i += 1
if i == chunksize:
yield chunk
i = 0
chunk = []
if chunk:
yield chunk
def get_blocks(self, block_nums):
requests = (
{
'jsonrpc': '2.0', 'id': block_num, 'method': 'get_block',
'params': [block_num]
} for block_num in block_nums)
batched_requests = chunkify(requests, self.batch_request_size)
coros = (self.fetch(batch) for batch in batched_requests)
first_coros = islice(coros, 0, self.concurrent_tasks_limit)
futures = [asyncio.ensure_future(c) for c in first_coros]
logger.debug(f'inital futures:{len(futures)}')
start = time.perf_counter()
while futures:
await asyncio.sleep(0)
for f in futures:
try:
if f.done():
self._perf_history.append(time.perf_counter() - start)
result = f.result()
futures.remove(f)
logger.debug(f'futures:{len(futures)}')
try:
futures.append(asyncio.ensure_future(next(coros)))
except StopIteration as e:
logger.debug('StopIteration')
except concurrent.futures._base.CancelledError:
return
start = time.perf_counter()
yield result
except KeyboardInterrupt:
logger.debug('client.get blocks kbi')
for f in futures:
f.cancel()
self.close()
return
except Exception as e:
logger.exception(f'client.get_blocks error:{e}')
continue
def detectValidExtensions(self,extensions,maxN,extList=None) :
self.logger.info("### Starting detection of valid extensions ...")
n = 0
if extList :
tmpExtList = []
for e in extList :
tmpExtList.append((e,getMime(extensions,e)))
else :
tmpExtList = extensions
validExtensions = []
extensionsToTest = tmpExtList[0:maxN]
with concurrent.futures.ThreadPoolExecutor(max_workers=self.threads) as executor :
futures = []
try :
for ext in extensionsToTest:
f = executor.submit(self.uploadFile,"."+ext[0],ext[1],os.urandom(self.size))
f.ext = ext
f.add_done_callback(self.detectValidExtension)
futures.append(f)
for future in concurrent.futures.as_completed(futures) :
a = future.result()
n += 1
except KeyboardInterrupt :
self.shouldLog = False
executor.shutdown(wait=False)
self.stopThreads = True
executor._threads.clear()
concurrent.futures.thread._threads_queues.clear()
return n
#detects if code execution is gained, given an url to request and a regex supposed to match the executed code output
def detectForms(html) :
soup = BeautifulSoup(html,'html.parser')
detectedForms = soup.find_all("form")
returnForms = []
if len(detectedForms) > 0 :
for f in detectedForms :
fileInputs = f.findChildren("input",{"type":"file"})
if len(fileInputs) > 0 :
returnForms.append((f,fileInputs))
return returnForms