def LoadFromHdfFile(InputDir, mode = 'train'):
if(mode == 'train'):
data = pd.read_hdf(path_or_buf= '%s/train.hdf' % InputDir, key='train')
elif(mode == 'valid'):
data = pd.read_hdf(path_or_buf= '%s/valid.hdf' % InputDir, key='valid')
else:
data = pd.read_hdf(path_or_buf= '%s/test.hdf' % InputDir, key='test')
return data
## class method, load data with pkl format
python类load()的实例源码
def LoadFromPklFile(InputDir):
with open('%s/train.pkl' % InputDir, 'rb') as i_file:
TrainData = pickle.load(i_file)
i_file.close()
with open('%s/test.pkl' % InputDir, 'rb') as i_file:
TestData = pickle.load(i_file)
i_file.close()
return TrainData,TestData
## class method, load data with text format
def download_and_unzip_result(url, job_hash):
r = requests.get(url, stream=True)
status_check(r)
total_size = int(r.headers.get('content-length', 0))
with open('download.zip', 'wb') as f:
pbar = tqdm(total=total_size, unit='B', unit_scale=True)
chunk_size = 1024 * 32 # 32kb
for data in r.iter_content(chunk_size):
f.write(data)
pbar.update(chunk_size)
# again there might be a pbar issue here
pbar.close()
zip_content = open("download.zip", "rb").read()
z = ZipFile(io.BytesIO(zip_content))
z.extractall()
remove('download.zip')
result = None # output of the script
new_files = None # names of new files created by the script
pickle_path = path.abspath(path.join(job_hash, job_hash + '.pkl'))
if path.isfile(pickle_path):
with open(pickle_path, 'rb') as f:
# Hack: a workaround for dill's pickling problem
# import_all()
result = dill.load(f)
# unimport_all()
remove(pickle_path)
if path.isdir(job_hash):
new_files = listdir(job_hash)
for name in new_files:
rename(path.join(job_hash, name), name)
rmtree(job_hash)
return result, new_files
def load(path, num_cpu=16):
with open(path, "rb") as f:
model_data, act_params = dill.load(f)
act = build_act(**act_params)
sess = U.make_session(num_cpu=num_cpu)
sess.__enter__()
with tempfile.TemporaryDirectory() as td:
arc_path = os.path.join(td, "packed.zip")
with open(arc_path, "wb") as f:
f.write(model_data)
zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
U.load_state(os.path.join(td, "model"))
return ActWrapper(act, act_params)
def load(path, num_cpu=16):
with open(path, "rb") as f:
model_data, act_params = dill.load(f)
act = build_act(**act_params)
sess = U.make_session(num_cpu=num_cpu)
sess.__enter__()
with tempfile.TemporaryDirectory() as td:
arc_path = os.path.join(td, "packed.zip")
with open(arc_path, "wb") as f:
f.write(model_data)
zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
U.load_state(os.path.join(td, "model"))
return ActWrapper(act, act_params)
def test_verify_features_does_not_work_by_default():
df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()
ml_predictor = utils.train_basic_binary_classifier(df_titanic_train)
file_name = ml_predictor.save(str(random.random()))
with open(file_name, 'rb') as read_file:
saved_ml_pipeline = dill.load(read_file)
os.remove(file_name)
try:
keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5'
os.remove(keras_file_name)
except:
pass
with warnings.catch_warnings(record=True) as w:
results = saved_ml_pipeline.named_steps['final_model'].verify_features(df_titanic_test)
print('Here are the caught warnings:')
print(w)
assert len(w) == 1
assert results == None
def test_verify_features_finds_no_missing_features_when_none_are_missing():
np.random.seed(0)
df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()
column_descriptions = {
'survived': 'output'
, 'embarked': 'categorical'
, 'pclass': 'categorical'
, 'sex': 'categorical'
}
ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)
ml_predictor.train(df_titanic_train, verify_features=True)
file_name = ml_predictor.save(str(random.random()))
with open(file_name, 'rb') as read_file:
saved_ml_pipeline = dill.load(read_file)
os.remove(file_name)
missing_features = saved_ml_pipeline.named_steps['final_model'].verify_features(df_titanic_test)
print('missing_features')
print(missing_features)
print("len(missing_features['prediction_not_training'])")
print(len(missing_features['prediction_not_training']))
print("len(missing_features['training_not_prediction'])")
print(len(missing_features['training_not_prediction']))
assert len(missing_features['prediction_not_training']) == 0
assert len(missing_features['training_not_prediction']) == 0
def load_ml_model(file_name):
with open(file_name, 'rb') as read_file:
base_pipeline = dill.load(read_file)
if isinstance(base_pipeline, utils_categorical_ensembling.CategoricalEnsembler):
for step in base_pipeline.transformation_pipeline.named_steps:
pipeline_step = base_pipeline.transformation_pipeline.named_steps[step]
try:
if pipeline_step.get('model_name', 'reallylongnonsensicalstring')[:12] == 'DeepLearning':
pipeline_step.model = insert_deep_learning_model(pipeline_step, file_name)
except AttributeError:
pass
for step in base_pipeline.trained_models:
pipeline_step = base_pipeline.trained_models[step]
try:
if pipeline_step.get('model_name', 'reallylongnonsensicalstring')[:12] == 'DeepLearning':
pipeline_step.model = insert_deep_learning_model(pipeline_step, file_name)
except AttributeError:
pass
else:
for step in base_pipeline.named_steps:
pipeline_step = base_pipeline.named_steps[step]
try:
if pipeline_step.get('model_name', 'reallylongnonsensicalstring')[:12] == 'DeepLearning':
pipeline_step.model = insert_deep_learning_model(pipeline_step, file_name)
except AttributeError:
pass
return base_pipeline
# Keeping this here for legacy support
def load(path, num_cpu=16):
with open(path, "rb") as f:
model_data, act_params = dill.load(f)
act = deepq.build_act(**act_params)
sess = U.make_session(num_cpu=num_cpu)
sess.__enter__()
with tempfile.TemporaryDirectory() as td:
arc_path = os.path.join(td, "packed.zip")
with open(arc_path, "wb") as f:
f.write(model_data)
zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
U.load_state(os.path.join(td, "model"))
return ActWrapper(act, act_params)
def Load(file):
""" Loads a model from specified file """
with open(file,'rb') as file:
model=dill.load(file)
return model
def update_session(fname=None):
import dill as pickle
if fname is None:
fname = conf.session
try:
s = pickle.load(gzip.open(fname,"rb"))
except IOError:
s = pickle.load(open(fname,"rb"))
scapy_session = builtins.__dict__["scapy_session"]
scapy_session.update(s)
################
##### Main #####
################
def load_object(fname):
import dill as pickle
return pickle.load(gzip.open(fname,"rb"))
def load_csr(f, return_y=False):
npz = np.load(f)
X = csr_matrix((npz['data'], npz['indices'], npz['indptr']),
shape=npz['shape'])
if return_y:
return X, npz['y']
else:
return X
def load_results(key, args):
fn = cache_fname(key, args)
with open(fn, "rb") as f:
return dill.load(f)
def saga_score_struct_cache(*args):
arghash = sha1(repr(("score_struct",) + args).encode('utf-8')).hexdigest()
fn = "res/baseline_linear_{}.dill".format(arghash)
try:
with open(fn, 'rb') as f:
out = dill.load(f)
logging.info("Loaded cached version.")
except FileNotFoundError:
logging.info("Computing...")
out = saga_score_struct(*args)
with open(fn, 'wb') as f:
dill.dump(out, f)
return out
def linear_cv_score(dataset, alpha, l1_ratio, constraints):
fn = cache_fname("linear_cv_score", (dataset, alpha, l1_ratio,
constraints))
if os.path.exists(fn):
logging.info("Loading {}".format(fn))
with open(fn, "rb") as f:
return dill.load(f)
load, ids = get_dataset_loader(dataset, split="train")
n_folds = 5 if dataset == 'ukp' else 3
scores = []
for k, (tr, val) in enumerate(KFold(n_folds).split(ids)):
Y_marg, bl = saga_decision_function(dataset, k, alpha, alpha, l1_ratio)
val_docs = list(load(ids[val]))
Y_true = [doc.label for doc in val_docs]
Y_pred = bl.fast_decode(Y_marg, val_docs, constraints)
scores.append(bl._score(Y_true, Y_pred))
with open(fn, "wb") as f:
logging.info("Saving {}".format(fn))
dill.dump(scores, f)
return scores
def svmstruct_cv_score(dataset, C, class_weight, constraints,
compat_features, second_order_features):
fn = cache_fname("svmstruct_cv_score", (dataset, C, class_weight,
constraints, compat_features,
second_order_features))
if os.path.exists(fn):
logging.info("Cached file already exists.")
with open(fn, "rb") as f:
return dill.load(f)
load, ids = get_dataset_loader(dataset, split="train")
n_folds = 5 if dataset == 'ukp' else 3
# below are boolean logical ops
grandparents = second_order_features and dataset == 'ukp'
coparents = second_order_features
siblings = second_order_features and dataset == 'cdcp'
scores = []
all_Y_pred = []
for k, (tr, val) in enumerate(KFold(n_folds).split(ids)):
train_docs = list(load(ids[tr]))
val_docs = list(load(ids[val]))
clf, Y_val, Y_pred = fit_predict(train_docs, val_docs, dataset, C,
class_weight,
constraints, compat_features,
second_order_features, grandparents,
coparents, siblings)
all_Y_pred.extend(Y_pred)
scores.append(clf.model._score(Y_val, Y_pred))
with open(fn, "wb") as f:
dill.dump((scores, all_Y_pred), f)
return scores, all_Y_pred
def read_dill(file_):
"""
Deserialize a computation from a file or file-like object
:param file_: If string, writes to a file
:type file_: File-like object, or string
"""
if isinstance(file_, six.string_types):
with open(file_, 'rb') as f:
return dill.load(f)
else:
return dill.load(file_)
def load(self, pkl, filepath):
entity_type = pkl.get_entity_type(self._entity_type_id)
self.__dict__ = entity_type.__dict__
# initialize index
self._ann_obj = AnnoyIndex(pkl.get_nfactor(), entity_type._metric)
# mmap the file
self._ann_obj.load(filepath)
def load_entities(self, entities, file_getter):
"""Load underlying entities."""
for k in entities:
annoy_filepath = file_getter.get_file_path('{}.ann'.format(k))
try:
self._annoy_objects[k].load(self,
annoy_filepath)
except IOError as e:
raise IOError(
"Error: cannot load file {0}, which was built "
"with the model. '{1}'".format(annoy_filepath, e)
)