def test_models(models_dir, test_dir, out_dir):
models_dir = utils.abs_path_dir(models_dir) + "/"
test_dir = utils.abs_path_dir(test_dir) + "/"
utils.create_dir(out_dir)
test_files = os.listdir(test_dir)
models = os.listdir(models_dir)
for model in models:
utils.print_success(model)
pred_dir = out_dir + model + "/"
utils.create_dir(pred_dir)
clf = joblib.load(models_dir + model + "/" + model + ".pkl")
for index, test_file in enumerate(test_files):
print(str(index) + "\t" + test_file)
sys.stdout.write("\033[F")
sys.stdout.write("\033[K")
test_features = read_test_file(test_dir + test_file)
predictions = clf.predict_proba(test_features)
with open(pred_dir + test_file, "w") as filep:
for pred in predictions:
filep.write(str(pred[0]) + "\n")
sys.stdout.write("\033[K")
python类load()的实例源码
def _caches_to_file(cache_path, start, end, name, cb, concat):
start_time = time()
if concat:
all_data = []
for i in range(start, end):
data = load(os.path.join(cache_path, "{0}.jb".format(i)))
all_data.extend(data)
dump(all_data, name, 3)
else:
target_path = os.path.join(cache_path, name[:-3])
if not os.path.exists(target_path):
os.makedirs(target_path)
for i in range(start, end):
src_file_path = os.path.join(cache_path, "{0}.jb".format(i))
basename = os.path.basename(src_file_path)
target_file_path = os.path.join(target_path, basename)
shutil.move(src_file_path, target_file_path)
finished_flag = os.path.join(target_path, '.finished')
with open(finished_flag, 'a'):
os.utime(finished_flag, None)
logging.debug("Finished saving data to {0}. Took {1}s".format(name, time()-start_time))
cb()
def create_ds_from_splits(mat_file_prefix, db, splits):
"""Create datasets from splits of data.
This function is created because of splits from larger datasets
"""
for batch_idx in xrange(1, splits+1):
# load data
temp_data = load_mat_data(mat_file_prefix+str(batch_idx)+".mat")
print ("[MESSAGE] Loaded %d-th split" % batch_idx)
group_name = "grid_data_split_"+str(batch_idx)
# save data within splits
for key in data_dict:
add_h5_group(group_name, db)
add_h5_ds(temp_data[key], key, db, group_name)
print ("[MESSAGE] %s: Saved %s" % (group_name, key))
def plot_all_policy_at0(path_experiment,color,num_iter=100,fig_dir=None):
mean_at_0 = []
var_at_0 = []
for itr in range(num_iter):
data_bimodal_1d = joblib.load(os.path.join(path_experiment,'itr_{}.pkl'.format(itr)))
poli = data_bimodal_1d['policy']
action_at_0 = poli.get_action(np.array((0,)))
mean_at_0.append(action_at_0[1]['mean'])
var_at_0.append(action_at_0[1]['log_std'])
# print "sampled action in iter {}: {}. Reward should be: {}".format(itr, action_at_0[0], reward(action_at_0[0]))
itr = list(range(num_iter))
plt.plot(itr,mean_at_0, color=color, label = 'mean at 0')
plt.plot(itr, var_at_0, color=color * 0.7, label = 'logstd at 0')
plt.title('How the policy variates accross iterations')
plt.xlabel('iteration')
plt.ylabel('mean and variance at 0')
plt.legend(loc=3)
if fig_dir:
plt.savefig(os.path.join(fig_dir,'policy_progress'))
else:
print("No directory for saving plots")
## estimate by MC the policy at 0!
def plot_all_policy_at0(path_experiment, color, num_iter=100, fig_dir=None):
mean_at_0 = []
var_at_0 = []
for itr in range(num_iter):
data_bimodal_1d = joblib.load(os.path.join(path_experiment, 'itr_{}.pkl'.format(itr)))
poli = data_bimodal_1d['policy']
action_at_0 = poli.get_action(np.array((0,)))
mean_at_0.append(action_at_0[1]['mean'])
var_at_0.append(action_at_0[1]['log_std'])
itr = list(range(num_iter))
plt.plot(itr, mean_at_0, color=color, label='mean at 0')
plt.plot(itr, var_at_0, color=color * 0.7, label='logstd at 0')
plt.title('How the policy variates accross iterations')
plt.xlabel('iteration')
plt.ylabel('mean and variance at 0')
plt.legend(loc=3)
if fig_dir:
plt.savefig(os.path.join(fig_dir, 'policy_at_0'))
else:
print("No directory for saving plots")
## plot for all the experiments
def __init__(self):
et = EntityTracker()
self.bow_enc = BoW_encoder()
self.emb = UtteranceEmbed()
at = ActionTracker(et)
self.dataset, dialog_indices = Data(et, at).trainset
train_indices = joblib.load('data/train_test_list/train_indices_759')
test_indices = joblib.load('data/train_test_list/test_indices_759_949')
self.dialog_indices_tr = train_indices
self.dialog_indices_dev = test_indices
obs_size = self.emb.dim + self.bow_enc.vocab_size + et.num_features
self.action_templates = at.get_action_templates()
action_size = at.action_size
nb_hidden = 128
self.net = LSTM_net(obs_size=obs_size,
action_size=action_size,
nb_hidden=nb_hidden)
def train(filePath):
try:
if not filePath.lower().endswith('json'):
return {'success':False,'message':'Training file should be in json format'}
with open(filePath) as file:
ent_data = json.load(file)
dataset = [jsonToCrf(q, nlp) for q in ent_data['entity_examples']]
X_train = [sent2features(s) for s in dataset]
y_train = [sent2labels(s) for s in dataset]
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
if(not os.path.exists("crfModel")):
os.mkdir("crfModel")
if(os.path.isfile("crfModel/classifier.pkl")):
os.remove("crfModel/classifier.pkl")
joblib.dump(crf,"crfModel/classifier.pkl")
return {'success':True,'message':'Model Trained Successfully'}
except Exception as ex:
return {'success':False,'message':'Error while Training the model - '+str(ex)}
def predict(utterance):
try:
tagged = []
finallist = []
parsed = nlp(utterance)
for i in range(len(parsed)):
tagged.append((str(parsed[i]),parsed[i].tag_))
finallist.append(tagged)
test = [sent2features(s) for s in finallist]
if(os.path.isfile("crfModel/classifier.pkl")):
crf = joblib.load("crfModel/classifier.pkl")
else:
return {'success':False,'message':'Please Train the model first'}
predicted = crf.predict(test)
entityList = extractEntities(predicted[0],tagged)
return {'success':True,'entitiesPredicted':entityList}
except Exception as ex:
return {'success':False,'message':'Error while pediction - '+str(ex)}
def setNetwork(self, fname, nEpoch=-1):
""" Set values into the network
Parameters
-----------
fname : string
Name of the file where the values are
nEpoch : int
Epoch number (Optional)
"""
basename = "nnets/" + fname
if (nEpoch>=0):
all_params = joblib.load(basename + ".epoch={}".format(nEpoch))
else:
all_params = joblib.load(basename)
self._network.setAllParams(all_params)
def __init__(self, *args, **kwargs):
super(score_simple, self).__init__(*args, **kwargs)
f_db = os.path.join(
kwargs['output_data_directory'],
kwargs['term_frequency']['f_db']
)
if not os.path.exists(f_db):
msg = "{} not computed yet, needed for TF methods!"
raise ValueError(msg.format(f_db))
score_config = simple_config.load()["score"]
f_csv = os.path.join(
score_config["output_data_directory"],
score_config["term_document_frequency"]["f_db"],
)
IDF = pd.read_csv(f_csv)
IDF = dict(zip(IDF["word"].values, IDF["count"].values))
self.corpus_N = IDF.pop("__pipeline_document_counter")
# Compute the IDF
for key in IDF:
IDF[key] = np.log(float(self.corpus_N) / (IDF[key] + 1))
self.IDF = IDF
def restore(self, checkpoint_dir=None):
if checkpoint_dir is None: checkpoint_dir = logger.get_snapshot_dir()
checkpoint_file = os.path.join(checkpoint_dir, 'params.chk')
if os.path.isfile(checkpoint_file + '.meta'):
sess = tf.get_default_session()
saver = tf.train.Saver()
saver.restore(sess, checkpoint_file)
tabular_chk_file = os.path.join(checkpoint_dir, 'progress.csv.chk')
if os.path.isfile(tabular_chk_file):
tabular_file = os.path.join(checkpoint_dir, 'progress.csv')
logger.remove_tabular_output(tabular_file)
shutil.copy(tabular_chk_file, tabular_file)
logger.add_tabular_output(tabular_file)
pool_file = os.path.join(checkpoint_dir, 'pool.chk')
if self.save_format == 'pickle':
pickle_load(pool_file)
elif self.save_format == 'joblib':
self.pool = joblib.load(pool_file)
else: raise NotImplementedError
logger.log('Restored from checkpoint %s'%checkpoint_file)
else:
logger.log('No checkpoint %s'%checkpoint_file)
def read_data_from_pkl(datafile):
"""
read file in joblib.dump pkl
:param datafile: filename of pkl
:return:
"""
datas = joblib.load(datafile)
for i in range(10):
datas = np.random.permutation(datas)
inputs, labels = [], []
for data in datas:
inputs.append(data["input"])
labels.append(data["label"])
inputs = np.array(inputs).reshape(-1, 15, 101, 101, 3).astype(np.float32)
inputs -= np.mean(inputs, axis=(2, 3), keepdims=True)
inputs /= np.std(inputs, axis=(2, 3), keepdims=True)
labels = np.array(labels).reshape(-1, 1).astype(np.float32)
return inputs, labels
def _fit_embedding_word(self, embedding_type, construct_docs, tokenize_, d=None):
if embedding_type == 'google':
embeddings_ = joblib.load('data/google/GoogleNews-vectors-negative300.pickle')
embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()})
elif embedding_type == 'twitter':
estimator = Pipeline([
('tokenize', MapCorporas(tokenize_)),
('word2vec', MergeSliceCorporas(CachedFitTransform(Word2Vec(
sg=1, size=d, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
), self.memory))),
]).fit([self.train_docs, self.unsup_docs[:10**6], self.val_docs, self.test_docs])
embeddings_ = estimator.named_steps['word2vec'].estimator
embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()})
else:
embeddings_ = SimpleNamespace(X=np.empty((0, d)), vocab={})
estimator = Pipeline([
('tokenize', MapCorporas(tokenize_)),
# 0.25 is chosen so the unknown vectors have approximately the same variance as google pre-trained ones
('embeddings', MapCorporas(Embeddings(
embeddings_, rand=lambda shape: get_rng().uniform(-0.25, 0.25, shape).astype('float32'),
include_zero=True
))),
])
estimator.fit(construct_docs)
return estimator.named_steps['embeddings'].estimator
def __init__(self, num_rnn_hidden, num_classes, utter_embed,
sequence_length, filter_sizes, num_filters, config, l2_reg_lambda=0.0):
self.num_hidden = num_rnn_hidden
self.num_classes = num_classes
self.utter_embed = utter_embed
self.logger = config.logger
self.sequence_length = sequence_length
self.embedding_size = utter_embed.get_vector_size()
self.filter_sizes = filter_sizes
self.num_filters = num_filters
self.l2_reg_lambda = l2_reg_lambda
self.config = config
self.cate_mapping_dict = joblib.load('./dbdc3/data/cate_mapping_dict')
def read(self, unpickler):
"""Reconstruct the array."""
filename = os.path.join(unpickler._dirname, self.filename)
# Load the array from the disk
# use getattr instead of self.allow_mmap to ensure backward compat
# with NDArrayWrapper instances pickled with joblib < 0.9.0
allow_mmap = getattr(self, 'allow_mmap', True)
memmap_kwargs = ({} if not allow_mmap
else {'mmap_mode': unpickler.mmap_mode})
array = unpickler.np.load(filename, **memmap_kwargs)
# Reconstruct subclasses. This does not work with old
# versions of numpy
if (hasattr(array, '__array_prepare__') and
self.subclass not in (unpickler.np.ndarray,
unpickler.np.memmap)):
# We need to reconstruct another subclass
new_array = unpickler.np.core.multiarray._reconstruct(
self.subclass, (0,), 'b')
return new_array.__array_prepare__(array)
else:
return array
def get_vocab_index_embedding_weights(self, embedding_dim, embedding_weights_masking, load_embeddings_pickled=False, load_vocab_pickled=False):
embedding_weights = []
if embedding_weights_masking==True:
masking_value = "masked" # For masked embedding weights leave it blank "", else for masked use "_non_masked"
else:
masking_value = "non_masked" # For masked embedding weights leave it blank "", else for masked use "_non_masked"
# Load data from files
if load_vocab_pickled:
vocab_index_dict = joblib.load(self.conf.vocab_index_file.format(masking_value))
vocab_set = joblib.load(self.conf.vocab_set_file.format(masking_value))
else:
vocab_set, vocab_index_dict = self.generate_vocabulary_set(masking=embedding_weights_masking)
if self.conf.feature_level == "word":
embedding_weights = self.load_word_embeddings_compact(embedding_dim, vocab_set,
masking=embedding_weights_masking,
use_pickled=load_embeddings_pickled)
return embedding_weights, vocab_index_dict
def add_file(prediction, create, value, *args, **kwargs):
train_featureset = prediction.model.featureset
fset_data, data = featurize.load_featureset(train_featureset.file_uri)
if 'class' in prediction.dataset.name or 'regr' in prediction.dataset.name:
labels = data['labels']
else:
labels = []
model_data = joblib.load(prediction.model.file_uri)
if hasattr(model_data, 'best_estimator_'):
model_data = model_data.best_estimator_
preds = model_data.predict(fset_data)
pred_probs = (pd.DataFrame(model_data.predict_proba(fset_data),
index=fset_data.index.astype(str),
columns=model_data.classes_)
if hasattr(model_data, 'predict_proba') else [])
all_classes = model_data.classes_ if hasattr(model_data, 'classes_') else []
pred_path = pjoin(TMP_DIR, '{}.npz'.format(str(uuid.uuid4())))
featurize.save_featureset(fset_data, pred_path, labels=labels,
preds=preds, pred_probs=pred_probs)
prediction.file_uri = pred_path
DBSession().commit()
def train_model_group(model_info,function='logistic'):
model = joblib.load(getTheFile('models/{}.m').format(function))
base_info = model_config.model_dim
user_id = model_info['user_id']
del model_info['user_id']
for key,value in base_info.items():
if key not in model_info.columns:
model_info[key] = value
#model_info = transform_data.transform_with_woe(model_info)
model_info = model_info[sorted(model_info.columns)]
x = np.matrix(model_info)
yt = model.predict(x)
ytp = model.predict_proba(x)
return user_id,yt,ytp
def process_map_data(path):
data = joblib.load(path)
im_data = data['im']
value_data = data['value']
state_data = data['state']
label_data = np.array([np.eye(1, 8, l)[0] for l in data['label']])
num = im_data.shape[0]
num_train = num - num / 5
im_train = np.concatenate((np.expand_dims(im_data[:num_train], 1),
np.expand_dims(value_data[:num_train], 1)),axis=1).astype(dtype=np.float32)
state_train = state_data[:num_train]
label_train = label_data[:num_train]
im_test = np.concatenate((np.expand_dims(im_data[num_train:], 1),
np.expand_dims(value_data[num_train:], 1)),axis=1).astype(dtype=np.float32)
state_test = state_data[num_train:]
label_test = label_data[num_train:]
return (im_train, state_train, label_train), (im_test, state_test, label_test)
def load_previous_runs(self):
if not os.path.isfile(self.prev_runs_file_):
self.found_matching_run = False
return
with open(self.prev_runs_file_, 'r') as file:
self.all_prev_runs_ = yaml.load(file)
for run in self.all_prev_runs_:
self.max_data_version_ = max(self.max_data_version_, run['input_data_version'])
if run['input_data_hash'] == self.data_.df_hash:
self.max_config_version_ = max(self.max_config_version_, run['config_version'])
if run['config_hash'] == self.config_hash_:
if self.found_matching_run:
raise Exception('Duplicate previous run entries found.')
self.found_matching_run = True
self.run_ = run
self.data_folder_ = self.get_versioned_folder(run['input_data_version'],
run['config_version'])
if self.found_matching_run is None:
self.found_matching_run = False
def load(self, obj):
"""Load model from file."""
self.__dict__.update(joblib.load(obj).__dict__)
def load_grid_selection(path):
"""Load a selected grid from pickle."""
if not os.path.isfile(path):
raise ValueError("The file is not existed.""")
with open(path, "r") as f:
data = pickle.load(f)
f.close()
return data['environment'], data['gt'], data['po'], data['goal']
def process_map_data(path, return_full=False):
data = joblib.load(path)
im_data = data['im']
value_data = data['value']
state_data = data['state']
if return_full:
im_full = np.concatenate((np.expand_dims(im_data, 1),
np.expand_dims(value_data, 1)),
axis=1).astype(dtype=np.uint8)
return im_full, state_data, data['label'], data['sample_idx']
label_data = np.array([np.eye(1, 8, l)[0] for l in data['label']])
num = im_data.shape[0]
num_train = num - num / 5
im_train = np.concatenate((np.expand_dims(im_data[:num_train], 1),
np.expand_dims(value_data[:num_train], 1)),
axis=1).astype(dtype=np.float32)
state_train = state_data[:num_train]
label_train = label_data[:num_train]
im_test = np.concatenate((np.expand_dims(im_data[num_train:], 1),
np.expand_dims(value_data[num_train:], 1)),
axis=1).astype(dtype=np.float32)
state_test = state_data[num_train:]
label_test = label_data[num_train:]
return (im_train, state_train, label_train), \
(im_test, state_test, label_test), data['sample_idx']
def create_grid_8_dataset(mat_file_path, db_name, save_dir):
"""Convert grid 8x8 dataset from mat file to hdf5 format.
Parameters
----------
mat_file_path : str
the path to the mat file
db_name : str
the name of the dataset
save_dir : str
the directory of the output path (must exist)
"""
# load matlab data
mat_data = load_mat_data(mat_file_path)
print ("[MESSAGE] The Matlab data is loaded.")
# init HDF5 database
db = init_h5_db(db_name, save_dir)
# save dataset
for key in data_dict:
add_h5_ds(mat_data[key], key, db)
print ("[MESSAGE] Saved %s" % (key))
db.flush()
db.close()
print ("[MESSAGE] The grid 8x8 dataset is saved at %s" % (save_dir))
def load(file_name):
if file_name.endswith('.json'):
with open(file_name, 'r') as f:
return jsonpickle.loads(f.read())
if file_name.endswith('.npy'):
return np.load(file_name)
return joblib.load(file_name)
def _load_experiments(self, data_folder, name_or_patterns):
if not isinstance(name_or_patterns, (list, tuple)):
name_or_patterns = [name_or_patterns]
files = []
for name_or_pattern in name_or_patterns:
matched_files = glob(
osp.join(data_folder, name_or_pattern)) # golb gives a list of all files satisfying pattern
files += matched_files # this will include twice the same file if it satisfies 2 patterns
experiments = []
progress_f = None
params_f = None
pkl_data = None
for f in files:
if os.path.isdir(f):
try:
progress = self._read_data(osp.join(f, "progress.csv"))
params = self._read_params(osp.join(f, "params.json"))
params["exp_name"] = osp.basename(f)
if os.path.isfile(osp.join(f, "params.pkl")):
pkl_data = joblib.load(osp.join(f, "params.pkl"))
experiments.append(Experiment(progress, params, pkl_data))
else:
experiments.append(Experiment(progress, params))
except Exception as e:
print(e)
elif 'progress.csv' in f: # in case you're giving as datafolder the dir that contains the files!
progress_f = self._read_data(f)
elif 'params.json' in f:
params_f = self._read_params(f)
elif 'params.pkl' in f:
print('about to load', f)
pkl_data = joblib.load(f)
if params_f and progress_f:
if pkl_data:
experiments.append(Experiment(progress_f, params_f, pkl_data))
else:
experiments.append(Experiment(progress_f, params_f))
self._experiments = experiments
def symbolize_signal(self, signal, parallel = None, n_jobs = -1):
"""
Symbolize whole time-series signal to a sentence (vector of words),
parallel can be {None, "ipython"}
"""
window_index = self.sliding_window_index(len(signal))
if parallel == None:
return map(lambda wi: self.symbolize_window(signal[wi]), window_index)
elif parallel == "ipython":
## too slow
raise NotImplementedError("parallel parameter %s not supported" % parallel)
#return self.iparallel_symbolize_signal(signal)
elif parallel == "joblib":
with tempfile.NamedTemporaryFile(delete=False) as f:
tf = f.name
print "save temp file at %s" % tf
tfiles = joblib.dump(signal, tf)
xs = joblib.load(tf, "r")
n_jobs = joblib.cpu_count() if n_jobs == -1 else n_jobs
window_index = list(window_index)
batch_size = len(window_index) / n_jobs
batches = chunk(window_index, batch_size)
symbols = Parallel(n_jobs)(delayed(joblib_symbolize_window)(self, xs, batch) for batch in batches)
for f in tfiles: os.unlink(f)
return sum(symbols, [])
else:
raise NotImplementedError("parallel parameter %s not supported" % parallel)
def signal_to_paa_vector(self, signal, n_jobs = -1):
window_index = self.sliding_window_index(len(signal))
with tempfile.NamedTemporaryFile(delete=False) as f:
tf = f.name
print "save temp file at %s" % tf
tfiles = joblib.dump(signal, tf)
xs = joblib.load(tf, "r")
n_jobs = joblib.cpu_count() if n_jobs == -1 else n_jobs
window_index = list(window_index)
batch_size = len(window_index) / n_jobs
batches = chunk(window_index, batch_size)
vecs = Parallel(n_jobs)(delayed(joblib_paa_window)(self, xs, batch) for batch in batches)
for f in tfiles: os.unlink(f)
return np.vstack(vecs)
def save_single(self):
assert(self.V is not None)
assert(self._ref is not None)
# Set the size explictly as a sanity check
size_n, dim_V = self.V.shape
config_score = simple_config.load()["score"]
f_db = os.path.join(
config_score["output_data_directory"],
config_score["document_scores"]["f_db"]
)
h5 = touch_h5(f_db)
g = h5.require_group(self.method)
gx = g.require_group(self.current_filename)
# Save the data array
msg = "Saving {} {} ({})"
print(msg.format(self.method, self.current_filename, size_n))
for col in ["V", "_ref", "VX",
"VX_explained_variance_ratio_",
"VX_components_"]:
if col in gx:
#print " Clearing", self.method, self.current_filename, col
del gx[col]
gx.create_dataset("V", data=self.V, **self.h5py_args)
gx.create_dataset("_ref", data=self._ref, **self.h5py_args)
def compute_reduced_representation(self):
if not self.compute_reduced:
return None
config_score = simple_config.load()["score"]
f_db = os.path.join(
config_score["output_data_directory"],
config_score["document_scores"]["f_db"]
)
h5 = touch_h5(f_db)
g = h5[self.method]
keys = g.keys()
V = np.vstack([g[x]["V"][:] for x in keys])
sizes = [g[x]["_ref"].shape[0] for x in keys]
nc = self.reduced_n_components
clf = IncrementalPCA(n_components=nc)
msg = "Performing PCA on {}, ({})->({})"
print(msg.format(self.method, V.shape[1], nc))
VX = clf.fit_transform(V)
EVR = clf.explained_variance_ratio_
COMPONENTS = clf.components_
for key, size in zip(keys, sizes):
# Take slices equal to the size
vx, VX = VX[:size,:], VX[size:, :]
evr, EVR = EVR[:size], EVR[size:]
com, COMPONENTS = COMPONENTS[:size,:], COMPONENTS[size:, :]
g[key].create_dataset("VX", data=vx, **self.h5py_args)
g[key].create_dataset("VX_explained_variance_ratio_", data=evr)
g[key].create_dataset("VX_components_", data=com)
h5.close()