def symbolize_signal(self, signal, parallel = None, n_jobs = -1):
"""
Symbolize whole time-series signal to a sentence (vector of words),
parallel can be {None, "ipython"}
"""
window_index = self.sliding_window_index(len(signal))
if parallel == None:
return map(lambda wi: self.symbolize_window(signal[wi]), window_index)
elif parallel == "ipython":
## too slow
raise NotImplementedError("parallel parameter %s not supported" % parallel)
#return self.iparallel_symbolize_signal(signal)
elif parallel == "joblib":
with tempfile.NamedTemporaryFile(delete=False) as f:
tf = f.name
print "save temp file at %s" % tf
tfiles = joblib.dump(signal, tf)
xs = joblib.load(tf, "r")
n_jobs = joblib.cpu_count() if n_jobs == -1 else n_jobs
window_index = list(window_index)
batch_size = len(window_index) / n_jobs
batches = chunk(window_index, batch_size)
symbols = Parallel(n_jobs)(delayed(joblib_symbolize_window)(self, xs, batch) for batch in batches)
for f in tfiles: os.unlink(f)
return sum(symbols, [])
else:
raise NotImplementedError("parallel parameter %s not supported" % parallel)
python类dump()的实例源码
def signal_to_paa_vector(self, signal, n_jobs = -1):
window_index = self.sliding_window_index(len(signal))
with tempfile.NamedTemporaryFile(delete=False) as f:
tf = f.name
print "save temp file at %s" % tf
tfiles = joblib.dump(signal, tf)
xs = joblib.load(tf, "r")
n_jobs = joblib.cpu_count() if n_jobs == -1 else n_jobs
window_index = list(window_index)
batch_size = len(window_index) / n_jobs
batches = chunk(window_index, batch_size)
vecs = Parallel(n_jobs)(delayed(joblib_paa_window)(self, xs, batch) for batch in batches)
for f in tfiles: os.unlink(f)
return np.vstack(vecs)
def save_itr_params(itr, params):
global _logger_info
if _snapshot_dir:
if _snapshot_mode == 'all':
file_name = osp.join(_snapshot_dir, 'itr_%d.pkl' % itr)
joblib.dump(params, file_name, compress=3)
elif _snapshot_mode == 'last':
# override previous params
file_name = osp.join(_snapshot_dir, 'params.pkl')
joblib.dump(params, file_name, compress=3)
elif _snapshot_mode == 'last_best':
# saves best and last params
last_file_name = osp.join(_snapshot_dir, 'params.pkl')
joblib.dump(params, last_file_name, compress=3)
_logger_info["lastReward"] = get_last_tabular("AverageReturn")
_logger_info["lastItr"] = get_last_tabular("Iteration")
if "bestReward" not in _logger_info or _logger_info["bestReward"] < _logger_info["lastReward"]:
best_file_name = osp.join(_snapshot_dir, 'params_best.pkl')
shutil.copy(last_file_name, best_file_name)
_logger_info["bestReward"] = _logger_info["lastReward"]
_logger_info["bestItr"] = _logger_info["lastItr"]
elif _snapshot_mode == 'last_all_best':
# saves last and all best params
last_file_name = osp.join(_snapshot_dir, 'params.pkl')
joblib.dump(params, last_file_name, compress=3)
_logger_info["lastReward"] = get_last_tabular("AverageReturn")
_logger_info["lastItr"] = get_last_tabular("Iteration")
if "bestReward" not in _logger_info or _logger_info["bestReward"] < _logger_info["lastReward"]:
best_file_name = osp.join(_snapshot_dir, 'params_best_%08d.pkl' % itr)
shutil.copy(last_file_name, best_file_name)
_logger_info["bestReward"] = _logger_info["lastReward"]
_logger_info["bestItr"] = _logger_info["lastItr"]
elif _snapshot_mode == "gap":
if itr % _snapshot_gap == 0:
file_name = osp.join(_snapshot_dir, 'itr_%d.pkl' % itr)
joblib.dump(params, file_name, compress=3)
elif _snapshot_mode == 'none':
pass
else:
raise NotImplementedError
def log_variant(log_file, variant_data):
mkdir_p(os.path.dirname(log_file))
if hasattr(variant_data, "dump"):
variant_data = variant_data.dump()
variant_json = stub_to_json(variant_data)
with open(log_file, "w") as f:
json.dump(variant_json, f, indent=2, sort_keys=True, cls=MyEncoder)
def _split_and_dump(self, X, y, valid_X, valid_y):
if not hasattr(self, '_dm'):
raise ValueError("It should be called after the dumpmanager _dm is set")
if self.resampling == 'cv':
pass
elif self.resampling == 'holdout':
if not self._has_valid_data:
data_size = y.shape[0]
if data_size >= 100000:
valid_ratio = 0.3
elif 15000 <= data_size < 100000:
valid_ratio = 0.2
else:
valid_ratio = 0.15
valid_size = int(data_size * valid_ratio)
X, valid_X = X[valid_size:], X[:valid_size]
y, valid_y = y[valid_size:], y[:valid_size]
else:
raise NotImplementedError()
pkl = {"resampling": self.resampling,
"X": X, "y": y,
"valid_X": valid_X, "valid_y": valid_y}
datafile = os.path.join(self._dm.dir, "data.pkl")
joblib.dump(pkl, datafile, protocol=-1)
self._datafile = datafile
return datafile
def dump_object(data):
# converts whatever to string
s = BytesIO()
joblib.dump(data, s)
return s.getvalue()
def process_study(study_id, annotations, out_dir, nstack):
volumes_metadata = isotropic_volumes_metadata[study_id]
isometric_volume = np.load('../data_proc/stage1/isotropic_volumes_1mm/{}.npy'.format(study_id))
mean = np.mean(isometric_volume).astype(np.float32)
std = np.std(isometric_volume).astype(np.float32)
resize_factor = np.divide(volumes_metadata['volume_resampled_shape'], volumes_metadata['volume_shape'])
coords_list = []
for a in annotations:
d = a['data']
z = int(round(resize_factor[0] * a['sliceNum']))
y0 = resize_factor[1] * d['y']
y1 = resize_factor[1] * (d['y'] + d['height'])
x0 = resize_factor[2] * d['x']
x1 = resize_factor[2] * (d['x'] + d['width'])
coords_list.append((z, y0, y1, x0, x1))
samples = []
for coords in coords_list:
z, y0, y1, x0, x1 = coords
for i in range(40):
sample_id = uuid4()
rand_y0 = max(0, int(round(y0 - random.randint(0, 32))))
rand_y1 = min(isometric_volume.shape[1], int(round(y1 + random.randint(0, 32))))
rand_x0 = max(0, int(round(x0 - random.randint(0, 32))))
rand_x1 = min(isometric_volume.shape[2], int(round(x1 + random.randint(0, 32))))
patch = []
for zi in range(nstack):
patch.append(resize(isometric_volume[z+zi, rand_y0:rand_y1, rand_x0:rand_x1], [32, 32],
mode='edge', clip=True, preserve_range=True))
patch = np.array(patch, dtype=np.float32)
patch = (patch - mean) / (std + 1e-7)
patch = np.moveaxis(patch, 0, 2)
bb_x = (x0 - rand_x0) / (rand_x1 - rand_x0)
bb_y = (y0 - rand_y0) / (rand_y1 - rand_y0)
bb_w = (x1 - x0) / (rand_x1 - rand_x0)
bb_h = (y1 - y0) / (rand_y1 - rand_y0)
samples.append((patch, bb_x, bb_y, bb_w, bb_h))
joblib.dump(samples, os.path.join(out_dir, 'samples', '{}.pkl'.format(study_id)))
return len(samples)
def agg(file_name,store_file):
datas = joblib.load(file_name)
new_datas = []
for data in datas:
new_datas.append(data)
new_datas.append({"input":np.flip(data["input"],axis=2),"label":data["label"]})
new_datas.append({"input":np.flip(data["input"],axis=3),"label":data["label"]})
#new_datas.append({"input":np.rot90(m=data["input"],k=1,axes=(2,3)),"label":data["label"]})
#new_datas.append({"input":np.rot90(m=data["input"],k=2,axes=(2,3)),"label":data["label"]})
#new_datas.append({"input":np.rot90(m=data["input"],k=3,axes=(2,3)),"label":data["label"]})
joblib.dump(value=new_datas,filename=store_file,compress=3)
def slice_data(filename):
data = joblib.load(filename=filename)
for idx, i in enumerate(data):
data[idx]["input"] = np.delete(data[idx]["input"],[3],axis=1)
data[idx]["input"] = data[idx]["input"][:,:,46:55,46:55]
name, suf = os.path.splitext(filename)
outputfilename = name + "del_height_no.4_slice_7x7.pkl"
joblib.dump(value=data, filename=outputfilename)
def main():
# Get the data.
trains = joblib.load("../data/CIKM2017_train/train_Imp_3x3.pkl")
#testa_set = joblib.load("../data/CIKM2017_testA/testA_Imp_3x3_del_height_no.4.pkl")
#testa_x = []
#for item in testa_set:
# testa_x.append(item["input"])
#testa_x = np.asarray(testa_x, dtype=np.int16).transpose((0,1,3,4,2))
train_x, train_y, train_class = sample(trains)
'''
for i in range(10):
np.random.shuffle(data_set)
valid_data_num = int(len(data_set) / 10) #get 10% data for validation
for i in range(10):
valid_set = data_set[i * valid_data_num : (i+1) * valid_data_num ]
train_set = data_set[0: i*valid_data_num]
train_set.extend(data_set[(i+1)*valid_data_num:])
train_out, train_mean, train_std = preprocessing(train_set, 0, 0, True )
valid_out = preprocessing(valid_set, train_mean, train_std)
testa_out = preprocessing(testa_set, train_mean, train_std)
convert_to(train_out, "train_Imp_3x3_resample_normalization_"+str(i)+"_fold", is_test=False)
convert_to(valid_out, "valid_Imp_3x3_resample_normalization_"+str(i)+"_fold", is_test=False)
convert_to(testa_out, "testA_Imp_3x3_normalization_"+str(i)+"_fold", is_test=True)
#joblib.dump(value=data_set, filename="../data/CIKM2017_train/train_Imp_3x3_classified_del_height_no.4.pkl",compress=3)
'''
h5fname = "../data/CIKM2017_train/train_Imp_3x3.h5"
import h5py
"write file"
with h5py.File(h5fname, "w") as f:
#f.create_dataset(name="testa_set_x", shape=testa_x.shape, data=testa_x, dtype=testa_x.dtype, compression="lzf", chunks=True)
f.create_dataset(name="train_set_x", shape=train_x.shape, data=train_x, dtype=train_x.dtype, compression="lzf", chunks=True)
f.create_dataset(name="train_set_y", shape=train_y.shape, data=train_y, dtype=train_y.dtype, compression="lzf", chunks=True)
f.create_dataset(name="train_set_class", shape=train_class.shape, data=train_class, dtype=train_class.dtype, compression="lzf", chunks=True)
return
def dump_object(data):
# converts whatever to string
s = BytesIO()
joblib.dump(data, s)
return s.getvalue()
def test_pickle(self):
joblib.dump(CachedIterable(self.iterator(), 3), 'output')
self.assertListEqual(list(joblib.load('output')), list(range(20)))
def main():
os.chdir('data/google')
model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, norm_only=False)
for v in model.vocab.values():
v.sample_int = 0
ts = list(model.vocab.items())
ts.sort(key=lambda t: t[1].index)
model.vocab = OrderedDict(ts)
joblib.dump(model, 'GoogleNews-vectors-negative300.pickle')
def dump_caffemodel_weights():
net = caffe.Net(args.prototxt_path, args.caffemodel_path, caffe.TEST)
weights = {}
n_layers = len(net.layers)
for i in range(n_layers):
layer_name = net._layer_names[i]
layer = net.layers[i]
layer_blobs = [o.data for o in layer.blobs]
weights[layer_name] = layer_blobs
joblib.dump(weights, args.caffe_weights_path)
def save(self, filepath):
joblib.dump(self, filepath, 3)
def save(self, dumpfile='soinn.dump'):
import joblib
joblib.dump(self, dumpfile, compress=True, protocol=0)
def log_variant(log_file, variant_data):
mkdir_p(os.path.dirname(log_file))
if hasattr(variant_data, "dump"):
variant_data = variant_data.dump()
variant_json = stub_to_json(variant_data)
with open(log_file, "w") as f:
json.dump(variant_json, f, indent=2, sort_keys=True, cls=MyEncoder)
def dump(path, content, method=None, py_prefix='', py_suffix='', text_mode='w'):
if method is None:
method = _infer_method(path)
assert_instance(method, IOMethod)
path_origin = path
if method != IOMethod.TEXT or text_mode == 'w':
path += '.tmp'
if method == IOMethod.PICKLE:
with open(path, 'wb') as f:
pickle.dump(content, f, protocol=pickle.HIGHEST_PROTOCOL)
elif method == IOMethod.PICKLE_GZ:
with gzip.open(path, 'wb') as f:
pickle.dump(content, f, protocol=pickle.HIGHEST_PROTOCOL)
elif method == IOMethod.NUMPY:
joblib.dump(content, path)
elif method == IOMethod.NUMPY_RAW:
with open(path, 'wb') as f:
content.dump(f)
elif method == IOMethod.TEXT:
with open(path, text_mode) as f:
if type(content) in (list, tuple):
f.writelines(content)
else:
f.write(str(content))
elif method == IOMethod.BINARY:
with open(path, 'wb') as f:
f.write(content)
else:
raise ValueError('Unsupported dumping method: {}', method)
if method != IOMethod.TEXT or text_mode == 'w':
os.rename(path, path_origin)
return path_origin
def save(self, path):
"""
Use joblib to pickle the object.
Arguments:
path: an open file object or string holding the path to where the
object should be saved
"""
jl.dump(self, path)
def txt2jd(f):
"""
Dump the np.ndarray using joblib.dump for fast access.
"""
data = []
for line in open(f):
line = line.split("\n")[0].split("\t")
data.append(map(int, line))
data = np.array(data)
joblib.dump(data, f.replace(".txt", ".jd"))
os.remove(f)
return f.replace(".txt", ".jd")