def _load_data(self):
"""
Load data only if the present data is not checkpointed, else, just load the checkpointed data
:return: None
"""
self.mapper = Mapper()
self.mapper.generate_vocabulary(self.review_summary_file)
self.X_fwd, self.X_bwd, self.Y = self.mapper.get_tensor(reverseflag=True)
# Store all the mapper values in a dict for later recovery
self.mapper_dict = dict()
self.mapper_dict['seq_length'] = self.mapper.get_seq_length()
self.mapper_dict['vocab_size'] = self.mapper.get_vocabulary_size()
self.mapper_dict['rev_map'] = self.mapper.get_reverse_map()
# Split into test and train data
self._split_train_tst()
python类load()的实例源码
def _load_data(self):
"""
Load data only if the present data is not checkpointed, else, just load the checkpointed data
:return: None
"""
self.mapper = Mapper()
self.mapper.generate_vocabulary(self.review_summary_file)
self.X, self.Y = self.mapper.get_tensor()
# Store all the mapper values in a dict for later recovery
self.mapper_dict = dict()
self.mapper_dict['seq_length'] = self.mapper.get_seq_length()
self.mapper_dict['vocab_size'] = self.mapper.get_vocabulary_size()
self.mapper_dict['rev_map'] = self.mapper.get_reverse_map()
# Split into test and train data
self._split_train_tst()
def _load_data(self):
"""
Load data only if the present data is not checkpointed, else, just load the checkpointed data
:return: None
"""
self.mapper = Mapper()
self.mapper.generate_vocabulary(self.review_summary_file)
self.X, self.Y = self.mapper.get_tensor()
# Store all the mapper values in a dict for later recovery
self.mapper_dict = dict()
self.mapper_dict['seq_length'] = self.mapper.get_seq_length()
self.mapper_dict['vocab_size'] = self.mapper.get_vocabulary_size()
self.mapper_dict['rev_map'] = self.mapper.get_reverse_map()
# Split into test and train data
self._split_train_tst()
def _load_data(self):
"""
Load data only if the present data is not checkpointed, else, just load the checkpointed data
:return: None
"""
self.mapper = Mapper()
self.mapper.generate_vocabulary(self.review_summary_file)
self.X_fwd, self.X_bwd, self.Y = self.mapper.get_tensor(reverseflag=True)
# Store all the mapper values in a dict for later recovery
self.mapper_dict = dict()
self.mapper_dict['seq_length'] = self.mapper.get_seq_length()
self.mapper_dict['vocab_size'] = self.mapper.get_vocabulary_size()
self.mapper_dict['rev_map'] = self.mapper.get_reverse_map()
# Split into test and train data
self._split_train_tst()
def load_from_corpus(cls, reader, remake=False, src_or_tgt="src"):
vocab_fname = reader.fname+".vocab-"+reader.mode+"-"+src_or_tgt
if not remake and os.path.isfile(vocab_fname):
return Vocab.load(vocab_fname)
else:
v = Vocab()
count = 0 # count of sentences
for item in reader:
toklist = item
for token in toklist:
v.add(token)
count += 1
if count % 10000 == 0:
print("...", count, end="")
print("\nSaving " + src_or_tgt + " vocab of size", v.size)
v.START_TOK = v[reader.begin] if reader.begin is not None else None
v.END_TOK = v[reader.end] if reader.end is not None else None
v.save(vocab_fname)
return v
#### reader class
def _load_builder(path):
# lifted straight from /usr/bin/swift-ring-builder
from swift.common.ring import RingBuilder
try:
builder = pickle.load(open(path, 'rb'))
if not hasattr(builder, 'devs'):
builder_dict = builder
builder = RingBuilder(1, 1, 1)
builder.copy_from(builder_dict)
except ImportError: # Happens with really old builder pickles
builder = RingBuilder(1, 1, 1)
builder.copy_from(pickle.load(open(path, 'rb')))
for dev in builder.devs:
if dev and 'meta' not in dev:
dev['meta'] = ''
return builder
def _load_accumulators(self, main_loop):
"""Nasty method, use carefully"""
for cg_name, model in main_loop.models.iteritems():
source = numpy.load(self.path_to_accumulators.format(cg_name))
accums_dict = {name.replace("-", "/"): value
for name, value in source.items()}
source.close()
algo = main_loop.algorithm.algorithms[cg_name]
model_params = model.get_params()
steps = algo.steps.items()
for pidx in xrange(len(steps)):
# Get parameter name and its accumulators
p = steps[pidx][0]
name = [k for k, v in model_params.iteritems() if v == p][0]
accums = accums_dict[name]
# This is num_accums_per_param
col = len(accums)
for aidx in xrange(col):
algo.step_rule_updates[pidx*col+aidx][0].set_value(
accums[aidx])
def _load_accumulators(self, main_loop):
"""Load accumulators with some checks."""
for cg_name, model in main_loop.models.iteritems():
# Load accumulators
accum_filename = self.path_to_accumulators.format(cg_name)
if not os.path.isfile(accum_filename):
logger.error(" Accumulators file does not exist [{}]"
.format(accum_filename))
continue
source = numpy.load(accum_filename)
accums_to_load = {k: v for k, v in source.items()}
source.close()
algo = main_loop.algorithm.algorithms[cg_name]
accums = algo.step_rule_updates
# Set accumulators
for acc in accums:
try:
acc.set_value(accums_to_load[acc.name])
except:
logger.error(" Could not load {}".format(acc.name))
def get_dev_streams(config):
"""Setup development set stream if necessary."""
dev_streams = {}
for cg in config['cgs']:
if 'val_sets' in config and cg in config['val_sets']:
logger.info('Building development stream for cg:[{}]'.format(cg))
eid = p_(cg)[0]
dev_file = config['val_sets'][cg]
# Get dictionary and fix EOS
dictionary = cPickle.load(open(config['src_vocabs'][eid]))
dictionary['<S>'] = 0
dictionary['<UNK>'] = config['unk_id']
dictionary['</S>'] = config['src_eos_idxs'][eid]
# Get as a text file and convert it into a stream
dev_dataset = TextFile([dev_file], dictionary, None)
dev_streams[cg] = DataStream(dev_dataset)
return dev_streams
def save_training_info(values, path):
"""
Gets a set of values as dictionary and append them to a log file.
stores in <path>/train_log.pkl
"""
file_name = os.path.join(path, __train_log_file_name)
try:
with open(file_name, "rb") as f:
log = pickle.load(f)
except IOError: # first time
log = {}
for k in values.keys():
log[k] = []
for k, v in values.items():
log[k].append(v)
with open(file_name, "wb") as f:
pickle.dump(log, f)
def plot_traing_info(x, ylist, path):
"""
Loads log file and plot x and y values as provided by input.
Saves as <path>/train_log.png
"""
file_name = os.path.join(path, __train_log_file_name)
try:
with open(file_name, "rb") as f:
log = pickle.load(f)
except IOError: # first time
warnings.warn("There is no {} file here!!!".format(file_name))
return
plt.figure()
x_vals = log[x]
for y in ylist:
y_vals = log[y]
if len(y_vals) != len(x_vals):
warning.warn("One of y's: {} does not have the same length as x:{}".format(y, x))
plt.plot(x_vals, y_vals, label=y)
# assert len(y_vals) == len(x_vals), "not the same len"
plt.xlabel(x)
plt.legend()
#plt.show()
plt.savefig(file_name[:-3]+'png', bbox_inches='tight')
plt.close('all')
def from_snapshot(self, sess, sfile, nfile):
print('Restoring model snapshots from {:s}'.format(sfile))
self.saver.restore(sess, sfile)
print('Restored.')
# Needs to restore the other hyper-parameters/states for training, (TODO xinlei) I have
# tried my best to find the random states so that it can be recovered exactly
# However the Tensorflow state is currently not available
with open(nfile, 'rb') as fid:
st0 = pickle.load(fid)
cur = pickle.load(fid)
perm = pickle.load(fid)
cur_val = pickle.load(fid)
perm_val = pickle.load(fid)
last_snapshot_iter = pickle.load(fid)
np.random.set_state(st0)
self.data_layer._cur = cur
self.data_layer._perm = perm
self.data_layer_val._cur = cur_val
self.data_layer_val._perm = perm_val
return last_snapshot_iter
def build_data_dict(self, layer_features, k = 5):
with open(self.pkl_dir + self.data_file_name, 'rb') as data_file:
data = cPickle.load(data_file)
with open(self.pkl_dir + self.feature_file_name, 'rb') as feature_file:
features = cPickle.load(feature_file)
data_dict = {}
for d,f in zip(data, features):
pid = d['id']
data_dict[pid] = {'label':d['label'], 'spacing':d['spacing']}
# add the features
for i in range(k):
data_dict[pid]['loc_{}'.format(i)] = f['loc_{}'.format(i)]
data_dict[pid]['p_{}'.format(i)] = f['p_{}'.format(i)]
for layer in layer_features:
data_dict[pid]['out_{}_{}'.format(i, layer)] = f['out_{}_{}'.format(i, layer)]
return data_dict
def build_data_dict(self, layer_features, k=5):
"""
This build dict[id] = {label, spacing, 1={loc, p, layer1_feature, layer2_feature...}, 2={}...}
:param layer_features: features from layer, e.g 67, 77
:param k: number of nodule considered as inputs
:return: a combined dictionary
"""
with open(self.pkl_dir + self.data_file_name, 'rb') as data_file:
data = cPickle.load(data_file)
with open(self.pkl_dir + self.feature_file_name, 'rb') as feature_file:
features = cPickle.load(feature_file)
data_dict = {}
for d,f in zip(data, features):
pid = d['id']
data_dict[pid] = {'label':d['label'], 'spacing':d['spacing']}
# add the features
for i in range(k):
data_dict[pid][i] = {'loc': f['loc_{}'.format(i)], 'p': f['p_{}'.format(i)]}
for layer in layer_features:
data_dict[pid][i][layer] = f['out_{}_{}'.format(i, layer)]
return data_dict
def read_pklc(lcfile):
'''
This just reads a pickle.
'''
try:
with open(lcfile,'rb') as infd:
lcdict = pickle.load(infd)
except UnicodeDecodeError:
with open(lcfile,'rb') as infd:
lcdict = pickle.load(infd, encoding='latin1')
return lcdict
# these translate filter operators given as strings to Python operators
def register(self, name, serializer):
"""Register ``serializer`` object under ``name``.
Raises :class:`AttributeError` if ``serializer`` in invalid.
.. note::
``name`` will be used as the file extension of the saved files.
:param name: Name to register ``serializer`` under
:type name: ``unicode`` or ``str``
:param serializer: object with ``load()`` and ``dump()``
methods
"""
# Basic validation
getattr(serializer, 'load')
getattr(serializer, 'dump')
self._serializers[name] = serializer
def register(self, name, serializer):
"""Register ``serializer`` object under ``name``.
Raises :class:`AttributeError` if ``serializer`` in invalid.
.. note::
``name`` will be used as the file extension of the saved files.
:param name: Name to register ``serializer`` under
:type name: ``unicode`` or ``str``
:param serializer: object with ``load()`` and ``dump()``
methods
"""
# Basic validation
getattr(serializer, 'load')
getattr(serializer, 'dump')
self._serializers[name] = serializer
def gt_roidb(self):
"""
Return the database of ground-truth regions of interest.
This function loads/saves from/to a cache file to speed up future calls.
"""
cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
if os.path.exists(cache_file):
with open(cache_file, 'rb') as fid:
roidb = cPickle.load(fid)
print '{} gt roidb loaded from {}'.format(self.name, cache_file)
return roidb
# gt_roidb = [self._load_pascal_annotation(index)
gt_roidb = [self._load_pascal_labels(index)
for index in self.image_index]
with open(cache_file, 'wb') as fid:
cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
print 'wrote gt roidb to {}'.format(cache_file)
return gt_roidb
def selective_search_IJCV_roidb(self):
"""
Return the database of selective search regions of interest.
Ground-truth ROIs are also included.
This function loads/saves from/to a cache file to speed up future calls.
"""
cache_file = os.path.join(self.cache_path,
'{:s}_selective_search_IJCV_top_{:d}_roidb.pkl'.
format(self.name, self.config['top_k']))
if os.path.exists(cache_file):
with open(cache_file, 'rb') as fid:
roidb = cPickle.load(fid)
print '{} ss roidb loaded from {}'.format(self.name, cache_file)
return roidb
gt_roidb = self.gt_roidb()
ss_roidb = self._load_selective_search_IJCV_roidb(gt_roidb)
roidb = datasets.imdb.merge_roidbs(gt_roidb, ss_roidb)
with open(cache_file, 'wb') as fid:
cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL)
print 'wrote ss roidb to {}'.format(cache_file)
return roidb
def load(batch_size, test_batch_size, n_labelled=None):
filepath = '/tmp/mnist.pkl.gz'
url = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
if not os.path.isfile(filepath):
print "Couldn't find MNIST dataset in /tmp, downloading..."
urllib.urlretrieve(url, filepath)
with gzip.open('/tmp/mnist.pkl.gz', 'rb') as f:
train_data, dev_data, test_data = pickle.load(f)
return (
mnist_generator(train_data, batch_size, n_labelled),
mnist_generator(dev_data, test_batch_size, n_labelled),
mnist_generator(test_data, test_batch_size, n_labelled)
)
def parse_standard_tfmeta(paths):
meta_list = []
for path in paths:
if isstring(path):
if path.startswith('meta') and path.endswith('.pkl'):
mpaths = [path]
else:
assert os.path.isdir(path)
mpaths = filter(lambda x: x.startswith('meta') and x.endswith('.pkl'),
os.listdir(path))
mpaths = [os.path.join(path, mp) for mp in mpaths]
else:
# in this case, it's a list
assert isinstance(path, list)
mpaths = path
d = {}
for mpath in mpaths:
d.update(cPickle.load(open(mpath)))
meta_list.append(d)
return meta_list
def loadDepthMap(self,filename):
"""
Read a depth-map
:param filename: file name to load
:return: image data of depth image
"""
img = Image.open(filename)
# top 8 bits of depth are packed into green channel and lower 8 bits into blue
assert len(img.getbands()) == 3
r, g, b = img.split()
r = np.asarray(r, np.int32)
g = np.asarray(g, np.int32)
b = np.asarray(b, np.int32)
dpt = np.bitwise_or(np.left_shift(g, 8), b)
imgdata = np.asarray(dpt, np.float32)
return imgdata
def read_pickle(self,filename):
try:
import cPickle as pickle
except ImportError:
import pickle
in_f = open(filename,"rb")
tabversion = pickle.load(in_f)
if tabversion != __tabversion__:
raise VersionError("yacc table file version is out of date")
self.lr_method = pickle.load(in_f)
signature = pickle.load(in_f)
self.lr_action = pickle.load(in_f)
self.lr_goto = pickle.load(in_f)
productions = pickle.load(in_f)
self.lr_productions = []
for p in productions:
self.lr_productions.append(MiniProduction(*p))
in_f.close()
return signature
# Bind all production function names to callable objects in pdict
def read_pickle(self,filename):
try:
import cPickle as pickle
except ImportError:
import pickle
in_f = open(filename,"rb")
tabversion = pickle.load(in_f)
if tabversion != __tabversion__:
raise VersionError("yacc table file version is out of date")
self.lr_method = pickle.load(in_f)
signature = pickle.load(in_f)
self.lr_action = pickle.load(in_f)
self.lr_goto = pickle.load(in_f)
productions = pickle.load(in_f)
self.lr_productions = []
for p in productions:
self.lr_productions.append(MiniProduction(*p))
in_f.close()
return signature
# Bind all production function names to callable objects in pdict
def copy_images_for_classification():
ground_truth_dates = pickle.load(open(data_dir + 'ground_truth_dates.pickle', "rb"))
ground_truth_dates = sorted(ground_truth_dates, key=lambda x: x[3], reverse=False)
if not os.path.exists(classify_dir):
os.mkdir(classify_dir)
for seed_id, coin_id, result, labeled_date, bad_angle, bad_image in ground_truth_dates:
if labeled_date < 1900:
continue
dir = crop_dir + str(coin_id / 100) + '/'
new_dir = classify_dir + str(labeled_date) + '/'
if not os.path.exists(new_dir):
os.mkdir(new_dir)
for image_id in range(0,57):
filename = str(coin_id).zfill(5) + str(image_id).zfill(2) + '.png'
old_filename = dir + filename
new_filename = new_dir + filename
shutil.copyfile(old_filename,new_filename)
def get_single_lmdb_filedata(seed_id, max_value_cutoff):
seeds = pickle.load(open(data_dir + 'seed_data.pickle', "rb"))
filedata = []
values = seeds[seed_id]
# this is handy for large groups (heads,tails)
# values.sort(key=lambda x: x[0], reverse=True)
# best_results_by_angle_group = {}
# for max_value, angle, image_id in values:
# rounded_angle = int(round(angle / 5) * 5)
# if not rounded_angle in best_results_by_angle_group.keys():
# best_results_by_angle_group[rounded_angle] = [max_value, angle, image_id]
# else:
# if max_value > best_results_by_angle_group[rounded_angle][0]:
# best_results_by_angle_group[rounded_angle] = [max_value, angle, image_id]
# values = best_results_by_angle_group.values()
filedata.append([seed_id, crop_dir + str(seed_id) + '.png', 0])
for image_id, test_values in values.iteritems():
max_value, angle = test_values
if max_value > max_value_cutoff:
filedata.append([image_id, crop_dir + str(image_id) + '.png', angle])
return filedata
def create_new_indexes(total_new_seed_imgs, total_new_test_imgs):
seeds = pickle.load(open(data_dir + 'seed_data.pickle', "rb"))
seed_image_ids = []
test_image_ids = []
count = 0
for seed_image_id, values in seeds.iteritems():
values.sort(key=lambda x: x[0], reverse=False)
# seed_image_ids.append(values[0:total_new_seed_imgs][2])
# test_image_ids.append(values[total_new_seed_imgs:total_new_seed_imgs+total_new_test_imgs][2])
for max_value, angle, image_id in values:
count += 1
if count < total_new_seed_imgs:
seed_image_ids.append(image_id)
else:
if count < total_new_seed_imgs + total_new_test_imgs:
test_image_ids.append(image_id)
count = 0
pickle.dump(seed_image_ids, open(data_dir + 'seed_image_ids.pickle', "wb"))
pickle.dump(test_image_ids, open(data_dir + 'test_image_ids.pickle', "wb"))
def create_new_indexes(total_new_seed_imgs, total_new_test_imgs):
seeds = pickle.load(open(data_dir + 'seed_data.pickle', "rb"))
seed_image_ids = []
test_image_ids = []
count = 0
for seed_image_id, values in seeds.iteritems():
values.sort(key=lambda x: x[0], reverse=False)
# seed_image_ids.append(values[0:total_new_seed_imgs][2])
# test_image_ids.append(values[total_new_seed_imgs:total_new_seed_imgs+total_new_test_imgs][2])
for max_value, angle, image_id in values:
count += 1
if count < total_new_seed_imgs:
seed_image_ids.append(image_id)
else:
if count < total_new_seed_imgs + total_new_test_imgs:
test_image_ids.append(image_id)
count = 0
pickle.dump(seed_image_ids, open(data_dir + 'seed_image_ids.pickle', "wb"))
pickle.dump(test_image_ids, open(data_dir + 'test_image_ids.pickle', "wb"))
def get_single_lmdb_filedata(seed_id, max_value_cutoff):
seeds = pickle.load(open(data_dir + 'seed_data.pickle', "rb"))
filedata = []
values = seeds[seed_id]
# this is handy for large groups (heads,tails)
# values.sort(key=lambda x: x[0], reverse=True)
# best_results_by_angle_group = {}
# for max_value, angle, image_id in values:
# rounded_angle = int(round(angle / 5) * 5)
# if not rounded_angle in best_results_by_angle_group.keys():
# best_results_by_angle_group[rounded_angle] = [max_value, angle, image_id]
# else:
# if max_value > best_results_by_angle_group[rounded_angle][0]:
# best_results_by_angle_group[rounded_angle] = [max_value, angle, image_id]
# values = best_results_by_angle_group.values()
filedata.append([seed_id, crop_dir + str(seed_id) + '.png', 0])
for image_id, test_values in values.iteritems():
max_value, angle = test_values
if max_value > max_value_cutoff:
filedata.append([image_id, crop_dir + str(image_id) + '.png', angle])
return filedata
def get_ground_truth_dates(total_coin_results):
#ground_truth_dates = pickle.load(open(data_dir + 'get_ground_truth_dates.pickle', "rb"))
ground_truth_date_dict = {}
for seed_id, values in total_coin_results.iteritems():
for coin_id, result in values.iteritems():
if coin_id not in ground_truth_date_dict.iterkeys():
ground_truth_date_dict[coin_id] = [seed_id, 0]
if result > ground_truth_date_dict[coin_id][1]:
ground_truth_date_dict[coin_id] = [seed_id, result]
#it bugs me I am not using a more pythonic way here:
ground_truth_date_array = []
for coin_id, values in ground_truth_date_dict.iteritems():
seed_id = values[0]
result = values[1]
ground_truth_date_array.append([seed_id,coin_id, result,0,False,False])
ground_truth_date_array = sorted(ground_truth_date_array, key=lambda x: x[2],reverse = True)
ground_truth_date_array = sorted(ground_truth_date_array, key=lambda x: x[0])
pickle.dump(ground_truth_date_array, open(data_dir + 'ground_truth_dates.pickle', "wb"))
return ground_truth_date_array