def pickle_load(path, compression=False):
"""Unpickle a possible compressed pickle.
Parameters
----------
path: str
path to the output file
compression: bool
if true assumes that pickle was compressed when created and attempts decompression.
Returns
-------
obj: object
the unpickled object
"""
if compression:
with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip:
with myzip.open("data") as f:
return pickle.load(f)
else:
with open(path, "rb") as f:
return pickle.load(f)
python类load()的实例源码
def directory_has_smart_contract(location):
# returns bool if there is a tsol contract in said directory
# probably makes more sense to put this inside of the tsol package
code_path = glob.glob(os.path.join(location, '*.tsol'))
example = glob.glob(os.path.join(location, '*.json'))
assert len(code_path) > 0 and len(example) > 0, 'Could not find *.tsol and *.json files in provided directory.'
# pop off the first file name and turn the code into a file object
code = open(code_path[0])
# turn the example into a dict
with open(example[0]) as e:
example = json.load(e)
try:
tsol.compile(code, example)
except Exception as e:
print(e)
return False
return True
def main():
'''
Run code specifed by data received over pipe
'''
assert is_forking(sys.argv)
handle = int(sys.argv[-1])
fd = msvcrt.open_osfhandle(handle, os.O_RDONLY)
from_parent = os.fdopen(fd, 'rb')
process.current_process()._inheriting = True
preparation_data = load(from_parent)
prepare(preparation_data)
self = load(from_parent)
process.current_process()._inheriting = False
from_parent.close()
exitcode = self._bootstrap()
exit(exitcode)
def get_item_history(self, prior_or_train, reconstruct = False, none_idx = 49689):
filepath = self.cache_dir + './item_history_' + prior_or_train + '.pkl'
if (not reconstruct) and os.path.exists(filepath):
with open(filepath, 'rb') as f:
item_history = pickle.load(f)
else:
up = self.get_users_orders(prior_or_train).sort_values(['user_id', 'order_number', 'product_id'], ascending = True)
item_history = up.groupby(['user_id', 'order_number'])['product_id'].apply(list).reset_index()
item_history.loc[item_history.order_number == 1, 'product_id'] = item_history.loc[item_history.order_number == 1, 'product_id'] + [none_idx]
item_history = item_history.sort_values(['user_id', 'order_number'], ascending = True)
# accumulate
item_history['product_id'] = item_history.groupby(['user_id'])['product_id'].transform(pd.Series.cumsum)
# get unique item list
item_history['product_id'] = item_history['product_id'].apply(set).apply(list)
item_history = item_history.sort_values(['user_id', 'order_number'], ascending = True)
# shift each group to make it history
item_history['product_id'] = item_history.groupby(['user_id'])['product_id'].shift(1)
for row in item_history.loc[item_history.product_id.isnull(), 'product_id'].index:
item_history.at[row, 'product_id'] = [none_idx]
item_history = item_history.sort_values(['user_id', 'order_number'], ascending = True).groupby(['user_id'])['product_id'].apply(list).reset_index()
item_history.columns = ['user_id', 'history_items']
with open(filepath, 'wb') as f:
pickle.dump(item_history, f, pickle.HIGHEST_PROTOCOL)
return item_history
def get_taxi_stats(data_path=data_path):
file_name = 'taxi_data_stats.p'
path = data_path + file_name
if not os.path.isfile(path):
download(file_name, data_path=data_path)
import pickle
stats = pickle.load(open(path, 'r'))
sum_X = stats['sum_X']
sum_X2 = stats['sum_X2']
n = float(stats['n'])
X_mean = sum_X / n
X_std = ((sum_X2 - (sum_X**2)/n)/(n-1))**0.5
X_mean = np.reshape(X_mean, [1, -1])
X_std = np.reshape(X_std, [1, -1])
return X_mean, X_std
def __init__(self, counts=None, calledfuncs=None, infile=None,
callers=None, outfile=None):
self.counts = counts
if self.counts is None:
self.counts = {}
self.counter = self.counts.copy() # map (filename, lineno) to count
self.calledfuncs = calledfuncs
if self.calledfuncs is None:
self.calledfuncs = {}
self.calledfuncs = self.calledfuncs.copy()
self.callers = callers
if self.callers is None:
self.callers = {}
self.callers = self.callers.copy()
self.infile = infile
self.outfile = outfile
if self.infile:
# Try to merge existing counts file.
try:
counts, calledfuncs, callers = \
pickle.load(open(self.infile, 'rb'))
self.update(self.__class__(counts, calledfuncs, callers))
except (IOError, EOFError, ValueError), err:
print >> sys.stderr, ("Skipping counts file %r: %s"
% (self.infile, err))
def from_snapshot(self, sess, sfile, nfile):
print('Restoring model snapshots from {:s}'.format(sfile))
self.saver.restore(sess, sfile)
print('Restored.')
# Needs to restore the other hyper-parameters/states for training, (TODO xinlei) I have
# tried my best to find the random states so that it can be recovered exactly
# However the Tensorflow state is currently not available
with open(nfile, 'rb') as fid:
st0 = pickle.load(fid)
cur = pickle.load(fid)
perm = pickle.load(fid)
cur_val = pickle.load(fid)
perm_val = pickle.load(fid)
last_snapshot_iter = pickle.load(fid)
np.random.set_state(st0)
self.data_layer._cur = cur
self.data_layer._perm = perm
self.data_layer_val._cur = cur_val
self.data_layer_val._perm = perm_val
return last_snapshot_iter
def gt_roidb(self):
"""
Return the database of ground-truth regions of interest.
This function loads/saves from/to a cache file to speed up future calls.
"""
cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
if os.path.exists(cache_file):
with open(cache_file, 'rb') as fid:
try:
roidb = pickle.load(fid)
except:
roidb = pickle.load(fid, encoding='bytes')
print('{} gt roidb loaded from {}'.format(self.name, cache_file))
return roidb
gt_roidb = [self._load_pascal_annotation(index)
for index in self.image_index]
with open(cache_file, 'wb') as fid:
pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL)
print('wrote gt roidb to {}'.format(cache_file))
return gt_roidb
def iter_keys_values(self, keys, inds=None, verbose=False):
for key in keys:
if key not in self.keys_:
raise RuntimeError('Key %s not found in dataset. keys: %s' % (key, self.keys_))
idx, ii = 0, 0
total_chunks = len(self.meta_file_.chunks)
inds = np.sort(inds) if inds is not None else None
for chunk_idx, chunk in enumerate(progressbar(self.meta_file_.chunks, size=total_chunks, verbose=verbose)):
data = AttrDict.load(self.get_chunk_filename(chunk_idx))
# if inds is None:
items = (data[key] for key in keys)
for item in izip(*items):
yield item
# else:
# for i, item in enumerate(data[key]):
# if inds[ii] == idx + i:
# yield item
# ii += 1
# if ii >= len(inds): break
# idx += len(data[key])
def iterchunks(self, key, batch_size=10, verbose=False):
if key not in self.keys_:
raise RuntimeError('Key %s not found in dataset. keys: %s' % (key, self.keys_))
idx, ii = 0, 0
total_chunks = len(self.meta_file_.chunks)
batch_chunks = grouper(range(len(self.meta_file_.chunks)), batch_size)
for chunk_group in progressbar(batch_chunks, size=total_chunks / batch_size, verbose=verbose):
items = []
# print key, chunk_group
for chunk_idx in chunk_group:
# grouper will fill chunks with default none values
if chunk_idx is None: continue
# Load chunk
data = AttrDict.load(self.get_chunk_filename(chunk_idx))
for item in data[key]:
items.append(item)
yield items
def district_hash_map(data_frame):
district_map_f = "cluster_map.pickle"
district_map_f_path = os.path.join(DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR,
district_map_f)
if not os.path.exists(district_map_f_path):
create_hash_district_map_dict()
# load the needed map file
with open(district_map_f_path, "rb") as f:
map_rule = pickle.load(f)
# map the needed cols..
for i in range(len(data_frame.columns)):
if "district_hash" in data_frame.columns[i]:
# map the hash according to the map rule
district_hash_col = data_frame.columns[i]
data_frame[district_hash_col] = data_frame[district_hash_col].replace(map_rule)
# after mapping, delete its hash str
new_name = re.sub("_hash","",district_hash_col)
data_frame.rename(columns={district_hash_col: new_name}, inplace = True)
return data_frame
## input the dir you want to map the hash
def setup(self, config):
"""
Load existing data for given worker.
:param config: Configuration object.
:type config: ``dict``
"""
self.path = os.path.join(
config[helper.DATA_ROOT], '%s_buckets-%s.pickle' %
(self.NAME, config[helper.WORKER_ID]))
with open(self.path, 'a') as _:
pass
with open(self.path, 'rb') as inp:
try:
self.buckets = pickle.load(inp)
except Exception:
self.buckets = {}
config_related = config.get(helper.RELATED, {}).get(self.NAME, {})
self.min_score = config_related.get(helper.MIN_SCORE, 0.4)
self.min_shared = config_related.get(helper.MIN_SHARED, 5)
self.max_results = config_related.get(helper.MAX_RESULTS, 100)
def load_all(self, config):
"""
Load all existing data.
:param config: Configuration object.
:type config: ``dict``
"""
self.buckets = {}
for path in glob.glob(os.path.join(
config[helper.DATA_ROOT], '%s_buckets-*.pickle' % self.NAME)):
with open(path, 'rb') as inp:
try:
for key, value in pickle.load(inp).items():
if key in self.buckets:
self.buckets[key]['bins'].update(value['bins'])
else:
self.buckets[key] = value
except:
logging.warning('could not load related_%s data', self.NAME)
def build_data_dict(self, layer_features, k=5):
"""
This build dict[id] = {label, spacing, 1={loc, p, layer1_feature, layer2_feature...}, 2={}...}
:param layer_features: features from layer, e.g 67, 77
:param k: number of nodule considered as inputs
:return: a combined dictionary
"""
with open(self.pkl_dir + self.data_file_name, 'rb') as data_file:
data = cPickle.load(data_file)
with open(self.pkl_dir + self.feature_file_name, 'rb') as feature_file:
features = cPickle.load(feature_file)
data_dict = {}
for d,f in zip(data, features):
pid = d['id']
data_dict[pid] = {'label':d['label'], 'spacing':d['spacing']}
# add the features
for i in range(k):
data_dict[pid][i] = {'loc': f['loc_{}'.format(i)], 'p': f['p_{}'.format(i)]}
for layer in layer_features:
data_dict[pid][i][layer] = f['out_{}_{}'.format(i, layer)]
return data_dict
luna_preprocessed_load_data.py 文件源码
项目:lung-cancer-detector
作者: YichenGong
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def next_batch(self, batch_size):
assert self.train_mode or self.validation_mode, "Please set mode, train, validation or test. e.g. DataLoad.train()"
idx_next_batch = [(self.current_idx + i)%len(self.p_imgs) for i in range(self.batch_size)]
patient_img_next_batch = [ self.p_imgs[idx] for idx in idx_next_batch]
batch_image = []
batch_mask = []
for image in patient_img_next_batch:
fi = gzip.open(self.data_path + image, 'rb')
img = pickle.load(fi)
img = np.expand_dims(img, axis=2)
batch_image.append(img)
fi.close()
fm = gzip.open(self.mask_path + image, 'rb')
mask = pickle.load(fm)
fm.close()
mask_binary_class = np.zeros([mask.shape[0],mask.shape[1],2])
mask_binary_class[:,:,0][mask == 0] = 1
mask_binary_class[:,:,1][mask == 1] = 1
batch_mask.append(mask_binary_class)
self.current_idx = (self.current_idx + batch_size) % len(self.p_imgs)
batched_image = np.stack(batch_image)
batched_mask = np.stack(batch_mask)
return batched_image, batched_mask
def sent_tokenize(text, lang='english'):
"""
Punkt sentence tokenizer from NLTK.
"""
global _nltk_sent_tokenizer
try:
_nltk_sent_tokenizer
except NameError:
# If the sentence tokenizer wasn't previously initialized.
available_languages = ['czech', 'danish', 'dutch', 'english',
'estonian', 'finnish', 'french', 'german',
'greek', 'italian', 'norwegian', 'polish',
'portuguese', 'slovene', 'spanish', 'swedish',
'turkish']
assert lang in available_languages, "Punkt Tokenizer for {} not available".format(lang)
# Checks that the punkt tokenizer model was previously downloaded.
download('punkt', quiet=True)
path_to_punkt = _nltk_downloader._download_dir + '/tokenizers/punkt/{}.pickle'.format(lang)
with open(path_to_punkt, 'rb') as fin:
_nltk_sent_tokenizer = pickle.load(fin)
# Actual tokenization using the Punkt Model.
return _nltk_sent_tokenizer.tokenize(text)
def read_fakelc(fakelcfile):
'''
This just reads a pickled fake LC.
'''
try:
with open(lcfile,'rb') as infd:
lcdict = pickle.load(infd)
except UnicodeDecodeError:
with open(lcfile,'rb') as infd:
lcdict = pickle.load(infd, encoding='latin1')
return lcdict
#######################
## UTILITY FUNCTIONS ##
#######################
def read_pklc(lcfile):
'''
This just reads a pickle.
'''
try:
with open(lcfile,'rb') as infd:
lcdict = pickle.load(infd)
except UnicodeDecodeError:
with open(lcfile,'rb') as infd:
lcdict = pickle.load(infd, encoding='latin1')
return lcdict
# LC format -> [default fileglob, function to read LC format]
def read_pklc(lcfile):
'''
This just reads a pickle.
'''
try:
with open(lcfile,'rb') as infd:
lcdict = pickle.load(infd)
except UnicodeDecodeError:
with open(lcfile,'rb') as infd:
lcdict = pickle.load(infd, encoding='latin1')
return lcdict
# these translate filter operators given as strings to Python operators
def register(self, name, serializer):
"""Register ``serializer`` object under ``name``.
Raises :class:`AttributeError` if ``serializer`` in invalid.
.. note::
``name`` will be used as the file extension of the saved files.
:param name: Name to register ``serializer`` under
:type name: ``unicode`` or ``str``
:param serializer: object with ``load()`` and ``dump()``
methods
"""
# Basic validation
getattr(serializer, 'load')
getattr(serializer, 'dump')
self._serializers[name] = serializer
def get(self, sid):
if not self.is_valid_key(sid):
return self.new()
try:
f = open(self.get_session_filename(sid), 'rb')
except IOError:
if self.renew_missing:
return self.new()
data = {}
else:
try:
try:
data = load(f)
except Exception:
data = {}
finally:
f.close()
return self.session_class(data, sid, False)
def find_neighbor():
h2c = pickle.load(open("comps/mobike/sol_carl/data/h2c.p","rb"))
c2h = pickle.load(open("comps/mobike/sol_carl/data/c2h.p","rb"))
print(len(h2c),len(c2h))
lc = [len(c2h[i]) for i in c2h]
#distribution(lc)
#point = list(h2c.keys())[0]
point = "wx4snhx"
print("hash", point, h2c[point])
lat,lon = h2c[point]
#lat,lon = int(lat+0.5),int(lon+0.5)
points = c2h[(lat,lon)]
for la in [lat-0.01,lat,lat+0.01]:
for lo in [lon-0.01,lon,lon+0.01]:
coord = (la,lo)
points = c2h.get(coord,[])
for p in points:
d = geo_distance(h2c[p],(lat,lon))
print(coord,p,d)
def get_per_sample_tf(self, texts, field, silent=0):
"""
Each sample is a document.
Input:
texts: ["train","text"]
"""
if self.sample_tf is not None:
return
self.sample_tf = {}
self.get_per_sample_words_count(texts, field, 1)
for text in texts:
name = "{}/{}_sample_tf_{}.p".format(self.flags.data_path,self.name,text)
if os.path.exists(name):
self.sample_tf[text] = pickle.load(open(name,'rb'))
else:
print("gen",name)
tf_list = tf(self.sample_words_count[text],0)
pickle.dump(tf_list,open(name,'wb'))
self.sample_tf[text] = tf_list
if silent==0:
print("\n{} sample tf done".format(text))
def mean_target_rate(name,out,idcol,ycol):
if os.path.exists(out):
return pickle.load(open(out,'rb'))
yc,cc = defaultdict(float),defaultdict(float)
for c,row in enumerate(csv.DictReader(open(name))):
y = float(row[ycol])
for i in row:
if i in [idcol,ycol]:
continue
v = "%s-%s"%(i,row[i])
yc[v] += y
cc[v] += 1.0
if c>0 and c%100000 == 0:
print("rows %d len_cc %d"%(c,len(cc)))
for i in yc:
yc[i] = yc[i]/cc[i]
pickle.dump(yc,open(out,'wb'))
return yc
def tutor_fpout():
pklout = os.path.join(RESDIR, TUTORPKL)
if os.path.exists(pklout):
with open(pklout, 'rb') as f:
fpout = pickle.load(f)
else:
print('re-creating fp results ... this could take a few minutes')
zip_archive = os.path.join(DATADIR, ZIPFILE)
with zipfile.ZipFile(zip_archive, 'r') as zfile:
zfile.extractall(DATADIR)
fpout = tutor_example()
make_clean_dat()
os.makedirs(RESDIR, exist_ok=True)
with open(pklout, 'wb') as f:
pickle.dump(fpout, f)
return fpout
def load_egg(filepath):
"""
Loads pickled egg
Parameters
----------
filepath : str
Location of pickled egg
Returns
----------
egg : Egg data object
A loaded unpickled egg
"""
with open(filepath, 'rb') as f:
egg = pickle.load(f)
return egg
def load_egg(filepath):
"""
Loads pickled egg
Parameters
----------
filepath : str
Location of pickled egg
Returns
----------
egg : Egg data object
A loaded unpickled egg
"""
with open(filepath, 'rb') as f:
egg = pickle.load(f)
return egg
def unpickle_cookies(args, alias=None):
"""
Unpickles the cookies file for the given alias and
returns the original object.
If no file exists, then an empty cookies object is
returned.
"""
if alias is None:
alias = args.alias
cookie_file = os.path.join(
get_working_dir(args),
alias + ".cookies")
try:
with open(cookie_file, "rb") as cookie_jar:
cookies = pickle.load(cookie_jar)
except BaseException:
cookies = requests.cookies.RequestsCookieJar()
return cookies
def loadData (self, filename, verbose=True, replace_missing=True):
''' Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse'''
if verbose: print("========= Reading " + filename)
start = time.time()
if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")):
with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file:
vprint (verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"))
return pickle.load(pickle_file)
if 'format' not in self.info.keys():
self.getFormatData(filename)
if 'feat_num' not in self.info.keys():
self.getNbrFeatures(filename)
data_func = {'dense':data_io.data, 'sparse':data_io.data_sparse, 'sparse_binary':data_io.data_binary_sparse}
data = data_func[self.info['format']](filename, self.info['feat_num'])
# INPORTANT: when we replace missing values we double the number of variables
if self.info['format']=='dense' and replace_missing and np.any(map(np.isnan,data)):
vprint (verbose, "Replace missing values by 0 (slow, sorry)")
data = data_converter.replace_missing(data)
if self.use_pickle:
with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file:
vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"))
p = pickle.Pickler(pickle_file)
p.fast = True
p.dump(data)
end = time.time()
if verbose: print( "[+] Success in %5.2f sec" % (end - start))
return data
def loadLabel (self, filename, verbose=True):
''' Get the solution/truth values'''
if verbose: print("========= Reading " + filename)
start = time.time()
if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")):
with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file:
vprint (verbose, "Loading pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"))
return pickle.load(pickle_file)
if 'task' not in self.info.keys():
self.getTypeProblem(filename)
# IG: Here change to accommodate the new multiclass label format
if self.info['task'] == 'multilabel.classification':
label = data_io.data(filename)
elif self.info['task'] == 'multiclass.classification':
label = data_converter.convert_to_num(data_io.data(filename))
else:
label = np.ravel(data_io.data(filename)) # get a column vector
#label = np.array([np.ravel(data_io.data(filename))]).transpose() # get a column vector
if self.use_pickle:
with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file:
vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"))
p = pickle.Pickler(pickle_file)
p.fast = True
p.dump(label)
end = time.time()
if verbose: print( "[+] Success in %5.2f sec" % (end - start))
return label