def allocate(self, shape, data_dtype=None):
if data_dtype is None:
data_dtype = self.data_dtype
if self._parallel_write:
self.my_file = h5py.File(self.file_name, mode='w', driver='mpio', comm=comm)
self.my_file.create_dataset(self.h5_key, dtype=data_dtype, shape=shape)
else:
self.my_file = h5py.File(self.file_name, mode='w')
if self.is_master:
if self.compression != '':
self.my_file.create_dataset(self.h5_key, dtype=data_dtype, shape=shape, compression=self.compression, chunks=True)
else:
self.my_file.create_dataset(self.h5_key, dtype=data_dtype, shape=shape, chunks=True)
self.my_file.close()
self._read_from_header()
python类File()的实例源码
def test_patch_for_similarities(params, extension):
file_out_suff = params.get('data', 'file_out_suff')
template_file = file_out_suff + '.templates%s.hdf5' %extension
if os.path.exists(template_file):
try:
myfile = h5py.File(template_file, 'r', libver='latest')
version = myfile.get('version')[0].decode('ascii')
myfile.close()
except Exception:
version = None
else:
raise Exception('No templates found! Check suffix?')
if version is not None:
if (StrictVersion(version) >= StrictVersion('0.6.0')):
return True
else:
print_and_log(["Version is below 0.6.0"], 'debug', logger)
return False
def test_validating(self):
#mpi_launch('fitting', self.file_name, 2, 0, 'False')
a, b = os.path.splitext(os.path.basename(self.file_name))
file_name, ext = os.path.splitext(self.file_name)
file_out = os.path.join(os.path.abspath(file_name), a)
result_name = os.path.join(file_name, 'injected')
spikes = {}
result = h5py.File(os.path.join(result_name, '%s.result.hdf5' %a))
for key in result.get('spiketimes').keys():
spikes[key] = result.get('spiketimes/%s' %key)[:]
juxta_file = file_out + '.juxta.dat'
f = numpy.memmap(juxta_file, shape=(self.length,1), dtype=self.parser.get('validating', 'juxta_dtype'), mode='w+')
f[spikes['temp_9']] = 100
del f
mpi_launch('validating', self.file_name, 2, 0, 'False')
def report(self, summary_json_paths, barcode_summary_h5_path, recovered_cells, cell_bc_seqs):
assert len(cell_bc_seqs) == len(self.matrices)
barcode_summary_h5 = h5.File(barcode_summary_h5_path, 'r')
d = {}
d.update(self._report_genome_agnostic_metrics(
summary_json_paths, barcode_summary_h5, recovered_cells, cell_bc_seqs))
# Compute genome-specific metrics
for i, (genome, matrix) in enumerate(self.matrices.iteritems()):
for key, value in matrix.report(genome,
barcode_summary_h5,
recovered_cells,
cell_bc_seqs=cell_bc_seqs[i],
).iteritems():
key = '_'.join([genome, key])
d[key] = value
return d
def write_data_frame(fn, df):
''' Write the pandas dataframe object to an HDF5 file. Each column is written as a single 1D dataset at the top
level of the HDF5 file, using the native pandas datatype'''
# Always write a fresh file -- the 'w' argument to h5py.File is supposed to truncate an existing file, but it doesn't appear to work correctly
if os.path.exists(fn):
os.remove(fn)
f = h5py.File(fn, "w")
# To preserve column order, write columns to an attribute
column_names = np.array(list(df.columns))
f.attrs.create("column_names", column_names)
for col in df.columns:
write_data_column(f, df[col])
f.close()
def read_data_frame(fn, query_cols=[]):
''' Load a pandas DataFrame from an HDF5 file. If a column list is specified, only load the matching columns '''
with h5py.File(fn, 'r') as f:
column_names = f.attrs.get("column_names")
column_names = get_column_intersection(column_names, query_cols)
df = p.DataFrame()
# Add the columns progressively to save memory
for name in column_names:
ds = f[name]
if has_levels(ds):
indices = ds[:]
uniques = get_levels(ds)
# This method of constructing of Categorical avoids copying the indices array
# which saves memory for big datasets
df[name] = p.Categorical(indices, categories=uniques, ordered=False, fastpath=True)
else:
df[name] = p.Series(ds[:])
return df
def read_data_frame_indexed_no_concat(fn, tabix_queries, query_cols = [], coords = True):
''' Read rows from the HDF5 data frame that match each tabix query in the
queries list. A tabix query is in the form ('chr1', 100, 200). query_cols
is a list of columns you want to return. If coords is True, then it it will
return coordinates regardless of query_cols. If coords is False, it will
only return the columns specified in query_cols. Returns a list of pandas
DataFrames, one for each query. '''
f = h5py.File(fn, 'r')
# read the index
tabix_index = read_tabix_index(f)
dfs = []
for q in tabix_queries:
r = _read_data_frame_indexed_sub(f, tabix_index, q, query_cols = query_cols, coords = coords)
dfs.append(r)
f.close()
# Return the union of the queries
return dfs
def check_filters(fast5_file, min_length, min_mean_qual, min_qual_window, window_size):
try:
hdf5_file = h5py.File(fast5_file, 'r')
names = get_hdf5_names(hdf5_file)
basecall_location = get_best_fastq_hdf5_location(hdf5_file, names)
if basecall_location:
fastq_str = hdf5_file[basecall_location].value
try:
parts = fastq_str.split(b'\n')
seq, quals = parts[1], parts[3]
except IndexError:
fastq_str, seq, quals = '', '', ''
if not fastq_str or not seq:
return False, 0
if min_mean_qual and get_mean_qscore(quals) < min_mean_qual:
return False, 0
if min_length and len(seq) < min_length:
return False, 0
if min_qual_window and get_min_window_qscore(quals, window_size) < min_qual_window:
return False, 0
return True, len(seq)
except (IOError, RuntimeError):
pass
return False, 0
def min_window_qual_and_length(fast5_file, window_size):
try:
hdf5_file = h5py.File(fast5_file, 'r')
names = get_hdf5_names(hdf5_file)
basecall_location = get_best_fastq_hdf5_location(hdf5_file, names)
if basecall_location:
fastq_str = hdf5_file[basecall_location].value
try:
parts = fastq_str.split(b'\n')
seq, quals = parts[1], parts[3]
return get_min_window_qscore(quals, window_size), len(seq), fast5_file
except IndexError:
pass
except (IOError, RuntimeError):
pass
return 0.0, 0, fast5_file
def save_h5_data_label_normal(h5_filename, data, label, normal,
data_dtype='float32', label_dtype='uint8', noral_dtype='float32'):
h5_fout = h5py.File(h5_filename)
h5_fout.create_dataset(
'data', data=data,
compression='gzip', compression_opts=4,
dtype=data_dtype)
h5_fout.create_dataset(
'normal', data=normal,
compression='gzip', compression_opts=4,
dtype=normal_dtype)
h5_fout.create_dataset(
'label', data=label,
compression='gzip', compression_opts=1,
dtype=label_dtype)
h5_fout.close()
# Write numpy array data and label to h5_filename
def main():
parser = generate_parser()
args = parser.parse_args()
infile1 = h5py.File(args.input1, 'r')
infile2 = h5py.File(args.input2, 'r')
resolutions = numpy.intersect1d(infile1['resolutions'][...], infile2['resolutions'][...])
chroms = numpy.intersect1d(infile2['chromosomes'][...], infile2['chromosomes'][...])
results = {}
data1 = load_data(infile1, chroms, resolutions)
data2 = load_data(infile2, chroms, resolutions)
infile1.close()
infile2.close()
results = {}
results[(args.input1.split('/')[-1].strip('.quasar'), args.input2.split('/')[-1].strip('.quasar'))] = correlate_samples(data1, data2)
for resolution in data1.keys():
for chromo in chroms:
plt.scatter(data1[resolution][chromo][1].flatten(),data2[resolution][chromo][1].flatten(),alpha=0.1,color='red')
plt.show()
plt.savefig(args.output+'.res'+str(resolution)+'.chr'+chromo+'.pdf')
def fill_hdf5_with_sparse_by_chunk(mym1,mym2,fname,chunksize):
start1=0
end1=0
n=mym1.shape[0]
f=h5py.File(fname,'w')
m1hdf5=f.create_dataset('m1',shape=(n,n),dtype='float')
m2hdf5=f.create_dataset('m2',shape=(n,n),dtype='float')
while end1<n:
end1=np.min([n,(start1+chunksize)])
print 'start1: '+str(start1)
if (end1-start1)==1:
m1hdf5[start1,:]=mym1[start1,:].toarray()
m2hdf5[start1,:]=mym2[start1,:].toarray()
else:
m1hdf5[start1:end1,:]=mym1[start1:end1,:].toarray()
m2hdf5[start1:end1,:]=mym2[start1:end1,:].toarray()
start1=end1
print 'sum of 1'
print m1hdf5[:,:].sum()
print m2hdf5[:,:].sum()
f.close()
def __init__(self, data=None, info=None, dtype=None, file=None, copy=False, **kwargs):
object.__init__(self)
#self._infoOwned = False
self._isHDF = False
if file is not None:
self._data = None
self.readFile(file, **kwargs)
if kwargs.get("readAllData", True) and self._data is None:
raise Exception("File read failed: %s" % file)
else:
self._info = info
if (hasattr(data, 'implements') and data.implements('MetaArray')):
self._info = data._info
self._data = data.asarray()
elif isinstance(data, tuple): ## create empty array with specified shape
self._data = np.empty(data, dtype=dtype)
else:
self._data = np.array(data, dtype=dtype, copy=copy)
## run sanity checks on info structure
self.checkInfo()
def transpose(self, *args):
if len(args) == 1 and hasattr(args[0], '__iter__'):
order = args[0]
else:
order = args
order = [self._interpretAxis(ax) for ax in order]
infoOrder = order + list(range(len(order), len(self._info)))
info = [self._info[i] for i in infoOrder]
order = order + list(range(len(order), self.ndim))
try:
if self._isHDF:
return MetaArray(np.array(self._data).transpose(order), info=info)
else:
return MetaArray(self._data.transpose(order), info=info)
except:
print(order)
raise
#### File I/O Routines
def export(self, fileName=None):
if not HAVE_HDF5:
raise RuntimeError("This exporter requires the h5py package, "
"but it was not importable.")
if not isinstance(self.item, PlotItem):
raise Exception("Must have a PlotItem selected for HDF5 export.")
if fileName is None:
self.fileSaveDialog(filter=["*.h5", "*.hdf", "*.hd5"])
return
dsname = self.params['Name']
fd = h5py.File(fileName, 'a') # forces append to file... 'w' doesn't seem to "delete/overwrite"
data = []
appendAllX = self.params['columnMode'] == '(x,y) per plot'
for i,c in enumerate(self.item.curves):
d = c.getData()
if appendAllX or i == 0:
data.append(d[0])
data.append(d[1])
fdata = numpy.array(data).astype('double')
dset = fd.create_dataset(dsname, data=fdata)
fd.close()
def __load_page_data(self):
self.__clearRows()
if hasattr(self,"selectChan"):
with hp.File(self.file_name,"r") as f:
sampling_rate = f["analogs"][self.selectChan]["sampling_rate"].value
start_time = f["analogs"][self.selectChan]["start_time"].value
start_point = sampling_rate*self.row_num*self.current_page
end_point = sampling_rate*self.row_num*(self.current_page+1)
self.page_data = f["analogs"][self.selectChan]["data"][start_point:end_point]
self.sigma = np.median(np.abs(self.page_data)/0.6745)
Thr = self.thresholds[self.selectChan] * self.sigma
self.sampling_rate = sampling_rate
self.row_wins_rois = [0]*self.row_num
for i in range(self.row_num):
start_point = i*sampling_rate
end_point = (i+1)*sampling_rate
if self.page_data[start_point:end_point].size:
ys = self.page_data[start_point:end_point]
xs = np.arange(ys.size)
line = MultiLine(np.array([xs]),np.array([ys]),"w")
self.row_wins[i].addItem(line)
self.row_wins_rois[i] = pg.InfiniteLine(pos=Thr,angle=0,movable=False)
self.row_wins_rois[i].setZValue(10)
self.row_wins[i].addItem(self.row_wins_rois[i])
def __load_waveforms(self,selectChan,file_name):
spk_startswith = "spike_{0}".format(selectChan)
with hp.File(file_name,"r") as f:
times = list()
waveforms = list()
for chn_unit in f["spikes"].keys():
if chn_unit.startswith(spk_startswith):
tep_time = f["spikes"][chn_unit]["times"].value
waveform = f["spikes"][chn_unit]["waveforms"].value
times.append(tep_time)
waveforms.append(waveform)
if times:
times = np.hstack(times)
waveforms = np.vstack(waveforms)
sort_index = np.argsort(times)
waveforms = waveforms[sort_index]
return waveforms
else:
return None
def __init__(self, data=None, info=None, dtype=None, file=None, copy=False, **kwargs):
object.__init__(self)
#self._infoOwned = False
self._isHDF = False
if file is not None:
self._data = None
self.readFile(file, **kwargs)
if kwargs.get("readAllData", True) and self._data is None:
raise Exception("File read failed: %s" % file)
else:
self._info = info
if (hasattr(data, 'implements') and data.implements('MetaArray')):
self._info = data._info
self._data = data.asarray()
elif isinstance(data, tuple): ## create empty array with specified shape
self._data = np.empty(data, dtype=dtype)
else:
self._data = np.array(data, dtype=dtype, copy=copy)
## run sanity checks on info structure
self.checkInfo()
def transpose(self, *args):
if len(args) == 1 and hasattr(args[0], '__iter__'):
order = args[0]
else:
order = args
order = [self._interpretAxis(ax) for ax in order]
infoOrder = order + list(range(len(order), len(self._info)))
info = [self._info[i] for i in infoOrder]
order = order + list(range(len(order), self.ndim))
try:
if self._isHDF:
return MetaArray(np.array(self._data).transpose(order), info=info)
else:
return MetaArray(self._data.transpose(order), info=info)
except:
print(order)
raise
#### File I/O Routines
def export(self, fileName=None):
if not HAVE_HDF5:
raise RuntimeError("This exporter requires the h5py package, "
"but it was not importable.")
if not isinstance(self.item, PlotItem):
raise Exception("Must have a PlotItem selected for HDF5 export.")
if fileName is None:
self.fileSaveDialog(filter=["*.h5", "*.hdf", "*.hd5"])
return
dsname = self.params['Name']
fd = h5py.File(fileName, 'a') # forces append to file... 'w' doesn't seem to "delete/overwrite"
data = []
appendAllX = self.params['columnMode'] == '(x,y) per plot'
for i,c in enumerate(self.item.curves):
d = c.getData()
if appendAllX or i == 0:
data.append(d[0])
data.append(d[1])
fdata = numpy.array(data).astype('double')
dset = fd.create_dataset(dsname, data=fdata)
fd.close()
def h5_io(filename, spike_to_load, analog_to_load):
spikes = dict()
analogs = dict()
events = dict()
comments = dict()
with hp.File(filename,'r') as f:
for key in f.keys():
if key=='events':
events['times'] = f[key]['times'].value
events['labels'] = f[key]['labels'].value
elif key=='comments':
comments['times'] = f[key]['times'].value
comments['labels'] = f[key]['labels'].value
elif key=='spikes':
for tem_key in f[key].keys():
if tem_key in spike_to_load:
spikes[tem_key] = f[key][tem_key]['times'].value
elif key=='analogs':
for tem_key in f[key].keys():
if tem_key in analog_to_load:
analogs[tem_key] = dict()
analogs[tem_key]['data'] = f[key][tem_key]['data'].value
analogs[tem_key]['sampling_rate'] = f[key][tem_key]['sampling_rate'].value
analogs[tem_key]['start_time'] = f[key][tem_key]['start_time'].value
return events,comments,spikes,analogs
def __load_page_data(self):
self.__clearRows()
if hasattr(self,"selectChan"):
with hp.File(self.file_name,"r") as f:
sampling_rate = f["analogs"][self.selectChan]["sampling_rate"].value
start_time = f["analogs"][self.selectChan]["start_time"].value
start_point = sampling_rate*self.row_num*self.current_page
end_point = sampling_rate*self.row_num*(self.current_page+1)
self.page_data = f["analogs"][self.selectChan]["data"][start_point:end_point]
self.sigma = np.median(np.abs(self.page_data)/0.6745)
Thr = self.thresholds[self.selectChan] * self.sigma
self.sampling_rate = sampling_rate
self.row_wins_rois = [0]*self.row_num
for i in range(self.row_num):
start_point = i*sampling_rate
end_point = (i+1)*sampling_rate
if self.page_data[start_point:end_point].size:
ys = self.page_data[start_point:end_point]
xs = np.arange(ys.size)
line = MultiLine(np.array([xs]),np.array([ys]),"w")
self.row_wins[i].addItem(line)
self.row_wins_rois[i] = pg.InfiniteLine(pos=Thr,angle=0,movable=False)
self.row_wins_rois[i].setZValue(10)
self.row_wins[i].addItem(self.row_wins_rois[i])
def gen_tracking_db(database, tracking_stats):
"""Generate TrackingDataset structure.
Parameters
----------
database : h5py.File
HDF5 file object
tracking_stats : dictionary
the dictionary that contains TrackingDataset's stats
Returns
-------
database : h5py.File
HDF5 file object with multiple groups
"""
primary_list = tracking_stats["primary_list"]
for pc in primary_list:
if pc not in database:
database.create_group(pc)
print "[MESSAGE] Primary group %s is created" % (pc)
print "[MESSAGE] TrackingDataset HDF5 structure is generated."
def gen_caltech256_db(database, caltech256_stats):
"""Generate Caltech-256 structure.
Parameters
----------
database : h5py.File
HDF5 file object
caltech256_stats : dictionary
the dictionary that contains Caltech-256's stats
Returns
-------
database : h5py.File
HDF5 file object with multiple groups
"""
caltech256_list = caltech256_stats["caltech256_list"]
for class_name in caltech256_list:
if class_name not in database:
database.create_group(class_name)
print "[MESSAGE] Class %s is created" % (class_name)
print "[MESSAGE] Caltech-256 HDF5 structure is generated."
def gen_ucf50_db(database, ucf50_stats):
"""Generate UCF50 structure.
Parameters
----------
database : h5py.File
HDF5 file object
ucf50_stats : dictionary
the dictionary that contains UCF50's stats
Returns
-------
database : h5py.File
HDF5 file object with multiple groups
"""
ucf50_list = ucf50_stats["ucf50_list"]
for category in ucf50_list:
if category not in database:
database.create_group(category)
print "[MESSAGE] Category %s is created" % (category)
print "[MESSAGE] UCF-50 HDF5 structure is generated."
def time_hdf5():
data_path = create_hdf5(BATCH_SIZE * NSTEPS)
f = h5py.File(data_path)
durs = []
for step in tqdm.trange(NSTEPS, desc='running hdf5'):
start_time = time.time()
arr = f['data'][BATCH_SIZE * step: BATCH_SIZE * (step+1)]
read_time = time.time()
arr = copy.deepcopy(arr)
copy_time = time.time()
durs.append(['hdf5 read', step, read_time - start_time])
durs.append(['hdf5 copy', step, copy_time - read_time])
f.close()
os.remove(data_path)
durs = pandas.DataFrame(durs, columns=['kind', 'stepno', 'dur'])
return durs
def mean_variance_normalisation(h5f, mvn_h5f, vad=None):
"""Do mean variance normlization. Optionnaly use a vad.
Parameters:
----------
h5f: str. h5features file name
mvn_h5f: str, h5features output name
"""
dset = h5py.File(h5f).keys()[0]
if vad is not None:
raise NotImplementedError
else:
data = h5py.File(h5f)[dset]['features'][:]
features = data
epsilon = np.finfo(data.dtype).eps
mean = np.mean(data)
std = np.std(data)
mvn_features = (features - mean) / (std + epsilon)
shutil.copy(h5f, mvn_h5f)
h5py.File(mvn_h5f)[dset]['features'][:] = mvn_features
def h5features_feats2stackedfeats(fb_h5f, stackedfb_h5f, nframes=7):
"""Create stacked features version of h5features file
Parameters:
----------
fb_h5f: str. h5features file name
stackedfb_h5f: str, h5features output name
"""
dset_name = h5py.File(fb_h5f).keys()[0]
files = h5py.File(fb_h5f)[dset_name]['items']
def aux(f):
return stack_fbanks(h5features.read(fb_h5f, from_item=f)[1][f],
nframes=nframes)
def time_f(f):
return h5features.read(fb_h5f, from_item=f)[0][f]
h5features_compute(files, stackedfb_h5f, featfunc=aux,
timefunc=time_f)
def load_data(name='ac3', N=-1, prefix=None, gold=False):
'''Load data
'''
if not 'mri' in name:
if gold: filename = '~/compresso/data/' + name + '/gold/' + name + '_gold.h5'
else: filename = '~/compresso/data/' + name + '/rhoana/' + name + '_rhoana.h5'
with h5py.File(os.path.expanduser(filename), 'r') as hf:
output = np.array(hf['main'], dtype=np.uint64)
else:
filename = '~/compresso/data/MRI/' + name + '.h5'
with h5py.File(os.path.expanduser(filename), 'r') as hf:
output = np.array(hf['main'], dtype=np.uint64)
if (not N == -1):
output = output[0:N,:,:]
return output
def write_hdf5(file, data, label_class, label_bbox, label_landmarks):
# transform to np array
data_arr = np.array(data, dtype = np.float32)
# print data_arr.shape
# if no swapaxes, transpose to num * channel * width * height ???
# data_arr = data_arr.transpose(0, 3, 2, 1)
label_class_arr = np.array(label_class, dtype = np.float32)
label_bbox_arr = np.array(label_bbox, dtype = np.float32)
label_landmarks_arr = np.array(label_landmarks, dtype = np.float32)
with h5py.File(file, 'w') as f:
f['data'] = data_arr
f['label_class'] = label_class_arr
f['label_bbox'] = label_bbox_arr
f['label_landmarks'] = label_landmarks_arr
# list_file format:
# image_path | label_class | label_boundingbox(4) | label_landmarks(10)