def create_levels(ds, levels):
# Create a dataset in the LEVEL_GROUP
# and store as native numpy / h5py types
level_grp = ds.file.get(LEVEL_GROUP)
if level_grp is None:
# Create a LEVEL_GROUP
level_grp = ds.file.create_group(LEVEL_GROUP)
ds_name = ds.name.split("/")[-1]
dt = h5py.special_dtype(vlen=str)
level_grp.create_dataset(ds_name,
shape = [len(levels)],
maxshape = (None,),
dtype = dt,
data = levels,
compression = COMPRESSION,
chunks = (CHUNK_SIZE,))
python类special_dtype()的实例源码
def text_to_h5py_dataset(text_path, dst_path):
# The simplest is to load everything to memory first.
# If memory becomes an issue, this code can be optimized.
words = []
with open(text_path, 'r') as src:
for line in src:
words.extend(line.strip().split())
with h5py.File(dst_path, 'w') as dst:
dtype = h5py.special_dtype(vlen=bytes)
table = dst.create_dataset('words', (len(words),), dtype=dtype)
table[:] = words
dst.attrs['split'] = H5PYDataset.create_split_array({
'train' : {
'words' : (0, len(words))
}
})
def add_words_ids_to_squad(h5_file, vocab):
"""Digitizes test with a vocabulary.
Also saves the vocabulary into the hdf5 file.
"""
with h5py.File(h5_file, 'a') as dst:
unicode_dtype = h5py.special_dtype(vlen=unicode)
dst.create_dataset('text_ids', (dst['text'].shape[0],), 'int64')
dst.create_dataset('vocab_words', (vocab.size(),), unicode_dtype)
dst.create_dataset('vocab_freqs', (vocab.size(),), 'int64')
dst['text_ids'][:] = map(vocab.word_to_id, dst['text'][:])
dst['vocab_words'][:] = vocab.words
dst['vocab_freqs'][:] = vocab.frequencies
### SNLI ###
def add_text_h5(fid, path, data):
"""Add text data (UTF-8) to the given path in the HDF5 file
with handle fid.
Arguments:
- fid is the file handle to the HDF5 file
- path is the base path inside the HDF5 file
- data is the text data as a string
"""
dset = fid.create_dataset(name=path,
shape=(1,),
dtype=h5py.special_dtype(vlen=str),
data=data,
compression="gzip")
def load_pretrained():
#glove_vec = ["glove_wiki_50","glove_wiki_150","glove_wiki_300"]
glove_vec = ["glove_wiki_300"]
#glove_vec = ["glove_wiki_50"]
filename = 'glove_pretrained.h5'
#import tensorflow as tf
#sess = tf.InteractiveSession()
features, words = load_h5py('glove_wiki_300',filename=root + glove_vec_fold + filename)
filename = 'glove.h5'
features = normalize(np.array(features), axis=1, norm='l2')
with h5py.File(root + glove_vec_fold + filename, "w") as hf:
hf.create_dataset(glove_vec[0], data=features)
string_dt = h5py.special_dtype(vlen=str)
hf.create_dataset(glove_vec[0] + "_words", data=words, dtype=string_dt)
for vec in glove_vec:
data, words = load_h5py(vec, filename=root + glove_vec_fold + "glove.h5")
print(data.shape, words.shape)
time.sleep(5)
def create_partition_function(self, f_w2v, f_h5):
print("Building the partition function")
# Load the model from disk
M = load_w2vec()
words = M.wv.index2word
ZT = []
INPUT_ITR = tqdm.tqdm(words)
# Compute the partition function for each word
for w in INPUT_ITR:
UE = self.energy(M.wv.syn0, M[w])
z = compute_partition_stats(UE)
ZT.append(z)
# Save the partition function to disk
# (special care needed for h5py unicode strings)
dt = h5py.special_dtype(vlen=unicode)
with h5py.File(f_h5, 'w') as h5:
h5.create_dataset("words", (len(words),),
dtype=dt,
data=[w.encode('utf8') for w in words])
h5.attrs['vocab_N'] = len(words)
h5['Z'] = ZT
def _precompute(self, Xy_generator, cache):
with h5py.File(cache, mode='w') as fp:
# initialize with a fixed number of sequences
n_sequences = 1000
y = fp.create_dataset(
'y', shape=(n_sequences, ),
dtype=h5py.special_dtype(vlen=bytes),
maxshape=(None, ))
for i, (X_, y_) in enumerate(Xy_generator):
if i == 0:
_, n_samples, n_features = X_.shape
X = fp.create_dataset(
'X', dtype=X_.dtype, compression='gzip',
shape=(n_sequences, n_samples, n_features),
chunks=(1, n_samples, n_features),
maxshape=(None, n_samples, n_features))
# increase number of sequences on demand
if i == n_sequences:
n_sequences = int(n_sequences * 1.1)
y.resize(n_sequences, axis=0)
X.resize(n_sequences, axis=0)
# store current X, y in file
y[i] = y_
X[i] = X_
# resize file to exactly match the number of sequences
y.resize(i, axis=0)
X.resize(i, axis=0)
def save_h5py(arrays, string_arrs, names, filename="glove.h5"):
with h5py.File(filename, "w") as hf:
for i in range(len(arrays)):
hf.create_dataset(names[i], data=arrays[i])
string_dt = h5py.special_dtype(vlen=str)
hf.create_dataset(names[i] + "_words", data=string_arrs[i], dtype=string_dt)
return True
def to_hdf5(self, hf, df, **kwargs):
"""
Add datasets to a group for an HDF5 file handler
"""
if self.dielectronic:
grp_name = '/'.join([self.element, self.ion_name, 'dielectronic', self.filetype])
else:
grp_name = '/'.join([self.element, self.ion_name, self.filetype])
if grp_name not in hf:
grp = hf.create_group(grp_name)
grp.attrs['chianti_version'] = df.meta['chianti_version']
grp.attrs['footer'] = df.meta['footer']
else:
grp = hf[grp_name]
hf['/'.join([self.element, self.ion_name])].attrs['element'] = self.element
hf['/'.join([self.element, self.ion_name])].attrs['ion'] = self.ion_name
for name in df.colnames:
col = df[name]
if type(col) == u.Quantity:
data = col.value
else:
data = col.data
if '<U' in data.dtype.str:
numchar = data.dtype.str[2:]
data = data.astype('|S{}'.format(numchar))
if name in grp:
ds = grp[name]
else:
if data.dtype == np.dtype('O'):
ragged_dtype = h5py.special_dtype(vlen=np.dtype('float64'))
ds = grp.create_dataset(name, data=data, dtype=ragged_dtype)
else:
ds = grp.create_dataset(name, data=data, dtype=data.dtype)
if col.unit is None:
ds.attrs['unit'] = 'SKIP'
else:
ds.attrs['unit'] = col.unit.to_string()
ds.attrs['description'] = df.meta['descriptions'][name]
def export_data_h5(vocabulary, embedding_matrix, output='embedding.h5'):
f = h5py.File(output, "w")
compress_option = dict(compression="gzip", compression_opts=9, shuffle=True)
words_flatten = '\n'.join(vocabulary)
f.attrs['vocab_len'] = len(vocabulary)
print len(vocabulary)
dt = h5py.special_dtype(vlen=str)
_dset_vocab = f.create_dataset('words_flatten', (1, ), dtype=dt, **compress_option)
_dset_vocab[...] = [words_flatten]
_dset = f.create_dataset('embedding', embedding_matrix.shape, dtype=embedding_matrix.dtype, **compress_option)
_dset[...] = embedding_matrix
f.flush()
f.close()
def cli_render(input, output, size):
'''Render a JSONlines dataset to numpy arrays, saved in an HDF5 file.
'''
chars = []
images = []
for line in input:
datum = json.loads(line)
chars.append(datum['target'])
images.append(render(
[np.array(s) for s in datum['strokes']],
size))
vocab = list(sorted(set(chars)))
char_to_index = {ch: y for y, ch in enumerate(vocab)}
with h5py.File(output, 'a') as f:
str_dt = h5py.special_dtype(vlen=str)
f.require_dataset(
'vocab', (len(vocab),), dtype=str_dt
)[...] = vocab
f.require_dataset(
'x', shape=(len(images), size, size), dtype=np.float32
)[...] = np.array(images)
f.require_dataset(
'y', shape=(len(chars),), dtype=np.int
)[...] = np.array([char_to_index[ch] for ch in chars])
def _save_hdf5(self, buffer_list):
"""
:param buffer_list:
:return:
"""
file_name = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
output_path = os.path.join(self.data_store_path, file_name)
h5file = h5py.File(output_path, 'w', chunk=True)
dt_vlen = h5py.special_dtype(vlen=str)
dt_arr = np.dtype((dt_vlen, (self.sent_max_len,)))
h5raw = h5file.create_dataset('rawdata', (len(buffer_list),), dtype=dt_arr)
for i in range(len(buffer_list)):
h5raw[i] = np.array(buffer_list[i], dtype=object)
h5file.flush()
h5file.close()
def hdf_create(self, output_path, filecnt, channel, image_arr, shape_arr, lable_arr, name_arr):
h5file = h5py.File(output_path, mode='w')
dtype = h5py.special_dtype(vlen=np.dtype('uint8'))
hdf_features = h5file.create_dataset('image_features', (filecnt,), dtype=dtype)
hdf_shapes = h5file.create_dataset('image_features_shapes', (filecnt, channel),dtype='int32')
hdf_labels = h5file.create_dataset('targets', (filecnt,), dtype='S240')
hdf_names = h5file.create_dataset('names', (filecnt,), dtype='S240')
# Attach shape annotations and scales
hdf_features.dims.create_scale(hdf_shapes, 'shapes')
hdf_features.dims[0].attach_scale(hdf_shapes)
hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels', (3,), dtype='S7')
hdf_shapes_labels[...] = ['channel'.encode('utf8'),
'height'.encode('utf8'),
'width'.encode('utf8')]
hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels')
hdf_features.dims[0].attach_scale(hdf_shapes_labels)
# Add axis annotations
hdf_features.dims[0].label = 'batch'
for i in range(len(image_arr)):
hdf_features[i] = image_arr[i]
hdf_shapes[i] = shape_arr[i]
hdf_labels[i] = lable_arr[i]
hdf_names[i] = name_arr[i]
h5file.flush()
h5file.close()
def write_psites(tpsites,psites_number,filename):
with h5py.File(filename,"w") as fout:
ds = h5py.special_dtype(vlen=str)
dt = h5py.special_dtype(vlen=np.dtype("int32"))
fout.create_dataset("transcript_ids",data=tpsites.keys(),dtype=ds)
fout.create_dataset("p_sites",data=tpsites.values(),dtype=dt, compression="gzip")
fout.create_dataset("psites_number",data=psites_number,dtype="int32")
return None
def dump_h5_var(filename, prefix, prefix_shape, data):
''' Dumps variable length data to a new dataset or
appends to the existed dataset
'''
h5f = h5py.File(filename, 'a')
ds = h5f.get(prefix)
ds_shp = h5f.get(prefix_shape)
if not ds:
var_dt = h5py.special_dtype(vlen=np.dtype(data[0].dtype))
ds = h5f.create_dataset(prefix, shape=(len(data),), maxshape=(None,), dtype=var_dt)
dim = len(data[0].shape)
ds_shp = h5f.create_dataset(prefix_shape, shape=(len(data),dim), maxshape=(None,dim), dtype=np.int64)
offset = 0
offset_shp = 0
else:
offset = len(ds)
offset_shp = len(ds)
ds.resize(len(ds) + len(data), axis=0)
ds_shp.resize(len(ds_shp) + len(data), axis=0)
for i in range(len(data)):
ds[offset+i] = data[i].flatten()
ds_shp[offset_shp+i] = data[i].shape
h5f.close()
def export_data_h5(vocabulary, embedding_matrix, output='embedding.h5'):
f = h5py.File(output, "w")
compress_option = dict(compression="gzip", compression_opts=9, shuffle=True)
words_flatten = '\n'.join(vocabulary)
f.attrs['vocab_len'] = len(vocabulary)
dt = h5py.special_dtype(vlen=str)
_dset_vocab = f.create_dataset('words_flatten', (1, ), dtype=dt, **compress_option)
_dset_vocab[...] = [words_flatten]
_dset = f.create_dataset('embedding', embedding_matrix.shape, dtype=embedding_matrix.dtype, **compress_option)
_dset[...] = embedding_matrix
f.flush()
f.close()
def add_word_ids_to_snli(h5_file, vocab):
with h5py.File(h5_file, 'a') as dst:
N = len(dst['sentence1'])
assert len(dst['sentence2']) == N
dst.create_dataset('vocab_words', (vocab.size(),), h5py.special_dtype(vlen=unicode))
dst.create_dataset('vocab_freqs', (vocab.size(),), 'int64')
dst['vocab_words'][:] = vocab.words
dst['vocab_freqs'][:] = vocab.frequencies
dtype = h5py.special_dtype(vlen=np.dtype('int32'))
sentence1_ds = dst.create_dataset('sentence1_ids', (N, ), dtype=dtype)
sentence2_ds = dst.create_dataset('sentence2_ids', (N, ), dtype=dtype)
### h5py nonsense ###
sentence1_ds_shapes = dst.create_dataset('sentence1_ids_shapes', (N, 1), dtype=("int"))
sentence2_ds_shapes = dst.create_dataset('sentence2_ids_shapes', (N, 1), dtype=("int"))
ds_shape_labels = dst.create_dataset('ds_ids_shape_labels', (1, ), dtype=("S20"))
### h5py nonsense ###
sentence1_ds[:] = np.array([[vocab.word_to_id(w) for w in s] for s in dst['sentence1'][:]])
sentence2_ds[:] = np.array([[vocab.word_to_id(w) for w in s] for s in dst['sentence2'][:]])
### h5py nonsense ###
sentence1_ds_shapes[:] = np.array([np.array(x).shape for x in dst['sentence1'][:]])
sentence2_ds_shapes[:] = np.array([np.array(x).shape for x in dst['sentence2'][:]])
ds_shape_labels[:] = np.array(['sentence_len'])
sentence1_ds.dims.create_scale(sentence1_ds_shapes, 'shapes')
sentence1_ds.dims[0].attach_scale(sentence1_ds_shapes)
sentence1_ds.dims.create_scale(ds_shape_labels, 'shape_labels')
sentence1_ds.dims[0].attach_scale(ds_shape_labels)
sentence2_ds.dims.create_scale(sentence2_ds_shapes, 'shapes')
sentence2_ds.dims[0].attach_scale(sentence2_ds_shapes)
sentence2_ds.dims.create_scale(ds_shape_labels, 'shape_labels')
sentence2_ds.dims[0].attach_scale(ds_shape_labels)
### h5py nonsense ###
dst.attrs['split'] = H5PYDataset.create_split_array({
'all': {
'sentence1': (0, N),
'sentence2': (0, N),
'sentence1_ids': (0, N),
'sentence2_ids': (0, N),
'label': (0, N),
'text': (0, len(dst['text']))
}
})
def processNMostCommon(N=3, wavdirpath=PATH_TRAIN_IN_16KWAVS, xmlpicklepath=PATH_TRAIN_OUT_XMLPICKLEFILE, todirrootpath=PATH_TRAIN_OUT_HDF5):
global spectrogramWindowLength
if not os.path.exists(todirrootpath):
os.makedirs(todirrootpath)
spectrogramHeight = 200
f = h5py.File(os.path.join(todirrootpath,"data_top{}_nozero.hdf5".format(N)), "w")
dsetX = f.create_dataset('X', (0,1,spectrogramHeight,spectrogramWindowLength), maxshape=(None, 1,spectrogramHeight,spectrogramWindowLength))
dsety = f.create_dataset('y', (0,N), maxshape=(None,N))
dsetMediaId = f.create_dataset('MediaId', (0,1), maxshape=(None,1))
dsetClassId = f.create_dataset('ClassId', (0,1), maxshape=(None,1), dtype=h5py.special_dtype(vlen=unicode))
import pickle
df = pd.read_pickle(xmlpicklepath) # read the metadata
# if we would like to keep recordings with a given quality than we can do it here by uncommenting the next line
#df = filterByQuality(df, 0, 3)
df["OFGS"] = df.apply(mergeOFGS, axis=1) # merge Order, Family, Genus, Species
df_mc = getMostCommon(df, N) # get N most common classes from the dataset
df = None # let GC free up some memory
print("Metadata loaded")
# Shuffle rows
df_mc = df_mc.iloc[np.random.permutation(len(df_mc))]
df_mc.reset_index(drop=True, inplace=True)
(lb,binaryLabels) = getOneHotClassId(df_mc) # generate one-hot labels
pickle.dump(lb, open(os.path.join(todirrootpath,"labelBinarizer_top{}.pickle".format(N)), 'wb'))
# process the selected files of top N classes and save the data into HDF5
fileRanges = np.hstack((np.arange(0, len(df_mc), 30), len(df_mc)))
for i in range(len(fileRanges)-1):
tempSG = wavsToSpectrogramByList(wavdirpath, df_mc.FileName[fileRanges[i]: fileRanges[i+1]], dontFilter=False)
X, y, fn, cIds = spectrogramListToT4(tempSG, \
binaryLabels[fileRanges[i]: fileRanges[i+1]], \
filenames = df_mc.MediaId[fileRanges[i]: fileRanges[i+1]].values, N=spectrogramWindowLength, \
classIds = df_mc.ClassId[fileRanges[i]: fileRanges[i+1]].values) #convert to t4
pre_len = dsetX.shape[0]
add_len = X.shape[0]
dsetX.resize(pre_len+add_len, axis=0)
dsety.resize(pre_len+add_len, axis=0)
dsetMediaId.resize(pre_len + add_len, axis=0)
dsetClassId.resize(pre_len + add_len, axis=0)
dsetX[pre_len:pre_len+add_len,:,:,:] = X
dsety[pre_len:pre_len+add_len,:] = y
dsetMediaId[pre_len:pre_len+add_len,:] = np.transpose([[int(i) for i in fn]])
dsetClassId[pre_len:pre_len+add_len,:] = np.transpose([[s.encode('utf8') for s in cIds]])
f.flush()
f.close
return (X,y,fn) # return last batch for debug purposes
def get_mat_test_metadata():
test_f = h5py.File(test_mat_metadata_file, 'w')
f = h5py.File(train_mat_metadata_file)
refs, ds = f['#refs#'], f['digitStruct']
t_ds = test_f.create_group('digitStruct')
ref_dtype = h5py.special_dtype(ref=h5py.Reference)
t_refs = test_f.create_group('#refs#')
data_idx = 0
def create_t_real_data(ref):
nonlocal data_idx
real = refs[ref]
if isinstance(real, h5py.Group):
created_group = t_refs.create_group('data_%s' % data_idx)
data_idx += 1
attrs = 'label top left width height'.split()
for attr in attrs:
reshaped = real[attr].value.reshape(-1)
data_count = reshaped.shape[0]
if isinstance(reshaped[0], h5py.Reference):
t_real_attr = created_group.create_dataset(attr, shape=(data_count, 1), dtype=ref_dtype)
for i in range(data_count):
t_real_attr[i, 0] = create_t_real_data(reshaped[i])
else:
created_group.create_dataset(attr, data=real[attr].value)
data_idx += 1
return created_group.ref
else:
t_real = t_refs.create_dataset('data_%s' % data_idx, data=real.value)
data_idx += 1
return t_real.ref
def create_t_element(t_group, name, ref_group, data_count):
reshaped = ref_group[name].value.reshape(-1)
data_count = reshaped.shape[0] if data_count is None else data_count
created_dataset = t_group.create_dataset(name, (data_count, 1), dtype=ref_dtype)
for i in range(data_count):
created_dataset[i, 0] = create_t_real_data(reshaped[i])
create_t_element(t_ds, 'name', ds, test_data_count)
create_t_element(t_ds, 'bbox', ds, test_data_count)
test_f.close()
return test_mat_metadata_file
def _main(args):
voc_path = os.path.expanduser(args.path_to_voc)
train_ids = get_ids(voc_path, train_set)
val_ids = get_ids(voc_path, val_set)
test_ids = get_ids(voc_path, test_set)
train_ids_2007 = get_ids(voc_path, sets_from_2007)
total_train_ids = len(train_ids) + len(train_ids_2007)
# Create HDF5 dataset structure
print('Creating HDF5 dataset structure.')
fname = os.path.join(voc_path, 'pascal_voc_07_12.hdf5')
voc_h5file = h5py.File(fname, 'w')
uint8_dt = h5py.special_dtype(
vlen=np.dtype('uint8')) # variable length uint8
vlen_int_dt = h5py.special_dtype(
vlen=np.dtype(int)) # variable length default int
train_group = voc_h5file.create_group('train')
val_group = voc_h5file.create_group('val')
test_group = voc_h5file.create_group('test')
# store class list for reference class ids as csv fixed-length numpy string
voc_h5file.attrs['classes'] = np.string_(str.join(',', classes))
# store images as variable length uint8 arrays
train_images = train_group.create_dataset(
'images', shape=(total_train_ids, ), dtype=uint8_dt)
val_images = val_group.create_dataset(
'images', shape=(len(val_ids), ), dtype=uint8_dt)
test_images = test_group.create_dataset(
'images', shape=(len(test_ids), ), dtype=uint8_dt)
# store boxes as class_id, xmin, ymin, xmax, ymax
train_boxes = train_group.create_dataset(
'boxes', shape=(total_train_ids, ), dtype=vlen_int_dt)
val_boxes = val_group.create_dataset(
'boxes', shape=(len(val_ids), ), dtype=vlen_int_dt)
test_boxes = test_group.create_dataset(
'boxes', shape=(len(test_ids), ), dtype=vlen_int_dt)
# process all ids and add to datasets
print('Processing Pascal VOC 2007 datasets for training set.')
last_2007 = add_to_dataset(voc_path, '2007', train_ids_2007, train_images,
train_boxes)
print('Processing Pascal VOC 2012 training set.')
add_to_dataset(
voc_path,
'2012',
train_ids,
train_images,
train_boxes,
start=last_2007 + 1)
print('Processing Pascal VOC 2012 val set.')
add_to_dataset(voc_path, '2012', val_ids, val_images, val_boxes)
print('Processing Pascal VOC 2007 test set.')
add_to_dataset(voc_path, '2007', test_ids, test_images, test_boxes)
print('Closing HDF5 file.')
voc_h5file.close()
print('Done.')
def convert_dtype(srcdt, ctx):
""" Return a dtype based on input dtype, converting any Reference types from
h5py style to h5pyd and vice-versa.
"""
msg = "convert dtype: {}, type: {}, len: {}".format(srcdt, type(srcdt), len(srcdt))
logging.info(msg)
if ctx["verbose"]:
print(msg)
if len(srcdt) > 0:
fields = []
for name in srcdt.fields:
item = srcdt.fields[name]
# item is a tuple of dtype and integer offset
field_dt = convert_dtype(item[0], ctx)
fields.append((name, field_dt))
tgt_dt = np.dtype(fields)
else:
# check if this a "special dtype"
if srcdt.metadata and 'ref' in srcdt.metadata:
ref = srcdt.metadata['ref']
if is_reference(ref):
if is_h5py(ctx['fout']):
tgt_dt = h5py.special_dtype(ref=h5py.Reference)
else:
tgt_dt = h5pyd.special_dtype(ref=h5pyd.Reference)
elif is_regionreference(ref):
if is_h5py(ctx['fout']):
tgt_dt = h5py.special_dtype(ref=h5py.RegionReference)
else:
tgt_dt = h5py.special_dtype(ref=h5py.RegionReference)
else:
msg = "Unexpected ref type: {}".format(srcdt)
logging.error(msg)
raise TypeError(msg)
elif srcdt.metadata and 'vlen' in srcdt.metadata:
src_vlen = srcdt.metadata['vlen']
tgt_base = convert_dtype(src_vlen, ctx)
if is_h5py(ctx['fout']):
tgt_dt = h5py.special_dtype(vlen=tgt_base)
else:
tgt_dt = h5pyd.special_dtype(vlen=tgt_base)
else:
tgt_dt = srcdt
return tgt_dt
#----------------------------------------------------------------------------------