python类special_dtype()的实例源码-面圈网

hdf5.py 文件源码项目：cellranger 作者: 10XGenomics 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def create_levels(ds, levels):
    # Create a dataset in the LEVEL_GROUP
    # and store as native numpy / h5py types
    level_grp = ds.file.get(LEVEL_GROUP)
    if level_grp is None:
        # Create a LEVEL_GROUP
        level_grp = ds.file.create_group(LEVEL_GROUP)
    ds_name = ds.name.split("/")[-1]
    dt = h5py.special_dtype(vlen=str)

    level_grp.create_dataset(ds_name,
                       shape = [len(levels)],
                       maxshape = (None,),
                       dtype = dt,
                       data  = levels,
                       compression = COMPRESSION,
                       chunks = (CHUNK_SIZE,))

h5py_conversion.py 文件源码项目：dict_based_learning 作者: tombosc 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def text_to_h5py_dataset(text_path, dst_path):
    # The simplest is to load everything to memory first.
    # If memory becomes an issue, this code can be optimized.
    words = []
    with open(text_path, 'r') as src:
        for line in src:
            words.extend(line.strip().split())

    with h5py.File(dst_path, 'w') as dst:
        dtype = h5py.special_dtype(vlen=bytes)
        table = dst.create_dataset('words', (len(words),), dtype=dtype)
        table[:] = words

        dst.attrs['split'] = H5PYDataset.create_split_array({
                'train' : {
                    'words' : (0, len(words))
                }
            })

h5py_conversion.py 文件源码项目：dict_based_learning 作者: tombosc 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def add_words_ids_to_squad(h5_file, vocab):
    """Digitizes test with a vocabulary.

    Also saves the vocabulary into the hdf5 file.

    """
    with h5py.File(h5_file, 'a') as dst:
        unicode_dtype = h5py.special_dtype(vlen=unicode)
        dst.create_dataset('text_ids', (dst['text'].shape[0],), 'int64')
        dst.create_dataset('vocab_words', (vocab.size(),), unicode_dtype)
        dst.create_dataset('vocab_freqs', (vocab.size(),), 'int64')
        dst['text_ids'][:] = map(vocab.word_to_id, dst['text'][:])
        dst['vocab_words'][:] = vocab.words
        dst['vocab_freqs'][:] = vocab.frequencies


### SNLI ###

utilities_h5.py 文件源码项目：export2hdf5 作者: bwrc 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def add_text_h5(fid, path, data):
    """Add text data (UTF-8) to the given path in the HDF5 file
       with handle fid.

    Arguments:
       - fid is the file handle to the HDF5 file

       - path is the base path inside the HDF5 file

       - data is the text data as a string
    """

    dset = fid.create_dataset(name=path,
                              shape=(1,),
                              dtype=h5py.special_dtype(vlen=str),
                              data=data,
                              compression="gzip")

load_langmod.py 文件源码项目：Msc_Multi_label_ZeroShot 作者: thomasSve 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def load_pretrained():

    #glove_vec = ["glove_wiki_50","glove_wiki_150","glove_wiki_300"]
    glove_vec = ["glove_wiki_300"]
    #glove_vec = ["glove_wiki_50"]
    filename = 'glove_pretrained.h5'
    #import tensorflow as tf
    #sess = tf.InteractiveSession()

    features, words = load_h5py('glove_wiki_300',filename=root + glove_vec_fold + filename)
    filename = 'glove.h5'
    features = normalize(np.array(features), axis=1, norm='l2')
    with h5py.File(root + glove_vec_fold + filename, "w") as hf:
        hf.create_dataset(glove_vec[0], data=features)
        string_dt = h5py.special_dtype(vlen=str)
        hf.create_dataset(glove_vec[0] + "_words", data=words, dtype=string_dt)

    for vec in glove_vec:
        data, words = load_h5py(vec, filename=root + glove_vec_fold + "glove.h5")
        print(data.shape, words.shape)
        time.sleep(5)

log_probablity.py 文件源码项目：word2vec_pipeline 作者: NIHOPA 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def create_partition_function(self, f_w2v, f_h5):
        print("Building the partition function")

        # Load the model from disk
        M = load_w2vec()

        words = M.wv.index2word
        ZT = []
        INPUT_ITR = tqdm.tqdm(words)

        # Compute the partition function for each word
        for w in INPUT_ITR:
            UE = self.energy(M.wv.syn0, M[w])
            z = compute_partition_stats(UE)
            ZT.append(z)

        # Save the partition function to disk
        # (special care needed for h5py unicode strings)
        dt = h5py.special_dtype(vlen=unicode)

        with h5py.File(f_h5, 'w') as h5:

            h5.create_dataset("words", (len(words),),
                              dtype=dt,
                              data=[w.encode('utf8') for w in words])

            h5.attrs['vocab_N'] = len(words)
            h5['Z'] = ZT

generators.py 文件源码项目：pyannote-audio 作者: pyannote 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def _precompute(self, Xy_generator, cache):

        with h5py.File(cache, mode='w') as fp:

            # initialize with a fixed number of sequences
            n_sequences = 1000

            y = fp.create_dataset(
                'y', shape=(n_sequences, ),
                dtype=h5py.special_dtype(vlen=bytes),
                maxshape=(None, ))

            for i, (X_, y_) in enumerate(Xy_generator):

                if i == 0:
                    _, n_samples, n_features = X_.shape
                    X = fp.create_dataset(
                        'X', dtype=X_.dtype, compression='gzip',
                        shape=(n_sequences, n_samples, n_features),
                        chunks=(1, n_samples, n_features),
                        maxshape=(None, n_samples, n_features))

                # increase number of sequences on demand
                if i == n_sequences:
                    n_sequences = int(n_sequences * 1.1)
                    y.resize(n_sequences, axis=0)
                    X.resize(n_sequences, axis=0)

                # store current X, y in file
                y[i] = y_
                X[i] = X_

            # resize file to exactly match the number of sequences
            y.resize(i, axis=0)
            X.resize(i, axis=0)

load_langmod.py 文件源码项目：Msc_Multi_label_ZeroShot 作者: thomasSve 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def save_h5py(arrays, string_arrs, names, filename="glove.h5"):
    with h5py.File(filename, "w") as hf:
        for i in range(len(arrays)):
            hf.create_dataset(names[i], data=arrays[i])
            string_dt = h5py.special_dtype(vlen=str)
            hf.create_dataset(names[i] + "_words", data=string_arrs[i], dtype=string_dt)

    return True

generic.py 文件源码项目：fiasco 作者: wtbarnes 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def to_hdf5(self, hf, df, **kwargs):
        """
        Add datasets to a group for an HDF5 file handler
        """
        if self.dielectronic:
            grp_name = '/'.join([self.element, self.ion_name, 'dielectronic', self.filetype])
        else:
            grp_name = '/'.join([self.element, self.ion_name, self.filetype])
        if grp_name not in hf:
            grp = hf.create_group(grp_name)
            grp.attrs['chianti_version'] = df.meta['chianti_version']
            grp.attrs['footer'] = df.meta['footer']
        else:
            grp = hf[grp_name]
        hf['/'.join([self.element, self.ion_name])].attrs['element'] = self.element
        hf['/'.join([self.element, self.ion_name])].attrs['ion'] = self.ion_name
        for name in df.colnames:
            col = df[name]
            if type(col) == u.Quantity:
                data = col.value
            else:
                data = col.data
            if '<U' in data.dtype.str:
                numchar = data.dtype.str[2:]
                data = data.astype('|S{}'.format(numchar))
            if name in grp:
                ds = grp[name]
            else:
                if data.dtype == np.dtype('O'):
                    ragged_dtype = h5py.special_dtype(vlen=np.dtype('float64'))
                    ds = grp.create_dataset(name, data=data, dtype=ragged_dtype)
                else:
                    ds = grp.create_dataset(name, data=data, dtype=data.dtype)
            if col.unit is None:
                ds.attrs['unit'] = 'SKIP'
            else:
                ds.attrs['unit'] = col.unit.to_string()
            ds.attrs['description'] = df.meta['descriptions'][name]

embedding_h5.py 文件源码项目：KnowledgeGraph-QA-Service 作者: kangzhun 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def export_data_h5(vocabulary, embedding_matrix, output='embedding.h5'):
    f = h5py.File(output, "w")
    compress_option = dict(compression="gzip", compression_opts=9, shuffle=True)
    words_flatten = '\n'.join(vocabulary)
    f.attrs['vocab_len'] = len(vocabulary)
    print len(vocabulary)
    dt = h5py.special_dtype(vlen=str)
    _dset_vocab = f.create_dataset('words_flatten', (1, ), dtype=dt, **compress_option)
    _dset_vocab[...] = [words_flatten]
    _dset = f.create_dataset('embedding', embedding_matrix.shape, dtype=embedding_matrix.dtype, **compress_option)
    _dset[...] = embedding_matrix
    f.flush()
    f.close()

data.py 文件源码项目：DeepLearnTute 作者: DouglasOrr 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def cli_render(input, output, size):
    '''Render a JSONlines dataset to numpy arrays, saved in an HDF5 file.
    '''
    chars = []
    images = []
    for line in input:
        datum = json.loads(line)
        chars.append(datum['target'])
        images.append(render(
            [np.array(s) for s in datum['strokes']],
            size))

    vocab = list(sorted(set(chars)))
    char_to_index = {ch: y for y, ch in enumerate(vocab)}

    with h5py.File(output, 'a') as f:
        str_dt = h5py.special_dtype(vlen=str)
        f.require_dataset(
            'vocab', (len(vocab),), dtype=str_dt
        )[...] = vocab
        f.require_dataset(
            'x', shape=(len(images), size, size), dtype=np.float32
        )[...] = np.array(images)
        f.require_dataset(
            'y', shape=(len(chars),), dtype=np.int
        )[...] = np.array([char_to_index[ch] for ch in chars])

data_node_text.py 文件源码项目：skp_edu_docker 作者: TensorMSA 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def _save_hdf5(self, buffer_list):
        """
        :param buffer_list:
        :return:
        """
        file_name = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
        output_path = os.path.join(self.data_store_path, file_name)
        h5file = h5py.File(output_path, 'w', chunk=True)
        dt_vlen = h5py.special_dtype(vlen=str)
        dt_arr = np.dtype((dt_vlen, (self.sent_max_len,)))
        h5raw = h5file.create_dataset('rawdata', (len(buffer_list),), dtype=dt_arr)
        for i in range(len(buffer_list)):
            h5raw[i] = np.array(buffer_list[i], dtype=object)
        h5file.flush()
        h5file.close()

file_upload_util.py 文件源码项目：skp_edu_docker 作者: TensorMSA 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def hdf_create(self, output_path, filecnt, channel, image_arr, shape_arr, lable_arr, name_arr):
    h5file = h5py.File(output_path, mode='w')
    dtype = h5py.special_dtype(vlen=np.dtype('uint8'))
    hdf_features = h5file.create_dataset('image_features', (filecnt,), dtype=dtype)
    hdf_shapes = h5file.create_dataset('image_features_shapes', (filecnt, channel),dtype='int32')
    hdf_labels = h5file.create_dataset('targets', (filecnt,), dtype='S240')
    hdf_names = h5file.create_dataset('names', (filecnt,), dtype='S240')

    # Attach shape annotations and scales
    hdf_features.dims.create_scale(hdf_shapes, 'shapes')
    hdf_features.dims[0].attach_scale(hdf_shapes)

    hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels', (3,), dtype='S7')
    hdf_shapes_labels[...] = ['channel'.encode('utf8'),
                              'height'.encode('utf8'),
                              'width'.encode('utf8')]
    hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels')
    hdf_features.dims[0].attach_scale(hdf_shapes_labels)

    # Add axis annotations
    hdf_features.dims[0].label = 'batch'

    for i in range(len(image_arr)):
        hdf_features[i] = image_arr[i]
        hdf_shapes[i] = shape_arr[i]
        hdf_labels[i] = lable_arr[i]
        hdf_names[i] = name_arr[i]

    h5file.flush()
    h5file.close()

process_bam.py 文件源码项目：RiboCode 作者: xzt41 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def write_psites(tpsites,psites_number,filename):
    with h5py.File(filename,"w") as fout:
        ds = h5py.special_dtype(vlen=str)
        dt = h5py.special_dtype(vlen=np.dtype("int32"))
        fout.create_dataset("transcript_ids",data=tpsites.keys(),dtype=ds)
        fout.create_dataset("p_sites",data=tpsites.values(),dtype=dt, compression="gzip")
        fout.create_dataset("psites_number",data=psites_number,dtype="int32")
    return None

h5monitor.py 文件源码项目：attention_ocr 作者: lightcaster 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def dump_h5_var(filename, prefix, prefix_shape, data):
    ''' Dumps variable length data to a new dataset or
    appends to the existed dataset 

    '''
    h5f = h5py.File(filename, 'a')

    ds = h5f.get(prefix)
    ds_shp = h5f.get(prefix_shape)

    if not ds:
        var_dt = h5py.special_dtype(vlen=np.dtype(data[0].dtype))
        ds = h5f.create_dataset(prefix, shape=(len(data),), maxshape=(None,), dtype=var_dt)
        dim = len(data[0].shape)
        ds_shp = h5f.create_dataset(prefix_shape, shape=(len(data),dim), maxshape=(None,dim), dtype=np.int64)
        offset = 0
        offset_shp = 0

    else:
        offset = len(ds)
        offset_shp = len(ds)

        ds.resize(len(ds) + len(data), axis=0)
        ds_shp.resize(len(ds_shp) + len(data), axis=0)

    for i in range(len(data)):
        ds[offset+i] = data[i].flatten()
        ds_shp[offset_shp+i] = data[i].shape

    h5f.close()

embedding_2_h5.py 文件源码项目：mctest-model 作者: Maluuba 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def export_data_h5(vocabulary, embedding_matrix, output='embedding.h5'):
    f = h5py.File(output, "w")
    compress_option = dict(compression="gzip", compression_opts=9, shuffle=True)
    words_flatten = '\n'.join(vocabulary)
    f.attrs['vocab_len'] = len(vocabulary)
    dt = h5py.special_dtype(vlen=str)
    _dset_vocab = f.create_dataset('words_flatten', (1, ), dtype=dt, **compress_option)
    _dset_vocab[...] = [words_flatten]
    _dset = f.create_dataset('embedding', embedding_matrix.shape, dtype=embedding_matrix.dtype, **compress_option)
    _dset[...] = embedding_matrix
    f.flush()
    f.close()

h5py_conversion.py 文件源码项目：dict_based_learning 作者: tombosc 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def add_word_ids_to_snli(h5_file, vocab):
    with h5py.File(h5_file, 'a') as dst:
        N = len(dst['sentence1'])
        assert len(dst['sentence2']) == N

        dst.create_dataset('vocab_words', (vocab.size(),),  h5py.special_dtype(vlen=unicode))
        dst.create_dataset('vocab_freqs', (vocab.size(),), 'int64')
        dst['vocab_words'][:] = vocab.words
        dst['vocab_freqs'][:] = vocab.frequencies

        dtype = h5py.special_dtype(vlen=np.dtype('int32'))
        sentence1_ds = dst.create_dataset('sentence1_ids', (N, ), dtype=dtype)
        sentence2_ds = dst.create_dataset('sentence2_ids', (N, ), dtype=dtype)

        ### h5py nonsense ###
        sentence1_ds_shapes = dst.create_dataset('sentence1_ids_shapes', (N, 1), dtype=("int"))
        sentence2_ds_shapes = dst.create_dataset('sentence2_ids_shapes', (N, 1), dtype=("int"))
        ds_shape_labels = dst.create_dataset('ds_ids_shape_labels', (1, ), dtype=("S20"))
        ### h5py nonsense ###

        sentence1_ds[:] = np.array([[vocab.word_to_id(w) for w in s] for s in dst['sentence1'][:]])
        sentence2_ds[:] = np.array([[vocab.word_to_id(w) for w in s] for s in dst['sentence2'][:]])

        ### h5py nonsense ###
        sentence1_ds_shapes[:] = np.array([np.array(x).shape for x in dst['sentence1'][:]])
        sentence2_ds_shapes[:] = np.array([np.array(x).shape for x in dst['sentence2'][:]])
        ds_shape_labels[:] = np.array(['sentence_len'])

        sentence1_ds.dims.create_scale(sentence1_ds_shapes, 'shapes')
        sentence1_ds.dims[0].attach_scale(sentence1_ds_shapes)
        sentence1_ds.dims.create_scale(ds_shape_labels, 'shape_labels')
        sentence1_ds.dims[0].attach_scale(ds_shape_labels)

        sentence2_ds.dims.create_scale(sentence2_ds_shapes, 'shapes')
        sentence2_ds.dims[0].attach_scale(sentence2_ds_shapes)
        sentence2_ds.dims.create_scale(ds_shape_labels, 'shape_labels')
        sentence2_ds.dims[0].attach_scale(ds_shape_labels)
        ### h5py nonsense ###

        dst.attrs['split'] = H5PYDataset.create_split_array({
            'all': {
                'sentence1': (0, N),
                'sentence2': (0, N),
                'sentence1_ids': (0, N),
                'sentence2_ids': (0, N),
                'label': (0, N),
                'text': (0, len(dst['text']))
            }
        })

loadData.py 文件源码项目：birdsong-keras 作者: bapalto 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def processNMostCommon(N=3, wavdirpath=PATH_TRAIN_IN_16KWAVS, xmlpicklepath=PATH_TRAIN_OUT_XMLPICKLEFILE, todirrootpath=PATH_TRAIN_OUT_HDF5):
    global spectrogramWindowLength

    if not os.path.exists(todirrootpath):
        os.makedirs(todirrootpath)

    spectrogramHeight = 200

    f       = h5py.File(os.path.join(todirrootpath,"data_top{}_nozero.hdf5".format(N)), "w")
    dsetX   = f.create_dataset('X', (0,1,spectrogramHeight,spectrogramWindowLength), maxshape=(None, 1,spectrogramHeight,spectrogramWindowLength))
    dsety   = f.create_dataset('y', (0,N), maxshape=(None,N))
    dsetMediaId = f.create_dataset('MediaId', (0,1), maxshape=(None,1))
    dsetClassId = f.create_dataset('ClassId', (0,1), maxshape=(None,1), dtype=h5py.special_dtype(vlen=unicode))

    import pickle    
    df      = pd.read_pickle(xmlpicklepath) # read the metadata

    # if we would like to keep recordings with a given quality than we can do it here by uncommenting the next line
    #df = filterByQuality(df, 0, 3)

    df["OFGS"]  = df.apply(mergeOFGS, axis=1) # merge Order, Family, Genus, Species
    df_mc   = getMostCommon(df, N) # get N most common classes from the dataset
    df      = None # let GC free up some memory
    print("Metadata loaded")

    # Shuffle rows
    df_mc   = df_mc.iloc[np.random.permutation(len(df_mc))]
    df_mc.reset_index(drop=True, inplace=True)
    (lb,binaryLabels) = getOneHotClassId(df_mc) # generate one-hot labels
    pickle.dump(lb, open(os.path.join(todirrootpath,"labelBinarizer_top{}.pickle".format(N)), 'wb'))

    # process the selected files of top N classes and save the data into HDF5
    fileRanges = np.hstack((np.arange(0, len(df_mc), 30), len(df_mc)))
    for i in range(len(fileRanges)-1):
        tempSG      = wavsToSpectrogramByList(wavdirpath, df_mc.FileName[fileRanges[i]: fileRanges[i+1]], dontFilter=False)
        X, y, fn, cIds  = spectrogramListToT4(tempSG, \
                        binaryLabels[fileRanges[i]: fileRanges[i+1]], \
                        filenames = df_mc.MediaId[fileRanges[i]: fileRanges[i+1]].values, N=spectrogramWindowLength, \
                        classIds = df_mc.ClassId[fileRanges[i]: fileRanges[i+1]].values) #convert to t4
        pre_len     = dsetX.shape[0]
        add_len     = X.shape[0]
        dsetX.resize(pre_len+add_len, axis=0)
        dsety.resize(pre_len+add_len, axis=0)
        dsetMediaId.resize(pre_len + add_len, axis=0)
        dsetClassId.resize(pre_len + add_len, axis=0)
        dsetX[pre_len:pre_len+add_len,:,:,:] = X
        dsety[pre_len:pre_len+add_len,:] = y
        dsetMediaId[pre_len:pre_len+add_len,:] = np.transpose([[int(i) for i in fn]])
        dsetClassId[pre_len:pre_len+add_len,:] = np.transpose([[s.encode('utf8') for s in cIds]])
        f.flush()

    f.close
    return (X,y,fn) # return last batch for debug purposes

test_helper.py 文件源码项目：num-seq-recognizer 作者: gmlove 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def get_mat_test_metadata():
  test_f = h5py.File(test_mat_metadata_file, 'w')

  f = h5py.File(train_mat_metadata_file)
  refs, ds = f['#refs#'], f['digitStruct']

  t_ds = test_f.create_group('digitStruct')
  ref_dtype = h5py.special_dtype(ref=h5py.Reference)
  t_refs = test_f.create_group('#refs#')

  data_idx = 0

  def create_t_real_data(ref):
    nonlocal data_idx
    real = refs[ref]
    if isinstance(real, h5py.Group):
      created_group = t_refs.create_group('data_%s' % data_idx)
      data_idx += 1
      attrs = 'label top left width height'.split()
      for attr in attrs:
        reshaped = real[attr].value.reshape(-1)
        data_count = reshaped.shape[0]
        if isinstance(reshaped[0], h5py.Reference):
          t_real_attr = created_group.create_dataset(attr, shape=(data_count, 1), dtype=ref_dtype)
          for i in range(data_count):
            t_real_attr[i, 0] = create_t_real_data(reshaped[i])
        else:
          created_group.create_dataset(attr, data=real[attr].value)
          data_idx += 1
      return created_group.ref
    else:
      t_real = t_refs.create_dataset('data_%s' % data_idx, data=real.value)
      data_idx += 1
      return t_real.ref

  def create_t_element(t_group, name, ref_group, data_count):
    reshaped = ref_group[name].value.reshape(-1)
    data_count = reshaped.shape[0] if data_count is None else data_count
    created_dataset = t_group.create_dataset(name, (data_count, 1), dtype=ref_dtype)
    for i in range(data_count):
      created_dataset[i, 0] = create_t_real_data(reshaped[i])

  create_t_element(t_ds, 'name', ds, test_data_count)
  create_t_element(t_ds, 'bbox', ds, test_data_count)
  test_f.close()
  return test_mat_metadata_file

voc_to_hdf5.py 文件源码项目：YAD2K 作者: allanzelener 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def _main(args):
    voc_path = os.path.expanduser(args.path_to_voc)
    train_ids = get_ids(voc_path, train_set)
    val_ids = get_ids(voc_path, val_set)
    test_ids = get_ids(voc_path, test_set)
    train_ids_2007 = get_ids(voc_path, sets_from_2007)
    total_train_ids = len(train_ids) + len(train_ids_2007)

    # Create HDF5 dataset structure
    print('Creating HDF5 dataset structure.')
    fname = os.path.join(voc_path, 'pascal_voc_07_12.hdf5')
    voc_h5file = h5py.File(fname, 'w')
    uint8_dt = h5py.special_dtype(
        vlen=np.dtype('uint8'))  # variable length uint8
    vlen_int_dt = h5py.special_dtype(
        vlen=np.dtype(int))  # variable length default int
    train_group = voc_h5file.create_group('train')
    val_group = voc_h5file.create_group('val')
    test_group = voc_h5file.create_group('test')

    # store class list for reference class ids as csv fixed-length numpy string
    voc_h5file.attrs['classes'] = np.string_(str.join(',', classes))

    # store images as variable length uint8 arrays
    train_images = train_group.create_dataset(
        'images', shape=(total_train_ids, ), dtype=uint8_dt)
    val_images = val_group.create_dataset(
        'images', shape=(len(val_ids), ), dtype=uint8_dt)
    test_images = test_group.create_dataset(
        'images', shape=(len(test_ids), ), dtype=uint8_dt)

    # store boxes as class_id, xmin, ymin, xmax, ymax
    train_boxes = train_group.create_dataset(
        'boxes', shape=(total_train_ids, ), dtype=vlen_int_dt)
    val_boxes = val_group.create_dataset(
        'boxes', shape=(len(val_ids), ), dtype=vlen_int_dt)
    test_boxes = test_group.create_dataset(
        'boxes', shape=(len(test_ids), ), dtype=vlen_int_dt)

    # process all ids and add to datasets
    print('Processing Pascal VOC 2007 datasets for training set.')
    last_2007 = add_to_dataset(voc_path, '2007', train_ids_2007, train_images,
                               train_boxes)
    print('Processing Pascal VOC 2012 training set.')
    add_to_dataset(
        voc_path,
        '2012',
        train_ids,
        train_images,
        train_boxes,
        start=last_2007 + 1)
    print('Processing Pascal VOC 2012 val set.')
    add_to_dataset(voc_path, '2012', val_ids, val_images, val_boxes)
    print('Processing Pascal VOC 2007 test set.')
    add_to_dataset(voc_path, '2007', test_ids, test_images, test_boxes)

    print('Closing HDF5 file.')
    voc_h5file.close()
    print('Done.')

utillib.py 文件源码项目：h5pyd 作者: HDFGroup 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def convert_dtype(srcdt, ctx):
    """ Return a dtype based on input dtype, converting any Reference types from 
    h5py style to h5pyd and vice-versa.
    """

    msg = "convert dtype: {}, type: {}, len: {}".format(srcdt, type(srcdt), len(srcdt))
    logging.info(msg)
    if ctx["verbose"]:
        print(msg)

    if len(srcdt) > 0:
        fields = []
        for name in srcdt.fields:
            item = srcdt.fields[name]
            # item is a tuple of dtype and integer offset
            field_dt = convert_dtype(item[0], ctx)
            fields.append((name, field_dt))
        tgt_dt = np.dtype(fields)
    else:
        # check if this a "special dtype"
        if srcdt.metadata and 'ref' in srcdt.metadata:
            ref = srcdt.metadata['ref']
            if is_reference(ref):
                if is_h5py(ctx['fout']):
                    tgt_dt = h5py.special_dtype(ref=h5py.Reference)
                else:
                    tgt_dt = h5pyd.special_dtype(ref=h5pyd.Reference)
            elif is_regionreference(ref):
                if is_h5py(ctx['fout']):
                    tgt_dt = h5py.special_dtype(ref=h5py.RegionReference)
                else:
                    tgt_dt = h5py.special_dtype(ref=h5py.RegionReference)
            else:
                msg = "Unexpected ref type: {}".format(srcdt)
                logging.error(msg)
                raise TypeError(msg)
        elif srcdt.metadata and 'vlen' in srcdt.metadata:
            src_vlen = srcdt.metadata['vlen']
            tgt_base = convert_dtype(src_vlen, ctx)
            if is_h5py(ctx['fout']):
                tgt_dt = h5py.special_dtype(vlen=tgt_base)
            else:
                tgt_dt = h5pyd.special_dtype(vlen=tgt_base)
        else:
            tgt_dt = srcdt
    return tgt_dt

#----------------------------------------------------------------------------------