def add_word_ids_to_snli(h5_file, vocab):
with h5py.File(h5_file, 'a') as dst:
N = len(dst['sentence1'])
assert len(dst['sentence2']) == N
dst.create_dataset('vocab_words', (vocab.size(),), h5py.special_dtype(vlen=unicode))
dst.create_dataset('vocab_freqs', (vocab.size(),), 'int64')
dst['vocab_words'][:] = vocab.words
dst['vocab_freqs'][:] = vocab.frequencies
dtype = h5py.special_dtype(vlen=np.dtype('int32'))
sentence1_ds = dst.create_dataset('sentence1_ids', (N, ), dtype=dtype)
sentence2_ds = dst.create_dataset('sentence2_ids', (N, ), dtype=dtype)
### h5py nonsense ###
sentence1_ds_shapes = dst.create_dataset('sentence1_ids_shapes', (N, 1), dtype=("int"))
sentence2_ds_shapes = dst.create_dataset('sentence2_ids_shapes', (N, 1), dtype=("int"))
ds_shape_labels = dst.create_dataset('ds_ids_shape_labels', (1, ), dtype=("S20"))
### h5py nonsense ###
sentence1_ds[:] = np.array([[vocab.word_to_id(w) for w in s] for s in dst['sentence1'][:]])
sentence2_ds[:] = np.array([[vocab.word_to_id(w) for w in s] for s in dst['sentence2'][:]])
### h5py nonsense ###
sentence1_ds_shapes[:] = np.array([np.array(x).shape for x in dst['sentence1'][:]])
sentence2_ds_shapes[:] = np.array([np.array(x).shape for x in dst['sentence2'][:]])
ds_shape_labels[:] = np.array(['sentence_len'])
sentence1_ds.dims.create_scale(sentence1_ds_shapes, 'shapes')
sentence1_ds.dims[0].attach_scale(sentence1_ds_shapes)
sentence1_ds.dims.create_scale(ds_shape_labels, 'shape_labels')
sentence1_ds.dims[0].attach_scale(ds_shape_labels)
sentence2_ds.dims.create_scale(sentence2_ds_shapes, 'shapes')
sentence2_ds.dims[0].attach_scale(sentence2_ds_shapes)
sentence2_ds.dims.create_scale(ds_shape_labels, 'shape_labels')
sentence2_ds.dims[0].attach_scale(ds_shape_labels)
### h5py nonsense ###
dst.attrs['split'] = H5PYDataset.create_split_array({
'all': {
'sentence1': (0, N),
'sentence2': (0, N),
'sentence1_ids': (0, N),
'sentence2_ids': (0, N),
'label': (0, N),
'text': (0, len(dst['text']))
}
})
评论列表
文章目录