def combine_data_frame_files(output_filename, input_filenames):
in_files = [ h5py.File(f, 'r') for f in input_filenames ]
column_names = [ tuple(sorted(f.attrs.get("column_names"))) for f in in_files ]
uniq = set(column_names)
if len(uniq) > 1:
raise Exception("you're attempting to combine incompatible data frames")
if len(uniq) == 0:
r = "No input files? output: %s, inputs: %s" % (output_filename, str(input_filenames))
raise Exception(r)
column_names = uniq.pop()
if os.path.exists(output_filename):
os.remove(output_filename)
out = h5py.File(output_filename)
out.attrs.create("column_names", column_names)
# Write successive columns
for c in column_names:
datasets = [f[c] for f in in_files if len(f[c]) > 0]
num_w_levels = np.sum([has_levels(ds) for ds in datasets if len(ds) > 0])
fract_w_levels = float(num_w_levels) / (len(datasets) + 1)
if fract_w_levels > 0.25:
combine_level_column(out, datasets, c)
continue
# filter out empty rows from the type promotion, unless they're all empty
types = [get_col_type(ds) for ds in datasets if len(ds) > 0]
if len(types) == 0:
# Fall back to getting column types from empty data frames
types = [get_col_type(f[c]) for f in in_files]
common_type = reduce(np.promote_types, types)
# numpy doesn't understand vlen strings -- so always promote to vlen strings if anything is using them
if vlen_string in types:
common_type = vlen_string
out_ds = out.create_dataset(c, shape=(0,), maxshape=(None,), dtype=common_type, compression=COMPRESSION, shuffle=True, chunks=(CHUNK_SIZE,))
item_count = 0
for ds in datasets:
new_items = ds.shape[0]
out_ds.resize((item_count + new_items,))
data = ds[:]
if has_levels(ds):
levels = get_levels(ds)
data = levels[data]
out_ds[item_count:(item_count + new_items)] = data
item_count += new_items
for in_f in in_files:
in_f.close()
out.close()
评论列表
文章目录