hdf5.py 文件源码-python代码片段

def combine_data_frame_files(output_filename, input_filenames):
    in_files = [ h5py.File(f, 'r') for f in input_filenames ]
    column_names = [ tuple(sorted(f.attrs.get("column_names"))) for f in in_files ]

    uniq = set(column_names)

    if len(uniq) > 1:
        raise Exception("you're attempting to combine incompatible data frames")

    if len(uniq) == 0:
        r = "No input files? output: %s, inputs: %s" % (output_filename, str(input_filenames))
        raise Exception(r)

    column_names = uniq.pop()

    if os.path.exists(output_filename):
        os.remove(output_filename)

    out = h5py.File(output_filename)
    out.attrs.create("column_names", column_names)

    # Write successive columns
    for c in column_names:
        datasets = [f[c] for f in in_files if len(f[c]) > 0]
        num_w_levels = np.sum([has_levels(ds) for ds in datasets if len(ds) > 0])
        fract_w_levels = float(num_w_levels) / (len(datasets) + 1)

        if fract_w_levels > 0.25:
            combine_level_column(out, datasets, c)
            continue

        # filter out empty rows from the type promotion, unless they're all empty
        types = [get_col_type(ds) for ds in datasets if len(ds) > 0]
        if len(types) == 0:
            # Fall back to getting column types from empty data frames
            types = [get_col_type(f[c]) for f in in_files]
        common_type = reduce(np.promote_types, types)

        # numpy doesn't understand vlen strings -- so always promote to vlen strings if anything is using them
        if vlen_string in types:
            common_type = vlen_string

        out_ds = out.create_dataset(c, shape=(0,), maxshape=(None,), dtype=common_type, compression=COMPRESSION, shuffle=True, chunks=(CHUNK_SIZE,))

        item_count = 0
        for ds in datasets:
            new_items = ds.shape[0]
            out_ds.resize((item_count + new_items,))
            data = ds[:]

            if has_levels(ds):
                levels = get_levels(ds)
                data = levels[data]

            out_ds[item_count:(item_count + new_items)] = data
            item_count += new_items

    for in_f in in_files:
        in_f.close()

    out.close()