def read_data_frame(fn, query_cols=[]):
''' Load a pandas DataFrame from an HDF5 file. If a column list is specified, only load the matching columns '''
with h5py.File(fn, 'r') as f:
column_names = f.attrs.get("column_names")
column_names = get_column_intersection(column_names, query_cols)
df = p.DataFrame()
# Add the columns progressively to save memory
for name in column_names:
ds = f[name]
if has_levels(ds):
indices = ds[:]
uniques = get_levels(ds)
# This method of constructing of Categorical avoids copying the indices array
# which saves memory for big datasets
df[name] = p.Categorical(indices, categories=uniques, ordered=False, fastpath=True)
else:
df[name] = p.Series(ds[:])
return df
评论列表
文章目录