def metadata_file(path, root, tables=None):
'''
Returns the metadata for a file. There are 3 types of file formats:
1. Archives (7z, zip, rar, tar) / compressed (xz, bzip2, gzip). Decompress and process
2. Database (sqlite3, hdf5, xls, xlsx). Process each table/sheet as a sub-dataset
3. Data (csv, json). Process directly
'''
tree = Meta()
format = guess_format(path)
if format is not None:
tree.format = format
if format == 'dir':
tree.datasets = Datasets()
for base, dirs, files in os.walk(path):
for filename in files:
source = os.path.join(base, filename)
name = os.path.relpath(source, path)
tree.datasets[name] = submeta = Meta(name=name, source=source)
try:
submeta.update(metadata_file(source, root, tables))
except Exception as e:
submeta['error'] = str(e)
logging.exception('Unable to get metadata for %s', source)
elif format in {'7z', 'zip', 'rar', 'tar', 'xz', 'gz', 'bz2'}:
tree.datasets = Datasets()
for name, source in unzip_files(path, root, format):
tree.datasets[name] = submeta = Meta(name=name)
try:
submeta.update(metadata_file(source, root, tables))
except Exception as e:
submeta['error'] = str(e)
logging.exception('Unable to get metadata for %s', source)
elif format == 'sqlite3':
tree.update(metadata_sql('sqlite:///' + path, tables))
elif format in {'hdf5', 'xls', 'xlsx'}:
if format == 'hdf5':
store = pd.HDFStore(path)
table_list = store.keys()
store.close()
else:
xls = pd.ExcelFile(path)
table_list = xls.sheet_names
format = 'xlsx'
tree.datasets = Datasets()
for table in table_list:
if tables is None or table in tables:
tree.datasets[table] = Meta([
('name', table),
('format', 'table'),
('command', [format, path, table])
])
elif format == 'csv':
tree.command = ['csv', path]
elif format == 'json':
tree.command = ['json', path]
return tree
评论列表
文章目录