meta.py 文件源码

python
阅读 30 收藏 0 点赞 0 评论 0

项目:autolysis 作者: gramener 项目源码 文件源码
def metadata_file(path, root, tables=None):
    '''
    Returns the metadata for a file. There are 3 types of file formats:

    1. Archives (7z, zip, rar, tar) / compressed (xz, bzip2, gzip). Decompress and process
    2. Database (sqlite3, hdf5, xls, xlsx). Process each table/sheet as a sub-dataset
    3. Data (csv, json). Process directly
    '''
    tree = Meta()
    format = guess_format(path)
    if format is not None:
        tree.format = format

    if format == 'dir':
        tree.datasets = Datasets()
        for base, dirs, files in os.walk(path):
            for filename in files:
                source = os.path.join(base, filename)
                name = os.path.relpath(source, path)
                tree.datasets[name] = submeta = Meta(name=name, source=source)
                try:
                    submeta.update(metadata_file(source, root, tables))
                except Exception as e:
                    submeta['error'] = str(e)
                    logging.exception('Unable to get metadata for %s', source)
    elif format in {'7z', 'zip', 'rar', 'tar', 'xz', 'gz', 'bz2'}:
        tree.datasets = Datasets()
        for name, source in unzip_files(path, root, format):
            tree.datasets[name] = submeta = Meta(name=name)
            try:
                submeta.update(metadata_file(source, root, tables))
            except Exception as e:
                submeta['error'] = str(e)
                logging.exception('Unable to get metadata for %s', source)
    elif format == 'sqlite3':
        tree.update(metadata_sql('sqlite:///' + path, tables))
    elif format in {'hdf5', 'xls', 'xlsx'}:
        if format == 'hdf5':
            store = pd.HDFStore(path)
            table_list = store.keys()
            store.close()
        else:
            xls = pd.ExcelFile(path)
            table_list = xls.sheet_names
            format = 'xlsx'
        tree.datasets = Datasets()
        for table in table_list:
            if tables is None or table in tables:
                tree.datasets[table] = Meta([
                    ('name', table),
                    ('format', 'table'),
                    ('command', [format, path, table])
                ])
    elif format == 'csv':
        tree.command = ['csv', path]
    elif format == 'json':
        tree.command = ['json', path]
    return tree
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号