awk.py 文件源码-python代码片段

def awk_filter_map(data_desc, filter_strs, map_strs):
    """
    >>> from tabkit.header import parse_header
    >>> awk, desc = awk_filter_map(
    ...     parse_header('# d p e s c m'),
    ...     ['e==157 and (s>100 or s in [15,30,45])'],
    ...     ['ctr=c/s', 'cpm=ctr*m']
    ... )
    >>> print desc
    DataDesc([DataField('ctr', 'any'), DataField('cpm', 'any')])
    >>> print awk.cmd_line()
    LC_ALL=C awk  -F $'\\t' 'BEGIN{OFS="\\t";}{if((($3 == 157) && (($4 > 100) || (($4 == 15) || ($4 == 30) || ($4 == 45))))){ctr = ($5 / $4);print(ctr,(ctr * $6));}}'
    >>> awk, desc = awk_filter_map(parse_header('# a b'), [], ['__all__'])
    >>> print desc
    DataDesc([DataField('a', 'any'), DataField('b', 'any')])
    """
    ctx = ExprContext(data_desc)

    # parse map
    for map_expr_str in map_strs:
        for node in parse(map_expr_str).body:
            if isinstance(node, _ast.Expr) and isinstance(node.value, _ast.Name) and node.value.id == '__all__':
                for field in data_desc.fields:
                    ctx.set_var(field.name, RowExprAssign(field.name, RowExprField(ctx, field.name)))
            elif isinstance(node, _ast.Expr) and isinstance(node.value, _ast.Name) and node.value.id == '__rest__':
                for field in data_desc.fields:
                    if not ctx.has_var(field.name):
                        ctx.set_var(field.name, RowExprAssign(field.name, RowExprField(ctx, field.name)))
            else:
                expr = parse_rowexpr(ctx, node)
                ctx.set_var(expr.target, expr)

    # parse filter
    nodes = [node for filter_str in filter_strs for node in parse(filter_str).body]
    filter_expr = None
    if len(nodes) == 0:
        pass
    elif len(nodes) == 1:
        filter_expr = parse_expr(ctx, nodes[0])
    else:
        filter_expr = RowExprOp('&&', [parse_expr(ctx, node) for node in nodes])

    awk_cmd, output_desc = awk_filter_map_from_context(ctx, filter_expr, data_desc.order)
    if output_desc:
        output_desc.meta = data_desc.meta
    return awk_cmd, output_desc or data_desc