def awk_filter_map(data_desc, filter_strs, map_strs):
"""
>>> from tabkit.header import parse_header
>>> awk, desc = awk_filter_map(
... parse_header('# d p e s c m'),
... ['e==157 and (s>100 or s in [15,30,45])'],
... ['ctr=c/s', 'cpm=ctr*m']
... )
>>> print desc
DataDesc([DataField('ctr', 'any'), DataField('cpm', 'any')])
>>> print awk.cmd_line()
LC_ALL=C awk -F $'\\t' 'BEGIN{OFS="\\t";}{if((($3 == 157) && (($4 > 100) || (($4 == 15) || ($4 == 30) || ($4 == 45))))){ctr = ($5 / $4);print(ctr,(ctr * $6));}}'
>>> awk, desc = awk_filter_map(parse_header('# a b'), [], ['__all__'])
>>> print desc
DataDesc([DataField('a', 'any'), DataField('b', 'any')])
"""
ctx = ExprContext(data_desc)
# parse map
for map_expr_str in map_strs:
for node in parse(map_expr_str).body:
if isinstance(node, _ast.Expr) and isinstance(node.value, _ast.Name) and node.value.id == '__all__':
for field in data_desc.fields:
ctx.set_var(field.name, RowExprAssign(field.name, RowExprField(ctx, field.name)))
elif isinstance(node, _ast.Expr) and isinstance(node.value, _ast.Name) and node.value.id == '__rest__':
for field in data_desc.fields:
if not ctx.has_var(field.name):
ctx.set_var(field.name, RowExprAssign(field.name, RowExprField(ctx, field.name)))
else:
expr = parse_rowexpr(ctx, node)
ctx.set_var(expr.target, expr)
# parse filter
nodes = [node for filter_str in filter_strs for node in parse(filter_str).body]
filter_expr = None
if len(nodes) == 0:
pass
elif len(nodes) == 1:
filter_expr = parse_expr(ctx, nodes[0])
else:
filter_expr = RowExprOp('&&', [parse_expr(ctx, node) for node in nodes])
awk_cmd, output_desc = awk_filter_map_from_context(ctx, filter_expr, data_desc.order)
if output_desc:
output_desc.meta = data_desc.meta
return awk_cmd, output_desc or data_desc
评论列表
文章目录