def masking_gen(attribute_name, details):
"""
Apply masking to a RDD of rows. Rows are first grouped by key in order to
have rows with same value for the column available at same time (if the
value is the same, the mask will be the same).
@FIXME: Define a good size for partitions / groups (for instance use part
of string or range of numbers, but it depends on the data type).
"""
def masking(group):
from faker import Factory
faker_obj = Factory.create(details.get('lang', 'en_GB'))
faker_obj.seed(random.randint(0, 100000))
if not hasattr(faker_obj, details.get('label_type', 'name')):
raise ValueError(_('Invalid masking type: {}').format(
details.get('label_type')))
action = getattr(faker_obj, details.get('label_type', 'name'))
faker_ctx = {}
result = []
for row in group[1]:
as_dict = row.asDict()
value = as_dict.get(attribute_name)
if value in faker_ctx:
new_value = faker_ctx.get(value)
else:
new_value = action(**details.get('label_args', {}))
faker_ctx[value] = new_value
as_dict[attribute_name] = new_value
result.append(as_dict)
return result
return masking
评论列表
文章目录