def reservoir_weighted(it, k, weights):
"""Weighted reservoir Sampling from job posting iterator
Randomly choosing a sample of k items from a streaming iterator based on the weights.
Args:
it (iterator): Job posting iterator to sample from. The format should be (job_posting, label)
k (int): Sample size
weights (dict): a dictionary that has key-value pairs as label-weighting pairs. It expects every
label in the iterator to be present as a key in the weights dictionary For example,
weights = {'11': 2, '13', 1}. In this case, the label/key is the occupation major
group and the value is the weight you want to sample with.
Returns:
generator: The result sample of k items from weighted reservori sampling.
"""
heap = []
hkey = lambda w: np.power(np.random.uniform(0.0, 1.0), 1.0 / w)
for i, datum in enumerate(it):
weight = weights[datum[1]]
score = hkey(weight)
if len(heap) < k:
hq.heappush(heap, (hkey(weight), datum))
elif score > heap[0][0]:
hq.heapreplace(heap, (score, datum))
while len(heap) > 0:
yield hq.heappop(heap)[1]
评论列表
文章目录