methods.py 文件源码-python代码片段

def reservoir_weighted(it, k, weights):
    """Weighted reservoir Sampling from job posting iterator

    Randomly choosing a sample of k items from a streaming iterator based on the weights.


    Args:
        it (iterator): Job posting iterator to sample from. The format should be (job_posting, label)
        k (int): Sample size
        weights (dict): a dictionary that has key-value pairs as label-weighting pairs. It expects every
                        label in the iterator to be present as a key in the weights dictionary For example,
                        weights = {'11': 2, '13', 1}. In this case, the label/key is the occupation major
                        group and the value is the weight you want to sample with.

    Returns:
        generator: The result sample of k items from weighted reservori sampling.

    """
    heap = []
    hkey = lambda w: np.power(np.random.uniform(0.0, 1.0), 1.0 / w)
    for i, datum in enumerate(it):
        weight = weights[datum[1]]
        score = hkey(weight)
        if len(heap) < k:
            hq.heappush(heap, (hkey(weight), datum))
        elif score > heap[0][0]:
            hq.heapreplace(heap, (score, datum))
    while len(heap) > 0:
        yield hq.heappop(heap)[1]