utils.py 文件源码-python代码片段

def random_balanced_partitions(data, first_size, labels, random=np.random):
    """Split data into a balanced random partition and the rest

    Partition the `data` array into two random partitions, using
    the `labels` array (of equal size) to guide the choice of
    elements of the first returned array.

    Example:
        random_balanced_partition(['a', 'b', 'c'], 2, [3, 5, 5])
        # Both labels 3 and 5 need to be presented once, so
        # the result can be either (['a', 'b'], ['c']) or
        # (['a', 'c'], ['b']) but not (['b', 'c'], ['a']).

    Args:
        data (ndarray): data to be split
        first_size (int): size of the first partition
        balance (ndarray): according to which balancing is done
        random (RandomState): source of randomness

    Return:
        tuple of two ndarrays
    """
    assert len(data) == len(labels)

    classes, class_counts = np.unique(labels, return_counts=True)
    assert len(classes) <= 10000, "surprisingly many classes: {}".format(len(classes))
    assert first_size % len(classes) == 0, "not divisible: {}/{}".format(first_size, len(classes))
    assert np.all(class_counts >= first_size // len(classes)), "not enough examples of some class"

    idxs_per_class = [np.nonzero(labels == klass)[0] for klass in classes]
    chosen_idxs_per_class = [
        random.choice(idxs, first_size // len(classes), replace=False)
        for idxs in idxs_per_class
    ]
    first_idxs = np.concatenate(chosen_idxs_per_class)
    second_idxs = np.setdiff1d(np.arange(len(labels)), first_idxs)

    assert first_idxs.shape == (first_size,)
    assert second_idxs.shape == (len(data) - first_size,)
    return data[first_idxs], data[second_idxs]