sgl_tools.py 文件源码-python代码片段

def generate_data(n_samples, n_features, size_groups, rho=0.5,
                  random_state=24):
    """ Data generation process with Toplitz like correlated features:
        this correspond to the synthetic dataset used in our paper
        "GAP Safe Screening Rules for Sparse-Group Lasso".

    """

    rng = check_random_state(random_state)
    n_groups = len(size_groups)
    # g_start = np.zeros(n_groups, order='F', dtype=np.intc)
    # for i in range(1, n_groups):
    #     g_start[i] = size_groups[i - 1] + g_start[i - 1]
    g_start = np.cumsum(size_groups, dtype=np.intc) - size_groups[0]

    # 10% of groups are actives
    gamma1 = int(np.ceil(n_groups * 0.1))
    selected_groups = rng.random_integers(0, n_groups - 1, gamma1)
    true_beta = np.zeros(n_features)

    for i in selected_groups:

        begin = g_start[i]
        end = g_start[i] + size_groups[i]
        # 10% of features are actives
        gamma2 = int(np.ceil(size_groups[i] * 0.1))
        selected_features = rng.random_integers(begin, end - 1, gamma2)

        ns = len(selected_features)
        s = 2 * rng.rand(ns) - 1
        u = rng.rand(ns)
        true_beta[selected_features] = np.sign(s) * (10 * u + (1 - u) * 0.5)

    vect = rho ** np.arange(n_features)
    covar = toeplitz(vect, vect)

    X = rng.multivariate_normal(np.zeros(n_features), covar, n_samples)
    y = np.dot(X, true_beta) + 0.01 * rng.normal(0, 1, n_samples)

    return X, y