def generate_data(n_samples, n_features, size_groups, rho=0.5,
random_state=24):
""" Data generation process with Toplitz like correlated features:
this correspond to the synthetic dataset used in our paper
"GAP Safe Screening Rules for Sparse-Group Lasso".
"""
rng = check_random_state(random_state)
n_groups = len(size_groups)
# g_start = np.zeros(n_groups, order='F', dtype=np.intc)
# for i in range(1, n_groups):
# g_start[i] = size_groups[i - 1] + g_start[i - 1]
g_start = np.cumsum(size_groups, dtype=np.intc) - size_groups[0]
# 10% of groups are actives
gamma1 = int(np.ceil(n_groups * 0.1))
selected_groups = rng.random_integers(0, n_groups - 1, gamma1)
true_beta = np.zeros(n_features)
for i in selected_groups:
begin = g_start[i]
end = g_start[i] + size_groups[i]
# 10% of features are actives
gamma2 = int(np.ceil(size_groups[i] * 0.1))
selected_features = rng.random_integers(begin, end - 1, gamma2)
ns = len(selected_features)
s = 2 * rng.rand(ns) - 1
u = rng.rand(ns)
true_beta[selected_features] = np.sign(s) * (10 * u + (1 - u) * 0.5)
vect = rho ** np.arange(n_features)
covar = toeplitz(vect, vect)
X = rng.multivariate_normal(np.zeros(n_features), covar, n_samples)
y = np.dot(X, true_beta) + 0.01 * rng.normal(0, 1, n_samples)
return X, y
评论列表
文章目录