def generate_data(case, sparse=False):
"""Generate regression/classification data."""
bunch = None
if case == 'regression':
bunch = datasets.load_boston()
elif case == 'classification':
bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
X, y = shuffle(bunch.data, bunch.target)
offset = int(X.shape[0] * 0.8)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]
if sparse:
X_train = csr_matrix(X_train)
X_test = csr_matrix(X_test)
else:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
y_train = np.array(y_train)
data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train,
'y_test': y_test}
return data
python类fetch_20newsgroups_vectorized()的实例源码
plot_model_complexity_influence.py 文件源码
项目:Parallel-SGD
作者: angadgill
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def test_20news_vectorized():
# This test is slow.
raise SkipTest("Test too slow.")
bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
assert_true(sp.isspmatrix_csr(bunch.data))
assert_equal(bunch.data.shape, (11314, 107428))
assert_equal(bunch.target.shape[0], 11314)
assert_equal(bunch.data.dtype, np.float64)
bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
assert_true(sp.isspmatrix_csr(bunch.data))
assert_equal(bunch.data.shape, (7532, 107428))
assert_equal(bunch.target.shape[0], 7532)
assert_equal(bunch.data.dtype, np.float64)
bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
assert_true(sp.isspmatrix_csr(bunch.data))
assert_equal(bunch.data.shape, (11314 + 7532, 107428))
assert_equal(bunch.target.shape[0], 11314 + 7532)
assert_equal(bunch.data.dtype, np.float64)
def load_20newsgroup_vectorized(folder=SCIKIT_LEARN_DATA, one_hot=True, partitions_proportions=None,
shuffle=False, binary_problem=False, as_tensor=True, minus_value=-1.):
data_train = sk_dt.fetch_20newsgroups_vectorized(data_home=folder, subset='train')
data_test = sk_dt.fetch_20newsgroups_vectorized(data_home=folder, subset='test')
X_train = data_train.data
X_test = data_test.data
y_train = data_train.target
y_test = data_test.target
if binary_problem:
y_train[data_train.target < 10] = minus_value
y_train[data_train.target >= 10] = 1.
y_test[data_test.target < 10] = minus_value
y_test[data_test.target >= 10] = 1.
if one_hot:
y_train = to_one_hot_enc(y_train)
y_test = to_one_hot_enc(y_test)
# if shuffle and sk_shuffle:
# xtr = X_train.tocoo()
# xts = X_test.tocoo()
d_train = Dataset(data=X_train,
target=y_train, info={'target names': data_train.target_names})
d_test = Dataset(data=X_test,
target=y_test, info={'target names': data_train.target_names})
res = [d_train, d_test]
if partitions_proportions:
res = redivide_data([d_train, d_test], partition_proportions=partitions_proportions, shuffle=False)
if as_tensor: [dat.convert_to_tensor() for dat in res]
return Datasets.from_list(res)
def test_LogisticRegressionCV():
bunch = fetch_20newsgroups_vectorized(subset="train")
X = bunch.data
y = bunch.target
y[y < y.mean()] = -1
y[y >= y.mean()] = 1
Xt, Xh, yt, yh = cross_validation.train_test_split(
X, y, test_size=.5, random_state=0)
# compute the scores
all_scores = []
all_alphas = np.linspace(-12, 0, 5)
for a in all_alphas:
lr = linear_model.LogisticRegression(
solver='lbfgs', C=np.exp(-a), fit_intercept=False, tol=1e-6,
max_iter=100)
lr.fit(Xt, yt)
score_scv = linear_model.logistic._logistic_loss(
lr.coef_.ravel(), Xh, yh, 0)
all_scores.append(score_scv)
all_scores = np.array(all_scores)
best_alpha = all_alphas[np.argmin(all_scores)]
clf = LogisticRegressionCV(verbose=True)
clf.fit(Xt, yt, Xh, yh)
assert np.abs(clf.alpha_ - best_alpha) < 0.5