def load_newsgroups():
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer(max_features=2000, dtype=np.float64, sublinear_tf=True)
x_sparse = vectorizer.fit_transform(newsgroups.data)
x = np.asarray(x_sparse.todense())
y = newsgroups.target
print('News group data shape ', x.shape)
print("News group number of clusters: ", np.unique(y).size)
return x, y
评论列表
文章目录