def test_bag_of_words_for_series():
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
remove=('headers', 'footers', 'quotes'))
series = XSeries(dataset.data[:10])
assert series.data_type == str
translator = str.maketrans('', '', string.punctuation)
tokenizer_transformer = XSeriesTransformer(
transform_function=lambda text: text.lower().translate(translator).strip().split()
)
transformed_series = tokenizer_transformer.fit_transform(series)
# print(transformed_series)
bag_transform = BagOfWordsTransformer()
transformed_series = bag_transform.fit_transform(transformed_series)
# print(transformed_series)
assert type(transformed_series) == XDataFrame
python类fetch_20newsgroups()的实例源码
def load_20ng_dataset_bow():
"""
Loads the 20NG dataset
:return:
"""
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
# Convert data to tf-idf
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.95)
train_data = vectorizer.fit_transform(newsgroups_train.data)
test_data = vectorizer.transform(newsgroups_test.data)
train_data = train_data.todense()
test_data = test_data.todense()
train_labels = newsgroups_train.target
test_labels = newsgroups_test.target
return train_data, train_labels, test_data, test_labels
def demo_command(args):
def create_data_file(partition, filename, samples):
data = pandas.DataFrame(
{TEXT_NAME: partition.data,
LABEL_NAME: [partition.target_names[target] for target in partition.target]}).dropna()[:samples]
data.to_csv(filename, index=False)
return filename
os.makedirs(args.directory, exist_ok=True)
print("Download a portion of the 20 Newsgroups data and create train.csv and test.csv.")
newsgroups_train = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))
newsgroups_test = fetch_20newsgroups(subset="test", remove=("headers", "footers", "quotes"))
train_filename = create_data_file(newsgroups_train, os.path.join(args.directory, "train.csv"), 1000)
test_filename = create_data_file(newsgroups_test, os.path.join(args.directory, "test.csv"), 100)
model_directory = os.path.join(args.directory, "model")
print("Train a model.\n")
cmd = "train bow %s --save-model %s --epochs 5 --logging progress\n" % (
train_filename, model_directory)
print("mycroft " + cmd)
default_main(cmd.split())
print("\nEvaluate it on the test data.\n")
cmd = "evaluate %s %s\n" % (model_directory, test_filename)
print("mycroft " + cmd)
default_main(cmd.split())
print("\n(Note that there is not enough training data here to generate accurate predictions.)")
def get_data():
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorizer = CountVectorizer()
categories = ['alt.atheism', 'talk.religion.misc',
'comp.graphics', 'sci.space']
# Train set
newsgroups_train = fetch_20newsgroups(subset='train',
categories=categories, shuffle=True)
X_train = vectorizer.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target
# Test set
newsgroups_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True)
X_test = vectorizer.transform(newsgroups_test.data)
y_test = newsgroups_test.target
return X_train, y_train, X_test, y_test
def test_build(self):
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
corpus = CorpusFromScikit(
X=X_counts,
y=newsgroups_train.target,
feature_vocabulary=count_vectorizer.vocabulary_,
category_names=newsgroups_train.target_names,
raw_texts=newsgroups_train.data
).build()
self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
self.assertEqual(corpus
.get_term_freq_df()
.assign(score=corpus.get_scaled_f_scores('alt.atheism'))
.sort_values(by='score', ascending=False).index.tolist()[:5],
['atheism', 'atheists', 'islam', 'atheist', 'belief'])
self.assertGreater(len(corpus.get_texts()[0]), 5)
def test_build(self):
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
term_doc_mat = TermDocMatrixFromScikit(
X=X_counts,
y=newsgroups_train.target,
feature_vocabulary=count_vectorizer.vocabulary_,
category_names=newsgroups_train.target_names).build()
self.assertEqual(term_doc_mat.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
self.assertEqual(term_doc_mat
.get_term_freq_df()
.assign(score=term_doc_mat.get_scaled_f_scores('alt.atheism'))
.sort_values(by='score', ascending=False).index.tolist()[:5],
['atheism', 'atheists', 'islam', 'atheist', 'belief'])
def newsgroups_class_distrib():
from sklearn.datasets import fetch_20newsgroups
ngroup_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=None)
ngroup_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=None)
test_data = ngroup_test.data
train_data = ngroup_train.data
test_groups = ngroup_test.target
train_groups = ngroup_train.target
n = 2000
train_groups = train_groups[:n]
test_groups = test_groups[:n]
plt.figure()
plt.hist(train_groups, 20, normed=True, range=(0, 19))
plt.title("train groups")
plt.figure()
plt.hist(test_groups, 20, normed=True, range=(0, 19))
plt.title("test groups")
plt.show()
def newsgroups(*, path=None, key=None, limit=None):
"""
Return a list of newsgroup messages from the 20 newsgroups dataset.
Arguments:
- path(str): Unused in this case. Dataset is managed by sklearn.
- key(str): Unused.
- limit(int): Unused.
"""
# This is going to download the dataset the first time we
# run this function. Ideally we can populate these datasets
# ahead of time.
from sklearn.datasets import fetch_20newsgroups
if limit:
return fetch_20newsgroups(subset='train').data[:limit]
return fetch_20newsgroups(subset='train').data
def test_bag_of_words_for_series_pipeline():
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
remove=('headers', 'footers', 'quotes'))
n = 100
series = XSeries(dataset.data[:n])
assert series.data_type == str
translator = str.maketrans('', '', string.punctuation)
tokenizer_transformer = XSeriesTransformer(
transform_function=lambda text: text.lower().translate(translator).strip().split()
)
# series = tokenizer_transformer.transform(series)
Y = np.random.binomial(1, 0.5, n)
pipeline = PipeLineChain([
('preprocessing', XSeriesTransformer(
transform_function=lambda text: text.lower().translate(translator).strip().split()
)),
('extractor', BagOfWordsTransformer()),
('pca', PCA(n_components=10)),
# ('svc', LinearSVC())
])
pipeline = pipeline.fit(series)
transformed_series = pipeline.transform(series)
# print(transformed_series)
def setUp(self):
"""Carga de los datos de prueba (20 Newsgroups corpus)."""
newsdata = fetch_20newsgroups(data_home="./data/")
self.ids = [str(i) for i in range(len(newsdata.target))]
self.texts = newsdata.data
self.labels = [newsdata.target_names[idx] for idx in newsdata.target]
self.tc = TextClassifier(self.texts, self.ids)
def case1():
from sklearn import datasets
news = datasets.fetch_20newsgroups(subset='all')
# print len(news.data)
# print len(news.target)
# print '*'*10
# print news.data[0]
# print '*'*10
# print news.target[0]
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vec = CountVectorizer()
x = vec.fit_transform(news.data)
# print x.shape
# print x[:2]
print x[:10,:10].toarray()
TFIDF = TfidfTransformer()
x_tfidf = TFIDF.fit_transform(x)
print x_tfidf[:10,:10].toarray()
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233)
tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233)
from sklearn.naive_bayes import MultinomialNB
mnb =MultinomialNB()
tf_mnb = MultinomialNB()
mmb.fit(Xtrain,ytrain)
tf_mnb.fit(tf_Xtrain,tf_ytrain)
def __init__(self, cfg=None):
super().__init__()
self.__dataset__ = fetch_20newsgroups(subset=cfg['subset'], categories=cfg['categories'],
shuffle=cfg['shuffle'], random_state=cfg['random_state'])
def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
"""
Retrieve data from 20 newsgroups
:param subset: train, test or all
:param categories: List of newsgroup name
:param shuffle: shuffle the list or not
:param random_state: seed integer to shuffle the dataset
:return: data and labels of the newsgroup
"""
datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
return datasets
def main():
#parameters
num_features = 400 #vocabulary size
#load data
print "loading 20 newsgroups dataset..."
categories = ['rec.autos','rec.sport.hockey','comp.graphics','sci.space']
tic = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=0, categories=categories, remove=('headers','footers','quotes'))
train_corpus = dataset.data # a list of 11314 documents / entries
train_labels = dataset.target
toc = time()
print "elapsed time: %.4f sec" %(toc - tic)
#tf-idf vectorizer
tfidf = TfidfVectorizer(max_df=0.5, max_features=num_features, \
min_df=2, stop_words='english', use_idf=True)
X_tfidf = tfidf.fit_transform(train_corpus).toarray()
#append document labels
train_labels = train_labels.reshape(-1,1)
X_all = np.hstack([train_labels, X_tfidf])
#distribute the data
sc = SparkContext('local', 'log_reg')
rdd = sc.parallelize(X_all)
labeled_corpus = rdd.map(parse_doc)
train_RDD, test_RDD = labeled_corpus.randomSplit([8, 2], seed=0)
#distributed logistic regression
print "training logistic regression..."
model = LogisticRegressionWithLBFGS.train(train_RDD, regParam=1, regType='l1', numClasses=len(categories))
#evaluated the model on test data
labels_and_preds = test_RDD.map(lambda p: (p.label, model.predict(p.features)))
test_err = labels_and_preds.filter(lambda (v, p): v != p).count() / float(test_RDD.count())
print "log-reg test error: ", test_err
#model.save(sc, './log_reg_lbfgs_model')
twenty_news_group.py 文件源码
项目:DataScience-And-MachineLearning-Handbook-For-Coders
作者: wxyyxc1992
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def fetch_data(self, subset='train', categories=None):
"""return data
????????
Arguments:
subset -> string -- ??????? train / test / all
"""
rand = np.random.mtrand.RandomState(8675309)
data = fetch_20newsgroups(subset=subset,
categories=categories,
shuffle=True,
random_state=rand)
self.data[subset] = data
def load_newsgroups():
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer(max_features=2000, dtype=np.float64, sublinear_tf=True)
x_sparse = vectorizer.fit_transform(newsgroups.data)
x = np.asarray(x_sparse.todense())
y = newsgroups.target
print('News group data shape ', x.shape)
print("News group number of clusters: ", np.unique(y).size)
return x, y
def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
"""
Retrieve data from 20 newsgroups
:param subset: train, test or all
:param categories: List of newsgroup name
:param shuffle: shuffle the list or not
:param random_state: seed integer to shuffle the dataset
:return: data and labels of the newsgroup
"""
datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
return datasets
def __init__(self, cfg=None):
super().__init__()
self.__dataset__ = fetch_20newsgroups(subset=cfg['subset'], categories=cfg['categories'],
shuffle=cfg['shuffle'], random_state=cfg['random_state'])
def get_data():
data = fetch_20newsgroups(subset='all',
shuffle=True,
remove=('headers', 'footers', 'quotes'))
return data
def download_articles(name, categories, subset):
data = {}
print("Downloading articles")
newsgroups_data = fetch_20newsgroups(subset=subset, categories=categories, remove=())
for i in range(len(newsgroups_data['data'])):
line = newsgroups_data['data'][i]
data[str(len(data))] = {'text': line, 'label': newsgroups_data['target_names'][newsgroups_data['target'][i]]}
print(len(data))
raw_data_dir = os.path.join('..', 'data', '20ng', name)
print("Saving to", raw_data_dir)
fh.makedirs(raw_data_dir)
fh.write_to_json(data, os.path.join(raw_data_dir, subset + '.json'))
def test_20news():
try:
data = datasets.fetch_20newsgroups(
subset='all', download_if_missing=False, shuffle=False)
except IOError:
raise SkipTest("Download 20 newsgroups to run this test")
# Extract a reduced dataset
data2cats = datasets.fetch_20newsgroups(
subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
# Check that the ordering of the target_names is the same
# as the ordering in the full dataset
assert_equal(data2cats.target_names,
data.target_names[-2:])
# Assert that we have only 0 and 1 as labels
assert_equal(np.unique(data2cats.target).tolist(), [0, 1])
# Check that the number of filenames is consistent with data/target
assert_equal(len(data2cats.filenames), len(data2cats.target))
assert_equal(len(data2cats.filenames), len(data2cats.data))
# Check that the first entry of the reduced dataset corresponds to
# the first entry of the corresponding category in the full dataset
entry1 = data2cats.data[0]
category = data2cats.target_names[data2cats.target[0]]
label = data.target_names.index(category)
entry2 = data.data[np.where(data.target == label)[0][0]]
assert_equal(entry1, entry2)
def test_20news_length_consistency():
"""Checks the length consistencies within the bunch
This is a non-regression test for a bug present in 0.16.1.
"""
try:
data = datasets.fetch_20newsgroups(
subset='all', download_if_missing=False, shuffle=False)
except IOError:
raise SkipTest("Download 20 newsgroups to run this test")
# Extract the full dataset
data = datasets.fetch_20newsgroups(subset='all')
assert_equal(len(data['data']), len(data.data))
assert_equal(len(data['target']), len(data.target))
assert_equal(len(data['filenames']), len(data.filenames))
def test_train_model():
data = fetch_20newsgroups(
random_state=42,
categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'])
limit = 200
if limit is not None:
data['target'] = data['target'][:limit]
data['data'] = data['data'][:limit]
n_domains = int(len(data['target']) / 5)
docs = [
{
'html': '\n'.join('<p>{}</p>'.format(t) for t in text.split('\n')),
'url': 'http://example-{}.com/{}'.format(n % n_domains, n),
'relevant': {'sci.space': True, 'sci.med': None}.get(
data['target_names'][target], False),
}
for n, (text, target) in enumerate(zip(data['data'], data['target']))]
result = train_model(docs)
pprint(attr.asdict(result.meta))
assert lst_as_dict(result.meta.advice) == [
{'kind': 'Notice',
'text': "The quality of the classifier is very good, ROC AUC is 0.96. "
"You can label more pages if you want to improve quality, "
"but it's better to start crawling "
"and check the quality of crawled pages.",
},
]
assert lst_as_dict(result.meta.description) == [
{'heading': 'Dataset',
'text': '200 documents, 159 labeled across 40 domains.'},
{'heading': 'Class balance',
'text': '33% relevant, 67% not relevant.'},
{'heading': 'Metrics', 'text': ''},
{'heading': 'Accuracy', 'text': '0.881 ± 0.122'},
{'heading': 'ROC AUC', 'text': '0.964 ± 0.081'}]
assert len(result.meta.weights['pos']) > 0
assert len(result.meta.weights['neg']) > 0
assert isinstance(result.model, BaseModel)
assert hasattr(result.model, 'predict_proba')
def main():
"""
Cluster the newsgroups dataset and measure against labels.
In this script, we're doing a grid search against various
TFIDF representations of the newsgroups dataset. We want
a TFIDF representation that has a good unsupervised
representation.
We're measuring the quality of that unsupervised
representation by how well it matches up to the actual
supervised labels of the newsgroups dataset.
"""
newsgroups = fetch_20newsgroups(
subset='train',
categories=CATEGORIES,
shuffle=True
)
print("Loaded data")
gridsearch = GridSearchCV(
Pipeline([
('vec', TfidfVectorizer()),
('cluster', ClusteringWithSupervision(
cluster_instance=MiniBatchKMeans()))
]),
{
'vec__stop_words': (None, 'english')
}
)
print("Defined pipeline. Beginning fit.")
gridsearch.fit(newsgroups.data, newsgroups.target)
print_best_worst(gridsearch.cv_results_)
best_estimator = gridsearch.best_estimator_
predicted = best_estimator.predict(newsgroups.data)
print(
classification_report(
newsgroups.target,
predicted,
target_names=newsgroups.target_names))
def load_20news(setName):
newsgroups_subset = fetch_20newsgroups(subset=setName, remove=('headers', 'footers')) #, 'quotes'
totalLineNum = 0
readDocNum = 0
print "Loading 20 newsgroup %s data..." %setName
setDocNum = len(newsgroups_subset.data)
orig_docs_name = []
orig_docs_cat = []
orig_docs_words = []
catNum = len(newsgroups_subset.target_names)
cats_docsWords = [ [] for i in xrange(catNum) ]
cats_docNames = [ [] for i in xrange(catNum) ]
emptyFileNum = 0
for d, text in enumerate(newsgroups_subset.data):
if d % 50 == 49 or d == setDocNum - 1:
print "\r%d %d\r" %( d + 1, totalLineNum ),
text = text.encode("utf-8")
lines = text.split("\n")
if len(text) == 0 or len(lines) == 0:
emptyFileNum += 1
continue
readDocNum += 1
totalLineNum += len(lines)
catID = newsgroups_subset.target[d]
category = newsgroups_subset.target_names[catID]
text = " ".join(lines)
wordsInSentences, wc = extractSentenceWords(text)
filename = newsgroups_subset.filenames[d]
filename = os.path.basename(filename)
orig_docs_words.append( wordsInSentences )
orig_docs_name.append(filename)
orig_docs_cat.append(catID)
cats_docsWords[catID].append(wordsInSentences)
cats_docNames[catID].append(filename)
print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum)
return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \
cats_docsWords, cats_docNames, newsgroups_subset.target_names
def initializeData(data):
# graphics_train = fetch_20newsgroups(subset = dataSet,\
# categories = categories, shuffle = True, random_state = 42)
wnl = WordNetLemmatizer()
stop_words = text.ENGLISH_STOP_WORDS
data = data
#List of dicts, each element represents word to number mapping for each document
termDictList = []
#Dictionary for each term which stores the number of documents that contains this term
termDocCountDict = {}
# set of term
termSet = set()
# list of int, each element represents total number of terms in each tokenlized documment
termCountList = []
# get focument frequency for each term
for i in range(len(data)):
document = data[i].lower()
words = set(word_tokenize(document))
for word in words:
if word.isalpha():
term = wnl.lemmatize(word)
if term not in stop_words:
if term not in termDocCountDict:
termDocCountDict[term] = 0
termDocCountDict[term] += 1
# get termDict and termSet
for i in range(len(data)):
termDict = {}
termCount = 0
document = data[i].lower()
words = word_tokenize(document)
for word in words:
if word.isalpha():
term = wnl.lemmatize(word)
if term not in stop_words:
if term in termDocCountDict:
if termDocCountDict[term] >= 110 and termDocCountDict[term] <= 11000:
termSet.add(term)
termCount += 1
# fill in termDict
if term not in termDict:
termDict[term] = 0
termDict[term] += 1
else:
del termDocCountDict[term]
termDictList.append(termDict)
termCountList.append(termCount)
return (termDictList, termCountList, termDocCountDict, termSet)
# function
def main():
"""
Train a classifier on the 20 newsgroups dataset.
The purpose of this is mostly trying to figure out how
to turn text into really good vector representations
for classification... which are also hopefully good
vector representations for unsupervised learning too.
"""
# We don't really use our interfaces for iterating over datasets...
# but maybe we will in the future.
train = fetch_20newsgroups(
subset='train',
# categories=CATEGORIES,
shuffle=True
)
test = fetch_20newsgroups(
subset='test',
# categories=CATEGORIES,
shuffle=True
)
print("Loaded data.", len(set(train.target)), "classes.")
glove_vectors = glove_simple()
print("Loaded word vectors")
pipeline = Pipeline([
# ('vec', TfidfVectorizer()),
('vec', WordVectorSum(vector_dict=glove_vectors)),
# ('svd', TruncatedSVD()),
('fit', SGDClassifier())
])
print("Defined pipeline. Beginning fit.")
gridsearch = GridSearchCV(
pipeline,
{
# 'vec__stop_words': ('english',),
# 'svd__n_components': (2, 100, 500, 1000),
# 'vec__min_df': (1, 0.01, 0.1, 0.4),
# 'vec__max_df': (0.5, 0.75, 0.9, 1.0),
# 'vec__max_features': (100, 1000, 10000)
}
)
gridsearch.fit(train.data, train.target)
print("Completed fit. Beginning prediction")
predicted = gridsearch.predict(test.data)
print("Completed prediction.")
accuracy = np.mean(predicted == test.target)
print("Accuracy was", accuracy)
print("Best params", gridsearch.best_params_)
print_best_worst(gridsearch.cv_results_)
print(
classification_report(
test.target,
predicted,
target_names=test.target_names))