def case1():
from sklearn import datasets
news = datasets.fetch_20newsgroups(subset='all')
# print len(news.data)
# print len(news.target)
# print '*'*10
# print news.data[0]
# print '*'*10
# print news.target[0]
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vec = CountVectorizer()
x = vec.fit_transform(news.data)
# print x.shape
# print x[:2]
print x[:10,:10].toarray()
TFIDF = TfidfTransformer()
x_tfidf = TFIDF.fit_transform(x)
print x_tfidf[:10,:10].toarray()
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233)
tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233)
from sklearn.naive_bayes import MultinomialNB
mnb =MultinomialNB()
tf_mnb = MultinomialNB()
mmb.fit(Xtrain,ytrain)
tf_mnb.fit(tf_Xtrain,tf_ytrain)
评论列表
文章目录