def test_ovr_partial_fit():
# Test if partial_fit is working as intented
X, y = shuffle(iris.data, iris.target, random_state=0)
ovr = OneVsRestClassifier(MultinomialNB())
ovr.partial_fit(X[:100], y[:100], np.unique(y))
ovr.partial_fit(X[100:], y[100:])
pred = ovr.predict(X)
ovr2 = OneVsRestClassifier(MultinomialNB())
pred2 = ovr2.fit(X, y).predict(X)
assert_almost_equal(pred, pred2)
assert_equal(len(ovr.estimators_), len(np.unique(y)))
assert_greater(np.mean(y == pred), 0.65)
# Test when mini batches doesn't have all classes
ovr = OneVsRestClassifier(MultinomialNB())
ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
ovr.partial_fit(iris.data[60:], iris.target[60:])
pred = ovr.predict(iris.data)
ovr2 = OneVsRestClassifier(MultinomialNB())
pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data)
assert_almost_equal(pred, pred2)
assert_equal(len(ovr.estimators_), len(np.unique(iris.target)))
assert_greater(np.mean(iris.target == pred), 0.65)
python类MultinomialNB()的实例源码
def test_ovr_multilabel():
# Toy dataset where features correspond directly to labels.
X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
y = np.array([[0, 1, 1],
[0, 1, 0],
[1, 1, 1],
[1, 0, 1],
[1, 0, 0]])
for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
LinearRegression(), Ridge(),
ElasticNet(), Lasso(alpha=0.5)):
clf = OneVsRestClassifier(base_clf).fit(X, y)
y_pred = clf.predict([[0, 4, 4]])[0]
assert_array_equal(y_pred, [0, 1, 1])
assert_true(clf.multilabel_)
def predict_job(job_list):
"""Assign a classification to a url"""
# TODO: Add case where len is 1 or 0....
job_list = [job for j in job_list for job in j]
new_job_list = [regex.tokenize_and_stem(i) for i in job_list]
new_job_list = [' '.join(job) for job in new_job_list]
vect = CountVectorizer()
x_series = pd.Series(X)
X_train_dtm = vect.fit_transform(x_series)
y_train = pd.Series(y)
job_list_series = pd.Series(new_job_list)
job_list_dtm = vect.transform(job_list_series)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred = nb.predict(job_list_dtm)
# for i in range(len(job_list)):
# print(job_list[i], y_pred[i])
return y_pred
# print(predict_job([('Founder',), ('Founder',), ('Architect & Full-stack developer',), ('Senior Engineer',), ('Technical Consultant',)]))
def train_classifier(self, trainvectors, labels, no_label_encoding=False, alpha='default', fit_prior=True, iterations=10):
if alpha == '':
paramsearch = GridSearchCV(estimator=naive_bayes.MultinomialNB(), param_grid=dict(alpha=numpy.linspace(0,2,20)[1:]), n_jobs=6)
paramsearch.fit(trainvectors,self.label_encoder.transform(labels))
selected_alpha = paramsearch.best_estimator_.alpha
elif alpha == 'default':
selected_alpha = 1.0
else:
selected_alpha = float(alpha)
if fit_prior == 'False':
fit_prior = False
else:
fit_prior = True
self.model = naive_bayes.MultinomialNB(alpha=selected_alpha,fit_prior=fit_prior)
if no_label_encoding:
self.model.fit(trainvectors, labels)
else:
self.model.fit(trainvectors, self.label_encoder.transform(labels))
def __init__(self, df, weight=True, min_ct=0, total_iter=5):
self.logger = logging.getLogger(__name__)
super(MultinomialNaiveBayes, self).__init__(total_iterations=total_iter) # call base constructor
#self.set_min_count(min_ct)
self.is_weighted_sample = weight
# process data
#df = self._filter_rows(df) # filter out low count rows
# row_sums = df.sum(axis=1).astype(float)
# df = df.div(row_sums, axis=0) # normalize each row
# df = df.mul(100)
# df.to_csv('tmp.nbclf.txt', sep='\t')
df = df.fillna(df.mean())
total = df['total']
df = df[['recurrent missense', 'recurrent indel', 'frame shift',
'nonsense', 'missense', 'synonymous', 'inframe indel', 'no protein',
'lost stop', 'splicing mutation']]
df = df.mul(total, axis=0).astype(int) # get back counts instead of pct
self.x, self.y = features.randomize(df)
# setup classifier
self.clf = MultinomialNB(alpha=1, # laplacian smooth, i.e. pseudocounts
fit_prior=True) # use data for prior class probs
04_sent.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def create_union_model(params=None):
def preprocessor(tweet):
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet.replace("-", " ").replace("_", " ")
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion(
[('ling', ling_stats), ('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('ling', ling_stats)])
clf = MultinomialNB()
pipeline = Pipeline([('all', all_features), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
03_clean.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def create_ngram_model(params=None):
def preprocessor(tweet):
global emoticons_replaced
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
clf = MultinomialNB()
pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
def article_trainers(articles: ArticleDB):
"""
Run repeated models against article db to predict validity score for
articles.
"""
models = [(DecisionTreeClassifier, {}),
(RandomForestClassifier, {}),
(LogisticRegression, {'C': [0.01, 0.1, 1, 10, 100]}),
(MultinomialNB, {'alpha': [0.1, 1.0, 10.0, 100.0]}),
(LinearSVC, {'C': [0.01, 0.1, 1, 10, 100]})]
trained_models = []
for classifier, param_grid in models:
res = train_model(articles, classifier, param_grid, probabilities=True)
trained_models.append((str(res), res))
ensemble_learner = VotingClassifier(estimators=trained_models[:4],
voting='soft')
train_model(articles, ensemble_learner, {})
def test_integrated_plot_numpy_named_arrays(self):
model = naive_bayes.MultinomialNB()
X = np.array([
(1.1, 9.52, 1.23, 0.86, 7.89, 0.13),
(3.4, 2.84, 8.65, 0.45, 7.43, 0.16),
(1.2, 3.22, 6.56, 0.24, 3.45, 0.17),
(3.8, 6.18, 2.45, 0.28, 2.53, 0.13),
(5.1, 9.12, 1.06, 0.19, 1.43, 0.13),
(4.4, 8.84, 4.97, 0.98, 1.35, 0.13),
(3.2, 3.22, 5.03, 0.68, 3.53, 0.32),
(7.8, 2.18, 6.87, 0.35, 3.25, 0.38),
], dtype=[('a','<f8'), ('b','<f8'),
('c','<f8'), ('d','<f8'),
('e','<f8'), ('f','<f8')]
)
y = np.array([1, 1, 0, 1, 0, 0, 1, 0])
visualizer = DecisionBoundariesVisualizer(model, features=['a', 'f'])
visualizer.fit_draw_poof(X, y=y)
self.assertEquals(visualizer.features_, ['a', 'f'])
self.assert_images_similar(visualizer)
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
super(ScikitRE, self).__init__()
self.modelname = relationtype + "_" + modelname
self.relationtype = relationtype
self.pairtype = relationtype
self.corpus = corpus
self.pairs = []
self.features = []
self.labels = []
self.pred = []
self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
self.generate_data(corpus, modelname, relationtype)
self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
#('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
#('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
#('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
#('clf', SGDClassifier())
#('clf', svm.NuSVC(nu=0.01 ))
#('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
('clf', MultinomialNB(alpha=0.01, fit_prior=False))
#('clf', DummyClassifier(strategy="constant", constant=True))
])
def __init__(self, fit_scaler=None, transform_scaler='bin'):
self.fit_scaler=fit_scaler
self.transform_scaler=transform_scaler
if fit_scaler in MNBScaler.fit_scalers:
self.fit_scaler_ = None if fit_scaler is None else MNBScaler.fit_scalers[fit_scaler]()
else:
raise ValueError("fit_scaler should be one of %r but %s specified" %
(MNBScaler.fit_scalers.keys(), fit_scaler))
if transform_scaler in MNBScaler.transform_scalers:
self.transform_scaler_ = None if transform_scaler is None else \
self.fit_scaler_ if transform_scaler=='auto' else \
MNBScaler.transform_scalers[transform_scaler]()
else:
raise ValueError("transform_scaler should be one of %r but %s specified" %
(MNBScaler.transform_scalers.keys(), transform_scaler))
self.mnb_ = MultinomialNB()
def MultinomialNBPredictModel(localTrainLabel, config):
train = pd.read_csv('../feature/trainQlist.csv', header = 0, sep = ",")
test = pd.read_csv('../feature/testQlist.csv', header = 0, sep = ",")
print "Train tf-idf vector Model..."
encode = TfidfVectorizer(decode_error = 'ignore', norm = "l2", binary = False, sublinear_tf = True, min_df = 50)
localTrainFeature = encode.fit_transform(train['qlist'].values)
localTestFeature = encode.transform(train['qlist'].values)
print localTrainFeature.shape, localTestFeature.shape
print 'train...'
model = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
model.fit(X = localTrainFeature, y = localTrainLabel)
print 'predict...'
if config['prob'] == False:
return model.predict(localTestFeature), test['uid'].values
else:
return model.predict_log_proba(localTestFeature), test['uid'].values
#-- xgboost local corss validation model frame
def test_ovr_multilabel_dataset():
base_clf = MultinomialNB(alpha=1)
for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
X, Y = datasets.make_multilabel_classification(n_samples=100,
n_features=20,
n_classes=5,
n_labels=2,
length=50,
allow_unlabeled=au,
random_state=0)
X_train, Y_train = X[:80], Y[:80]
X_test, Y_test = X[80:], Y[80:]
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
assert_true(clf.multilabel_)
assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"),
prec,
decimal=2)
assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"),
recall,
decimal=2)
def test_ovr_single_label_predict_proba():
base_clf = MultinomialNB(alpha=1)
X, Y = iris.data, iris.target
X_train, Y_train = X[:80], Y[:80]
X_test = X[80:]
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
# decision function only estimator. Fails in current implementation.
decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
assert_raises(AttributeError, decision_only.predict_proba, X_test)
Y_pred = clf.predict(X_test)
Y_proba = clf.predict_proba(X_test)
assert_almost_equal(Y_proba.sum(axis=1), 1.0)
# predict assigns a label if the probability that the
# sample has the label is greater than 0.5.
pred = np.array([l.argmax() for l in Y_proba])
assert_false((pred - Y_pred).any())
def test_ovo_partial_fit_predict():
X, y = shuffle(iris.data, iris.target)
ovo1 = OneVsOneClassifier(MultinomialNB())
ovo1.partial_fit(X[:100], y[:100], np.unique(y))
ovo1.partial_fit(X[100:], y[100:])
pred1 = ovo1.predict(X)
ovo2 = OneVsOneClassifier(MultinomialNB())
ovo2.fit(X, y)
pred2 = ovo2.predict(X)
assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2)
assert_greater(np.mean(y == pred1), 0.65)
assert_almost_equal(pred1, pred2)
# Test when mini-batches don't have all target classes
ovo1 = OneVsOneClassifier(MultinomialNB())
ovo1.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
ovo1.partial_fit(iris.data[60:], iris.target[60:])
pred1 = ovo1.predict(iris.data)
ovo2 = OneVsOneClassifier(MultinomialNB())
pred2 = ovo2.fit(iris.data, iris.target).predict(iris.data)
assert_almost_equal(pred1, pred2)
assert_equal(len(ovo1.estimators_), len(np.unique(iris.target)))
assert_greater(np.mean(iris.target == pred1), 0.65)
def test_input_check_partial_fit():
for cls in [BernoulliNB, MultinomialNB]:
# check shape consistency
assert_raises(ValueError, cls().partial_fit, X2, y2[:-1],
classes=np.unique(y2))
# classes is required for first call to partial fit
assert_raises(ValueError, cls().partial_fit, X2, y2)
# check consistency of consecutive classes values
clf = cls()
clf.partial_fit(X2, y2, classes=np.unique(y2))
assert_raises(ValueError, clf.partial_fit, X2, y2,
classes=np.arange(42))
# check consistency of input shape for partial_fit
assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2)
# check consistency of input shape for predict
assert_raises(ValueError, clf.predict, X2[:, :-1])
def getClassifier(self,**kwargs):
"""
returns a vectorizer to predict the query
"""
self.path = kwargs.get('path','trainer')
self.df = self.trainWith(self.path)
self.vectorizer = CountVectorizer()
counts = self.vectorizer.fit_transform(self.df['message'].values)
self.classifier = MultinomialNB()
targets = self.df['class'].values
self.classifier.fit(counts, targets)
os.chdir(self.old_loc)
return self.classifier,self.vectorizer
def classify(n = 50):
#clf = MultinomialNB(fit_prior=False)
#clf = SVC(gamma=2, C=1, class_weight = {0.0:0.063829777, 1.0:1.0})
clf = SGDClassifier(loss="log", penalty="l1", class_weight = {0.0:0.022, 1.0:1.0})
clf.fit(mat[:n], rel[:n])
return clf
def train_test():
"""Identify accuracy via training set"""
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train) # creates vocab set and dtm for each raw document!
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm
# w = list(X_test)
return metrics.accuracy_score(y_test, y_pred_class)
# print(train_test())
def nb_test(X,y):
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print metrics.accuracy_score(y_test,y_pred)
def __create_ngram_model(self, lang):
if lang == 'en':
tfidf_ngrams = EnglishTfidfVectorizer(decode_error='ignore')
elif lang == 'ja':
tfidf_ngrams = JapaneseTfidfVectorizer(decode_error='ignore')
clf = MultinomialNB()
pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])
return pipeline
def get_naive_bayes(is_multiclass=True):
return MultinomialNB()
def _estimator(self):
return MultinomialNB()
def train_classifier(self, trainvectors, labels, alpha='default', fit_prior=True, iterations=10):
if alpha == '':
paramsearch = GridSearchCV(estimator=naive_bayes.MultinomialNB(), param_grid=dict(alpha=numpy.linspace(0,2,20)[1:]), n_jobs=6)
paramsearch.fit(trainvectors,self.label_encoder.transform(labels))
selected_alpha = paramsearch.best_estimator_.alpha
elif alpha == 'default':
selected_alpha = 1.0
else:
selected_alpha = alpha
if fit_prior == 'False':
fit_prior = False
else:
fit_prior = True
self.model = naive_bayes.MultinomialNB(alpha=selected_alpha,fit_prior=fit_prior)
self.model.fit(trainvectors, self.label_encoder.transform(labels))
def test_basic(self, single_chunk_count_classification):
X, y = single_chunk_count_classification
a = nb.PartialMultinomialNB(classes=[0, 1])
b = nb_.MultinomialNB()
a.fit(X, y)
b.partial_fit(X, y, classes=[0, 1])
assert_eq(a.coef_, b.coef_)
def case1():
from sklearn import datasets
news = datasets.fetch_20newsgroups(subset='all')
# print len(news.data)
# print len(news.target)
# print '*'*10
# print news.data[0]
# print '*'*10
# print news.target[0]
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vec = CountVectorizer()
x = vec.fit_transform(news.data)
# print x.shape
# print x[:2]
print x[:10,:10].toarray()
TFIDF = TfidfTransformer()
x_tfidf = TFIDF.fit_transform(x)
print x_tfidf[:10,:10].toarray()
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233)
tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233)
from sklearn.naive_bayes import MultinomialNB
mnb =MultinomialNB()
tf_mnb = MultinomialNB()
mmb.fit(Xtrain,ytrain)
tf_mnb.fit(tf_Xtrain,tf_ytrain)
def test_init_no_file():
mm = mnb_modelmanager.MNBModelManager()
assert isinstance(mm, mnb_modelmanager.MNBModelManager)
assert isinstance(mm.clf, Pipeline)
assert isinstance(mm.clf.named_steps['clf'], MultinomialNB)
def test_init_mnb():
ct = classifytext.ClassifyText(type=classifytext.MNB)
assert isinstance(ct.mm, mnb_modelmanager.MNBModelManager)
assert isinstance(ct.mm.clf, Pipeline)
# assert isinstance(ct.mm.clf.named_steps['clf'], MultinomialNB)
def __init__(self, filename=None):
super().__init__(filename)
if not filename:
self.clf = Pipeline([
('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'))),
('anova', SelectPercentile(f_classif)),
('clf', MultinomialNB())
])