def build_models_NLP(train_pos_vec, train_neg_vec):
"""
Returns a BernoulliNB and LosticRegression Model that are fit to the training data.
"""
Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)
# Use sklearn's BernoulliNB and LogisticRegression functions to fit two models to the training data.
# For BernoulliNB, use alpha=1.0 and binarize=None
# For LogisticRegression, pass no parameters
train_vec = []
train_vec.extend(train_pos_vec)
train_vec.extend(train_neg_vec)
nb_model = BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True)
nb_model.fit(train_vec, Y)
lr_model = LogisticRegression()
lr_model.fit(train_vec, Y)
return nb_model, lr_model
python类BernoulliNB()的实例源码
sentiment.py 文件源码
项目:Twitter-and-IMDB-Sentimental-Analytics
作者: abhinandanramesh
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def get_classifier(self):
algo=self.algo
if algo=="GBT":
return GradientBoostingClassifier()
elif algo=="RF":
return RandomForestClassifier()
elif algo=="ADB":
return AdaBoostClassifier()
elif algo =="DT":
return DecisionTreeClassifier()
elif algo=="NB":
return BernoulliNB()
elif algo=="SGD":
return SGDClassifier()
elif algo=="SVC":
return LinearSVC()
elif algo=="MLPC":
return MLPClassifier(activation='logistic', batch_size='auto',
early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive',
learning_rate_init=0.1, max_iter=5000, random_state=1,
solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
return 0
def Fit(self, bags, bagData):
self.Bayes, self.GBayes = [], []
for i in xrange(10):
bnb = BernoulliNB()
gnb = GaussianNB()
x, y, xg = [], [], []
for j in xrange(10):
if i != j:
for vv in xrange(len(bagData[j][0])):
x.append(self.Convert(bagData[j][0][vv]))
xg.append(self.ConvertGauss(bagData[j][0][vv]))
y.extend(bagData[j][1])
bnb.fit(x, y)
gnb.fit(xg, y)
self.Bayes.append(bnb)
self.GBayes.append(gnb)
def test_discretenb_pickle():
# Test picklability of discrete naive Bayes classifiers
for cls in [BernoulliNB, MultinomialNB, GaussianNB]:
clf = cls().fit(X2, y2)
y_pred = clf.predict(X2)
store = BytesIO()
pickle.dump(clf, store)
clf = pickle.load(BytesIO(store.getvalue()))
assert_array_equal(y_pred, clf.predict(X2))
if cls is not GaussianNB:
# TODO re-enable me when partial_fit is implemented for GaussianNB
# Test pickling of estimator trained with partial_fit
clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2))
clf2.partial_fit(X2[3:], y2[3:])
store = BytesIO()
pickle.dump(clf2, store)
clf2 = pickle.load(BytesIO(store.getvalue()))
assert_array_equal(y_pred, clf2.predict(X2))
def test_input_check_partial_fit():
for cls in [BernoulliNB, MultinomialNB]:
# check shape consistency
assert_raises(ValueError, cls().partial_fit, X2, y2[:-1],
classes=np.unique(y2))
# classes is required for first call to partial fit
assert_raises(ValueError, cls().partial_fit, X2, y2)
# check consistency of consecutive classes values
clf = cls()
clf.partial_fit(X2, y2, classes=np.unique(y2))
assert_raises(ValueError, clf.partial_fit, X2, y2,
classes=np.arange(42))
# check consistency of input shape for partial_fit
assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2)
# check consistency of input shape for predict
assert_raises(ValueError, clf.predict, X2[:, :-1])
def test_discretenb_provide_prior_with_partial_fit():
# Test whether discrete NB classes use provided prior
# when using partial_fit
iris = load_iris()
iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split(
iris.data, iris.target, test_size=0.4, random_state=415)
for cls in [BernoulliNB, MultinomialNB]:
for prior in [None, [0.3, 0.3, 0.4]]:
clf_full = cls(class_prior=prior)
clf_full.fit(iris.data, iris.target)
clf_partial = cls(class_prior=prior)
clf_partial.partial_fit(iris_data1, iris_target1,
classes=[0, 1, 2])
clf_partial.partial_fit(iris_data2, iris_target2)
assert_array_almost_equal(clf_full.class_log_prior_,
clf_partial.class_log_prior_)
def test_feature_log_prob_bnb():
# Test for issue #4268.
# Tests that the feature log prob value computed by BernoulliNB when
# alpha=1.0 is equal to the expression given in Manning, Raghavan,
# and Schuetze's "Introduction to Information Retrieval" book:
# http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
X = np.array([[0, 0, 0], [1, 1, 0], [0, 1, 0], [1, 0, 1], [0, 1, 0]])
Y = np.array([0, 0, 1, 2, 2])
# Fit Bernoulli NB w/ alpha = 1.0
clf = BernoulliNB(alpha=1.0)
clf.fit(X, Y)
# Manually form the (log) numerator and denominator that
# constitute P(feature presence | class)
num = np.log(clf.feature_count_ + 1.0)
denom = np.tile(np.log(clf.class_count_ + 2.0), (X.shape[1], 1)).T
# Check manual estimate matches
assert_array_equal(clf.feature_log_prob_, (num - denom))
def __init__(self, info, verbose=True, debug_mode=False):
self.label_num=info['label_num']
self.target_num=info['target_num']
self.task = info['task']
self.metric = info['metric']
self.postprocessor = None
#self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
if debug_mode>=2:
self.name = "RandomPredictor"
self.model = RandomPredictor(self.target_num)
self.predict_method = self.model.predict_proba
return
if info['task']=='regression':
if info['is_sparse']==True:
self.name = "BaggingRidgeRegressor"
self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingRegressor"
self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
self.predict_method = self.model.predict # Always predict probabilities
else:
if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
self.name = "RandomForestClassifier"
self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
elif info['is_sparse']:
self.name = "BaggingNBClassifier"
self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingClassifier"
self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
if info['task']=='multilabel.classification':
self.model = MultiLabelEnsemble(self.model)
self.predict_method = self.model.predict_proba
def __init__(self, info, verbose=True, debug_mode=False):
self.label_num=info['label_num']
self.target_num=info['target_num']
self.task = info['task']
self.metric = info['metric']
self.postprocessor = None
#self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
if debug_mode>=2:
self.name = "RandomPredictor"
self.model = RandomPredictor(self.target_num)
self.predict_method = self.model.predict_proba
return
if info['task']=='regression':
if info['is_sparse']==True:
self.name = "BaggingRidgeRegressor"
self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingRegressor"
self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
self.predict_method = self.model.predict # Always predict probabilities
else:
if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
self.name = "RandomForestClassifier"
self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
elif info['is_sparse']:
self.name = "BaggingNBClassifier"
self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingClassifier"
self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
if info['task']=='multilabel.classification':
self.model = MultiLabelEnsemble(self.model)
self.predict_method = self.model.predict_proba
def test_basic(self, single_chunk_binary_classification):
X, y = single_chunk_binary_classification
a = nb.PartialBernoulliNB(classes=[0, 1])
b = nb_.BernoulliNB()
a.fit(X, y)
b.partial_fit(X, y, classes=[0, 1])
assert_eq(a.coef_, b.coef_)
def generate_base_classification():
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
models = [
#(LinearSVC, params('C', 'loss')),
# (NuSVC, params('nu', 'kernel', 'degree')),
#(SVC, params('C', 'kernel')),
#(ExtraTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')),
(DecisionTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')),
(RandomForestClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf', 'n_estimators')),
#(GaussianProcessClassifier, None),
(LogisticRegression, params('C', 'penalty')),
#(PassiveAggressiveClassifier, params('C', 'loss')),
#(RidgeClassifier, params('alpha')),
# we do in-place modification of what the method params return in order to add
# more loss functions that weren't defined in the method
#(SGDClassifier, params('loss', 'penalty', 'alpha')['loss'].extend(['log', 'modified_huber'])),
(KNeighborsClassifier, params('n_neighbors', 'leaf_size', 'p').update({
'algorithm': ['auto', 'brute', 'kd_tree', 'ball_tree']
})),
(MultinomialNB, params('alpha')),
#(GaussianNB, None),
#(BernoulliNB, params('alpha'))
]
return models
def train_model(data, target):
"""
Splits the data into a training set and test set
Instatiating a Bernoulli Naive Bayes classifier, train on the training set,
and then evaluate the model based upon the test set
"""
# Using cross-validation
# TO TRY: stratification for dividing preclassified tweets into homogenous subgroups before
# sampling in order to improve the representativeness of the sampling
train_tweets, validation_tweets, train_sentiment, validation_sentiment = cross_validation.train_test_split(data,
target,
test_size=0.4)
# Fitting the Naive Bayes classifier wtih the training tweets and corresponding sentiment
classifier = BernoulliNB().fit(train_tweets, train_sentiment)
predicted = classifier.predict(validation_tweets)
# Using the cross-validation split, evaluate the accuracy of the predicted tweets
evaluate_model(validation_sentiment, predicted)
# Pickling the classifier
pickle_file = open('nb_classifier.pickle', 'wb')
pickle.dump(classifier, pickle_file)
pickle_file.close()
return classifier
################################################################################
def train_BNB(X, y):
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
return bnb
def test_BernoulliNB(*data):
'''
test BernoulliNB
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
cls=naive_bayes.BernoulliNB()
cls.fit(X_train,y_train)
print('Training Score: {0}'.format(cls.score(X_train,y_train)))
print('Testing Score: {0}'.format(cls.score(X_test, y_test)))
def test_BernoulliNB_alpha(*data):
'''
test the performance with different alpha
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
alphas=np.logspace(-2,5,num=200)
train_scores=[]
test_scores=[]
for alpha in alphas:
cls=naive_bayes.BernoulliNB(alpha=alpha)
cls.fit(X_train,y_train)
train_scores.append(cls.score(X_train,y_train))
test_scores.append(cls.score(X_test, y_test))
## graph
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(alphas,train_scores,label="Training Score")
ax.plot(alphas,test_scores,label="Testing Score")
ax.set_xlabel(r"$\alpha$")
ax.set_ylabel("score")
ax.set_ylim(0,1.0)
ax.set_title("BernoulliNB")
ax.set_xscale("log")
ax.legend(loc="best")
plt.show()
def test_BernoulliNB_binarize(*data):
'''
test the performance with different binarize
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
min_x=min(np.min(X_train.ravel()),np.min(X_test.ravel()))-0.1
max_x=max(np.max(X_train.ravel()),np.max(X_test.ravel()))+0.1
binarizes=np.linspace(min_x,max_x,endpoint=True,num=100)
train_scores=[]
test_scores=[]
for binarize in binarizes:
cls=naive_bayes.BernoulliNB(binarize=binarize)
cls.fit(X_train,y_train)
train_scores.append(cls.score(X_train,y_train))
test_scores.append(cls.score(X_test, y_test))
## graph
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(binarizes,train_scores,label="Training Score")
ax.plot(binarizes,test_scores,label="Testing Score")
ax.set_xlabel("binarize")
ax.set_ylabel("score")
ax.set_ylim(0,1.0)
ax.set_xlim(min_x-1,max_x+1)
ax.set_title("BernoulliNB")
ax.legend(loc="best")
plt.show()
def sk_bernoulli_demo():
x = np.random.randint(2, size=(6, 100))
y = np.array([1, 2, 3, 4, 4, 5])
clf = BernoulliNB()
clf.fit(x, y)
# print clf.predict(x[2:3])
print clf.predict(x[2])
def test_discrete_prior():
# Test whether class priors are properly set.
for cls in [BernoulliNB, MultinomialNB]:
clf = cls().fit(X2, y2)
assert_array_almost_equal(np.log(np.array([2, 2, 2]) / 6.0),
clf.class_log_prior_, 8)
def test_discretenb_partial_fit():
for cls in [MultinomialNB, BernoulliNB]:
yield check_partial_fit, cls
def test_input_check_fit():
# Test input checks for the fit method
for cls in [BernoulliNB, MultinomialNB, GaussianNB]:
# check shape consistency for number of samples at fit time
assert_raises(ValueError, cls().fit, X2, y2[:-1])
# check shape consistency for number of input features at predict time
clf = cls().fit(X2, y2)
assert_raises(ValueError, clf.predict, X2[:, :-1])
def test_discretenb_uniform_prior():
# Test whether discrete NB classes fit a uniform prior
# when fit_prior=False and class_prior=None
for cls in [BernoulliNB, MultinomialNB]:
clf = cls()
clf.set_params(fit_prior=False)
clf.fit([[0], [0], [1]], [0, 0, 1])
prior = np.exp(clf.class_log_prior_)
assert_array_equal(prior, np.array([.5, .5]))
def test_discretenb_provide_prior():
# Test whether discrete NB classes use provided prior
for cls in [BernoulliNB, MultinomialNB]:
clf = cls(class_prior=[0.5, 0.5])
clf.fit([[0], [0], [1]], [0, 0, 1])
prior = np.exp(clf.class_log_prior_)
assert_array_equal(prior, np.array([.5, .5]))
# Inconsistent number of classes with prior
assert_raises(ValueError, clf.fit, [[0], [1], [2]], [0, 1, 2])
assert_raises(ValueError, clf.partial_fit, [[0], [1]], [0, 1],
classes=[0, 1, 1])
def test_sample_weight_multiclass():
for cls in [BernoulliNB, MultinomialNB]:
# check shape consistency for number of samples at fit time
yield check_sample_weight_multiclass, cls
def test_coef_intercept_shape():
# coef_ and intercept_ should have shapes as in other linear models.
# Non-regression test for issue #2127.
X = [[1, 0, 0], [1, 1, 1]]
y = [1, 2] # binary classification
for clf in [MultinomialNB(), BernoulliNB()]:
clf.fit(X, y)
assert_equal(clf.coef_.shape, (1, 3))
assert_equal(clf.intercept_.shape, (1,))
def create_new_user(sess_id):
gauss_clf = BernoulliNB()
user = User(session_id=sess_id)
db.session.add(user)
user_id = User.query.filter_by(session_id = sess_id).all()[0].id
classifier = Classifiers(user_id=user_id,pickled_classifier=gauss_clf)
db.session.add(classifier)
db.session.commit()
return gauss_clf
def get_pipeline_builder():
pipe_builder = PipelineBuilder()
# Feature Extraction
params = {'ngram_range': [(1, 1), (1, 2), (1, 3)]}
pipe_builder.add_extractor('CountVectorizer', CountVectorizer, 'Count Vectorizer', params)
params = {}
pipe_builder.add_extractor('HashingVectorizer', HashingVectorizer, 'Hashing Vectorizer', params)
params = {}
pipe_builder.add_extractor('TfidfVectorizer', TfidfVectorizer, 'TfIdf Vectorizer', params)
# Dimension Reduction
params = {}
pipe_builder.add_reductor('No_Reduction', ModelNull, 'None', params)
params = {}
pipe_builder.add_reductor('TruncatedSVD', TruncatedSVD, 'Truncated SVD', params)
# Normalization
params = {}
pipe_builder.add_normalizer('No_Normalization', ModelNull, 'None', params)
params = {}
pipe_builder.add_normalizer('Normalizer', Normalizer, 'Normalizer', params)
# Classification Models
params = {}
pipe_builder.add_classifier('MultinomialNB', MultinomialNB, 'Multinomial Naive Bayes', params)
params = {}
pipe_builder.add_classifier('BernoulliNB', BernoulliNB, 'Bernoulli Naive Bayes', params)
params = {}
pipe_builder.add_classifier('KNeighborsClassifier', KNeighborsClassifier, 'K-Neighbors', params)
params = {}
pipe_builder.add_classifier('RadiusNeighborsClassifier', RadiusNeighborsClassifier, 'Radius Neighbors', params)
return pipe_builder
def train(self):
self.pos = open("data/positive.txt", "r").read()
self.neg = open("data/negative.txt", "r").read()
self.words = []
self.doc = []
for p in self.pos.split('\n'):
self.doc.append((p, "pos"))
words = word_tokenize(p)
pos = nltk.pos_tag(words)
for w in pos:
if w[1][0] in ["J"]:
self.words.append(w[0].lower())
for p in self.neg.split('\n'):
self.doc.append((p, "neg"))
words = word_tokenize(p)
pos = nltk.pos_tag(words)
for w in pos:
if w[1][0] in ["J"]:
self.words.append(w[0].lower())
pickle.dump(self.doc, open("pickle/doc.pickle", "wb"))
self.words = nltk.FreqDist(self.words)
self.wordFeat = [self.i for (selfi, self.c)in self.words.most_common(5000)]
pickle.dump(self.wordFeat, open("pickle/wordFeat.pickle", "wb"))
self.featSet = [(trainClassifier().featureFind(self.rev,self.wordFeat), self.category) for (self.rev, self.category) in self.doc]
random.shuffle(self.featSet)
self.testSet = self.featSet[10000:]
self.triainSet = self.featSet[:10000]
pickle.dump(self.featSet,open("pickle/featSet.pickle", "wb"))
ONB = nltk.NaiveBayesClassifier.train(self.triainSet)
print("Original Naive Bayes Algo accuracy:",round((nltk.clify.accuracy(ONB, self.testSet)) * 100,2),"%")
pickle.dump(ONB, open("pickle/ONB.pickle", "wb"))
MNB = SklearnClassifier(MultinomialNB())
MNB.train(self.triainSet)
print("MultinomialNB accuracy:",round((nltk.clify.accuracy(MNB, self.testSet)) * 100,2),"%")
pickle.dump(MNB, open("pickle/MNB.pickle", "wb"))
BNB = SklearnClassifier(BernoulliNB())
BNB.train(self.triainSet)
print("BernoulliNB accuracy percent:",round((nltk.clify.accuracy(BNB, self.testSet)) * 100,2),"%")
pickle.dump(BNB, open("pickle/BNB.pickle", "wb"))
LR = SklearnClassifier(LogisticRegression())
LR.train(self.triainSet)
print("LogisticRegression accuracy:",round((nltk.clify.accuracy(LR, self.testSet)) * 100,2),"%")
pickle.dump(LR, open("pickle/LR.pickle", "wb"))
LSVC = SklearnClassifier(LinearSVC())
LSVC.train(self.triainSet)
print("LinearSVC accuracy:",round((nltk.clify.accuracy(LSVC, self.testSet)) * 100,2),"%")
pickle.dump(LSVC, open("pickle/LSVC.pickle", "wb"))
SGDC = SklearnClassifier(SGDClassifier())
SGDC.train(self.triainSet)
print("SGDClassifier accuracy:", round(nltk.clify.accuracy(SGDC, self.testSet) * 100,2),"%")
pickle.dump(SGDC, open("pickle/SGDC.pickle", "wb"))
def test_bnb():
# Tests that BernoulliNB when alpha=1.0 gives the same values as
# those given for the toy example in Manning, Raghavan, and
# Schuetze's "Introduction to Information Retrieval" book:
# http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
# Training data points are:
# Chinese Beijing Chinese (class: China)
# Chinese Chinese Shanghai (class: China)
# Chinese Macao (class: China)
# Tokyo Japan Chinese (class: Japan)
# Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo
X = np.array([[1, 1, 0, 0, 0, 0],
[0, 1, 0, 0, 1, 0],
[0, 1, 0, 1, 0, 0],
[0, 1, 1, 0, 0, 1]])
# Classes are China (0), Japan (1)
Y = np.array([0, 0, 0, 1])
# Fit BernoulliBN w/ alpha = 1.0
clf = BernoulliNB(alpha=1.0)
clf.fit(X, Y)
# Check the class prior is correct
class_prior = np.array([0.75, 0.25])
assert_array_almost_equal(np.exp(clf.class_log_prior_), class_prior)
# Check the feature probabilities are correct
feature_prob = np.array([[0.4, 0.8, 0.2, 0.4, 0.4, 0.2],
[1/3.0, 2/3.0, 2/3.0, 1/3.0, 1/3.0, 2/3.0]])
assert_array_almost_equal(np.exp(clf.feature_log_prob_), feature_prob)
# Testing data point is:
# Chinese Chinese Chinese Tokyo Japan
X_test = np.array([[0, 1, 1, 0, 0, 1]])
# Check the predictive probabilities are correct
unnorm_predict_proba = np.array([[0.005183999999999999,
0.02194787379972565]])
predict_proba = unnorm_predict_proba / np.sum(unnorm_predict_proba)
assert_array_almost_equal(clf.predict_proba(X_test), predict_proba)