python类BernoulliNB()的实例源码-面圈网

sentiment.py 文件源码项目：Twitter-and-IMDB-Sentimental-Analytics 作者: abhinandanramesh 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def build_models_NLP(train_pos_vec, train_neg_vec):
    """
    Returns a BernoulliNB and LosticRegression Model that are fit to the training data.
    """
    Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)

    # Use sklearn's BernoulliNB and LogisticRegression functions to fit two models to the training data.
    # For BernoulliNB, use alpha=1.0 and binarize=None
    # For LogisticRegression, pass no parameters
    train_vec = []
    train_vec.extend(train_pos_vec)
    train_vec.extend(train_neg_vec)

    nb_model = BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True)
    nb_model.fit(train_vec, Y)

    lr_model = LogisticRegression()
    lr_model.fit(train_vec, Y)

    return nb_model, lr_model

SentiCR.py 文件源码项目：SentiCR 作者: senticr 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def get_classifier(self):
        algo=self.algo

        if algo=="GBT":
            return GradientBoostingClassifier()
        elif algo=="RF":
            return  RandomForestClassifier()
        elif algo=="ADB":
            return AdaBoostClassifier()
        elif algo =="DT":
            return  DecisionTreeClassifier()
        elif algo=="NB":
            return  BernoulliNB()
        elif algo=="SGD":
            return  SGDClassifier()
        elif algo=="SVC":
            return LinearSVC()
        elif algo=="MLPC":
            return MLPClassifier(activation='logistic',  batch_size='auto',
            early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive',
            learning_rate_init=0.1, max_iter=5000, random_state=1,
            solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
            warm_start=False)
        return 0

main.py 文件源码项目：Sberbank 作者: dimaquick 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def Fit(self, bags, bagData):
        self.Bayes, self.GBayes = [], []
        for i in xrange(10):
            bnb = BernoulliNB()
            gnb = GaussianNB()
            x, y, xg = [], [], []
            for j in xrange(10):
                if i != j:
                    for vv in xrange(len(bagData[j][0])):
                        x.append(self.Convert(bagData[j][0][vv]))
                        xg.append(self.ConvertGauss(bagData[j][0][vv]))
                    y.extend(bagData[j][1])
            bnb.fit(x, y)
            gnb.fit(xg, y)
            self.Bayes.append(bnb)
            self.GBayes.append(gnb)

test_naive_bayes.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def test_discretenb_pickle():
    # Test picklability of discrete naive Bayes classifiers

    for cls in [BernoulliNB, MultinomialNB, GaussianNB]:
        clf = cls().fit(X2, y2)
        y_pred = clf.predict(X2)

        store = BytesIO()
        pickle.dump(clf, store)
        clf = pickle.load(BytesIO(store.getvalue()))

        assert_array_equal(y_pred, clf.predict(X2))

        if cls is not GaussianNB:
            # TODO re-enable me when partial_fit is implemented for GaussianNB

            # Test pickling of estimator trained with partial_fit
            clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2))
            clf2.partial_fit(X2[3:], y2[3:])
            store = BytesIO()
            pickle.dump(clf2, store)
            clf2 = pickle.load(BytesIO(store.getvalue()))
            assert_array_equal(y_pred, clf2.predict(X2))

test_naive_bayes.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def test_input_check_partial_fit():
    for cls in [BernoulliNB, MultinomialNB]:
        # check shape consistency
        assert_raises(ValueError, cls().partial_fit, X2, y2[:-1],
                      classes=np.unique(y2))

        # classes is required for first call to partial fit
        assert_raises(ValueError, cls().partial_fit, X2, y2)

        # check consistency of consecutive classes values
        clf = cls()
        clf.partial_fit(X2, y2, classes=np.unique(y2))
        assert_raises(ValueError, clf.partial_fit, X2, y2,
                      classes=np.arange(42))

        # check consistency of input shape for partial_fit
        assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2)

        # check consistency of input shape for predict
        assert_raises(ValueError, clf.predict, X2[:, :-1])

test_naive_bayes.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def test_discretenb_provide_prior_with_partial_fit():
    # Test whether discrete NB classes use provided prior
    # when using partial_fit

    iris = load_iris()
    iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split(
        iris.data, iris.target, test_size=0.4, random_state=415)

    for cls in [BernoulliNB, MultinomialNB]:
        for prior in [None, [0.3, 0.3, 0.4]]:
            clf_full = cls(class_prior=prior)
            clf_full.fit(iris.data, iris.target)
            clf_partial = cls(class_prior=prior)
            clf_partial.partial_fit(iris_data1, iris_target1,
                                    classes=[0, 1, 2])
            clf_partial.partial_fit(iris_data2, iris_target2)
            assert_array_almost_equal(clf_full.class_log_prior_,
                                      clf_partial.class_log_prior_)

test_naive_bayes.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 86 收藏 0 点赞 0 评论 0

def test_feature_log_prob_bnb():
    # Test for issue #4268.
    # Tests that the feature log prob value computed by BernoulliNB when
    # alpha=1.0 is equal to the expression given in Manning, Raghavan,
    # and Schuetze's "Introduction to Information Retrieval" book:
    # http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html

    X = np.array([[0, 0, 0], [1, 1, 0], [0, 1, 0], [1, 0, 1], [0, 1, 0]])
    Y = np.array([0, 0, 1, 2, 2])

    # Fit Bernoulli NB w/ alpha = 1.0
    clf = BernoulliNB(alpha=1.0)
    clf.fit(X, Y)

    # Manually form the (log) numerator and denominator that
    # constitute P(feature presence | class)
    num = np.log(clf.feature_count_ + 1.0)
    denom = np.tile(np.log(clf.class_count_ + 2.0), (X.shape[1], 1)).T

    # Check manual estimate matches
    assert_array_equal(clf.feature_log_prob_, (num - denom))

models.py 文件源码项目：AutoML5 作者: djajetic 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def __init__(self, info, verbose=True, debug_mode=False):
        self.label_num=info['label_num']
        self.target_num=info['target_num']
        self.task = info['task']
        self.metric = info['metric']
        self.postprocessor = None
        #self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
        self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
        if debug_mode>=2:
            self.name = "RandomPredictor"
            self.model = RandomPredictor(self.target_num)
            self.predict_method = self.model.predict_proba 
            return
        if info['task']=='regression':
            if info['is_sparse']==True:
                self.name = "BaggingRidgeRegressor"
                self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
            else:
                self.name = "GradientBoostingRegressor"
                self.model = GradientBoostingRegressor(n_estimators=1,  max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
            self.predict_method = self.model.predict # Always predict probabilities
        else:
            if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
                self.name = "RandomForestClassifier"
                self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
            elif info['is_sparse']:                
                self.name = "BaggingNBClassifier"
                self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...                          
            else:
                self.name = "GradientBoostingClassifier"
                self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
            if info['task']=='multilabel.classification':
                self.model = MultiLabelEnsemble(self.model)
            self.predict_method = self.model.predict_proba

models.py 文件源码项目：AutoML4 作者: djajetic 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def __init__(self, info, verbose=True, debug_mode=False):
        self.label_num=info['label_num']
        self.target_num=info['target_num']
        self.task = info['task']
        self.metric = info['metric']
        self.postprocessor = None
        #self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
        self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
        if debug_mode>=2:
            self.name = "RandomPredictor"
            self.model = RandomPredictor(self.target_num)
            self.predict_method = self.model.predict_proba 
            return
        if info['task']=='regression':
            if info['is_sparse']==True:
                self.name = "BaggingRidgeRegressor"
                self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
            else:
                self.name = "GradientBoostingRegressor"
                self.model = GradientBoostingRegressor(n_estimators=1,  max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
            self.predict_method = self.model.predict # Always predict probabilities
        else:
            if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
                self.name = "RandomForestClassifier"
                self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
            elif info['is_sparse']:                
                self.name = "BaggingNBClassifier"
                self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...                          
            else:
                self.name = "GradientBoostingClassifier"
                self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
            if info['task']=='multilabel.classification':
                self.model = MultiLabelEnsemble(self.model)
            self.predict_method = self.model.predict_proba

test_naive_bayes.py 文件源码项目：dask-ml 作者: dask 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def test_basic(self, single_chunk_binary_classification):
        X, y = single_chunk_binary_classification
        a = nb.PartialBernoulliNB(classes=[0, 1])
        b = nb_.BernoulliNB()
        a.fit(X, y)
        b.partial_fit(X, y, classes=[0, 1])
        assert_eq(a.coef_, b.coef_)

tbs_ml.py 文件源码项目：eezzy 作者: 3Blades 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def generate_base_classification():
    from sklearn.svm import LinearSVC, NuSVC, SVC
    from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.gaussian_process import GaussianProcessClassifier
    from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
    models = [
        #(LinearSVC, params('C', 'loss')),
#         (NuSVC, params('nu', 'kernel', 'degree')),
        #(SVC, params('C', 'kernel')),
        #(ExtraTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')),
        (DecisionTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')),
        (RandomForestClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf', 'n_estimators')),
        #(GaussianProcessClassifier, None),
        (LogisticRegression, params('C', 'penalty')),
        #(PassiveAggressiveClassifier, params('C', 'loss')),
        #(RidgeClassifier, params('alpha')),
        # we do in-place modification of what the method params return in order to add
        # more loss functions that weren't defined in the method
        #(SGDClassifier, params('loss', 'penalty', 'alpha')['loss'].extend(['log', 'modified_huber'])),
        (KNeighborsClassifier, params('n_neighbors', 'leaf_size', 'p').update({
            'algorithm': ['auto', 'brute', 'kd_tree', 'ball_tree']
        })),
        (MultinomialNB, params('alpha')),
        #(GaussianNB, None),
        #(BernoulliNB, params('alpha'))
    ]

    return models

naive_bayes.py 文件源码项目：TwitterElectionTracking 作者: lgorham 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def train_model(data, target):
    """
    Splits the data into a training set and test set

    Instatiating a Bernoulli Naive Bayes classifier, train on the training set,
    and then evaluate the model based upon the test set
    """

    # Using cross-validation
    # TO TRY: stratification for dividing preclassified tweets into homogenous subgroups before
    # sampling in order to improve the representativeness of the sampling

    train_tweets, validation_tweets, train_sentiment, validation_sentiment = cross_validation.train_test_split(data, 
                                                                                                target,
                                                                                                test_size=0.4)


    # Fitting the Naive Bayes classifier wtih the training tweets and corresponding sentiment
    classifier = BernoulliNB().fit(train_tweets, train_sentiment)


    predicted = classifier.predict(validation_tweets)

    # Using the cross-validation split, evaluate the accuracy of the predicted tweets
    evaluate_model(validation_sentiment, predicted)

    # Pickling the classifier
    pickle_file = open('nb_classifier.pickle', 'wb')
    pickle.dump(classifier, pickle_file)
    pickle_file.close()

    return classifier


################################################################################

classifier.py 文件源码项目：tweet-emotion-spam-detection 作者: cheskayang 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def train_BNB(X, y):
    bnb = BernoulliNB()
    bnb.fit(X_train, y_train)
    return bnb

3.3 Bernoulli NB.py 文件源码项目：ML-note 作者: JasonK93 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def test_BernoulliNB(*data):
    '''
    test BernoulliNB
    :param data: train_data, test_data, train_value, test_value
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    cls=naive_bayes.BernoulliNB()
    cls.fit(X_train,y_train)
    print('Training Score: {0}'.format(cls.score(X_train,y_train)))
    print('Testing Score: {0}'.format(cls.score(X_test, y_test)))

3.3 Bernoulli NB.py 文件源码项目：ML-note 作者: JasonK93 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def test_BernoulliNB_alpha(*data):
    '''
    test the performance with different alpha
    :param data: train_data, test_data, train_value, test_value
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    alphas=np.logspace(-2,5,num=200)
    train_scores=[]
    test_scores=[]
    for alpha in alphas:
        cls=naive_bayes.BernoulliNB(alpha=alpha)
        cls.fit(X_train,y_train)
        train_scores.append(cls.score(X_train,y_train))
        test_scores.append(cls.score(X_test, y_test))

    ## graph
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    ax.plot(alphas,train_scores,label="Training Score")
    ax.plot(alphas,test_scores,label="Testing Score")
    ax.set_xlabel(r"$\alpha$")
    ax.set_ylabel("score")
    ax.set_ylim(0,1.0)
    ax.set_title("BernoulliNB")
    ax.set_xscale("log")
    ax.legend(loc="best")
    plt.show()

3.3 Bernoulli NB.py 文件源码项目：ML-note 作者: JasonK93 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def test_BernoulliNB_binarize(*data):
    '''
    test the performance with different binarize
    :param data: train_data, test_data, train_value, test_value
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    min_x=min(np.min(X_train.ravel()),np.min(X_test.ravel()))-0.1
    max_x=max(np.max(X_train.ravel()),np.max(X_test.ravel()))+0.1
    binarizes=np.linspace(min_x,max_x,endpoint=True,num=100)
    train_scores=[]
    test_scores=[]
    for binarize in binarizes:
        cls=naive_bayes.BernoulliNB(binarize=binarize)
        cls.fit(X_train,y_train)
        train_scores.append(cls.score(X_train,y_train))
        test_scores.append(cls.score(X_test, y_test))

    ## graph
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    ax.plot(binarizes,train_scores,label="Training Score")
    ax.plot(binarizes,test_scores,label="Testing Score")
    ax.set_xlabel("binarize")
    ax.set_ylabel("score")
    ax.set_ylim(0,1.0)
    ax.set_xlim(min_x-1,max_x+1)
    ax.set_title("BernoulliNB")
    ax.legend(loc="best")
    plt.show()

bayes_sklearn.py 文件源码项目：python_utils 作者: Jayhello 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def sk_bernoulli_demo():
    x = np.random.randint(2, size=(6, 100))
    y = np.array([1, 2, 3, 4, 4, 5])
    clf = BernoulliNB()
    clf.fit(x, y)
    # print clf.predict(x[2:3])
    print clf.predict(x[2])

test_naive_bayes.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def test_discrete_prior():
    # Test whether class priors are properly set.
    for cls in [BernoulliNB, MultinomialNB]:
        clf = cls().fit(X2, y2)
        assert_array_almost_equal(np.log(np.array([2, 2, 2]) / 6.0),
                                  clf.class_log_prior_, 8)

test_naive_bayes.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def test_discretenb_partial_fit():
    for cls in [MultinomialNB, BernoulliNB]:
        yield check_partial_fit, cls

test_naive_bayes.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def test_input_check_fit():
    # Test input checks for the fit method
    for cls in [BernoulliNB, MultinomialNB, GaussianNB]:
        # check shape consistency for number of samples at fit time
        assert_raises(ValueError, cls().fit, X2, y2[:-1])

        # check shape consistency for number of input features at predict time
        clf = cls().fit(X2, y2)
        assert_raises(ValueError, clf.predict, X2[:, :-1])

test_naive_bayes.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def test_discretenb_uniform_prior():
    # Test whether discrete NB classes fit a uniform prior
    # when fit_prior=False and class_prior=None

    for cls in [BernoulliNB, MultinomialNB]:
        clf = cls()
        clf.set_params(fit_prior=False)
        clf.fit([[0], [0], [1]], [0, 0, 1])
        prior = np.exp(clf.class_log_prior_)
        assert_array_equal(prior, np.array([.5, .5]))

test_naive_bayes.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def test_discretenb_provide_prior():
    # Test whether discrete NB classes use provided prior

    for cls in [BernoulliNB, MultinomialNB]:
        clf = cls(class_prior=[0.5, 0.5])
        clf.fit([[0], [0], [1]], [0, 0, 1])
        prior = np.exp(clf.class_log_prior_)
        assert_array_equal(prior, np.array([.5, .5]))

        # Inconsistent number of classes with prior
        assert_raises(ValueError, clf.fit, [[0], [1], [2]], [0, 1, 2])
        assert_raises(ValueError, clf.partial_fit, [[0], [1]], [0, 1],
                      classes=[0, 1, 1])

test_naive_bayes.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def test_sample_weight_multiclass():
    for cls in [BernoulliNB, MultinomialNB]:
        # check shape consistency for number of samples at fit time
        yield check_sample_weight_multiclass, cls

test_naive_bayes.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def test_coef_intercept_shape():
    # coef_ and intercept_ should have shapes as in other linear models.
    # Non-regression test for issue #2127.
    X = [[1, 0, 0], [1, 1, 1]]
    y = [1, 2]  # binary classification

    for clf in [MultinomialNB(), BernoulliNB()]:
        clf.fit(X, y)
        assert_equal(clf.coef_.shape, (1, 3))
        assert_equal(clf.intercept_.shape, (1,))

app.py 文件源码项目：virtual-real-estate 作者: Theo- 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def create_new_user(sess_id):
    gauss_clf = BernoulliNB()
    user = User(session_id=sess_id)
    db.session.add(user)
    user_id = User.query.filter_by(session_id = sess_id).all()[0].id
    classifier = Classifiers(user_id=user_id,pickled_classifier=gauss_clf)
    db.session.add(classifier)
    db.session.commit()
    return gauss_clf

model_pipeline.py 文件源码项目：texta 作者: texta-tk 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def get_pipeline_builder():

    pipe_builder = PipelineBuilder()

    # Feature Extraction
    params = {'ngram_range': [(1, 1), (1, 2), (1, 3)]}
    pipe_builder.add_extractor('CountVectorizer', CountVectorizer, 'Count Vectorizer', params)

    params = {}
    pipe_builder.add_extractor('HashingVectorizer', HashingVectorizer, 'Hashing Vectorizer', params)

    params = {}
    pipe_builder.add_extractor('TfidfVectorizer', TfidfVectorizer, 'TfIdf Vectorizer', params)

    # Dimension Reduction
    params = {}
    pipe_builder.add_reductor('No_Reduction', ModelNull, 'None', params)

    params = {}
    pipe_builder.add_reductor('TruncatedSVD', TruncatedSVD, 'Truncated SVD', params)

    # Normalization
    params = {}
    pipe_builder.add_normalizer('No_Normalization', ModelNull, 'None', params)

    params = {}
    pipe_builder.add_normalizer('Normalizer', Normalizer, 'Normalizer', params)

    # Classification Models
    params = {}
    pipe_builder.add_classifier('MultinomialNB', MultinomialNB, 'Multinomial Naive Bayes', params)

    params = {}
    pipe_builder.add_classifier('BernoulliNB', BernoulliNB, 'Bernoulli Naive Bayes', params)

    params = {}
    pipe_builder.add_classifier('KNeighborsClassifier', KNeighborsClassifier, 'K-Neighbors', params)

    params = {}
    pipe_builder.add_classifier('RadiusNeighborsClassifier', RadiusNeighborsClassifier, 'Radius Neighbors', params)

    return pipe_builder

trainer.py 文件源码项目：Political-Opinion-Finder 作者: philhabell 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def train(self):
        self.pos = open("data/positive.txt", "r").read()
        self.neg = open("data/negative.txt", "r").read()
        self.words = []
        self.doc = []

        for p in self.pos.split('\n'):
            self.doc.append((p, "pos"))
            words = word_tokenize(p)
            pos = nltk.pos_tag(words)
            for w in pos:
                if w[1][0] in ["J"]:
                    self.words.append(w[0].lower())

        for p in self.neg.split('\n'):
            self.doc.append((p, "neg"))
            words = word_tokenize(p)
            pos = nltk.pos_tag(words)
            for w in pos:
                if w[1][0] in ["J"]:
                    self.words.append(w[0].lower())

        pickle.dump(self.doc, open("pickle/doc.pickle", "wb"))
        self.words = nltk.FreqDist(self.words)
        self.wordFeat = [self.i for (selfi, self.c)in self.words.most_common(5000)]
        pickle.dump(self.wordFeat, open("pickle/wordFeat.pickle", "wb"))
        self.featSet = [(trainClassifier().featureFind(self.rev,self.wordFeat), self.category) for (self.rev, self.category) in self.doc]
        random.shuffle(self.featSet)
        self.testSet = self.featSet[10000:]
        self.triainSet = self.featSet[:10000]
        pickle.dump(self.featSet,open("pickle/featSet.pickle", "wb"))
        ONB = nltk.NaiveBayesClassifier.train(self.triainSet)
        print("Original Naive Bayes Algo accuracy:",round((nltk.clify.accuracy(ONB, self.testSet)) * 100,2),"%")
        pickle.dump(ONB, open("pickle/ONB.pickle", "wb"))
        MNB = SklearnClassifier(MultinomialNB())
        MNB.train(self.triainSet)
        print("MultinomialNB accuracy:",round((nltk.clify.accuracy(MNB, self.testSet)) * 100,2),"%")
        pickle.dump(MNB, open("pickle/MNB.pickle", "wb"))
        BNB = SklearnClassifier(BernoulliNB())
        BNB.train(self.triainSet)
        print("BernoulliNB accuracy percent:",round((nltk.clify.accuracy(BNB, self.testSet)) * 100,2),"%")
        pickle.dump(BNB, open("pickle/BNB.pickle", "wb"))
        LR = SklearnClassifier(LogisticRegression())
        LR.train(self.triainSet)
        print("LogisticRegression accuracy:",round((nltk.clify.accuracy(LR, self.testSet)) * 100,2),"%")
        pickle.dump(LR, open("pickle/LR.pickle", "wb"))
        LSVC = SklearnClassifier(LinearSVC())
        LSVC.train(self.triainSet)
        print("LinearSVC accuracy:",round((nltk.clify.accuracy(LSVC, self.testSet)) * 100,2),"%")
        pickle.dump(LSVC, open("pickle/LSVC.pickle", "wb"))
        SGDC = SklearnClassifier(SGDClassifier())
        SGDC.train(self.triainSet)
        print("SGDClassifier accuracy:", round(nltk.clify.accuracy(SGDC, self.testSet) * 100,2),"%")
        pickle.dump(SGDC, open("pickle/SGDC.pickle", "wb"))

test_naive_bayes.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def test_bnb():
    # Tests that BernoulliNB when alpha=1.0 gives the same values as
    # those given for the toy example in Manning, Raghavan, and
    # Schuetze's "Introduction to Information Retrieval" book:
    # http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html

    # Training data points are:
    # Chinese Beijing Chinese (class: China)
    # Chinese Chinese Shanghai (class: China)
    # Chinese Macao (class: China)
    # Tokyo Japan Chinese (class: Japan)

    # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo
    X = np.array([[1, 1, 0, 0, 0, 0],
                  [0, 1, 0, 0, 1, 0],
                  [0, 1, 0, 1, 0, 0],
                  [0, 1, 1, 0, 0, 1]])

    # Classes are China (0), Japan (1)
    Y = np.array([0, 0, 0, 1])

    # Fit BernoulliBN w/ alpha = 1.0
    clf = BernoulliNB(alpha=1.0)
    clf.fit(X, Y)

    # Check the class prior is correct
    class_prior = np.array([0.75, 0.25])
    assert_array_almost_equal(np.exp(clf.class_log_prior_), class_prior)

    # Check the feature probabilities are correct
    feature_prob = np.array([[0.4, 0.8, 0.2, 0.4, 0.4, 0.2],
                             [1/3.0, 2/3.0, 2/3.0, 1/3.0, 1/3.0, 2/3.0]])
    assert_array_almost_equal(np.exp(clf.feature_log_prob_), feature_prob)

    # Testing data point is:
    # Chinese Chinese Chinese Tokyo Japan
    X_test = np.array([[0, 1, 1, 0, 0, 1]])

    # Check the predictive probabilities are correct
    unnorm_predict_proba = np.array([[0.005183999999999999,
                                      0.02194787379972565]])
    predict_proba = unnorm_predict_proba / np.sum(unnorm_predict_proba)
    assert_array_almost_equal(clf.predict_proba(X_test), predict_proba)