def passive_aggressive_train(self):
'''Trains passive aggressive classifier
'''
self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0)
self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y)
y_dist = self._clf.decision_function(self._term_doc_matrix._X)
pos_ecdf = ECDF(y_dist[y_dist >= 0])
neg_ecdf = ECDF(y_dist[y_dist <= 0])
def proba_function(distance_from_hyperplane):
if distance_from_hyperplane > 0:
return pos_ecdf(distance_from_hyperplane) / 2. + 0.5
elif distance_from_hyperplane < 0:
return pos_ecdf(distance_from_hyperplane) / 2.
return 0.5
self._proba = proba_function
return self
python类PassiveAggressiveClassifier()的实例源码
def test_main(self):
categories, documents = get_docs_categories()
clean_function = lambda text: '' if text.startswith('[') else text
entity_types = set(['GPE'])
term_doc_mat = (
TermDocMatrixFactory(
category_text_iter=zip(categories, documents),
clean_function=clean_function,
nlp=_testing_nlp,
feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types)
).build()
)
clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0)
fdc = FeatsFromDoc(term_doc_mat._term_idx_store,
clean_function=clean_function,
feats_from_spacy_doc=FeatsFromSpacyDoc(
entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
tfidf = TfidfTransformer(norm='l1')
X = tfidf.fit_transform(term_doc_mat._X)
clf.fit(X, term_doc_mat._y)
X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
pred = clf.predict(tfidf.transform(X_to_predict))
dec = clf.decision_function(X_to_predict)
def test_learning_curve_batch_and_incremental_learning_are_equal():
X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
n_redundant=0, n_classes=2,
n_clusters_per_class=1, random_state=0)
train_sizes = np.linspace(0.2, 1.0, 5)
estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False)
train_sizes_inc, train_scores_inc, test_scores_inc = \
learning_curve(
estimator, X, y, train_sizes=train_sizes,
cv=3, exploit_incremental_learning=True)
train_sizes_batch, train_scores_batch, test_scores_batch = \
learning_curve(
estimator, X, y, cv=3, train_sizes=train_sizes,
exploit_incremental_learning=False)
assert_array_equal(train_sizes_inc, train_sizes_batch)
assert_array_almost_equal(train_scores_inc.mean(axis=1),
train_scores_batch.mean(axis=1))
assert_array_almost_equal(test_scores_inc.mean(axis=1),
test_scores_batch.mean(axis=1))
def test_learning_curve_batch_and_incremental_learning_are_equal():
X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
n_redundant=0, n_classes=2,
n_clusters_per_class=1, random_state=0)
train_sizes = np.linspace(0.2, 1.0, 5)
estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False)
train_sizes_inc, train_scores_inc, test_scores_inc = \
learning_curve(
estimator, X, y, train_sizes=train_sizes,
cv=3, exploit_incremental_learning=True)
train_sizes_batch, train_scores_batch, test_scores_batch = \
learning_curve(
estimator, X, y, cv=3, train_sizes=train_sizes,
exploit_incremental_learning=False)
assert_array_equal(train_sizes_inc, train_sizes_batch)
assert_array_almost_equal(train_scores_inc.mean(axis=1),
train_scores_batch.mean(axis=1))
assert_array_almost_equal(test_scores_inc.mean(axis=1),
test_scores_batch.mean(axis=1))
def test_class_weights():
# Test class weights.
X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
[1.0, 1.0], [1.0, 0.0]])
y2 = [1, 1, 1, -1, -1]
clf = PassiveAggressiveClassifier(C=0.1, n_iter=100, class_weight=None,
random_state=100)
clf.fit(X2, y2)
assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
# we give a small weights to class 1
clf = PassiveAggressiveClassifier(C=0.1, n_iter=100,
class_weight={1: 0.001},
random_state=100)
clf.fit(X2, y2)
# now the hyperplane should rotate clock-wise and
# the prediction on this point should shift
assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
def test_equal_class_weight():
X2 = [[1, 0], [1, 0], [0, 1], [0, 1]]
y2 = [0, 0, 1, 1]
clf = PassiveAggressiveClassifier(C=0.1, n_iter=1000, class_weight=None)
clf.fit(X2, y2)
# Already balanced, so "balanced" weights should have no effect
clf_balanced = PassiveAggressiveClassifier(C=0.1, n_iter=1000,
class_weight="balanced")
clf_balanced.fit(X2, y2)
clf_weighted = PassiveAggressiveClassifier(C=0.1, n_iter=1000,
class_weight={0: 0.5, 1: 0.5})
clf_weighted.fit(X2, y2)
# should be similar up to some epsilon due to learning rate schedule
assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2)
def test_basic(self, single_chunk_classification):
X, y = single_chunk_classification
a = lm.PartialPassiveAggressiveClassifier(classes=[0, 1],
random_state=0,
max_iter=100, tol=1e-3)
b = lm_.PassiveAggressiveClassifier(random_state=0, max_iter=100,
tol=1e-3)
a.fit(X, y)
b.partial_fit(X, y, classes=[0, 1])
assert_estimator_equal(a, b, exclude=['loss_function_'])
def run_cat(filename,modelname,fileout,embeddings,new_run=True,run_parse=True,
model_type='logreg',C=10.0,
alpha=1.0, cutoff=0.50, n_iter=1):
# pull relevant data and run parsing and classification
df = pd.read_csv(filename)
if (len(df.columns)==2): # make sure columns have the right names
df.columns = ['raw','amount']
if new_run: # initialize the model;
if model_type=='logreg':
model = linear_model.SGDClassifier(loss='log',warm_start=True,
n_iter=n_iter,alpha=alpha)
elif model_type=='passive-aggressive':
model = linear_model.PassiveAggressiveClassifier(C=C,warm_start=True)
elif model_type=='naive-bayes':
model = naive_bayes.GaussianNB()
else:
raise NameError('model_type must be logreg, passive-aggressive, or naive-bayes')
else: # load a saved, pre-trained model
modelFileLoad = open(modelname, 'rb')
model = pickle.load(modelFileLoad)
fileCities = dirs.data_dir + 'cities_by_state.pickle'
us_cities = pd.read_pickle(fileCities)
df = cat_df(df,model,us_cities,embeddings,new_run,run_parse,cutoff=cutoff,
model_type=model_type)
df.to_csv(fileout,index=False)
# Saving logistic regression model from training set 1
modelFileSave = open(modelname, 'wb')
pickle.dump(model, modelFileSave)
modelFileSave.close()
# ------ testing functions
def generate_base_classification():
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
models = [
#(LinearSVC, params('C', 'loss')),
# (NuSVC, params('nu', 'kernel', 'degree')),
#(SVC, params('C', 'kernel')),
#(ExtraTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')),
(DecisionTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')),
(RandomForestClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf', 'n_estimators')),
#(GaussianProcessClassifier, None),
(LogisticRegression, params('C', 'penalty')),
#(PassiveAggressiveClassifier, params('C', 'loss')),
#(RidgeClassifier, params('alpha')),
# we do in-place modification of what the method params return in order to add
# more loss functions that weren't defined in the method
#(SGDClassifier, params('loss', 'penalty', 'alpha')['loss'].extend(['log', 'modified_huber'])),
(KNeighborsClassifier, params('n_neighbors', 'leaf_size', 'p').update({
'algorithm': ['auto', 'brute', 'kd_tree', 'ball_tree']
})),
(MultinomialNB, params('alpha')),
#(GaussianNB, None),
#(BernoulliNB, params('alpha'))
]
return models
def test_classifier_accuracy():
for data in (X, X_csr):
for fit_intercept in (True, False):
clf = PassiveAggressiveClassifier(C=1.0, n_iter=30,
fit_intercept=fit_intercept,
random_state=0)
clf.fit(data, y)
score = clf.score(data, y)
assert_greater(score, 0.79)
def test_classifier_partial_fit():
classes = np.unique(y)
for data in (X, X_csr):
clf = PassiveAggressiveClassifier(C=1.0,
fit_intercept=True,
random_state=0)
for t in range(30):
clf.partial_fit(data, y, classes)
score = clf.score(data, y)
assert_greater(score, 0.79)
def test_classifier_refit():
# Classifier can be retrained on different labels and features.
clf = PassiveAggressiveClassifier().fit(X, y)
assert_array_equal(clf.classes_, np.unique(y))
clf.fit(X[:, :-1], iris.target_names[y])
assert_array_equal(clf.classes_, iris.target_names)
def test_classifier_undefined_methods():
clf = PassiveAggressiveClassifier()
for meth in ("predict_proba", "predict_log_proba", "transform"):
assert_raises(AttributeError, lambda x: getattr(clf, x), meth)
def test_partial_fit_weight_class_balanced():
# partial_fit with class_weight='balanced' not supported
clf = PassiveAggressiveClassifier(class_weight="balanced")
assert_raises(ValueError, clf.partial_fit, X, y, classes=np.unique(y))
def test_wrong_class_weight_label():
# ValueError due to wrong class_weight label.
X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
[1.0, 1.0], [1.0, 0.0]])
y2 = [1, 1, 1, -1, -1]
clf = PassiveAggressiveClassifier(class_weight={0: 0.5})
assert_raises(ValueError, clf.fit, X2, y2)
def test_partial_fit():
est = PassiveAggressiveClassifier(random_state=0, shuffle=False)
transformer = SelectFromModel(estimator=est)
transformer.partial_fit(data, y,
classes=np.unique(y))
old_model = transformer.estimator_
transformer.partial_fit(data, y,
classes=np.unique(y))
new_model = transformer.estimator_
assert_true(old_model is new_model)
X_transform = transformer.transform(data)
transformer.fit(np.vstack((data, data)), np.concatenate((y, y)))
assert_array_equal(X_transform, transformer.transform(data))
def test_warm_start():
est = PassiveAggressiveClassifier(warm_start=True, random_state=0)
transformer = SelectFromModel(estimator=est)
transformer.fit(data, y)
old_model = transformer.estimator_
transformer.fit(data, y)
new_model = transformer.estimator_
assert_true(old_model is new_model)