def convert(model, feature_names, target):
"""Convert a Nu-Support Vector Classification (NuSVC) model to the protobuf spec.
Parameters
----------
model: NuSVC
A trained NuSVC encoder model.
feature_names: [str], optional (default=None)
Name of the input columns.
target: str, optional (default=None)
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
_sklearn_util.check_expected_type(model, _NuSVC)
return _SVC.convert(model, feature_names, target)
python类NuSVC()的实例源码
def convert(model, feature_names, target):
"""Convert a Nu-Support Vector Classification (NuSVC) model to the protobuf spec.
Parameters
----------
model: NuSVC
A trained NuSVC encoder model.
feature_names: [str], optional (default=None)
Name of the input columns.
target: str, optional (default=None)
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
_sklearn_util.check_expected_type(model, _NuSVC)
return _SVC.convert(model, feature_names, target)
def __init__(self, path, etype, **kwargs):
super(EnsembleModel, self).__init__(path, etype=etype, **kwargs)
self.basedir = "models/ensemble/"
self.goldstd = kwargs.get("goldstd")
self.data = {}
self.offsets = []
self.pipeline = Pipeline(
[
#('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
#('clf', SGDClassifier())
# ('clf', svm.NuSVC(nu=0.01 ))
('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True))
# ('clf', tree.DecisionTreeClassifier(criterion="entropy")),
# ('clf', MultinomialNB())
# ('clf', GaussianNB())
#('clf', svm.SVC(kernel="rbf", degree=2, C=1)),
#('clf', svm.SVC(kernel="linear", C=2))
#('clf', DummyClassifier(strategy="constant", constant=True))
])
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
super(ScikitRE, self).__init__()
self.modelname = relationtype + "_" + modelname
self.relationtype = relationtype
self.pairtype = relationtype
self.corpus = corpus
self.pairs = []
self.features = []
self.labels = []
self.pred = []
self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
self.generate_data(corpus, modelname, relationtype)
self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
#('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
#('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
#('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
#('clf', SGDClassifier())
#('clf', svm.NuSVC(nu=0.01 ))
#('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
('clf', MultinomialNB(alpha=0.01, fit_prior=False))
#('clf', DummyClassifier(strategy="constant", constant=True))
])
def _train(self, X_matrix, y, **kwargs):
"""????
Parameters:
X_matrix (numpy.array): - ????????????
y (numpy.array): - ???????????
Returns:
sklearn.model: - sklearn???
"""
from sklearn.svm import NuSVC
model = NuSVC(**kwargs)
model.fit(X_matrix, y)
return model
def test_probability():
# Predict probabilities using SVC
# This uses cross validation, so we use a slightly bigger testing set.
for clf in (svm.SVC(probability=True, random_state=0, C=1.0),
svm.NuSVC(probability=True, random_state=0)):
clf.fit(iris.data, iris.target)
prob_predict = clf.predict_proba(iris.data)
assert_array_almost_equal(
np.sum(prob_predict, 1), np.ones(iris.data.shape[0]))
assert_true(np.mean(np.argmax(prob_predict, 1)
== clf.predict(iris.data)) > 0.9)
assert_almost_equal(clf.predict_proba(iris.data),
np.exp(clf.predict_log_proba(iris.data)), 8)
def test_conversion_bad_inputs(self):
from sklearn.preprocessing import OneHotEncoder
# Error on converting an untrained model
with self.assertRaises(TypeError):
model = NuSVC()
spec = scikit_converter.convert(model, 'data', 'out')
# Check the expected class during conversion
with self.assertRaises(TypeError):
model = OneHotEncoder()
spec = scikit_converter.convert(model, 'data', 'out')
def generate_base_classification():
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
models = [
#(LinearSVC, params('C', 'loss')),
# (NuSVC, params('nu', 'kernel', 'degree')),
#(SVC, params('C', 'kernel')),
#(ExtraTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')),
(DecisionTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')),
(RandomForestClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf', 'n_estimators')),
#(GaussianProcessClassifier, None),
(LogisticRegression, params('C', 'penalty')),
#(PassiveAggressiveClassifier, params('C', 'loss')),
#(RidgeClassifier, params('alpha')),
# we do in-place modification of what the method params return in order to add
# more loss functions that weren't defined in the method
#(SGDClassifier, params('loss', 'penalty', 'alpha')['loss'].extend(['log', 'modified_huber'])),
(KNeighborsClassifier, params('n_neighbors', 'leaf_size', 'p').update({
'algorithm': ['auto', 'brute', 'kd_tree', 'ball_tree']
})),
(MultinomialNB, params('alpha')),
#(GaussianNB, None),
#(BernoulliNB, params('alpha'))
]
return models
def __init__(self, classifier):
assert isinstance(classifier, (SVC, LinearSVC, NuSVC)), \
"Classifier must be a sklearn's SVM classifier (SVC, LinearSVC, NuSVC)."
self.clf = classifier
self.model = None
def test_bad_input():
# Test that it gives proper exception on deficient input
# impossible value of C
assert_raises(ValueError, svm.SVC(C=-1).fit, X, Y)
# impossible value of nu
clf = svm.NuSVC(nu=0.0)
assert_raises(ValueError, clf.fit, X, Y)
Y2 = Y[:-1] # wrong dimensions for labels
assert_raises(ValueError, clf.fit, X, Y2)
# Test with arrays that are non-contiguous.
for clf in (svm.SVC(), svm.LinearSVC(random_state=0)):
Xf = np.asfortranarray(X)
assert_false(Xf.flags['C_CONTIGUOUS'])
yf = np.ascontiguousarray(np.tile(Y, (2, 1)).T)
yf = yf[:, -1]
assert_false(yf.flags['F_CONTIGUOUS'])
assert_false(yf.flags['C_CONTIGUOUS'])
clf.fit(Xf, yf)
assert_array_equal(clf.predict(T), true_result)
# error for precomputed kernelsx
clf = svm.SVC(kernel='precomputed')
assert_raises(ValueError, clf.fit, X, Y)
# sample_weight bad dimensions
clf = svm.SVC()
assert_raises(ValueError, clf.fit, X, Y, sample_weight=range(len(X) - 1))
# predict with sparse input when trained with dense
clf = svm.SVC().fit(X, Y)
assert_raises(ValueError, clf.predict, sparse.lil_matrix(X))
Xt = np.array(X).T
clf.fit(np.dot(X, Xt), Y)
assert_raises(ValueError, clf.predict, X)
clf = svm.SVC()
clf.fit(X, Y)
assert_raises(ValueError, clf.predict, Xt)
def test_immutable_coef_property():
# Check that primal coef modification are not silently ignored
svms = [
svm.SVC(kernel='linear').fit(iris.data, iris.target),
svm.NuSVC(kernel='linear').fit(iris.data, iris.target),
svm.SVR(kernel='linear').fit(iris.data, iris.target),
svm.NuSVR(kernel='linear').fit(iris.data, iris.target),
svm.OneClassSVM(kernel='linear').fit(iris.data),
]
for clf in svms:
assert_raises(AttributeError, clf.__setattr__, 'coef_', np.arange(3))
assert_raises((RuntimeError, ValueError),
clf.coef_.__setitem__, (0, 0), 0)
def test_decision_function_shape_two_class():
for n_classes in [2, 3]:
X, y = make_blobs(centers=n_classes, random_state=0)
for estimator in [svm.SVC, svm.NuSVC]:
clf = OneVsRestClassifier(estimator(
decision_function_shape="ovr")).fit(X, y)
assert_equal(len(clf.predict(X)), len(y))
def test_error():
# Test that it gives proper exception on deficient input
# impossible value of C
assert_raises(ValueError, svm.SVC(C=-1).fit, X, Y)
# impossible value of nu
clf = svm.NuSVC(nu=0.0)
assert_raises(ValueError, clf.fit, X_sp, Y)
Y2 = Y[:-1] # wrong dimensions for labels
assert_raises(ValueError, clf.fit, X_sp, Y2)
clf = svm.SVC()
clf.fit(X_sp, Y)
assert_array_equal(clf.predict(T), true_result)
def _evaluation_test_helper(self, class_labels, use_probability_estimates, allow_slow, allowed_prob_delta=0.00001):
# Parameters to test
kernel_parameters = [{}, {'kernel': 'rbf', 'gamma': 1.2},
{'kernel': 'linear'},
{'kernel': 'poly'}, {'kernel': 'poly', 'degree': 2}, {'kernel': 'poly', 'gamma': 0.75},
{'kernel': 'poly', 'degree': 0, 'gamma': 0.9, 'coef0':2},
{'kernel': 'sigmoid'}, {'kernel': 'sigmoid', 'gamma': 1.3}, {'kernel': 'sigmoid', 'coef0': 0.8},
{'kernel': 'sigmoid', 'coef0': 0.8, 'gamma': 0.5}
]
non_kernel_parameters = [{}, {'nu': 0.75}, {'nu': 0.25, 'shrinking': True}, {'shrinking': False}]
# Generate some random data
x, y = [], []
random.seed(42)
for _ in range(50):
x.append([random.gauss(200,30), random.gauss(-100,22), random.gauss(100,42)])
y.append(random.choice(class_labels))
column_names = ['x1', 'x2', 'x3']
# make sure first label is seen first, second is seen second, and so on.
for i, val in enumerate(class_labels):
y[i] = val
df = pd.DataFrame(x, columns=column_names)
# Test
for param1 in non_kernel_parameters:
for param2 in kernel_parameters:
cur_params = param1.copy()
cur_params.update(param2)
cur_params['probability'] = use_probability_estimates
cur_params['max_iter'] = 10 # Don't want test to take too long
# print("cur_params=" + str(cur_params))
cur_model = NuSVC(**cur_params)
cur_model.fit(x, y)
spec = scikit_converter.convert(cur_model, column_names, 'target')
if use_probability_estimates:
probability_lists = cur_model.predict_proba(x)
df['classProbability'] = [dict(zip(cur_model.classes_, cur_vals)) for cur_vals in probability_lists]
metrics = evaluate_classifier_with_probabilities(spec, df, probabilities='classProbability')
self.assertEquals(metrics['num_key_mismatch'], 0)
self.assertLess(metrics['max_probability_error'], allowed_prob_delta)
else:
df['prediction'] = cur_model.predict(x)
metrics = evaluate_classifier(spec, df, verbose=False)
self.assertEquals(metrics['num_errors'], 0)
if not allow_slow:
break
if not allow_slow:
break
def trainModel(xtrain, xtest, ytrain, ytest):
classifiers = [
# KNeighborsClassifier(3),
# SVC(kernel="linear", probability=True),
# NuSVC(probability=True),
# DecisionTreeClassifier(),
RandomForestClassifier(),
# AdaBoostClassifier(),
# GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=200,
# subsample=1.0, criterion='friedman_mse', min_samples_split=2,
# min_samples_leaf=1, min_weight_fraction_leaf=0.,
# max_depth=5),
# GradientBoostingClassifier(),
# GaussianNB(),
# LinearDiscriminantAnalysis(),
# QuadraticDiscriminantAnalysis()
]
log_cols = ["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)
for clf in classifiers:
clf.fit(xtrain, ytrain)
name = clf.__class__.__name__
print("=" * 30)
print(name)
print('****Results****')
train_predictions = clf.predict(xtest)
# acc = accuracy_score(ytest, train_predictions)
# print("Accuracy: {:.4%}".format(acc))
train_porb_predictions = clf.predict_proba(xtest)
ll = log_loss(ytest, train_porb_predictions)
print("Log Loss: {}".format(ll))
# printResult(ytest, train_predictions)
# result.printMultiResult(ytest, train_predictions)
save_path = "doc/result.txt"
desc = "sentiment by tfidf "
result_str = result.printMultiResult(ytest, train_predictions)
result.saveResult(save_path, desc, result_str)
#
# log_entry = pd.DataFrame([[name, acc * 100, ll]], columns=log_cols)
# log = log.append(log_entry)
print("=" * 30)