def __init__(self, filename, target_map, classifier='svm'):
self.seed_ = 0
self.filename_ = filename
self.target_map_ = target_map
self.target_ids_ = (np.unique(target_map.keys())).astype(np.int32)
self.epoch_no_ = 0
self.st_time_ = time.time()
# Setup classifier
print('-------------------------------')
print('====> Building Classifier, setting class weights')
if classifier == 'svm':
self.clf_hyparams_ = {'C':[0.01, 0.1, 1.0, 10.0, 100.0], 'class_weight': ['balanced']}
self.clf_base_ = LinearSVC(random_state=self.seed_)
elif classifier == 'sgd':
self.clf_hyparams_ = {'alpha':[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0], 'class_weight':['auto']} # 'loss':['hinge'],
self.clf_ = SGDClassifier(loss='log', penalty='l2', shuffle=False, random_state=self.seed_,
warm_start=True, n_jobs=-1, n_iter=1, verbose=4)
else:
raise Exception('Unknown classifier type %s. Choose from [sgd, svm, gradient-boosting, extra-trees]'
% classifier)
python类LinearSVC()的实例源码
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
figure.classification.vs.regression.py 文件源码
项目:microbiome-summer-school-2017
作者: aldro61
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def make_classification_example(axis, random_state):
X, y = make_blobs(n_samples=100, n_features=2, centers=2, cluster_std=2.7, random_state=random_state)
axis.scatter(X[y == 0, 0], X[y == 0, 1], color="red", s=10, label="Disease")
axis.scatter(X[y == 1, 0], X[y == 1, 1], color="blue", s=10, label="Healthy")
clf = LinearSVC().fit(X, y)
# get the separating hyperplane
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 7)
yy = a * xx - (clf.intercept_[0]) / w[1]
# plot the line, the points, and the nearest vectors to the plane
axis.plot(xx, yy, 'k-', color="black", label="Model")
ax1.tick_params(labelbottom='off', labelleft='off')
ax1.set_xlabel("Gene 1")
ax1.set_ylabel("Gene 2")
ax1.legend()
def define_model(self, model, parameters, n_cores = 0):
clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
'GaussianNB': GaussianNB(),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7),
'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3),
'linear.SVC': svm.LinearSVC() }
if model not in clfs:
raise ConfigError("Unsupported model {}".format(model))
clf = clfs[model]
clf.set_params(**parameters)
return clf
def do_ml(ticker):
X, y, df = extract_featuresets(ticker)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,
y,
test_size=0.25)
#clf = neighbors.KNeighborsClassifier()
clf = VotingClassifier([('lsvc',svm.LinearSVC()),
('knn',neighbors.KNeighborsClassifier()),
('rfor',RandomForestClassifier())])
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
print('accuracy:',confidence)
predictions = clf.predict(X_test)
print('predicted class counts:',Counter(predictions))
print()
print()
return confidence
# examples of running:
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
def convert(model, feature_names, target):
"""Convert a LinearSVC model to the protobuf spec.
Parameters
----------
model: LinearSVC
A trained LinearSVC model.
feature_names: [str]
Name of the input columns.
target: str
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
_sklearn_util.check_expected_type(model, _LinearSVC)
_sklearn_util.check_fitted(model, lambda m: hasattr(m, 'coef_'))
return _MLModel(_logistic_regression._convert(model, feature_names, target))
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
def __init__(self, a_clf=None, a_grid_search=False):
"""Class constructor.
Initialize classifier.
Args:
a_clf (classifier or None):
classifier to use or None for default
a_grid_search (bool): use grid search for estimating hyper-parameters
"""
classifier = a_clf or LinearSVC(C=DFLT_C,
**DFLT_PARAMS)
self._gs = a_grid_search
self._model = Pipeline([("vect", DictVectorizer()),
("clf", classifier)])
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
def __init__(self, isTrain, isOutlierRemoval=0):
"""
The linear models ``LinearSVC()`` and ``SVC(kernel='linear')`` yield slightly
different decision boundaries. This can be a consequence of the following
differences:
- ``LinearSVC`` minimizes the squared hinge loss while ``SVC`` minimizes the
regular hinge loss.
- ``LinearSVC`` uses the One-vs-All (also known as One-vs-Rest) multiclass
reduction while ``SVC`` uses the One-vs-One multiclass reduction.
:return:
"""
super(ClassificationSVM, self).__init__(isTrain, isOutlierRemoval)
# data preprocessing
self.dataPreprocessing()
self.clf = svm.SVC() # define the SVM classifier
C = 1.0 # SVM regularization parameter
self.svc = svm.SVC(kernel='linear', C=C, max_iter=100000)
self.rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C)
self.poly_svc = svm.SVC(kernel='poly', coef0=1, degree=3, C=C)
self.lin_svc = svm.LinearSVC(C=C)
def get_classifier(self):
algo=self.algo
if algo=="GBT":
return GradientBoostingClassifier()
elif algo=="RF":
return RandomForestClassifier()
elif algo=="ADB":
return AdaBoostClassifier()
elif algo =="DT":
return DecisionTreeClassifier()
elif algo=="NB":
return BernoulliNB()
elif algo=="SGD":
return SGDClassifier()
elif algo=="SVC":
return LinearSVC()
elif algo=="MLPC":
return MLPClassifier(activation='logistic', batch_size='auto',
early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive',
learning_rate_init=0.1, max_iter=5000, random_state=1,
solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
return 0
def test_classes__property():
# Test that classes_ property matches best_estimator_.classes_
X = np.arange(100).reshape(10, 10)
y = np.array([0] * 5 + [1] * 5)
Cs = [.1, 1, 10]
grid_search = dcv.GridSearchCV(LinearSVC(random_state=0), {'C': Cs})
grid_search.fit(X, y)
assert_array_equal(grid_search.best_estimator_.classes_,
grid_search.classes_)
# Test that regressors do not have a classes_ attribute
grid_search = dcv.GridSearchCV(Ridge(), {'alpha': [1.0, 2.0]})
grid_search.fit(X, y)
assert not hasattr(grid_search, 'classes_')
# Test that the grid searcher has no classes_ attribute before it's fit
grid_search = dcv.GridSearchCV(LinearSVC(random_state=0), {'C': Cs})
assert not hasattr(grid_search, 'classes_')
# Test that the grid searcher has no classes_ attribute without a refit
grid_search = dcv.GridSearchCV(LinearSVC(random_state=0),
{'C': Cs}, refit=False)
grid_search.fit(X, y)
assert not hasattr(grid_search, 'classes_')
def test_grid_search_sparse():
# Test that grid search works with both dense and sparse matrices
X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
clf = LinearSVC()
cv = dcv.GridSearchCV(clf, {'C': [0.1, 1.0]})
cv.fit(X_[:180], y_[:180])
y_pred = cv.predict(X_[180:])
C = cv.best_estimator_.C
X_ = sp.csr_matrix(X_)
clf = LinearSVC()
cv = dcv.GridSearchCV(clf, {'C': [0.1, 1.0]})
cv.fit(X_[:180].tocoo(), y_[:180])
y_pred2 = cv.predict(X_[180:])
C2 = cv.best_estimator_.C
assert np.mean(y_pred == y_pred2) >= .9
assert C == C2
def train_pumil_clf(bags, pidx, uidx, w, NL, learning_phase = False):
# top-{NL} reliable negative bags
relnidx = reliable_negative_bag_idx(bags, uidx, w, NL)
Bn = [bags[j] for j in relnidx]
# estimated p(X|Y=-1) via WKDE
Dn = weighted_kde(Bn, w[relnidx])
# form Positive Margin Pool (PMP)
pmp_x, pmp_y, pmp_conf = form_pmp(bags, w, pidx, relnidx, Dn)
# train SVM by using PMP instances
pmp_weighted_x = np.multiply(pmp_x.T, pmp_conf).T
clf = svm.LinearSVC(loss = 'hinge')
clf.fit(pmp_weighted_x, pmp_y)
clf_ = pumil_clf_wrapper(lambda x: float(clf.decision_function(x)), Dn, learning_phase)
if learning_phase:
return clf_, relnidx
else:
return clf_
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
def convert(model, feature_names, target):
"""Convert a LinearSVC model to the protobuf spec.
Parameters
----------
model: LinearSVC
A trained LinearSVC model.
feature_names: [str]
Name of the input columns.
target: str
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
_sklearn_util.check_expected_type(model, _LinearSVC)
_sklearn_util.check_fitted(model, lambda m: hasattr(m, 'coef_'))
return _MLModel(_logistic_regression._convert(model, feature_names, target))
def _conversion_and_evaluation_helper_for_linear_svc(self, class_labels):
ARGS = [ {},
{'C' : .75, 'loss': 'hinge'},
{'penalty': 'l1', 'dual': False},
{'tol': 0.001, 'fit_intercept': False},
{'intercept_scaling': 1.5}
]
x, y = GlmCassifierTest._generate_random_data(class_labels)
column_names = ['x1', 'x2']
df = pd.DataFrame(x, columns=column_names)
for cur_args in ARGS:
print(class_labels, cur_args)
cur_model = LinearSVC(**cur_args)
cur_model.fit(x, y)
spec = convert(cur_model, input_features=column_names,
output_feature_names='target')
df['prediction'] = cur_model.predict(x)
cur_eval_metics = evaluate_classifier(spec, df, verbose=False)
self.assertEquals(cur_eval_metics['num_errors'], 0)
def svm_experiment(scope_name, X, y):
for lp in lp_cand:
results = []
for r in range(50):
with open('data/local/split/' + scope_name + '/lb' + str(lp).zfill(3) + '_' + str(r).zfill(
3) + '_train') as f:
trainLabel = pk.load(f)
with open('data/local/split/' + scope_name + '/lb' + str(lp).zfill(3) + '_' + str(r).zfill(
3) + '_test') as f:
testLabel = pk.load(f)
XTrain = X[trainLabel.keys()]
XTest = X[testLabel.keys()]
yTrain = y[trainLabel.keys()]
yTest = y[testLabel.keys()]
# train
clf = LinearSVC(C=0.01)
clf.fit(XTrain, yTrain)
# test
pred = clf.predict(XTest)
results.append(sum(pred == yTest) / float(yTest.shape[0]))
return np.mean(results)
def fit(self, x, y):
# Convert non-binary features to binary
bin_x = tfidf_to_counts(x)
# Calculating the log-count ratio
X_pos = bin_x[np.where(y == 1)]
X_neg = bin_x[np.where(y == 0)]
self.r = log_count_ratio(X_pos, X_neg)
X = np.multiply(self.r, bin_x)
# Training linear SVM with NB features but no interpolation
svm = LinearSVC(C=self.C)
svm.fit(X, y)
self.coef_ = svm.coef_
self.int_coef_ = interpolate(self.coef_, self.beta)
self.bias = svm.intercept_
# Scores the interpolated model
def init_model():
# “????”??
f_trunk = QuestionTrunkVectorizer(tokenizer=tokenize)
# Word2Vec ????
f_word2vec = Question2VecVectorizer(tokenizer=tokenize)
# ???? (400 ?)
union_features = FeatureUnion([
('f_trunk_lsa', Pipeline([
('trunk', f_trunk),
# ??_????: ?????? (LSA)
('lsa', TruncatedSVD(n_components=200, n_iter=10))
])),
('f_word2vec', f_word2vec),
])
model = Pipeline([('union', union_features), ('clf', LinearSVC(C=0.02))])
return model
def __init__(self, cls, dim, feature_scale=1.0,
C=0.001, B=10.0, pos_weight=2.0):
self.pos = np.zeros((0, dim), dtype=np.float32)
self.neg = np.zeros((0, dim), dtype=np.float32)
self.B = B
self.C = C
self.cls = cls
self.pos_weight = pos_weight
self.dim = dim
self.feature_scale = feature_scale
self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
intercept_scaling=B, verbose=1,
penalty='l2', loss='l1',
random_state=cfg.RNG_SEED, dual=True)
self.pos_cur = 0
self.num_neg_added = 0
self.retrain_limit = 2000
self.evict_thresh = -1.1
self.loss_history = []
def grid_retrain_in_f(self, n_dim=500):
rbf_map = RBFSampler(n_dim, random_state=1)
fourier_approx_svm = pipeline.Pipeline([("mapper", rbf_map),
("svm", LinearSVC())])
# C_range = np.logspace(-5, 15, 21, base=2)
# gamma_range = np.logspace(-15, 3, 19, base=2)
# param_grid = dict(mapper__gamma=gamma_range, svm__C=C_range)
# cv = StratifiedShuffleSplit(Y, n_iter=5, test_size=0.2, random_state=42)
# grid = GridSearchCV(fourier_approx_svm, param_grid=param_grid, cv=cv)
# grid.fit(X, Y)
#
# rbf_svc2 = grid.best_estimator_
rbf_svc2 = fourier_approx_svm
rbf_svc2.fit(self.X_ex, self.y_ex)
self.set_clf2(rbf_svc2)
return self.benchmark()