def get_model_class(method):
"""
Returns the class associated with a method string.
:param method: A string describing the method to use.
:return: A class corresponding to the method.
"""
if method == 'logistic':
return sklearn.linear_model.LogisticRegression
elif method == 'svm':
return sklearn.svm.SVC
elif method == 'mirowski-svm':
return sklearn.svm.SVC
elif method == 'sgd':
return sklearn.linear_model.SGDClassifier
elif method == 'random-forest':
return sklearn.ensemble.RandomForestClassifier
elif method == 'nearest-centroid':
return sklearn.neighbors.NearestCentroid
elif method == 'knn':
return sklearn.neighbors.KNeighborsClassifier
elif method == 'bagging':
return sklearn.ensemble.BaggingClassifier
else:
raise NotImplementedError("Method {} is not supported".format(method))
python类svm()的实例源码
def main():
baskets.time_me.set_default_mode('print')
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument('tags', nargs='+')
parser.add_argument('-f', '--train-fold', default='train')
parser.add_argument('--validation-fold', help='Fold for validation (default: None)')
parser.add_argument('--no-metafeats', action='store_true')
parser.add_argument('--svm', action='store_true')
args = parser.parse_args()
with time_me("Loaded metavectors"):
meta_df = pd.read_pickle(METAVECTORS_PICKLEPATH)
with time_me("Made training vectors"):
X, y = vectorize_fold(args.train_fold, args.tags, meta_df, use_metafeats=not args.no_metafeats)
# This sucks.
if args.svm:
# slooow :( (sklearn docs say hard to scale to dataset w more than like 20k examples)
#model = sklearn.svm.SVC(verbose=True, probability=True, C=1.0)
model = sklearn.svm.LinearSVC( penalty='l2', loss='hinge', C=.001, verbose=1,)
else:
# TODO: C
model = LogisticRegression(verbose=1)
with time_me('Trained model', mode='print'):
model.fit(X, y)
model_fname = 'model.pkl'
joblib.dump(model, model_fname)
return model
# TODO: report acc on validation set
def train_svm(X, y, k):
if k == 'linear':
svm = SVC(C=1.0, kernel='linear')
elif k =='rbf':
svm = SVC(C=1.0, kernel='rbf')
svm.fit(X, y)
return svm
def __init__(self, n_features=100, transform=True, classifier='lsvm', kernel='rbf', n_neighbors=5):
self.n_features = n_features
self.transform = transform
self.clf_type = classifier
if classifier == 'lsvm':
self.clf = LinearSVC()
elif classifier == 'svm':
self.clf = SVC(kernel=kernel, probability=True)
elif classifier == 'knn':
self.clf = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='brute', metric='cosine')
def predict_proba(self, x, y, normalize=False, flip=False):
X = deepcopy(x)
if self.transform:
X = decompose(X, self.n_features, normalize, flip)
if self.clf_type == 'svm':
return self.clf.predict_proba(X)
elif self.clf_type in ['lsvm', 'nbsvm']:
return platt_scale(X, y, self.clf)
elif self.clf_type == 'knn':
return self.clf.predict_proba(X)
def trainSVMTK(docs, pairs, dditype, model="svm_tk_classifier.model", excludesentences=[]):
if os.path.isfile("ddi_models/" + model):
os.remove("ddi_models/" + model)
if os.path.isfile("ddi_models/" + model + ".txt"):
os.remove("ddi_models/" + model + ".txt")
#docs = use_external_data(docs, excludesentences, dditype)
xerrors = 0
with open("ddi_models/" + model + ".txt", 'w') as train:
#print pairs
for p in pairs:
if dditype != "all" and pairs[p][relations.PAIR_DDI] and pairs[p][relations.PAIR_TYPE] != dditype:
continue
sid = relations.getSentenceID(p)
if sid not in excludesentences:
tree = pairs[p][relations.PAIR_DEP_TREE][:]
#print "tree1:", tree
#if len(docs[sid][ddi.SENTENCE_ENTITIES]) > 20:
#print line
# line = "1 |BT| (ROOT (NP (NN candidatedrug) (, ,) (NN candidatedrug))) |ET|"
# xerrors += 1
#else:
line = get_svm_train_line(tree, pairs[p], sid,
docs[sid][relations.SENTENCE_PAIRS][p])
if not pairs[p][relations.PAIR_DDI]:
line = '-' + line
elif pairs[p][relations.PAIR_TYPE] != dditype and dditype != "all":
line = '-' + line
train.write(line)
#print "tree errors:", xerrors
svmlightcall = Popen(["./svm-light-TK-1.2/svm-light-TK-1.2.1/svm_learn", "-t", "5",
"-L", "0.4", "-T", "2", "-S", "2", "-g", "10",
"-D", "0", "-C", "T", basedir + model + ".txt", basedir + model],
stdout = PIPE, stderr = PIPE)
res = svmlightcall.communicate()
if not os.path.isfile("ddi_models/" + model):
print "failed training model " + basedir + model
print res
sys.exit()
def train(method='svm', savePath=None, choice='basic'):
if not savePath:
savePath = os.path.abspath('./support/clf/' + method + '.pk')
else:
savePath = os.path.abspath(savePath)
trainer = trainerFactory(method, choice, savePath)
if not trainer:
print('No such method to train for now')
return
trainer.train()
trainer.save()
print('Train Process Complete')
def train(self, training_data, config, **kwargs):
# type: (TrainingData, RasaNLUConfig, **Any) -> None
"""Train the intent classifier on a data set.
:param num_threads: number of threads used during training time"""
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
labels = [e.get("intent") for e in training_data.intent_examples]
if len(set(labels)) < 2:
logger.warn("Can not train an intent classifier. Need at least 2 different classes. " +
"Skipping training of intent classifier.")
else:
y = self.transform_labels_str2num(labels)
X = np.stack([example.get("text_features") for example in training_data.intent_examples])
sklearn_config = config.get("intent_classifier_sklearn")
C = sklearn_config.get("C", [1, 2, 5, 10, 20, 100])
kernel = sklearn_config.get("kernel", "linear")
# dirty str fix because sklearn is expecting str not instance of basestr...
tuned_parameters = [{"C": C, "kernel": [str(kernel)]}]
cv_splits = max(2, min(MAX_CV_FOLDS, np.min(np.bincount(y)) // 5)) # aim for 5 examples in each fold
self.clf = GridSearchCV(SVC(C=1, probability=True, class_weight='balanced'),
param_grid=tuned_parameters, n_jobs=config["num_threads"],
cv=cv_splits, scoring='f1_weighted', verbose=1)
self.clf.fit(X, y)
def train(self, training_data, config, **kwargs):
# type: (TrainingData, RasaNLUConfig, **Any) -> None
"""Train the intent classifier on a data set.
:param num_threads: number of threads used during training time"""
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
labels = [e.get("intent") for e in training_data.intent_examples]
if len(set(labels)) < 2:
logger.warn("Can not train an intent classifier. Need at least 2 different classes. " +
"Skipping training of intent classifier.")
else:
y = self.transform_labels_str2num(labels)
X = np.stack([example.get("text_features") for example in training_data.intent_examples])
sklearn_config = config.get("intent_classifier_sklearn")
C = sklearn_config.get("C", [1, 2, 5, 10, 20, 100])
kernel = sklearn_config.get("kernel", "linear")
# dirty str fix because sklearn is expecting str not instance of basestr...
tuned_parameters = [{"C": C, "kernel": [str(kernel)]}]
cv_splits = max(2, min(MAX_CV_FOLDS, np.min(np.bincount(y)) // 5)) # aim for 5 examples in each fold
self.clf = GridSearchCV(SVC(C=1, probability=True, class_weight='balanced'),
param_grid=tuned_parameters, n_jobs=config["num_threads"],
cv=cv_splits, scoring='f1_weighted', verbose=1)
self.clf.fit(X, y)
def testSVMTK(sentence, pairs, pairs_list, model="svm_tk_classifier.model", tag=""):
if os.path.isfile(basedir + tag + "svm_test_data.txt"):
os.remove(basedir + tag + "svm_test_data.txt")
if os.path.isfile(basedir + tag + "svm_test_output.txt"):
os.remove(basedir + tag + "svm_test_output.txt")
#docs = use_external_data(docs, excludesentences, dditype)
#pidlist = pairs.keys()
total = 0
with open(temp_dir + tag + "svm_test_data.txt", 'w') as test:
for pid in pairs:
sid = pairs[pid].sid
tree = sentence.parsetree
#if len(docs[sid][ddi.SENTENCE_ENTITIES]) > 30:
#print line
#line = reparse_tree(line)
# line = "1 |BT| (ROOT (NP (NN candidatedrug) (, ,) (NN candidatedrug))) |ET|\n"
# xerrors += 1
#else:
line = get_svm_train_line(tree, pairs[pid], sid)
line = '-' + line
test.write(line)
total += 1
#print "tree errors:", xerrors, "total:", total
svmtklightargs = ["./bin/svm-light-TK-1.2/svm-light-TK-1.2.1/svm_classify",
temp_dir + tag + "svm_test_data.txt", basedir + model,
temp_dir + tag + "svm_test_output.txt"]
svmlightcall = Popen(svmtklightargs, stdout=PIPE, stderr=PIPE)
res = svmlightcall.communicate()
# logging.debug(res[0].split('\n')[-3:])
#os.system(' '.join(svmtklightargs))
if not os.path.isfile(temp_dir + tag + "svm_test_output.txt"):
print "something went wrong with SVM-light-TK"
print res
sys.exit()
with open(temp_dir + tag + "svm_test_output.txt", 'r') as out:
lines = out.readlines()
if len(lines) != len(pairs_list):
print "check " + tag + "svm_test_output.txt! something is wrong"
print res
sys.exit()
for p, pid in enumerate(pairs):
score = float(lines[p])
if float(score) < 0:
pairs[pid].recognized_by[relations.SST_PRED] = -1
else:
pairs[pid].recognized_by[relations.SST_PRED] = 1
logging.info("{} - {} SST: {}".format(pairs[pid].entities[0], pairs[pid].entities[0], score))
return pairs
def main():
parser = optparse.OptionParser("[!] usage: python cross_validate_SVM.py -F <data file>")
parser.add_option("-F", dest="dataFile", type="string", \
help="specify data file to analyse")
(options, args) = parser.parse_args()
dataFile = options.dataFile
# TODO: remove only for testing
if False:
cfg = config.Config()
data_path = cfg.paths['data']
data_file_standard = cfg.paths['data_file_standard']
dataFile = data_path + data_file_standard
if dataFile == None:
print(parser.usage)
exit(0)
data = sio.loadmat(dataFile)
X = data["X"]
m,n = np.shape(X)
y = np.squeeze(data["y"])
kernel_grid = ["rbf"]
C_grid = [5]
gamma_grid = [1]
kf = KFold(m, n_folds=5)
fold = 1
for kernel in kernel_grid:
for C in C_grid:
for gamma in gamma_grid:
fold=1
FoMs = []
for train, test in kf:
print("[*]", fold, kernel, C, gamma)
file = data_path + "classifiers/cv/SVM_kernel"+str(kernel)+"_C"+str(C)+\
"_gamma"+str(gamma)+"_"+dataFile.split("/")[-1].split(".")[0]+\
"_fold"+str(fold)+".pkl"
try:
svm = pickle.load(open(file,"rb"))
except IOError:
train_x, train_y = X[train], y[train]
svm = train_SVM(train_x, train_y, kernel, C, gamma)
outputFile = open(file, "wb")
pickle.dump(svm, outputFile)
FoM, threshold = measure_FoM(X[test], y[test], svm, False)
fold+=1
FoMs.append(FoM)
print("[+] mean FoM: %.3lf" % (np.mean(np.array(FoMs))))
print()
def main():
parser = optparse.OptionParser("[!] usage: python cross_validate_SVM.py -F <data file>")
parser.add_option("-F", dest="dataFile", type="string", \
help="specify data file to analyse")
(options, args) = parser.parse_args()
dataFile = options.dataFile
if dataFile == None:
print parser.usage
exit(0)
data = sio.loadmat(dataFile)
X = data["X"]
m,n = np.shape(X)
y = np.squeeze(data["y"])
kernel_grid = ["rbf"]
C_grid = [5]
gamma_grid = [1]
kf = KFold(m, n_folds=5)
fold = 1
for kernel in kernel_grid:
for C in C_grid:
for gamma in gamma_grid:
fold=1
FoMs = []
for train, test in kf:
print "[*]", fold, kernel, C, gamma
file = "cv/SVM_kernel"+str(kernel)+"_C"+str(C)+\
"_gamma"+str(gamma)+"_"+dataFile.split("/")[-1].split(".")[0]+\
"_fold"+str(fold)+".pkl"
try:
svm = pickle.load(open(file,"rb"))
except IOError:
train_x, train_y = X[train], y[train]
svm = train_SVM(train_x, train_y, kernel, C, gamma)
outputFile = open(file, "wb")
pickle.dump(svm, outputFile)
FoM, threshold = measure_FoM(X[test], y[test], svm, False)
fold+=1
FoMs.append(FoM)
print "[+] mean FoM: %.3lf" % (np.mean(np.array(FoMs)))
print
def train_svm_classifer(features, labels, model_output_path):
"""
train_svm_classifer will train a SVM, saved the trained and SVM model and
report the classification performance
features: 2D array of each input feature for each sample
labels: array of string labels classifying each sample
model_output_path: path for storing the trained svm model
"""
# save 20% of data for performance evaluation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, labels, test_size=0.2)
param = [
{
"kernel": ["linear"],
"C": [1, 10, 100, 1000]
},
{
"kernel": ["rbf"],
"C": [1, 10, 100, 1000],
"gamma": [1e-2, 1e-3, 1e-4, 1e-5]
}
]
# request probability estimation
svm = SVC(probability=True)
# 10-fold cross validation, use 4 thread as each fold and each parameter set can be train in parallel
clf = grid_search.GridSearchCV(svm, param,
cv=10, n_jobs=20, verbose=3)
clf.fit(X_train, y_train)
if os.path.exists(model_output_path):
joblib.dump(clf.best_estimator_, model_output_path)
else:
print("Cannot save trained svm model to {0}.".format(model_output_path))
print("\nBest parameters set:")
print(clf.best_params_)
y_predict=clf.predict(X_test)
labels=sorted(list(set(labels)))
print("\nConfusion matrix:")
print("Labels: {0}\n".format(",".join(labels)))
print(confusion_matrix(y_test, y_predict, labels=labels))
print("\nClassification report:")
print(classification_report(y_test, y_predict))