def classification_rbf_svm(self):
self.signals.PrintInfo.emit("RBF SVM")
output_dir = self.output_dir + 'rbf_svm_out/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
vectorizer = HashingVectorizer()
fdata = vectorizer.fit_transform(self.fdata)
trainingSet = fdata[:self.split]
testSet = fdata[self.split:]
classificator = SVC(gamma=2, probability=True, C=self.rbf_svm_c)
classificator.fit(trainingSet, self.trainingClass)
results = classificator.predict(testSet)
proba = classificator.predict_proba(testSet)
self.write_results_to_file(output_dir + 'results.csv', results, proba, classificator.classes_,self.test_filenames)
out_text = self.compile_result_string(results, proba, classificator.classes_, self.test_filenames)
self.signals.PrintInfo.emit(out_text)
python类SVC的实例源码
def fitAndPredict(self):
# classifier = LogisticRegression()
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Logistic:'
# print classification_report(self.testLabel, pred_labels)
self.classifier = SVC()
self.classifier.fit(self.trainingSet, self.trainingLabel)
pred_labels = {}
for user in self.testDict:
pred_labels[user] = self.classifier.predict([[self.BDS[user]]])
# print 'SVM:'
# print classification_report(self.testLabel, pred_labels)
# classifier = DecisionTreeClassifier(criterion='entropy')
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Decision Tree:'
# print classification_report(self.testLabel, pred_labels)
# return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel
return pred_labels
def fitAndPredict(self):
corpus = self.trainingSet+self.testSet
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]
text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T
if PCA_Applied:
pca = PCA(n_components=PCA_nComponents)
text_matrix = pca.fit_transform(text_matrix)
classifier = LogisticRegression()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'Logistic:'
print classification_report(self.testLabel, pred_labels)
classifier = SVC()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'SVM:'
print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self):
corpus = self.trainingSet+self.testSet
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]
model = models.TfidfModel(corpus)
corpus = [text for text in model[corpus]]
text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T
if PCA_Applied:
pca = PCA(n_components=PCA_nComponents)
text_matrix = pca.fit_transform(text_matrix)
classifier = LogisticRegression()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'Logistic:'
print classification_report(self.testLabel, pred_labels)
classifier = SVC()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'SVM:'
print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self):
# classifier = LogisticRegression()
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Logistic:'
# print classification_report(self.testLabel, pred_labels)
pred_labels = {}
classifier = SVC()
classifier.fit(self.trainingSet, self.trainingLabel)
for user in self.testDict:
pred_labels[user] = classifier.predict([[self.MUD[user], self.RUD[user], self.QUD[user]]])
# print 'SVM:'
# print classification_report(self.testLabel, pred_labels)
return pred_labels
# classifier = DecisionTreeClassifier(criterion='entropy')
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Decision Tree:'
# print classification_report(self.testLabel, pred_labels)
# return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel
def buildModel(dataset, method, parameters):
"""
Build final model for predicting real testing data
"""
features = dataset.columns[0:-1]
if method == 'RNN':
clf = performRNNlass(dataset[features], dataset['UpDown'])
return clf
elif method == 'RF':
clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
elif method == 'KNN':
clf = neighbors.KNeighborsClassifier()
elif method == 'SVM':
c = parameters[0]
g = parameters[1]
clf = SVC(C=c, gamma=g)
elif method == 'ADA':
clf = AdaBoostClassifier()
return clf.fit(dataset[features], dataset['UpDown'])
def Training_model():
#????????????
f = open("f://emotion/mysite/weibo_emotion/emotion_file/data_count.txt") # ???????????
f.readline() # ????
data = np.loadtxt(f)
#?????????
f1 = open("f://emotion/mysite/weibo_emotion/emotion_file/data_jixing.txt")
leibie = np.loadtxt(f1)
f.close()
f1.close()
#TF-IDF??
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(data)
data1 = tfidf.toarray()
#SVM?????
clf = svm.SVC() # class
clf.fit(data1, leibie) # training the svc model
return clf
def train():
training_set=[]
training_labels=[]
os.chdir("/Users/muyunyan/Desktop/EC500FINAL/logo/")
counter=0
a=os.listdir(".")
for i in a:
os.chdir(i)
print(i)
for d in os.listdir("."):
img = cv2.imread(d)
res=cv2.resize(img,(250,250))
gray_image = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY)
xarr=np.squeeze(np.array(gray_image).astype(np.float32))
m,v=cv2.PCACompute(xarr)
arr= np.array(v)
flat_arr= arr.ravel()
training_set.append(flat_arr)
training_labels.append(i)
os.chdir("..")
trainData=training_set
responses=training_labels
svm = svm.SVC()
svm.fit(trainData,responses)
return svm
def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5)
x, y = online.collect_pts(100, -1)
i = 0
q = online.get_n_query()
C_range = np.logspace(-2, 5, 10, base=10)
gamma_range = np.logspace(-5, 1, 10, base=10)
param_grid = dict(gamma=gamma_range, C=C_range)
while q < 3500:
i += 1
# h_ = ex.fit(x, y)
cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
grid.fit(x, y)
h_ = grid.best_estimator_
online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1)
x_, _ = online_.collect_pts(10, 200)
if x_ is not None and len(x_) > 0:
x.extend(x_)
y.extend(oracle(x_))
q += online_.get_n_query()
pred_y = h_.predict(test_x)
print len(x), q, sm.accuracy_score(test_y, pred_y)
def grid_retrain_in_x(self):
gamma_range = np.logspace(-15, 3, 19, base=2)
param_grid = dict(gamma=gamma_range)
if len(np.unique(self.y_ex)) < 2:
return 1, 1
try:
cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=.2)
grid = GridSearchCV(SVC(C=1e5), param_grid=param_grid, cv=cv, n_jobs=-1)
grid.fit(self.X_ex, self.y_ex)
rbf_svc2 = grid.best_estimator_
except ValueError:
rbf_svc2 = SVC(C=1e5)
rbf_svc2.fit(self.X_ex, self.y_ex)
self.set_clf2(rbf_svc2)
return self.benchmark()
def grid_search(self):
C_range = np.logspace(-5, 15, 21, base=2)
param_grid = dict(C=C_range)
cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(kernel='poly', max_iter=10000), param_grid=param_grid, cv=cv, n_jobs=1, verbose=0)
logger.info('start grid search for Linear')
grid.fit(self.X_ex, self.y_ex)
logger.info('end grid search for Linear')
scores = [x[1] for x in grid.grid_scores_]
# final train
clf = grid.best_estimator_
pred_train = clf.predict(self.X_ex)
pred_val = clf.predict(self.val_x)
pred_test = clf.predict(self.test_x)
r = Result(self.name + ' (X)', 'Poly', len(self.X_ex),
sm.accuracy_score(self.y_ex, pred_train),
sm.accuracy_score(self.val_y, pred_val),
sm.accuracy_score(self.test_y, pred_test))
return r
def fit_model(X, y):
classifier = svm.SVC()
parameters = {'kernel':['poly', 'rbf', 'sigmoid'], 'degree':[1, 2, 3], 'C':[0.1, 1, 10]}
f1_scorer = make_scorer(performance_metric,
greater_is_better=True)
clf = GridSearchCV(classifier,
param_grid=parameters,
scoring=f1_scorer)
clf.fit(X, y)
return clf
# Read student data
def create_model(self, training_articles):
model = OneVsRestClassifier(svm.SVC(probability=True))
features = []
labels = []
i = 0
for article in training_articles:
print("Generating features for article " + str(i) + "...")
google_cloud_response = self.analyze_text_google_cloud(article["article"])
relevant_entities = self.get_relevant_entities(google_cloud_response["entities"], article["market"]["entities"], article["market"]["wikipedia_urls"])
# Only count this article if a relevant entity is present
if relevant_entities:
article_features = self.article_features(relevant_entities, article["market"], google_cloud_response, article["article"])
features.append(article_features)
labels.append(article["label"])
else:
print("Skipping article " + str(i) + "...")
i = i + 1
print("Performing feature scaling...")
scaler = preprocessing.StandardScaler().fit(features)
features_scaled = scaler.transform(features)
print("Fitting model...")
model.fit(features_scaled, labels)
print("Saving model...")
joblib.dump(scaler, "data_analysis/caler.pkl")
joblib.dump(model, "data_analysis/model.pkl")
print("Done!")
# For use in prod
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None):
"""Description of compare
compare multiple classifier and display the best one
"""
utils.print_success("Comparison of differents classifiers")
if data is not None:
train_features = data["train_features"]
train_groundtruths = data["train_groundtruths"]
test_features = data["test_features"]
test_groundtruths = data["test_groundtruths"]
else:
train = utils.abs_path_file(train)
test = utils.abs_path_file(test)
train_features, train_groundtruths = read_file(train)
test_features, test_groundtruths = read_file(test)
if not utils.create_dir(res_dir):
res_dir = utils.abs_path_dir(res_dir)
classifiers = {
"RandomForest": RandomForestClassifier(n_jobs=-1)
# "RandomForest": RandomForestClassifier(n_estimators=5),
# "KNeighbors":KNeighborsClassifier(3),
# "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
# "DecisionTree":DecisionTreeClassifier(max_depth=5),
# "MLP":MLPClassifier(),
# "AdaBoost":AdaBoostClassifier(),
# "GaussianNB":GaussianNB(),
# "QDA":QuadraticDiscriminantAnalysis(),
# "SVM":SVC(kernel="linear", C=0.025),
# "GradientBoosting":GradientBoostingClassifier(),
# "ExtraTrees":ExtraTreesClassifier(),
# "LogisticRegression":LogisticRegression(),
# "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis()
}
for key in classifiers:
utils.print_success(key)
clf = classifiers[key]
utils.print_info("\tFit")
clf.fit(train_features, train_groundtruths)
utils.print_info("\tPredict")
predictions = clf.predict(test_features)
return predictions
def classify(n = 50):
#clf = MultinomialNB(fit_prior=False)
#clf = SVC(gamma=2, C=1, class_weight = {0.0:0.063829777, 1.0:1.0})
clf = SGDClassifier(loss="log", penalty="l1", class_weight = {0.0:0.022, 1.0:1.0})
clf.fit(mat[:n], rel[:n])
return clf
def baseline_svm():
train_data = pd.read_csv(r"data/train.csv")
print u"?????\n",train_data.info()
print u'?????\n',train_data.describe()
#display_data(train_data) # ????????
#display_with_process(train_data) # ??????????????????,????
process_data = pre_processData(train_data,'process_train_data') # ????????????
train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ???????????
train_np = train_data.as_matrix() # ????
'''??model'''
X = train_np[:,1:]
y = train_np[:,0]
model = svm.SVC(C=1.0,tol=1e-6).fit(X,y)
# print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
'''??????'''
test_data = pd.read_csv(r"data/test.csv")
process_test_data = pre_processData(test_data,'process_test_data') # ?????
test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
test_np = test_data.as_matrix()
predict = model.predict(test_np)
result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
result.to_csv(r'baseline_svm_result/prediction.csv',index=False)
# baseline???????——0.76077
def baseline_svm_crossValidate():
origin_train_data = pd.read_csv(r"data/train.csv")
process_data = pre_processData(origin_train_data,'process_train_data') # ????????????
process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2)
train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ???????????
train_np = train_data.as_matrix() # ????
'''??model'''
X_train = train_np[:,1:]
y_train = train_np[:,0]
model = svm.SVC(kernel='rbf',tol=1e-6).fit(X_train,y_train)
#print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
cv_np = cv_data.as_matrix()
X_cv = cv_np[:,1:]
y_cv = cv_np[:,0]
predictions = model.predict(X_cv)
print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0])
error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)]
predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId'])
predictions_item.columns=['error_PassengerId']
# error_items = error_items.reset_index(drop=True)
error_result = pd.concat([error_items,predictions_item],axis=1)
error_result.to_csv(r'error.csv',index=False)
'''??????'''
'''test_data = pd.read_csv(r"data/test.csv")
process_test_data = pre_processData(test_data,'process_test_data',optimize=False) # ?????
test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
test_np = test_data.as_matrix()
predict = model.predict(test_np)
result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
result.to_csv(r'svm_result/prediction.csv',index=False)'''
# baseline crossValidate???????——??????
def svc_model(self):
model = svm.SVC(probability=True, C=0.3, kernel='linear')
return model
load_feature.py 文件源码
项目:EmotiW-2017-Audio-video-Emotion-Recognition
作者: xujinchang
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def use_SVM(X_data,y_data):
p_gamma = 0.1
p_C = 10
svm = SVC(kernel = 'rbf',random_state=0, gamma=p_gamma ,C=p_C, probability=True)
svm.fit(X_data,y_data)
joblib.dump(svm,"./sklearn_model/svm_trainval1_{param1}_{param2}".format(param1 = p_gamma,param2 = p_C))
return svm
def classifier_train(feature_matrix_0, feature_matrix_1, algorithm = 'SVM'):
"""
Trains a binary classifier using the SVM algorithm with the following parameters
Arguments
feature_matrix_0: Matrix with examples for Class 0
feature_matrix_0: Matrix with examples for Class 1
algorithm: Currently only SVM is supported
Outputs
classfier: trained classifier (scikit object)
mu_ft, std_ft: normalization parameters for the data
"""
# Create vector Y (class labels)
class0 = np.zeros((feature_matrix_0.shape[0],1))
class1 = np.ones((feature_matrix_1.shape[0],1))
# Concatenate feature matrices and their respective labels
y = np.concatenate((class0, class1),axis=0)
features_all = np.concatenate((feature_matrix_0, feature_matrix_1),axis=0)
# Normalize inputs
mu_ft = np.mean(features_all)
std_ft = np.std(features_all)
X = (features_all - mu_ft) / std_ft
# Train SVM, using default parameters
classifier = svm.SVC()
classifier.fit(X, y)
return classifier, mu_ft, std_ft