def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'penalty': ['l1'],
'C': np.logspace(-5,5)},
{'penalty': ['l2'],
'C': np.logspace(-5,5)}]
clf = GridSearchCV(linear_model.LogisticRegression(tol=1e-6), tuned_parameters, cv=5, scoring='precision_weighted')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "Detailed classification report:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print classification_report(y_true, y_pred)
python类classification_report()的实例源码
def score(train_labels, train_features, test_labels, test_features, save_file, use_tree=False):
if use_tree:
train_clf = Classifier(tree.DecisionTreeClassifier())
else:
train_clf = Classifier()
print train_clf.clf
print ''
t_start = time.clock()
train_clf.learn(train_features, train_labels)
t_end = time.clock()
if save_file:
train_clf.save_to_file(open(save_file, 'w'))
p_start = time.clock()
predicted = train_clf.clf.predict(test_features)
p_end = time.clock()
test_labels_t = train_clf.labels.transform(test_labels)
print classification_report(test_labels_t, predicted, target_names=train_clf.labels.classes_)
print 'Training time: %fs' % (t_end - t_start)
print 'Predicting time: %fs' % (p_end - p_start)
print 'Mean squared error: %f' % mean_squared_error(test_labels_t, predicted)
return train_clf.score(test_features, test_labels)
def multiclass_classifier(X_train, Y_train, X_val, Y_val, X_test, Y_test, nb_epoch=200, batch_size=10, seed=7):
clf = softmax_network(X_train.shape[1], Y_train.shape[1])
clf.fit(X_train, Y_train,
epochs=nb_epoch,
batch_size=batch_size,
shuffle=True,
validation_data=(X_val, Y_val),
callbacks=[
ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.01),
EarlyStopping(monitor='val_loss', min_delta=1e-5, patience=5, verbose=0, mode='auto'),
]
)
acc = clf.test_on_batch(X_test, Y_test)[1]
# confusion matrix and precision-recall
true = np.argmax(Y_test,axis=1)
pred = np.argmax(clf.predict(X_test), axis=1)
print confusion_matrix(true, pred)
print classification_report(true, pred)
return acc
def metrics_equal():
dataset_path = dpu.generate_equal_dataset()
dataset = dpu.load(dataset_path)
mm = SGDCModelManager()
mm.x_train, mm.x_test, mm.y_train, mm.y_test = train_test_split(dataset['inputs'], dataset['outputs'], random_state=42)
mm.train()
predicts = mm.predict(mm.x_test)
report = classification_report(mm.y_test, predicts)
return jsonify(status=200, message=report)
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'],
'gamma': np.logspace(-4, 3, 30),
'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]},
{'kernel': ['poly'],
'degree': [1, 2, 3, 4],
'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000],
'coef0': np.logspace(-4, 3, 30)},
{'kernel': ['linear'],
'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}]
clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring='precision_weighted')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "Detailed classification report:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print classification_report(y_true, y_pred)
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'weights': ['uniform', 'distance'],
'n_neighbors': range(2,60)
}
]
clf = GridSearchCV(neighbors.KNeighborsClassifier(), tuned_parameters, cv=5, scoring='precision_weighted')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "Detailed classification report:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print classification_report(y_true, y_pred)
ClassificationRandomForest.py 文件源码
项目:AirTicketPredicting
作者: junlulocky
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'max_depth': range(20,60),
'n_estimators': range(10,40),
'max_features': ['sqrt', 'log2', None]
}
]
clf = GridSearchCV(RandomForestClassifier(n_estimators=30), tuned_parameters, cv=5, scoring='precision_weighted')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "Detailed classification report:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print classification_report(y_true, y_pred)
def learn_structure(self, samples):
X_train, X_train_label, X_test, X_test_label = \
self._generate_train_test_sets(samples, 0.75)
logger.info('Training with ' + str(len(X_train)) +
'samples; testing with ' + str(len(X_test)) + ' samples.')
lr_detector = self._get_best_detector(X_train, X_train_label)
Y_test = lr_detector.predict(X_test)
num_anomalies = Y_test[Y_test == ANOMALY].size
logger.info('Found ' + str(num_anomalies) +
' anomalies in testing set')
logger.info('Confusion Matrix: \n{}'.
format(classification_report(
X_test_label,
Y_test,
target_names=['no', 'yes'])))
return lr_detector
def learn_structure(self, samples):
X_train, X_train_label, X_test, X_test_label = \
self._generate_train_test_sets(samples, 0.75)
logger.info('Training with ' + str(len(X_train)) +
'samples; testing with ' + str(len(X_test)) + ' samples.')
svc_detector = self._get_best_detector(X_train, X_train_label)
Y_test = svc_detector.predict(X_test)
num_anomalies = Y_test[Y_test == ANOMALY].size
logger.info('Found ' + str(num_anomalies) +
' anomalies in testing set')
logger.info('Confusion Matrix: \n{}'.
format(classification_report(
X_test_label,
Y_test,
target_names=['no', 'yes'])))
return svc_detector
def learn_structure(self, samples):
X_train, X_train_label, X_test, X_test_label = \
self._generate_train_test_sets(samples, 0.75)
logger.info('Training with ' + str(len(X_train)) +
'samples; testing with ' + str(len(X_test)) + ' samples.')
dt_detector = self._get_best_detector(X_train, X_train_label)
Y_test = dt_detector.predict(X_test)
num_anomalies = Y_test[Y_test == ANOMALY].size
logger.info('Found ' + str(num_anomalies) +
' anomalies in testing set')
logger.info('Confusion Matrix: \n{}'.
format(classification_report(
X_test_label,
Y_test,
target_names=['no', 'yes'])))
return dt_detector
def learn_structure(self, samples):
X_train, X_train_label, X_test, X_test_label = \
self._generate_train_test_sets(samples, 0.75)
logger.info('Training with ' + str(len(X_train)) +
'samples; testing with ' + str(len(X_test)) + ' samples.')
rf_detector = self._get_best_detector(X_train, X_train_label)
Y_test = rf_detector.predict(X_test)
num_anomalies = Y_test[Y_test == ANOMALY].size
logger.info('Found ' + str(num_anomalies) +
' anomalies in testing set')
logger.info('Confusion Matrix: \n{}'.
format(classification_report(
X_test_label,
Y_test,
target_names=['no', 'yes'])))
return rf_detector
def main(log_file, table_file):
"""
:param log_file:
:param table_file:
:return:
"""
tables = read_tables(table_file)
table_dict = build_table_dict(tables)
questions = read_log(log_file)
truth = list()
prediction = list()
for q in questions:
process(q, table_dict[q["tid"]])
t, p = recalc_index(q)
truth += t
prediction += p
file_base_name = os.path.basename(log_file)
dirname = os.path.dirname(log_file)
file = os.path.join(dirname, "processed_" + file_base_name)
report = classification_report(truth, prediction, target_names=["PAT", "LIT", "TAB", "COL", "CELL"])
save(questions, report, file)
def score_model(model, data_test, labeler):
'''
??????? ?????????????????? ??????,
?????? ? ??????????? ????? ??? ???????:
???????? ?????????, ???????? ??????? ?
???????? ??? ??????? ??????, ????????
? ????????????? ??????.
?????????:
model - ????????? ??????
data_test - ??????????? ???????
labeler - LabelEncoder ?????? ???????
??????????:
??????
'''
X_test = data_test.drop(["proto"], axis=1)
y_test = data_test["proto"]
y_predicted = model.predict(X_test)
true_labels = labeler.inverse_transform(y_test)
predicted_labels = labeler.inverse_transform(y_predicted)
print feature_importances_report(model, X_test.columns)
print "\n", classification_report(true_labels, predicted_labels)
print cross_class_report(true_labels, predicted_labels)
def evaluate(y_test, y_test_proba, nb_classes, path):
from riddle import roc # here so np can be seeded before run_pipeline() call
y_pred = [np.argmax(p) for p in y_test_proba]
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred))
print()
print('Classification report:')
print(classification_report(y_test, y_pred, digits=3))
print('ROC AUC values:')
roc_auc, fpr, tpr = roc.compute_roc(y_test, y_test_proba,
nb_classes=nb_classes)
roc.save_plots(roc_auc, fpr, tpr, nb_classes=nb_classes, path=path)
for l, r in roc_auc.items():
print(' {}: {:.5f}'.format(l, r))
print()
# ---------------------------- PUBLIC FUNCTIONS ------------------------------ #
def fitAndPredict(self):
# classifier = LogisticRegression()
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Logistic:'
# print classification_report(self.testLabel, pred_labels)
self.classifier = SVC()
self.classifier.fit(self.trainingSet, self.trainingLabel)
pred_labels = {}
for user in self.testDict:
pred_labels[user] = self.classifier.predict([[self.BDS[user]]])
# print 'SVM:'
# print classification_report(self.testLabel, pred_labels)
# classifier = DecisionTreeClassifier(criterion='entropy')
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Decision Tree:'
# print classification_report(self.testLabel, pred_labels)
# return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel
return pred_labels
def fitAndPredict(self):
corpus = self.trainingSet+self.testSet
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]
text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T
if PCA_Applied:
pca = PCA(n_components=PCA_nComponents)
text_matrix = pca.fit_transform(text_matrix)
classifier = LogisticRegression()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'Logistic:'
print classification_report(self.testLabel, pred_labels)
classifier = SVC()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'SVM:'
print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self):
corpus = self.trainingSet+self.testSet
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]
model = models.TfidfModel(corpus)
corpus = [text for text in model[corpus]]
text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T
if PCA_Applied:
pca = PCA(n_components=PCA_nComponents)
text_matrix = pca.fit_transform(text_matrix)
classifier = LogisticRegression()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'Logistic:'
print classification_report(self.testLabel, pred_labels)
classifier = SVC()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'SVM:'
print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self):
# classifier = LogisticRegression()
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Logistic:'
# print classification_report(self.testLabel, pred_labels)
pred_labels = {}
classifier = SVC()
classifier.fit(self.trainingSet, self.trainingLabel)
for user in self.testDict:
pred_labels[user] = classifier.predict([[self.MUD[user], self.RUD[user], self.QUD[user]]])
# print 'SVM:'
# print classification_report(self.testLabel, pred_labels)
return pred_labels
# classifier = DecisionTreeClassifier(criterion='entropy')
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Decision Tree:'
# print classification_report(self.testLabel, pred_labels)
# return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel
def test_model(self, n_folds=10):
""" ?? `??K-??????Stratified K-folds cross-validating?`
???????
"""
logging.debug("testing model with {}-folds CV".format(n_folds))
model = self.init_model()
X = self.data.data
y = self.data.target
cv = cross_validation.StratifiedKFold(y, n_folds=n_folds, random_state=42)
t0 = time()
y_pred = cross_validation.cross_val_predict(model, X=X, y=y, n_jobs=-1, cv=cv)
t = time() - t0
print("=" * 52)
print("time cost: {}".format(t))
print()
print("confusion matrix\n", metrics.confusion_matrix(y, y_pred))
print()
print("\t\taccuracy: {}".format(metrics.accuracy_score(y, y_pred)))
print()
print("\t\tclassification report")
print("-" * 52)
print(metrics.classification_report(y, y_pred))
def test(self):
lenW = len(self.vectorizer.vocabulary_)
W = 3*lenW
Y_true = []
Y_pred = []
for i,line in enumerate(self.test_lines):
if line['type'] == 'q':
r = line['answer']
id = line['id']-1
indices = [idx for idx in range(i-id, i+1)]
memory_list = self.L_test[indices]
m_o1 = O_t([id], memory_list, self.s_Ot)
m_o2 = O_t([id, m_o1], memory_list, self.s_Ot)
bestVal = None
best = None
for w in self.vectorizer.vocabulary_:
val = self.sR([id, m_o1, m_o2], self.H[w], memory_list, self.V)
if bestVal is None or val > bestVal:
bestVal = val
best = w
Y_true.append(r)
Y_pred.append(best)
print metrics.classification_report(Y_true, Y_pred)
def MyEvaluation(y_test,predicted):
def norm_me(x):
if str(type(x)).find("int")>-1:
return x
zix = np.argmax(x)
x1 = [0]*len(x)
x1[zix] = 1
return x1
predicted = [norm_me(x) for x in predicted]
predicted = np.array(predicted,dtype="uint8")
target_names = ['normal','malware']
inv_map = {v: k for k, v in KLABEL.items()}
target_names = [inv_map[x] for x in range(WORKING_KLABEL)]
result = classification_report(y_test,predicted,target_names=target_names)
print result
averagelabel = 'binary'
if B_MULTICLASS: averaegelabel = "macro"
v_precision = precision_score(y_test,predicted, average=averagelabel)
v_recall = recall_score(y_test,predicted, average=averagelabel)
(TP, FP, TN, FN) = perf_measure(y_test, predicted,KLABEL["malicious"])
return v_precision,v_recall,TP, FP, TN, FN
def classification_report(y_pred, y_true, labels):
"""
Parameters
----------
pass
Return
------
Classification report in form of string
"""
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# ====== validate labels ====== #
labels = as_tuple(labels)
target_names = [str(i) for i in labels]
labels = list(range(0, len(labels)))
# ====== create report ====== #
s = ""
s += "Accuracy: %f\n" % accuracy_score(y_true, y_pred, normalize=True)
s += "Confusion matrix:\n"
s += str(confusion_matrix(y_true, y_pred, labels=labels)) + '\n'
s += "Report:\n"
s += str(classification_report(y_true, y_pred, labels=labels, digits=3,
target_names=target_names))
return s
def splitValidateModel(self, visualizePredictions = False):
(label_vector, input_vector) = loadData(self.featureFile)
indexArray = range(0, len(input_vector))
trainData, testData, trainLabels, expectedLabels, trainIndices, testIndices = \
cross_validation.train_test_split(input_vector, label_vector, indexArray, test_size=(1.0 - self.percentSplit))
kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
kNNClassifier.fit(trainData, trainLabels)
predictedLabels = kNNClassifier.predict(testData)
print("Classification report for classifier %s:\n%s\n"
% ('k-NearestNeighbour', metrics.classification_report(expectedLabels, predictedLabels)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expectedLabels, predictedLabels))
print('Split Validation training :: Done.\n')
if visualizePredictions:
self.__visualizePredictedDataset__(input_vector, testIndices, predictedLabels, expectedLabels)
svm_clusterization_test.py 文件源码
项目:rbm_based_autoencoders_with_tensorflow
作者: ikhlestov
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def test_svm_estimator(estimator, notes, encodings_train, labels_train,
encodings_test, labels_test):
t0 = time()
estimator.fit(encodings_train, labels_train)
print("Time cons: %.2fs, type: %s" % (time() - t0, notes))
predicted = estimator.predict(encodings_test)
accuracy = metrics.accuracy_score(labels_test, predicted)
print("Accuracy: %.5f" % accuracy)
report = metrics.classification_report(labels_test, predicted)
print(report)
prec_recall_f_score = metrics.precision_recall_fscore_support(
labels_test, predicted)
print('-' * 10)
prec_recall_f_score_dict = {
'prec': np.mean(prec_recall_f_score[0]),
'recall': np.mean(prec_recall_f_score[1]),
'f_score': np.mean(prec_recall_f_score[2])
}
return accuracy, prec_recall_f_score_dict
def classify(y_true, y_pred):
lb = LabelBinarizer()
y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
tagset = set(lb.classes_) - {'O'}
tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
return classification_report(
y_true_combined,
y_pred_combined,
labels = [class_indices[cls] for cls in tagset],
target_names = tagset,
)
def evaluate(self, x_test, y_test, batch_size=256):
"""Evaluate classifier
Args:
x_test (np.array): 3D numpy array (n_samples, embedding_dim, tokenizer.max_sequence_length)
y_test (np.array): 2D numpy array (n_samples, len(self.category_map))
batch_size (int): Training batch size
"""
print('Evaluating...')
predictions_last_epoch = self.model.predict(x_test, batch_size=batch_size, verbose=1)
predicted_classes = np.argmax(predictions_last_epoch, axis=1)
target_names = ['']*len(self.category_map)
for category in self.category_map:
target_names[self.category_map[category]] = category
y_val = np.argmax(y_test, axis=1)
print(classification_report(y_val, predicted_classes, target_names=target_names, digits = 6))
def evaluate(args, model, data):
train_predict = model.predict(data.trainX)
print("TRAINING RESULTS")
print(classification_report(
[e[1] for e in data.trainY],
[utils.get_sentiment(e[1]) for e in train_predict],
))
print()
test_predict = model.predict(data.valX)
print("DEV RESULTS")
print(classification_report(
[e[1] for e in data.valY],
[utils.get_sentiment(e[1]) for e in test_predict],
))
print()
if args['--evaluate-test']:
test_predict = model.predict(data.testX)
print("TEST RESULTS")
print(classification_report(
[e[1] for e in data.testY],
[utils.get_sentiment(e[1]) for e in test_predict],
))
print()
def eval_model(name, model, data):
print '=' * 20
print name, 'training'
model.fit(data, train.target, sample_weight=sample_weights)
print name, 'trained'
predictions = model.predict(processed_test_data)
print name, 'accuracy', np.mean(predictions == test.target)
print(metrics.classification_report(test.target, predictions))
print metrics.confusion_matrix(test.target, predictions)
print name, 'f1 cross validation', cross_validation.cross_val_score(model, grammar_processed_data, train.target, scoring='f1')
print name, 'precision cross validation', cross_validation.cross_val_score(
model, grammar_processed_data, train.target, scoring='precision'
)
return model, predictions
# SVM need balance on input features, same ranges and variances and stuff like that
def bio_classification_report(y_gold,y_pred):
#y_gold: [[],[],[]]
#y_pred:
lb = LabelBinarizer()
y_gold_combined = lb.fit_transform(list(chain.from_iterable(y_gold)))
y_pred_combined = lb.fit_transform(list(chain.from_iterable(y_pred)))
tagset = set(lb.classes_) - {'O'}
tagset = sorted(tagset,key=lambda tag: tag.split('-',1)[::-1])
class_indices = {cls:idx for idx,cls in enumerate(lb.classes_)}
return classification_report(
y_gold_combined,
y_pred_combined,
labels=[class_indices[cls] for cls in tagset],
target_names=tagset
)
def classify():
reader = DbdReader(DATA_DIR, TRAIN_PATH, target_for_vocabulary=TARGET_PATH, max_vocabulary_size=_vocab_size_, filter="140", threshold=0.6, clear_when_exit=False)
reader.init()
dataset, user_vocab, system_vocab = reader.get_dataset()
labels = reader.get_labels()
model = make_model(user_vocab, system_vocab)
model_if = model.create_interface(_buckets_, TRAIN_DIR)
train_x, test_x, train_t, test_t = train_test_split(dataset, labels, test_size=0.2, random_state=42)
with tf.Session() as sess:
detector = Detector(sess, model_if)
detector.train(sess, train_x, train_t)
y = [detector.predict(sess, p) for p in test_x]
y = [lb for lb, prob in y]
report = classification_report([lb.label for lb in test_t], y, target_names=DbdReader.get_label_names())
print(report)