def train_detector(x_train, y_train, x_val, y_val):
fpr, tpr, thresholds = roc_curve(y_train, x_train)
accuracy = [ sklearn.metrics.accuracy_score(y_train, x_train>threshold, normalize=True, sample_weight=None) for threshold in thresholds ]
roc_auc = auc(fpr, tpr)
idx_best = np.argmax(accuracy)
print "Best training accuracy: %.4f, TPR(Recall): %.4f, FPR: %.4f @%.4f" % (accuracy[idx_best], tpr[idx_best], fpr[idx_best], thresholds[idx_best])
print "ROC_AUC: %.4f" % roc_auc
accuracy_val = [ sklearn.metrics.accuracy_score(y_val, x_val>threshold, normalize=True, sample_weight=None) for threshold in thresholds ]
tpr_val, fpr_val = zip(*[ get_tpr_fpr(y_val, x_val, threshold) for threshold in thresholds ])
# roc_auc_val = auc(fpr_val, tpr_val)
print "Validation accuracy: %.4f, TPR(Recall): %.4f, FPR: %.4f @%.4f" % (accuracy_val[idx_best], tpr_val[idx_best], fpr_val[idx_best], thresholds[idx_best])
return threshold, accuracy_val, fpr_val, tpr_val
python类metrics()的实例源码
def build_lstm(output_dim, embeddings):
loss_function = "categorical_crossentropy"
# this is the placeholder tensor for the input sequences
sequence = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
# this embedding layer will transform the sequences of integers
embedded = Embedding(embeddings.shape[0], embeddings.shape[1], input_length=MAX_SEQUENCE_LENGTH, weights=[embeddings], trainable=True)(sequence)
# 4 convolution layers (each 1000 filters)
cnn = [Convolution1D(filter_length=filters, nb_filter=1000, border_mode="same") for filters in [2, 3, 5, 7]]
# concatenate
merged_cnn = merge([cnn(embedded) for cnn in cnn], mode="concat")
# create attention vector from max-pooled convoluted
maxpool = Lambda(lambda x: keras_backend.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
attention_vector = maxpool(merged_cnn)
forwards = AttentionLSTM(64, attention_vector)(embedded)
backwards = AttentionLSTM(64, attention_vector, go_backwards=True)(embedded)
# concatenate the outputs of the 2 LSTM layers
bi_lstm = merge([forwards, backwards], mode="concat", concat_axis=-1)
after_dropout = Dropout(0.5)(bi_lstm)
# softmax output layer
output = Dense(output_dim=output_dim, activation="softmax")(after_dropout)
# the complete omdel
model = Model(input=sequence, output=output)
# try using different optimizers and different optimizer configs
model.compile("adagrad", loss_function, metrics=["accuracy"])
return model
def to_dict_w_opt(model, metrics=None):
"""Serializes a sklearn model. Saves the parameters,
not the attributes.
Args:
model(sklearn.BaseEstimator): the model to serialize,
must be in SUPPORTED
metrics(list, optionnal): a list of metrics to monitor
Returns:
a dictionnary of the serialized model
"""
config = dict()
typestring = str(type(model))[8:][:-2]
config['config'] = typestring
attr = model.__dict__
for k, v in attr.items():
# check if parameter or attribute
if k[-1:] == '_':
# do not store attributes
pass
else:
config[k] = typeconversion(v)
# to be discussed :
# we add the metrics to the config even if it doesnt
# make sense for a sklearn model
# the metrics are then catch in model_from_dict_w_opt
if metrics is not None:
config['metrics'] = []
for m in metrics:
config['metrics'].append(m)
return config
def model_from_dict_w_opt(model_dict, custom_objects=None):
"""Builds a sklearn model from a serialized model using `to_dict_w_opt`
Args:
model_dict(dict): a serialized sklearn model
custom_objects(dict, optionnal): a dictionnary mapping custom objects
names to custom objects (callables, etc.)
Returns:
A new sklearn.BaseEstimator (in SUPPORTED) instance. The attributes
are not loaded.
"""
if custom_objects is None:
custom_objects = dict()
# custom_objects = {k: deserialize(k, custom_objects[k])
# for k in custom_objects}
# safety check
if model_dict['config'] not in keyval:
raise NotImplementedError("sklearn model not supported.")
# load the metrics
if 'metrics' in model_dict:
metrics = model_dict.pop('metrics')
else:
metrics = None
# create a new instance of the appropriate model type
model = copy.deepcopy(keyval[model_dict['config']])
# load the parameters
for k, v in model_dict.items():
if isinstance(v, list): # pragma: no cover
setattr(model, k, np.array(v))
else:
setattr(model, k, v)
return model, metrics
def select_model(training_data, method='logistic',
do_segment_split=True,
processes=1,
cv_verbosity=2,
model_params=None,
random_state=None):
"""
Fits a model given by *method* to the training data.
:param training_data: The training data to fit the model with
:param method: A string which specifies the model to use.
:param do_segment_split: If True, the training data will be split by segment.
:param processes: The number of processes to use for the grid search.
:param cv_verbosity: The verbosity level of the grid search. 0 is silent, 2 is maximum verbosity.
:param model_params: An optional dictionary with keyword arguments to tune the grid search.
:param random_state: A constant which will seed the random number generator if given.
:return: The fitted grid search object.
"""
logging.info("Training a {} model".format(method))
training_data_x = training_data.drop('Preictal', axis=1)
training_data_y = training_data['Preictal']
cv = get_cv_generator(training_data, do_segment_split=do_segment_split, random_state=random_state)
scorer = sklearn.metrics.make_scorer(sklearn.metrics.roc_auc_score, average='weighted')
model_dict = get_model(method,
training_data_x,
training_data_y,
model_params=model_params,
random_state=random_state)
common_cv_kwargs = dict(cv=cv,
scoring=scorer,
n_jobs=processes,
pre_dispatch='2*n_jobs',
refit=True,
verbose=cv_verbosity,
iid=False)
cv_kwargs = dict(common_cv_kwargs)
cv_kwargs.update(model_dict)
logging.info("Running grid search using the parameters: {}".format(model_dict))
clf = GridSearchCV(**cv_kwargs)
clf.fit(training_data_x, training_data_y)
return clf
def classification_report(y_true, y_pred, labels=None, sample_weight=None, digits=4, threshold=None):
# this function is copied from https://github.com/scikit-learn/scikit-learn/blob/412996f/sklearn/metrics/classification.py#L1341 (c) respective authors
# I pulled it here to fix formatting bug.
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
y_true = np.array(y_true)
y_pred = np.array(y_pred)
if labels is None:
from sklearn.utils.multiclass import unique_labels
if threshold is not None:
y_true = y_true > threshold
y_pred = y_pred > threshold
labels = unique_labels(y_true, y_pred)
else:
labels = np.asarray(labels)
last_line_heading = 'avg / total'
target_names = ['%s' % l for l in labels]
results = [["", "precision", "recall", "f1-score", "support", "accuracy"]]
p, r, f1, s = precision_recall_fscore_support(y_true, y_pred,
labels=labels,
average=None,
sample_weight=sample_weight)
for i, label in enumerate(labels):
values = [target_names[i]]
for v in (p[i], r[i], f1[i]):
values += ["{0:0.{1}f}".format(v, digits)]
values += ["{0}".format(s[i])]
accuracy = accuracy_score(y_true == label, y_pred == label, sample_weight=sample_weight)
values += ["{0:0.{1}f}".format(accuracy, digits)]
results.append(values)
values = [last_line_heading]
for v in (np.average(p, weights=s),
np.average(r, weights=s),
np.average(f1, weights=s)):
values += ["{0:0.{1}f}".format(v, digits)]
values += ['{0}'.format(np.sum(s))]
accuracy = accuracy_score(y_true, y_pred, sample_weight=sample_weight)
values += ["{0:0.{1}f}".format(accuracy, digits)]
results.append(values)
return results
def roc_plot_from_thresholds(roc_thresholds_by_model, save=False, debug=False):
"""
From a given dictionary of thresholds by model, create a ROC curve for each model.
Args:
roc_thresholds_by_model (dict): A dictionary of ROC thresholds by model name.
save (bool): False to display the image (default) or True to save it (but not display it)
debug (bool): verbost output.
"""
# TODO consolidate this and PR plotter into 1 function
# TODO make the colors randomly generated from rgb values
# Cycle through the colors list
color_iterator = itertools.cycle(['b', 'g', 'r', 'c', 'm', 'y', 'k'])
# Initialize plot
plt.figure()
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TRP)')
plt.title('Receiver Operating Characteristic (ROC)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.plot([0, 1], [0, 1], linestyle=DIAGONAL_LINE_STYLE, color=DIAGONAL_LINE_COLOR)
# Calculate and plot for each model
for color, (model_name, metrics) in zip(color_iterator, roc_thresholds_by_model.items()):
# Extract model name and metrics from dictionary
roc_auc = metrics['roc_auc']
tpr = metrics['true_positive_rates']
fpr = metrics['false_positive_rates']
best_true_positive_rate = metrics['best_true_positive_rate']
best_false_positive_rate = metrics['best_false_positive_rate']
if debug:
print('{} model:'.format(model_name))
print(pd.DataFrame({'FPR': fpr, 'TPR': tpr}))
# plot the line
label = '{} (ROC AUC = {})'.format(model_name, round(roc_auc, 2))
plt.plot(fpr, tpr, color=color, label=label)
plt.plot([best_false_positive_rate], [best_true_positive_rate], marker='*', markersize=10, color=color)
plt.legend(loc="lower right")
if save:
plt.savefig('ROC.png')
source_path = os.path.dirname(os.path.abspath(__file__))
print('\nROC plot saved in: {}'.format(source_path))
plt.show()
def pr_plot_from_thresholds(pr_thresholds_by_model, save=False, debug=False):
"""
From a given dictionary of thresholds by model, create a PR curve for each model.
Args:
pr_thresholds_by_model (dict): A dictionary of PR thresholds by model name.
save (bool): False to display the image (default) or True to save it (but not display it)
debug (bool): verbost output.
"""
# TODO consolidate this and PR plotter into 1 function
# TODO make the colors randomly generated from rgb values
# Cycle through the colors list
color_iterator = itertools.cycle(['b', 'g', 'r', 'c', 'm', 'y', 'k'])
# Initialize plot
plt.figure()
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall (PR)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.plot([0, 1], [1, 0], linestyle=DIAGONAL_LINE_STYLE, color=DIAGONAL_LINE_COLOR)
# Calculate and plot for each model
for color, (model_name, metrics) in zip(color_iterator, pr_thresholds_by_model.items()):
# Extract model name and metrics from dictionary
pr_auc = metrics['pr_auc']
precision = metrics['precisions']
recall = metrics['recalls']
best_recall = metrics['best_recall']
best_precision = metrics['best_precision']
if debug:
print('{} model:'.format(model_name))
print(pd.DataFrame({'Recall': recall, 'Precision': precision}))
# plot the line
label = '{} (PR AUC = {})'.format(model_name, round(pr_auc, 2))
plt.plot(recall, precision, color=color, label=label)
plt.plot([best_recall], [best_precision], marker='*', markersize=10, color=color)
plt.legend(loc="lower left")
if save:
plt.savefig('PR.png')
source_path = os.path.dirname(os.path.abspath(__file__))
print('\nPR plot saved in: {}'.format(source_path))
plt.show()
machine_learning.py 文件源码
项目:-Python-Analysis_of_wine_quality
作者: ekolik
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def decis_tree(wine_set):
# to remember the if the wine_set red or white
w = wine_set
# subset data for better tree visibility
# wine_set = wine_set[:100]
# recode quality (response variable) into 2 groups: 0:{3,4,5}, 1:{6,7,8,9}
recode = {3: 0, 4: 0, 5: 0, 6: 1, 7: 1, 8: 1, 9: 1}
wine_set['quality_c'] = wine_set['quality'].map(recode)
# round explanatory data for easier tree
# wine_set["residual_sugar"] = wine_set["residual_sugar"].round()
# wine_set["alcohol"] = wine_set["alcohol"].round()
# split into training and testing sets
predictors = wine_set[["residual_sugar", 'alcohol']]
targets = wine_set.quality_c
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.4)
# build model on training data
classifier = DecisionTreeClassifier()
classifier = classifier.fit(pred_train, tar_train)
predictions = classifier.predict(pred_test)
# print the confusion matrix and accuracy of the model
print(sklearn.metrics.confusion_matrix(tar_test, predictions))
print(sklearn.metrics.accuracy_score(tar_test, predictions))
# export the tree for viewing
if w.equals(red):
export_graphviz(classifier, out_file="red_decision_tree.dot")
else:
export_graphviz(classifier, out_file="white_decision_tree.dot")
# to view the decision tree create a .pdf file from the created .dot file
# by typing in the terminal from this directory: dot -Tpdf decision_tree.dot -o decision_tree.pdf
# print('----------------Decision Tree------------------------')
# call(decis_tree)
# ____________________________________Random Forests________________
machine_learning.py 文件源码
项目:-Python-Analysis_of_wine_quality
作者: ekolik
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def random_forests(wine_set):
# recode quality (response variable) into 2 groups: 0:{3,4,5}, 1:{6,7,8,9}
recode = {3: 0, 4: 0, 5: 0, 6: 1, 7: 1, 8: 1, 9: 1}
wine_set['quality_c'] = wine_set['quality'].map(recode)
# split into training and testing sets
predictors = wine_set[["density", 'alcohol', 'sulphates', 'pH', 'volatile_acidity', 'chlorides', 'fixed_acidity',
'citric_acid', 'residual_sugar', 'free_sulfur_dioxide', 'total_sulfur_dioxide']]
targets = wine_set.quality_c
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.4)
# build model on training data#
classifier = RandomForestClassifier(n_estimators=25)
classifier = classifier.fit(pred_train, tar_train)
predictions = classifier.predict(pred_test)
# print the confusion matrix and accuracy of the model
print('confusion matrix:\n', sklearn.metrics.confusion_matrix(tar_test, predictions))
print('\naccuracy:', sklearn.metrics.accuracy_score(tar_test, predictions))
# to display the relative importance of each predictive variable
model = ExtraTreesClassifier()
model.fit(pred_train, tar_train)
print('importance of predictors:')
dct = dict()
for c in range(len(predictors.columns)):
dct[predictors.columns[c]] = model.feature_importances_[c]
print(sorted(dct.items(), key=operator.itemgetter(1), reverse=True))
# run different numbers of trees to see the effect of the number on the accuracy of the prediction
n = 100
accuracy = [0]*n
for i in range(n):
classifier = RandomForestClassifier(n_estimators=i+1)
classifier = classifier.fit(pred_train, tar_train)
predictions = classifier.predict(pred_test)
accuracy[i] = sklearn.metrics.accuracy_score(tar_test, predictions)
plt.plot(range(1, n+1), accuracy)
plt.xlabel("Number of trees")
plt.ylabel("Accuracy of prediction")
plt.title("Effect of the number of trees on the prediction accuracy")
plt.show()
print(accuracy)
# print('----------------Random Forests------------------------')
# call(random_forests)
# ________________________________Lasso Regression__________________________________
def do_system_evaluation(dataset, dataset_evaluation_mode, result_path):
# Set warnings off, sklearn metrics will trigger warning for classes without
# predicted samples in F1-scoring. This is just to keep printing clean.
#warnings.simplefilter("ignore")
fold_wise_class_eer = numpy.zeros((len(dataset.folds(mode=dataset_evaluation_mode)), dataset.audio_tag_count))
for fold in dataset.folds(mode=dataset_evaluation_mode):
class_wise_eer = numpy.zeros((dataset.audio_tag_count))
results = []
result_filename = get_result_filename(fold=fold, path=result_path)
if os.path.isfile(result_filename):
with open(result_filename, 'rt') as f:
for row in csv.reader(f, delimiter=','):
results.append(row)
else:
raise IOError("Result file not found [%s]" % result_filename)
for tag_id,tag in enumerate(dataset.audio_tags):
y_true_binary = []
y_true_file = []
y_score = []
for result in results:
if tag == result[1]:
relative_path = dataset.package_list[0]['local_audio_path'].replace(dataset.local_path,'')[1:] + os.path.sep + result[0]
y_true_file.append(result[0])
if tag in dataset.file_meta(relative_path)[0]['tags']:
y_true_binary.append(1)
else:
y_true_binary.append(0)
y_score.append(float(result[2]))
if numpy.any(y_true_binary):
class_wise_eer[tag_id] = compute_eer(result_filename, tag, dict(zip(y_true_file, y_true_binary)))
else:
class_wise_eer[tag_id] = None
fold_wise_class_eer[fold - 1 if fold > 0 else fold, :] = class_wise_eer
print " File-wise evaluation, over %d folds" % (dataset.fold_count)
print " {:20s} | {:8s}".format('Tag', 'EER')
print " ==============================================="
labels = numpy.array([dataset.tagcode_to_taglabel(t) for t in dataset.audio_tags])
for i in numpy.argsort(labels):
print " {:20s} | {:3.3f} ".format(labels[i],
numpy.nanmean(fold_wise_class_eer[:,i])
)
print " ==============================================="
print " {:20s} | {:3.3f} ".format('Mean error',
numpy.mean(numpy.nanmean(fold_wise_class_eer))
)
# Restore warnings to default settings
warnings.simplefilter("default")
def rand_forest_train(self):
# ??????????
users = pd.read_csv('names.csv')
# ??similarity?platform?reputation?entropy????????????
X = users[['similarity', 'platform', 'reputation', 'entropy']]
y = users['human_or_machine']
# ?????????? 25%???????
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
# ????????????????
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))
# ?????????????????????
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_y_pred = dtc.predict(X_test)
# ???????????????????????
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)
# ???????????????????????
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_pred = gbc.predict(X_test)
from sklearn.metrics import classification_report
# ??????????????????? ?????????? ??? F1??
print("??????????", dtc.score(X_test, y_test))
print(classification_report(dtc_y_pred, y_test))
# ??????????????????????????????? ??? F1??
print("????????????", rfc.score(X_test, y_test))
print(classification_report(rfc_y_pred, y_test))
# ??????????????????????????????? ??? F1??
print("????????????", gbc.score(X_test, y_test))
print(classification_report(gbc_y_pred, y_test))
users = pd.read_csv('values.csv')
# ??????????
X = users[['similarity', 'platform', 'reputation', 'entropy']]
X = vec.transform(X.to_dict(orient='record'))
print(rfc.predict(X))
self.dtc = dtc
self.rfc = rfc
self.gbc = gbc