def _check_binary_probabilistic_predictions(y_true, y_prob):
"""Check that y_true is binary and y_prob contains valid probabilities"""
check_consistent_length(y_true, y_prob)
labels = np.unique(y_true)
if len(labels) != 2:
raise ValueError("Only binary classification is supported. "
"Provided labels %s." % labels)
if y_prob.max() > 1:
raise ValueError("y_prob contains values greater than 1.")
if y_prob.min() < 0:
raise ValueError("y_prob contains values less than 0.")
return label_binarize(y_true, labels)[:, 0]
python类label_binarize()的实例源码
def test_precision_recall_f_ignored_labels():
# Test a subset of labels may be requested for PRF
y_true = [1, 1, 2, 3]
y_pred = [1, 3, 3, 3]
y_true_bin = label_binarize(y_true, classes=np.arange(5))
y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
data = [(y_true, y_pred),
(y_true_bin, y_pred_bin)]
for i, (y_true, y_pred) in enumerate(data):
recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3])
recall_all = partial(recall_score, y_true, y_pred, labels=None)
assert_array_almost_equal([.5, 1.], recall_13(average=None))
assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro'))
assert_almost_equal((.5 * 2 + 1. * 1) / 3,
recall_13(average='weighted'))
assert_almost_equal(2. / 3, recall_13(average='micro'))
# ensure the above were meaningful tests:
for average in ['macro', 'weighted', 'micro']:
assert_not_equal(recall_13(average=average),
recall_all(average=average))
def multilabel_precision_recall(y_score, y_test, clf_target_ids, clf_target_names):
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import label_binarize
# Compute Precision-Recall and plot curve
precision = dict()
recall = dict()
average_precision = dict()
# Find indices that have non-zero detections
clf_target_map = { k: v for k,v in zip(clf_target_ids, clf_target_names)}
id2ind = {tid: idx for (idx,tid) in enumerate(clf_target_ids)}
# Only handle the targets encountered
unique = np.unique(y_test)
nzinds = np.int64([id2ind[target] for target in unique])
# Binarize and create precision-recall curves
y_test_multi = label_binarize(y_test, classes=unique)
for i,target in enumerate(unique):
index = id2ind[target]
name = clf_target_map[target]
precision[name], recall[name], _ = precision_recall_curve(y_test_multi[:, i],
y_score[:, index])
average_precision[name] = average_precision_score(y_test_multi[:, i], y_score[:, index])
# Compute micro-average ROC curve and ROC area
precision["average"], recall["average"], _ = precision_recall_curve(y_test_multi.ravel(),
y_score[:,nzinds].ravel())
average_precision["micro"] = average_precision_score(y_test_multi, y_score[:,nzinds],
average="micro")
average_precision["macro"] = average_precision_score(y_test_multi, y_score[:,nzinds],
average="macro")
return precision, recall, average_precision
def binarize_labels(actual):
return label_binarize(actual, list(set(actual)))
def roc_auc(actual, predictions, average='weighted'):
class_names = list(set(actual))
# use binarized values for AUC score calculation
return roc_auc_score(label_binarize(actual, class_names), label_binarize(predictions, class_names), average=average)
def generate_prec_recall_points(clf, test_examples, test_labels, pk_file):
# Generate precision-recall points and store in a pickle file.
precision = dict()
recall = dict()
average_precision = dict()
thresholds = dict()
n_classes = len(clf.model.classes_)
y_test = label_binarize(test_labels, clf.model.classes_)
y_score = clf.predict_raw_prob(test_examples)
# It only output 1 column of positive probability.
y_score = y_score[:, 1:]
for i in range(n_classes - 1):
precision[i], recall[i], thresholds[i] = precision_recall_curve(
y_test[:, i],
y_score[:, i])
average_precision[i] = average_precision_score(y_test[:, i],
y_score[:, i])
# Compute micro-average ROC curve and ROC area
precision["micro"], recall["micro"], thresholds['micro'] = \
precision_recall_curve(y_test.ravel(), y_score.ravel())
average_precision["micro"] = average_precision_score(y_test, y_score,
average="micro")
if pk_file is not None:
with open(pk_file, 'wb') as f:
pickle.dump((precision, recall, average_precision, thresholds), f)
def roc(y_true, y_pred, classes=[0, 1, 2, 3, 4]):
y_true = label_binarize(y_true, classes=classes)
y_pred = label_binarize(y_pred, classes=classes)
n_classes = len(classes)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
return roc_auc
def _marg_rounded(self, x, y):
y_node = y.nodes
y_link = y.links
Y_node = label_binarize(y_node, self.prop_encoder_.classes_)
Y_link = label_binarize(y_link, self.link_encoder_.classes_)
# XXX can this be avoided?
Y_node, Y_link = map(_binary_2d, (Y_node, Y_link))
src_type = Y_node[x.link_to_prop[:, 0]]
trg_type = Y_node[x.link_to_prop[:, 1]]
if self.compat_features:
pw = np.einsum('...j,...k,...l->...jkl',
src_type, trg_type, Y_link)
compat = np.tensordot(x.X_compat.T, pw, axes=[1, 0])
else:
# equivalent to compat_features == np.ones(n_links)
compat = np.einsum('ij,ik,il->jkl', src_type, trg_type, Y_link)
second_order = []
if self.coparents_ or self.grandparents_ or self.siblings_:
link = {(a, b): k for k, (a, b) in enumerate(x.link_to_prop)}
if self.coparents_:
second_order.extend(y_link[link[a, b]] & y_link[link[c, b]]
for a, b, c in x.second_order)
if self.grandparents_:
second_order.extend(y_link[link[a, b]] & y_link[link[b, c]]
for a, b, c in x.second_order)
if self.siblings_:
second_order.extend(y_link[link[b, a]] & y_link[link[b, c]]
for a, b, c in x.second_order)
second_order = np.array(second_order)
return Y_node, Y_link, compat, second_order
def roc(y_true, y_pred, classes=[0, 1, 2, 3, 4]):
y_true = label_binarize(y_true, classes=classes)
y_pred = label_binarize(y_pred, classes=classes)
n_classes = len(classes)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
return roc_auc
def compute_roc(y_test, y_test_proba, nb_classes):
y_test = label_binarize(y_test, classes=range(0, nb_classes))
fpr, tpr, roc_auc = {}, {}, {}
for i in range(nb_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_test_proba[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(),
y_test_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
return roc_auc, fpr, tpr
def load_abalone_data(proportion=1044./4177):
from sklearn import datasets
from sklearn import preprocessing
from sklearn import cross_validation
abalone = datasets.fetch_mldata('regression-datasets abalone')
X_cate = np.array([abalone.target[i].tolist()
for i in range(abalone.target.shape[0])])
X_cate = preprocessing.label_binarize(X_cate, np.unique(X_cate))
X = np.hstack((X_cate, abalone.data))
y = abalone.int1[0].T.astype(np.float64)
y = y[:, None]
X = X.astype(np.float64)
X_train, X_test, y_train, y_test = \
cross_validation.train_test_split(X, y, test_size=proportion)
return X_train, y_train, X_test, y_test
def load_abalone_data(proportion=1044./4177):
from sklearn import datasets
from sklearn import preprocessing
from sklearn import cross_validation
abalone = datasets.fetch_mldata('regression-datasets abalone')
X_cate = np.array([abalone.target[i].tolist()
for i in range(abalone.target.shape[0])])
X_cate = preprocessing.label_binarize(X_cate, np.unique(X_cate))
X = np.hstack((X_cate, abalone.data))
y = abalone.int1[0].T.astype(np.float64)
y = y[:, None]
X = X.astype(np.float64)
X_train, X_test, y_train, y_test = \
cross_validation.train_test_split(X, y, test_size=proportion)
return X_train, y_train, X_test, y_test
def _score_micro_average(self, y, y_pred, classes, n_classes):
"""
Compute the micro average scores for the ROCAUC curves.
"""
# Convert y to binarized array for micro and macro scores
y = label_binarize(y, classes=classes)
if n_classes == 2:
y = np.hstack((1-y, y))
# Compute micro-average
self.fpr[MICRO], self.tpr[MICRO], _ = roc_curve(y.ravel(), y_pred.ravel())
self.roc_auc[MICRO] = auc(self.fpr[MICRO], self.tpr[MICRO])
def test_precision_recall_curve():
iris=load_iris()
X=iris.data
y=iris.target
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
np.random.seed(0)
n_samples, n_features = X.shape
X = np.c_[X, np.random.randn(n_samples, 200 * n_features)]
X_train,X_test,y_train,y_test=train_test_split(X,y,
test_size=0.5,random_state=0)
clf=OneVsRestClassifier(SVC(kernel='linear', probability=True,random_state=0))
clf.fit(X_train,y_train)
y_score = clf.fit(X_train, y_train).decision_function(X_test)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
precision = dict()
recall = dict()
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(y_test[:, i],
y_score[:, i])
ax.plot(recall[i],precision[i],label="target=%s"%i)
ax.set_xlabel("Recall Score")
ax.set_ylabel("Precision Score")
ax.set_title("P-R")
ax.legend(loc='best')
ax.set_xlim(0,1.1)
ax.set_ylim(0,1.1)
ax.grid()
plt.show()
def test_roc_auc_score():
iris=load_iris()
X=iris.data
y=iris.target
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
np.random.seed(0)
n_samples, n_features = X.shape
X = np.c_[X, np.random.randn(n_samples, 200 * n_features)]
X_train,X_test,y_train,y_test=train_test_split(X,y,
test_size=0.5,random_state=0)
clf=OneVsRestClassifier(SVC(kernel='linear', probability=True,random_state=0))
clf.fit(X_train,y_train)
y_score = clf.fit(X_train, y_train).decision_function(X_test)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
fpr = dict()
tpr = dict()
roc_auc=dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i],y_score[:, i])
roc_auc[i] = roc_auc_score(fpr[i], tpr[i])
ax.plot(fpr[i],tpr[i],label="target=%s,auc=%s"%(i,roc_auc[i]))
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel("FPR")
ax.set_ylabel("TPR")
ax.set_title("ROC")
ax.legend(loc="best")
ax.set_xlim(0,1.1)
ax.set_ylim(0,1.1)
ax.grid()
plt.show()
def ensemble_classify():
label_list = get_labels()
tweet_list = get_labelled_tweets()
# vectorise using tf-idf
vectoriser = TfidfVectorizer(min_df=3,
max_features=None,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1, 2),
use_idf=1,
smooth_idf=1,
sublinear_tf=1,)
## do transformation into vector
vectoriser.fit(tweet_list)
vectorised_tweet_list = vectoriser.transform(tweet_list)
train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list,
label_list,
test_size=0.8,
random_state=42)
n_estimators = 10 # number of weak learners
model = AdaBoostClassifier(n_estimators=n_estimators)
ada_classifier = model.fit(train_vector, train_labels)
result = ada_classifier.predict(test_vector)
# output result to csv
create_directory('data')
result.tofile("data/tfidf_ada.csv", sep=',')
save_model(ada_classifier, 'tfidf_ada')
# evaluation
binarise_result = label_binarize(result, classes=class_list)
binarise_labels = label_binarize(test_labels, classes=class_list)
generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)
def lin_svc():
label_list = get_labels()
tweet_list = get_labelled_tweets()
# vectorise using tf-idf
vectoriser = TfidfVectorizer(min_df=3,
max_features=None,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1, 2),
use_idf=1,
smooth_idf=1,
sublinear_tf=1,)
## do transformation into vector
fitted_vectoriser = vectoriser.fit(tweet_list)
vectorised_tweet_list = fitted_vectoriser.transform(tweet_list)
train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list,
label_list,
test_size=0.8,
random_state=42)
# train model and predict
model = LinearSVC()
ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels)
result = ovr_classifier.predict(test_vector)
# output result to csv
create_directory('data')
save_to_csv("data/testset_labels.csv", test_labels)
result.tofile("data/tfidf_linsvc.csv", sep=',')
save_model(ovr_classifier, 'tfidf_linsvc')
save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser')
# evaluation
label_score = ovr_classifier.decision_function(test_vector)
binarise_result = label_binarize(result, classes=class_list)
binarise_labels = label_binarize(test_labels, classes=class_list)
evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def test_precision_recall_f_extra_labels():
# Test handling of explicit additional (not in input) labels to PRF
y_true = [1, 3, 3, 2]
y_pred = [1, 1, 3, 2]
y_true_bin = label_binarize(y_true, classes=np.arange(5))
y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
data = [(y_true, y_pred),
(y_true_bin, y_pred_bin)]
for i, (y_true, y_pred) in enumerate(data):
# No average: zeros in array
actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
average=None)
assert_array_almost_equal([0., 1., 1., .5, 0.], actual)
# Macro average is changed
actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
average='macro')
assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual)
# No effect otheriwse
for average in ['micro', 'weighted', 'samples']:
if average == 'samples' and i == 0:
continue
assert_almost_equal(recall_score(y_true, y_pred,
labels=[0, 1, 2, 3, 4],
average=average),
recall_score(y_true, y_pred, labels=None,
average=average))
# Error when introducing invalid label in multilabel case
# (although it would only affect performance if average='macro'/None)
for average in [None, 'macro', 'micro', 'samples']:
assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
labels=np.arange(6), average=average)
assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
labels=np.arange(-1, 4), average=average)
def test_matthews_corrcoef():
rng = np.random.RandomState(0)
y_true = ["a" if i == 0 else "b" for i in rng.randint(0, 2, size=20)]
# corrcoef of same vectors must be 1
assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0)
# corrcoef, when the two vectors are opposites of each other, should be -1
y_true_inv = ["b" if i == "a" else "a" for i in y_true]
assert_almost_equal(matthews_corrcoef(y_true, y_true_inv), -1)
y_true_inv2 = label_binarize(y_true, ["a", "b"]) * -1
assert_almost_equal(matthews_corrcoef(y_true, y_true_inv2), -1)
# For the zero vector case, the corrcoef cannot be calculated and should
# result in a RuntimeWarning
mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered',
matthews_corrcoef, [0, 0, 0, 0], [0, 0, 0, 0])
# But will output 0
assert_almost_equal(mcc, 0.)
# And also for any other vector with 0 variance
mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered',
matthews_corrcoef, y_true,
rng.randint(-100, 100) * np.ones(20, dtype=int))
# But will output 0
assert_almost_equal(mcc, 0.)
# These two vectors have 0 correlation and hence mcc should be 0
y_1 = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]
y_2 = [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1]
assert_almost_equal(matthews_corrcoef(y_1, y_2), 0.)
# Check that sample weight is able to selectively exclude
mask = [1] * 10 + [0] * 10
# Now the first half of the vector elements are alone given a weight of 1
# and hence the mcc will not be a perfect 0 as in the previous case
assert_raises(AssertionError, assert_almost_equal,
matthews_corrcoef(y_1, y_2, sample_weight=mask), 0.)
def plot_roc(y_score, y_test, target_map, title='ROC curve'):
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.preprocessing import label_binarize
# Compute Precision-Recall and plot curve
fpr = dict()
tpr = dict()
roc_auc = dict()
target_ids = target_map.keys()
target_names = target_map.values()
print target_names
y_test_multi = label_binarize(y_test, classes=target_ids)
N, n_classes = y_score.shape[:2]
for i,name in enumerate(target_names):
fpr[name], tpr[name], _ = roc_curve(y_test_multi[:, i], y_score[:, i])
roc_auc[name] = auc(fpr[name], tpr[name])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_multi.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# Plot Precision-Recall curve for each class
plt.clf()
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr["micro"], tpr["micro"],
label='ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]), linewidth=3)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
plt.legend(loc="lower right")
plt.show()
for i,name in enumerate(target_names):
plt.plot(fpr[name], tpr[name],
label='{0}'.format(name.title().replace('_', ' ')))
# label='{0} (area = {1:0.2f})'
# ''.format(name.title().replace('_', ' '), roc_auc[name]))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(title)
plt.legend(loc="lower right")
plt.show(block=False)
def _as_dmatrix(self):
kwargs = dict(label=self.records['label'])
kwargs['feature_names'] = self.feature_names
featdat = self.records[self.basic_feat_cols]
featdat = featdat.view(fields.dtype).reshape(len(featdat), -1)
if self.hps.embedding_tag:
embs = cache_embeddings.load_embeddings(self.hps.embedding_tag)
npids, embsize = embs.shape
assert embsize == self.hps.embedding_dimension
logging.info('Loaded {}-d embeddings from rnn model {}'.format(
embsize, self.hps.embedding_tag))
pids = self.records['pid']
# NB: pids are 1-indexed
pidxs = (pids-1).astype(np.int32)
lookuped = embs[pidxs]
orig_shape = featdat.shape
featdat = np.hstack((featdat, lookuped))
logging.info('Shape went from {} to {} after adding pid embeddings'.format(
orig_shape, featdat.shape))
onehot_matrices = []
for onehot_var in self.onehot_vars:
onehot = label_binarize(self.records[onehot_var],
classes=range(1, self.FIELD_TO_NVALUES[onehot_var]+1),
sparse_output=True).astype(fields.dtype)
onehot_matrices.append(onehot)
if onehot_matrices:
# TODO: There are some perf issues with this. Look into this workaround:
# https://stackoverflow.com/questions/6844998/is-there-an-efficient-way-of-concatenating-scipy-sparse-matrices/33259578#33259578
featdat = scipy.sparse.hstack([featdat,]+onehot_matrices, format='csr')
logging.info('Made dmatrix with feature data having shape {}'.format(featdat.shape))
# https://github.com/dmlc/xgboost/issues/2554
if not kwargs['label'].flags.c_contiguous:
logging.info('Contiguizing labels')
kwargs['label'] = np.ascontiguousarray(kwargs['label'])
logging.info('Contiguized')
if isinstance(featdat, np.ndarray) and not featdat.flags.c_contiguous:
logging.info('Contiguizing feature data')
featdat = np.ascontiguousarray(featdat)
if FTYPES:
kwargs['feature_types'] = self.feature_types
return xgb.DMatrix(featdat, **kwargs)
def main():
plt.figure()
for j in range(1,6):
random_state = np.random.RandomState(0)
X,y = load_file(file_name,j)
k = 2
# y = label_binarize(y, classes=[0, 1, 2])
# n_classes = y.shape[1]
# print n_classes
n_classes = 2
ylabel, ave= transformtolabel(y,k)
ylabel = np.array(ylabel)
# ylabel = np.transpose(ylabel)
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, ylabel, test_size=.5,
random_state=0)
# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='rbf', probability=True,
random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
# print y_test[i]
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# print fpr[1]
##############################################################################
# Plot of a ROC curve for a specific class
# plt.figure()
# plt.plot(fpr[0], tpr[0], label='CO below %0.2f' % ave +' (area = %0.2f)' %roc_auc[0])
plt.plot(fpr[1], tpr[1], label='O3 prediction (area = %0.2f)' %roc_auc[1]+'(%0.0f'% j+' features)')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for SVM')
plt.legend(loc="lower right")
plt.show()
def gensim_classifier():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
label_list = get_labels()
tweet_list = get_labelled_tweets()
# split all sentences to list of words
sentences = []
for tweet in tweet_list:
temp_doc = tweet.split()
sentences.append(temp_doc)
# parameters for model
num_features = 100
min_word_count = 1
num_workers = 4
context = 2
downsampling = 1e-3
# Initialize and train the model
w2v_model = Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling, seed=1)
index_value, train_set, test_set = train_test_split(0.80, sentences)
train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features)
test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features)
train_vector = Imputer().fit_transform(train_vector)
test_vector = Imputer().fit_transform(test_vector)
# train model and predict
model = LinearSVC()
classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value])
result = classifier_fitted.predict(test_vector)
# output result to csv
create_directory('data')
result.tofile("data/w2v_linsvc.csv", sep=',')
# store the model to mmap-able files
create_directory('model')
joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc')
# evaluation
label_score = classifier_fitted.decision_function(test_vector)
binarise_result = label_binarize(result, classes=class_list)
binarise_labels = label_binarize(label_list, classes=class_list)
evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')