def plot_significance_score(model, X, Y):
from sklearn.model_selection import permutation_test_score, StratifiedKFold
cv = StratifiedKFold(10)
# Must be numpy arrays to use permutation as opposed to pandas data frame
score, permutation_scores, pvalue = permutation_test_score(model,
X.values, Y.values, scoring="roc_auc", cv=cv,
n_permutations=100)
print("Classification Score %s (p-value: %s)" % (score, pvalue))
# -------------------- Matplotlib Graphs -------------------- #
#-------------------------------------------------------------#
#-------------------------------------------------------------#
python类StratifiedKFold()的实例源码
def setUp(self):
bl1 = RandomForestClassifier(random_state=8)
bl2 = LogisticRegression()
bl3 = RandomForestClassifier(max_depth=10, random_state=10)
meta_est = LogisticRegression()
skf = StratifiedKFold(random_state=8).split
self.stacked_ensemble = stacker.XcessivStackedEnsemble(
[bl1, bl2, bl3],
['predict', 'predict_proba', 'predict_proba'],
meta_est,
skf
)
def get_sample_dataset(dataset_properties):
"""Returns sample dataset
Args:
dataset_properties (dict): Dictionary corresponding to the properties of the dataset
used to verify the estimator and metric generators.
Returns:
X (array-like): Features array
y (array-like): Labels array
splits (iterator): This is an iterator that returns train test splits for
cross-validation purposes on ``X`` and ``y``.
"""
kwargs = dataset_properties.copy()
data_type = kwargs.pop('type')
if data_type == 'multiclass':
try:
X, y = datasets.make_classification(random_state=8, **kwargs)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
except Exception as e:
raise exceptions.UserError(repr(e))
elif data_type == 'iris':
X, y = datasets.load_iris(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'mnist':
X, y = datasets.load_digits(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'breast_cancer':
X, y = datasets.load_breast_cancer(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'boston':
X, y = datasets.load_boston(return_X_y=True)
splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
elif data_type == 'diabetes':
X, y = datasets.load_diabetes(return_X_y=True)
splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
else:
raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type']))
return X, y, splits
preprocess.py 文件源码
项目:Sentence-Classification-with-RNN-in-TensorFlow
作者: zakizhou
项目源码
文件源码
阅读 36
收藏 0
点赞 0
评论 0
def build_corpus():
positive_sentences = codecs.open("rt-polaritydata/rt-polarity.pos").readlines()
negative_sentences = codecs.open("rt-polaritydata/rt-polarity.neg").readlines()
num_positive = len(positive_sentences)
num_negative = len(negative_sentences)
labels = [1] * num_positive + [0] * num_negative
sentences = positive_sentences + negative_sentences
clean = [word_tokenize(clean_sentence(sentence)) for sentence in sentences]
total = reduce(lambda sent1, sent2: sent1 + sent2, clean)
counter = collections.Counter(total)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
word2id = dict(zip(words, range(3, len(words)+3)))
word2id["<pad>"] = 0
word2id["<sos>"] = 1
word2id["<eos>"] = 2
inputs = []
for sent in clean:
stantard_sent = [1] + [word2id[word] for word in sent] + [2]
inputs.append(stantard_sent)
skf = StratifiedKFold(n_splits=5)
inputs_array = np.array(inputs)
labels_array = np.array(labels)
train_index, validation_index = skf.split(inputs_array, labels_array).next()
np.random.shuffle(train_index)
np.random.shuffle(validation_index)
train_X, train_y = inputs_array[train_index], labels_array[train_index]
valid_X, valid_y = inputs_array[validation_index], labels_array[validation_index]
return word2id, train_X, train_y, valid_X, valid_y
def _get_cv_splits(self, df):
if self._cv_method is None:
self._cv_method = StratifiedKFold(n_splits=self._kfolds, shuffle=self._shuffle)
for train, test in self._cv_method.split(df, df[self._class_col]):
yield df.ix[train], df.ix[test]
def _create_stratified_split(csv_filepath, n_splits):
"""
Create a stratified split for the classification task.
Parameters
----------
csv_filepath : str
Path to a CSV file which points to images
n_splits : int
Number of splits to make
"""
from sklearn.model_selection import StratifiedKFold
data = _load_csv(csv_filepath)
labels = [el['symbol_id'] for el in data]
skf = StratifiedKFold(labels, n_folds=n_splits)
i = 1
kdirectory = 'classification-task'
if not os.path.exists(kdirectory):
os.makedirs(kdirectory)
for train_index, test_index in skf:
print("Create fold %i" % i)
directory = "%s/fold-%i" % (kdirectory, i)
if not os.path.exists(directory):
os.makedirs(directory)
else:
print("Directory '%s' already exists. Please remove it." %
directory)
i += 1
train = [data[el] for el in train_index]
test_ = [data[el] for el in test_index]
for dataset, name in [(train, 'train'), (test_, 'test')]:
with open("%s/%s.csv" % (directory, name), 'wb') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(('path', 'symbol_id', 'latex', 'user_id'))
for el in dataset:
csv_writer.writerow(("../../%s" % el['path'],
el['symbol_id'],
el['latex'],
el['user_id']))
def _create_stratified_split(csv_filepath, n_splits):
"""
Create a stratified split for the classification task.
Parameters
----------
csv_filepath : str
Path to a CSV file which points to images
n_splits : int
Number of splits to make
"""
from sklearn.model_selection import StratifiedKFold
data = _load_csv(csv_filepath)
labels = [el['symbol_id'] for el in data]
skf = StratifiedKFold(labels, n_folds=n_splits)
i = 1
kdirectory = 'classification-task'
if not os.path.exists(kdirectory):
os.makedirs(kdirectory)
for train_index, test_index in skf:
print("Create fold %i" % i)
directory = "%s/fold-%i" % (kdirectory, i)
if not os.path.exists(directory):
os.makedirs(directory)
else:
print("Directory '%s' already exists. Please remove it." %
directory)
i += 1
train = [data[el] for el in train_index]
test_ = [data[el] for el in test_index]
for dataset, name in [(train, 'train'), (test_, 'test')]:
with open("%s/%s.csv" % (directory, name), 'wb') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(('path', 'symbol_id', 'latex', 'user_id'))
for el in dataset:
csv_writer.writerow(("../../%s" % el['path'],
el['symbol_id'],
el['latex'],
el['user_id']))
def make_mf_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''):
n = X.shape[0]
'''
Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier
'''
print clf
np.random.seed(seed)
feature_index = np.arange(X.shape[1])
for epoch in range(nb_epoch):
print "Start epoch:",epoch
mf_tr = np.zeros((X.shape[0],len(np.unique(y))))
mf_te = np.zeros((X_test.shape[0],len(np.unique(y))))
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y)
np.random.shuffle(feature_index)
new_index = feature_index[:int(max_features*len(feature_index))]
for ind_tr, ind_te in skf:
if ssp.issparse(X):
X_tr = X[ind_tr].tocsc()[:,new_index]
X_te = X[ind_te].tocsc()[:,new_index]
else:
X_tr = X[ind_tr][:,new_index]
X_te = X[ind_te][:,new_index]
y_tr = y[ind_tr]
y_te = y[ind_te]
clf.fit(X_tr, y_tr)
mf_tr[ind_te] += clf.predict_proba(X_te)
mf_te += clf.predict_proba(X_test[:,new_index])
score = log_loss(y_te, mf_tr[ind_te])
print '\tpred[{}] score:{}'.format(epoch, score)
mf_te/=n_folds
pd.to_pickle(mf_tr,path+'X_mf_%s_%s_random_r.pkl'%(name,epoch))
pd.to_pickle(mf_te,path+'X_t_mf_%s_%s_random_r.pkl'%(name,epoch))
def make_mf_lsvc_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''):
n = X.shape[0]
'''
Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier
'''
print clf
for epoch in range(nb_epoch):
print "Start epoch:",epoch
mf_tr = np.zeros(X.shape[0])
mf_te = np.zeros(X_test.shape[0])
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y)
for ind_tr, ind_te in skf:
X_tr = X[ind_tr]
X_te = X[ind_te]
y_tr = y[ind_tr]
y_te = y[ind_te]
clf.fit(X_tr, y_tr)
mf_tr[ind_te] += clf.predict_proba(X_te).ravel()
score = accuracy_score(y_te, clf.predict(X_te).ravel())
del X_tr
del X_te
mf_te += clf.predict_proba(X_test).ravel()
print '\tpred[{}] score:{}'.format(epoch, score)
mf_te/=n_folds
pd.to_pickle(mf_tr.reshape(-1,1),path+'X_mf_%s_%s_random.pkl'%(name,epoch))
pd.to_pickle(mf_te.reshape(-1,1),path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
def make_mf_regression(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''):
n = X.shape[0]
'''
Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier
'''
print clf
for epoch in range(nb_epoch):
print "Start epoch:",epoch
mf_tr = np.zeros(X.shape[0])
mf_te = np.zeros(X_test.shape[0])
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y)
for ind_tr, ind_te in skf:
X_tr = X[ind_tr]
X_te = X[ind_te]
y_tr = y[ind_tr]
y_te = y[ind_te]
clf.fit(X_tr, y_tr)
mf_tr[ind_te] += clf.predict(X_te)
del X_tr
del X_te
l = 600000
y_pred = []
for batch in range(4):
X_tmp = X_test[l*batch:l*(batch+1)]
y_pred.append(clf.predict(X_tmp))
y_pred = np.concatenate(y_pred)
mf_te += y_pred
score = log_loss(y_te, mf_tr[ind_te])
print '\tpred[{}] score:{}'.format(epoch, score)
mf_te/=n_folds
pd.to_pickle(mf_tr,path+'X_mf_%s_%s_random.pkl'%(name,epoch))
pd.to_pickle(mf_te,path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
def tune_num_estimators(metric: str,
label: np.ndarray,
params: dict,
strat_folds: StratifiedKFold,
train) -> Tuple[int, float]:
"""
Uses xgboost's cross-validation method to tune the number of estimators and returns that along with the best CV score
achieved.
:param metric:
Evaluation metric that is monitored during cross-validation - e.g. 'logloss' or 'rmse'.
:param label:
An array-like containing the labels of the classification or regression problem.
:param params:
A dictionary of XGB parameters.
:param strat_folds:
A StratifiedKFold object to cross validate the parameters.
:param train:
An array-like containing the training input samples.
:return:
A tuple containing the tuned number of estimators along with the best CV score achieved.
"""
eval_hist = xgb.cv(
dtrain=xgb.DMatrix(train, label=label),
early_stopping_rounds=50,
folds=strat_folds,
metrics=metric,
num_boost_round=10000,
params=params,
verbose_eval=True
)
num_trees = eval_hist.shape[0]
best_score = eval_hist.values[num_trees - 1, 0]
return num_trees, best_score
def split_kfold(self, features, labels=None, n_folds=10):
skf = StratifiedKFold(n_folds)
for train_index, test_index in skf.split(features, labels):
yield train_index, test_index
# return cross_validation.StratifiedKFold(labels, n_folds)
def __init__(self, models, meta_model, cv=model_selection.StratifiedKFold(n_splits=3),
use_base_features=True, use_proba=True):
super().__init__(
models=models,
meta_model=meta_model,
cv=cv,
use_base_features=use_base_features,
use_proba=use_proba,
)
def predict_kfold(cls, X, y, n_folds=10, seed=0, textModel_params={},
kfolds=None, pool=None, use_tqdm=True):
try:
from tqdm import tqdm
except ImportError:
def tqdm(x, **kwargs):
return x
le = preprocessing.LabelEncoder().fit(y)
y = np.array(le.transform(y))
hy = np.zeros(len(y), dtype=np.int)
if kfolds is None:
kfolds = StratifiedKFold(n_splits=n_folds, shuffle=True,
random_state=seed).split(X, y)
args = [(X, y, tr, ts, textModel_params) for tr, ts in kfolds]
if pool is not None:
if use_tqdm:
res = [x for x in tqdm(pool.imap_unordered(cls.train_predict_pool, args),
desc='Params', total=len(args))]
else:
res = [x for x in pool.imap_unordered(cls.train_predict_pool, args)]
else:
if use_tqdm:
args = tqdm(args)
res = [cls.train_predict_pool(x) for x in args]
for ts, _hy in res:
hy[ts] = _hy
return le.inverse_transform(hy)
def __init__(self, X, y, score, n_folds, cls, seed=0, pool=None):
self.n_folds = n_folds
self.score = score
self.X = X
self.le = le = preprocessing.LabelEncoder().fit(y)
self.y = np.array(le.transform(y))
self.cls = cls
self.pool = pool
np.random.seed(seed)
self.kfolds = [x for x in StratifiedKFold(n_splits=n_folds, shuffle=True,
random_state=seed).split(np.zeros(self.y.shape[0]),
self.y)]
def xgb1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
N_splits = 7
N_seeds = 2
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
xgb_params = dict(
max_depth = 5,
learning_rate = 0.03,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
for s in range(N_seeds):
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
print('step %d of %d'%(n+1, skf.n_splits), now())
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
dtest = xgb.DMatrix(test2)
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=1000)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d: '%(xgb_params['seed'], n+1), score, now())
scores.append(score)
print('validation loss: ', metrics.log_loss(y, v[cname]))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
def xgb2(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
N_splits = 7
N_seeds = 2
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
xgb_params = dict(
max_depth = 4,
learning_rate = 0.03,
subsample = 0.7,
#colsample_bytree = 0.8,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
print('step %d of %d'%(n+1, skf.n_splits), now())
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=1000)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d: '%(xgb_params['seed'], n+1), score, now())
scores.append(score)
print('validation loss: ', metrics.log_loss(y, v[cname]))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
def xgb3(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
N_splits = 7
N_seeds = 2
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
xgb_params = dict(
max_depth = 4,
learning_rate = 0.03,
subsample = 0.8,
#colsample_bytree = 0.8,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
print('step %d of %d'%(n+1, skf.n_splits), now())
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=1000)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d: '%(xgb_params['seed'], n+1), score, now())
scores.append(score)
print('validation loss: ', metrics.log_loss(y, v[cname]))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
def xgb1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
N_splits = 7
N_seeds = 3
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
xgb_params = dict(
max_depth = 5,
learning_rate = 0.02,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
for s in range(N_seeds):
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
dtest = xgb.DMatrix(test2)
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
print('validation loss: ', metrics.log_loss(y, v[cname]))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
def xgb3(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
N_splits = 7
N_seeds = 3
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
xgb_params = dict(
max_depth = 4,
learning_rate = 0.02,
subsample = 0.8,
colsample_bytree = 0.8,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
print('validation loss: ', metrics.log_loss(y, v[cname]))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds