def get_bootstrapped_trainset(trainSet, y_train, bootstrap_data, es, estimator, th_bs):
new_train_set = list(trainSet)
new_y_train = list(y_train)
trainAndBSData = trainSet + bootstrap_data
generateDataDrivenFeats(trainSet, trainAndBSData, es)
featurized = featurize(trainAndBSData)
train_feats = [featurized[idx] for idx in range(0, len(trainSet), 1)]
test_feats = [featurized[idx] for idx in range(len(trainSet), len(trainAndBSData), 1)]
#Do feature selection on train data
train_feats = fs.runFeatureSelection(train_feats, y_train, es)
train_feats, y_train, train_bucket = ss.runSampleSelection(train_feats, y_train,[i for i in range(0, len(trainSet), 1)], es)
# calculate Inter-annotator weighting.
weights_train = getWeights(trainAndBSData, train_bucket, es.weighInterAnnot)
vectorizer = DictVectorizer()
x_train = vectorizer.fit_transform(train_feats)
x_test = vectorizer.transform(test_feats)
if es.scaleData:
min_max_scalar = MinMaxScaler()
x_train = min_max_scalar.fit_transform(x_train.toarray())
x_test = min_max_scalar.transform(x_test.toarray())
model = train(estimator, x_train, y_train, weights_train, model=None)
y_pred_prob = model.predict_proba(x_test)
for i, cur_y in enumerate(y_pred_prob):
if np.max(cur_y) > th_bs:
new_train_set.append(bootstrap_data[i])
new_y_train.append(np.argmax(cur_y))
return (new_train_set, new_y_train) #update none to confidence vector
评论列表
文章目录