def _get_child_predict(self, clf, X, index=None):
if self.stack_by_proba and hasattr(clf, 'predict_proba'):
if self.save_stage0 and index is not None:
proba = util.saving_predict_proba(clf, X, index)
else:
proba = clf.predict_proba(X)
return proba[:, 1:]
elif hasattr(clf, 'predict'):
predict_result = clf.predict(X)
if isinstance(clf, ClassifierMixin):
lb = LabelBinarizer()
lb.fit(predict_result)
return lb.fit_transform(predict_result)
else:
return predict_result.reshape((predict_result.size, 1))
else:
return clf.fit_transform(X)
python类LabelBinarizer()的实例源码
def fit(self, X, y, check_input=True):
self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
Y = self._label_binarizer.fit_transform(y)
if self._label_binarizer.y_type_.startswith('multilabel'):
# we don't (yet) support multi-label classification in ENet
raise ValueError(
"%s doesn't support multi-label classification" % (
self.__class__.__name__))
# Y = column_or_1d(Y, warn=True)
super(ElasticNetClassifier, self).fit(X, Y)
if self.classes_.shape[0] > 2:
ndim = self.classes_.shape[0]
else:
ndim = 1
self.coef_ = self.coef_.reshape(ndim, -1)
return self
def __init__(self, inputs, labels, test_indices=None, **kwargs):
"""Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it
easy to serialize and deserialize everything as a unit.
Args:
inputs: The raw model inputs. This can be set to None if you dont want
to serialize this value when you save the dataset.
labels: The raw output labels.
test_indices: The optional test indices to use. Ideally, this should be generated one time and reused
across experiments to make results comparable. `generate_test_indices` can be used generate first
time indices.
**kwargs: Additional key value items to store.
"""
self.X = np.array(inputs)
self.y = np.array(labels)
for key, value in kwargs.items():
setattr(self, key, value)
self._test_indices = None
self._train_indices = None
self.test_indices = test_indices
self.is_multi_label = isinstance(labels[0], (set, list, tuple))
self.label_encoder = MultiLabelBinarizer() if self.is_multi_label else LabelBinarizer()
self.y = self.label_encoder.fit_transform(self.y).flatten()
cnn.py 文件源码
项目:Nature-Conservancy-Fish-Image-Prediction
作者: Brok-Bucholtz
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def preprocess(image_shape, image_paths, labels=[]):
features = []
for image_path in tqdm(image_paths):
image_data = list(Image.open(image_path).resize(image_shape[:2]).getdata())
image_data = np.asarray(image_data).reshape(image_shape)
features.append(image_data)
# Normalizer
features = np.asarray(features)
features = features / 255.0
if labels:
# one hot encode
label_binarizer = LabelBinarizer()
labels = label_binarizer.fit_transform(labels)
# Shuffle
features, labels = shuffle(features, labels)
return features, labels
def get_one_hot(in_matrix):
"""
Reformat truth matrix to same size as the output of the dense network.
Args:
in_matrix: the categorized 1D matrix
Returns: a one-hot matrix representing the categorized matrix
"""
if in_matrix.dtype.name == 'category':
custum_array = in_matrix.cat.codes
elif isinstance(in_matrix, np.ndarray):
custum_array = in_matrix
else:
raise ValueError("Input matrix cannot be converted.")
lb = LabelBinarizer()
return np.array(lb.fit_transform(custum_array), dtype='float32')
def extract_data(path):
global CLASSES
images, labels = traverse_dir(path)
images = np.array(images)
# change to ont-hot vector
one_hot = preprocessing.LabelBinarizer()
one_hot.fit(labels)
nb_classes = len(one_hot.classes_)
with open(path+'\labels.txt', 'w') as f:
for label in one_hot.classes_:
f.write(label + '\n')
one_hots = list(one_hot.transform([i]) for i in labels)
one_hots = np.array(one_hots)
one_hots = np.reshape(one_hots, (images.shape[0], nb_classes))
return images, one_hots, nb_classes
def encode_bond_features(self, bond_set):
"""
We break out this function for encoding bond types because it is
reused and occupies several lines.
Parameters:
===========
- bond_set: (set or list) of bonds.
"""
bond_lb = LabelBinarizer()
bond_lb.fit(BOND_TYPES)
bonds = np.zeros(len(BOND_TYPES))
if len(bond_set) > 0:
bond_array = bond_lb.transform([i for i in bond_set])
for b in bond_array:
bonds = bonds + b
return bonds
def load_data_labels(data_file, labels_file):
"""
Loads MR polarity data from files, splits the data into words and generates labels.
Returns split sentences and labels.
"""
data = []
labels = []
with open(data_file, 'r', encoding='latin-1') as f:
data.extend([s.strip() for s in f.readlines()])
data = [clean_str(s) for s in data]
with open(labels_file, 'r') as f:
labels.extend([s.strip() for s in f.readlines()])
lables = [label.split(',')[1].strip() for label in labels]
lb = LabelBinarizer()
y = lb.fit_transform(lables)
# max_document_length = max([len(x.split(" ")) for x in data])
# print(max_document_length)
vocab_processor = learn.preprocessing.VocabularyProcessor(1000)
x = np.array(list(vocab_processor.fit_transform(data)))
return x, y, vocab_processor
def _check_X_y(self, X, y):
# helpful error message for sklearn < 1.17
is_2d = hasattr(y, 'shape') and len(y.shape) > 1 and y.shape[1] >= 2
if is_2d or type_of_target(y) != 'binary':
raise TypeError("Only binary targets supported. For training "
"multiclass or multilabel models, you may use the "
"OneVsRest or OneVsAll metaestimators in "
"scikit-learn.")
X, Y = check_X_y(X, y, dtype=np.double, accept_sparse='csc',
multi_output=False)
self.label_binarizer_ = LabelBinarizer(pos_label=1, neg_label=-1)
y = self.label_binarizer_.fit_transform(Y).ravel().astype(np.double)
return X, y
def _partial_fit(self, X, y, classes=None, first_partial_fit=None):
if first_partial_fit and not classes:
raise ValueError("classes must be passed on the first call "
"to partial_fit.")
if not self.is_fitted:
self.alpha_sum_ = X.shape[1] * self.alpha
if classes:
self.classes_ = classes
lb = LabelBinarizer()
y_one_hot = lb.fit_transform(y)
self.class_count_ = np.sum(y_one_hot, axis=0)
if not self.classes_:
self.classes_ = lb.classes_
self._class_log_prob()
self._update_complement_features(X, y_one_hot)
self.is_fitted = True
def _partial_fit(self, X, y, classes=None, first_partial_fit=None):
if first_partial_fit and not classes:
raise ValueError("classes must be passed on the first call "
"to partial_fit.")
if not self.is_fitted:
self.alpha_sum_ = X.shape[1] * self.alpha
if classes:
self.classes_ = classes
lb = LabelBinarizer()
y_one_hot = lb.fit_transform(y)
self.class_count_ = np.sum(y_one_hot, axis=0)
if not self.classes_:
self.classes_ = lb.classes_
self._features_in_class(X, y_one_hot)
self.is_fitted = True
def _partial_fit(self, X, y, classes=None, first_partial_fit=None):
if first_partial_fit and not classes:
raise ValueError("classes must be passed on the first call "
"to partial_fit.")
if not self.is_fitted:
self.alpha_sum_ = X.shape[1] * self.alpha
if classes:
self.classes_ = classes
lb = LabelBinarizer()
y_one_hot = lb.fit_transform(y)
self.class_count_ = np.sum(y_one_hot, axis=0)
if not self.classes_:
self.classes_ = lb.classes_
self._update_complement_features(X, y_one_hot)
self._update_features(X, y_one_hot)
self.is_fitted = True
def _partial_fit(self, X, y, classes=None, first_partial_fit=None):
if first_partial_fit and not classes:
raise ValueError("classes must be passed on the first call "
"to partial_fit.")
if not self.is_fitted:
self.alpha_sum_ = X.shape[1] * self.alpha
if classes:
self.classes_ = classes
lb = LabelBinarizer()
y_one_hot = lb.fit_transform(y)
self.class_count_ = np.sum(y_one_hot, axis=0)
if not self.classes_:
self.classes_ = lb.classes_
#self._class_log_prob()
self._update_complement_features(X, y_one_hot)
self.is_fitted = True
def _partial_fit(self, X, y, classes=None, first_partial_fit=None):
if first_partial_fit and not classes:
raise ValueError("classes must be passed on the first call "
"to partial_fit.")
if not self.is_fitted:
self.alpha_sum_ = X.shape[1] * self.alpha
if classes:
self.classes_ = classes
lb = LabelBinarizer()
y_one_hot = lb.fit_transform(y)
self.class_counts_ = np.sum(y_one_hot, axis=0)
if not self.classes_:
self.classes_ = lb.classes_
self._class_log_prob()
self._features_in_class(X, y_one_hot)
self.is_fitted = True
def data2Vector(self):
vec = DictVectorizer()
dummy_x = vec.fit_transform(self.feature_list).toarray()
lb = LabelBinarizer()
dummy_y = lb.fit_transform(self.label_list)
return dummy_x, dummy_y
# here the decision tree use the algorithm which we call ID3, ID3 will use
# information gain as feature select
def to_one_hot(y):
"""Transform multi-class labels to binary labels
The output of to_one_hot is sometimes referred to by some authors as the
1-of-K coding scheme.
Parameters
----------
y : numpy array or sparse matrix of shape (n_samples,) or
(n_samples, n_classes) Target values. The 2-d matrix should only
contain 0 and 1, represents multilabel classification. Sparse
matrix can be CSR, CSC, COO, DOK, or LIL.
Returns
-------
Y : numpy array or CSR matrix of shape [n_samples, n_classes]
Shape will be [n_samples, 1] for binary problems.
classes_ : class vector extraceted from y.
"""
lb = LabelBinarizer()
lb.fit(y)
Y = lb.transform(y)
return (Y.base, lb.classes_)
def to_one_hot(y):
"""Transform multi-class labels to binary labels
The output of to_one_hot is sometimes referred to by some authors as the
1-of-K coding scheme.
Parameters
----------
y : numpy array or sparse matrix of shape (n_samples,) or
(n_samples, n_classes) Target values. The 2-d matrix should only
contain 0 and 1, represents multilabel classification. Sparse
matrix can be CSR, CSC, COO, DOK, or LIL.
Returns
-------
Y : numpy array or CSR matrix of shape [n_samples, n_classes]
Shape will be [n_samples, 1] for binary problems.
classes_ : class vector extraceted from y.
"""
lb = LabelBinarizer()
lb.fit(y)
Y = lb.transform(y)
return (Y.base, lb.classes_)
def check_proba_classif_convergence(X_train, y_train, mc):
lb = LabelBinarizer()
y_bin = lb.fit_transform(y_train)
le = LabelEncoder()
y_enc = le.fit_transform(y_train)
proba = mc.predict_proba(X_train)
labels = mc.predict(X_train)
assert_array_equal(proba, y_bin)
assert_array_equal(labels, lb.inverse_transform(y_bin))
# For points completely far away from the training data, this
# should converge to the empirical distribution of labels.
# X is scaled between to -1.0 and 1.0
X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]),
-30.0 * np.ones(X_train.shape[1])))
inf_proba = mc.predict_proba(X_inf)
emp_proba = np.bincount(y_enc) / float(len(y_enc))
assert_array_almost_equal(inf_proba, [emp_proba, emp_proba])
def check_proba_classif_convergence(est, X_train, y_train):
lb = LabelBinarizer()
y_bin = lb.fit_transform(y_train)
le = LabelEncoder()
y_enc = le.fit_transform(y_train)
proba = est.predict_proba(X_train)
labels = est.predict(X_train)
assert_array_equal(proba, y_bin)
assert_array_equal(labels, lb.inverse_transform(y_bin))
# For points completely far away from the training data, this
# should converge to the empirical distribution of labels.
X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]),
-30.0 * np.ones(X_train.shape[1])))
inf_proba = est.predict_proba(X_inf)
emp_proba = np.bincount(y_enc) / float(len(y_enc))
assert_array_almost_equal(inf_proba, [emp_proba, emp_proba], 3)
def load_csv(training_fn, features_fn):
'''
Transform tabular data set into NumPy arrays.
'''
df = pd.read_csv(training_fn, sep='\t')
features = json.load(open(features_fn))['features']
data = df[features].as_matrix()
print('Data:', data.shape)
labels = df[['label']].as_matrix().reshape(-1)
lb = preprocessing.LabelBinarizer()
lb.fit(labels)
print('Labels:', labels.shape)
return features, data, labels
def classify(y_true, y_pred):
lb = LabelBinarizer()
y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
tagset = set(lb.classes_) - {'O'}
tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
return classification_report(
y_true_combined,
y_pred_combined,
labels = [class_indices[cls] for cls in tagset],
target_names = tagset,
)
def bio_classification_report(y_gold,y_pred):
#y_gold: [[],[],[]]
#y_pred:
lb = LabelBinarizer()
y_gold_combined = lb.fit_transform(list(chain.from_iterable(y_gold)))
y_pred_combined = lb.fit_transform(list(chain.from_iterable(y_pred)))
tagset = set(lb.classes_) - {'O'}
tagset = sorted(tagset,key=lambda tag: tag.split('-',1)[::-1])
class_indices = {cls:idx for idx,cls in enumerate(lb.classes_)}
return classification_report(
y_gold_combined,
y_pred_combined,
labels=[class_indices[cls] for cls in tagset],
target_names=tagset
)
def __init__(self, hidden_layer_sizes=(100,), activation="relu",
algorithm='l-bfgs', alpha=0.00001,
batch_size=200, learning_rate="constant",
learning_rate_init=0.5, power_t=0.5, max_iter=200,
shuffle=False, random_state=None, tol=1e-5,
verbose=False, warm_start=False):
sup = super(MultilayerPerceptronClassifier, self)
sup.__init__(hidden_layer_sizes=hidden_layer_sizes,
activation=activation, algorithm=algorithm, alpha=alpha,
batch_size=batch_size, learning_rate=learning_rate,
learning_rate_init=learning_rate_init, power_t=power_t,
max_iter=max_iter, loss='log_loss', shuffle=shuffle,
random_state=random_state, tol=tol,
beta=0, sparsity_param=0,
verbose=verbose, warm_start=warm_start)
self.label_binarizer_ = LabelBinarizer()
def fit(self, y):
"""Fit simplex coding
Parameters
----------
targets : array, shape = [n_samples,] or [n_samples, n_classes]
Target values. The 2-d array represents the simplex coding for
multilabel classification.
Returns
-------
self : returns an instance of self.
"""
if self.binarizer is None:
self.binarizer_ = LabelBinarizer(neg_label=0, pos_label=1,
sparse_output=True)
self.binarizer_.fit(y)
dimension = self.binarizer_.classes_.size
if dimension > 2:
self.simplex_operator_ = SimplexCoding.code(dimension)
else:
self.simplex_operator_ = ones((1, 1))
return self
def to_one_hot(y):
"""Transform multi-class labels to binary labels
The output of to_one_hot is sometimes referred to by some authors as the
1-of-K coding scheme.
Parameters
----------
y : numpy array or sparse matrix of shape (n_samples,) or
(n_samples, n_classes) Target values. The 2-d matrix should only
contain 0 and 1, represents multilabel classification. Sparse
matrix can be CSR, CSC, COO, DOK, or LIL.
Returns
-------
Y : numpy array or CSR matrix of shape [n_samples, n_classes]
Shape will be [n_samples, 1] for binary problems.
classes_ : class vector extraceted from y.
"""
lb = LabelBinarizer()
lb.fit(y)
Y = lb.transform(y)
return (Y.base, lb.classes_)
def test_cross_val_predict():
# Make sure it works in cross_val_predict for multiclass.
X, y = load_iris(return_X_y=True)
y = LabelBinarizer().fit_transform(y)
X = StandardScaler().fit_transform(X)
mlp = MLPClassifier(n_epochs=10,
solver_kwargs={'learning_rate': 0.05},
random_state=4567).fit(X, y)
cv = KFold(n_splits=4, random_state=457, shuffle=True)
y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba')
auc = roc_auc_score(y, y_oos, average=None)
assert np.all(auc >= 0.96)
def __init__(self, n_hidden=20, alpha=0.5, rbf_width=1.0,
activation_func='tanh', activation_args=None,
user_components=None, regressor=None,
binarizer=LabelBinarizer(-1, 1),
random_state=None):
super(ELMClassifier, self).__init__(n_hidden=n_hidden,
alpha=alpha,
random_state=random_state,
activation_func=activation_func,
activation_args=activation_args,
user_components=user_components,
rbf_width=rbf_width,
regressor=regressor)
self.classes_ = None
self.binarizer = binarizer
def fit(self, X, y):
"""
:param X_: shape = [n_samples, n_features]
:param y: shape = [n_samples]
:return: self
"""
labelbin = LabelBinarizer()
Y = labelbin.fit_transform(y)
self.classes = labelbin.classes_
self.class_count = np.zeros(Y.shape[1], dtype=np.float64)
self.feature_count = np.zeros((Y.shape[1], X.shape[1]),
dtype=np.float64)
self.feature_count += Y.T @ X
self.class_count += Y.sum(axis=0)
smoothed_fc = self.feature_count + self.alpha
smoothed_cc = smoothed_fc.sum(axis=1)
self.feature_log_prob = (np.log(smoothed_fc) -
np.log(smoothed_cc.reshape(-1, 1)))
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def feature_mapping_to_numerical_values(self, df):
TwoSigmaFinModTools._is_one_hot_encoder = 0
mask = ~df.isnull()
# Assume that training set has all possible feature_var_names
# Although it may occur in real life that a training set may hold a feature_var_name. But it is probably
# avoided since such features cannot
# be part of the trained learning algo.
# Add missing feature_var_names of training set not occurring in test set. Add these with zeros in columns.
if not any(tuple(df.columns == 'y')):
# All one-hot encoded feature var names occurring in test data is assigned the public variable
# df_test_all_feature_var_names.
self.df_test_all_feature_var_names = df.columns
_feature_names_num = np.zeros((TwoSigmaFinModTools._non_numerical_feature_names.shape[0],), dtype=object)
ith = 0
for feature_name in TwoSigmaFinModTools._non_numerical_feature_names:
# Create a feature_nameNum list
feature_name_num = ''.join([feature_name, 'Num'])
_feature_names_num[ith] = feature_name_num
ith += 1
TwoSigmaFinModTools.encode_labels_in_numeric_format(df, feature_name)
if TwoSigmaFinModTools._is_one_hot_encoder:
is_with_label_binarizer = 0
if is_with_label_binarizer:
mapper_df = DataFrameMapper([(feature_name, LabelBinarizer())], df_out=True)
feature_var_values = mapper_df.fit_transform(df.copy())
print(df[feature_name].isnull().sum().sum())
print(df[feature_name][mask[feature_name]].isnull().sum().sum())
for ite in feature_var_values.columns:
df[ite] = feature_var_values[ite]
else:
TwoSigmaFinModTools.one_hot_encoder(df, feature_name)
TwoSigmaFinModTools._feature_names_num = pd.Series(data=_feature_names_num, dtype=object)
def display_image_predictions(features, labels, predictions):
n_classes = 10
label_names = _load_label_names()
label_binarizer = LabelBinarizer()
label_binarizer.fit(range(n_classes))
label_ids = label_binarizer.inverse_transform(np.array(labels))
fig, axies = plt.subplots(nrows=4, ncols=2)
fig.tight_layout()
fig.suptitle('Softmax Predictions', fontsize=20, y=1.1)
n_predictions = 3
margin = 0.05
ind = np.arange(n_predictions)
width = (1. - 2. * margin) / n_predictions
for image_i, (feature, label_id, pred_indicies, pred_values) in enumerate(zip(features, label_ids, predictions.indices, predictions.values)):
pred_names = [label_names[pred_i] for pred_i in pred_indicies]
correct_name = label_names[label_id]
axies[image_i][0].imshow(feature*255)
axies[image_i][0].set_title(correct_name)
axies[image_i][0].set_axis_off()
axies[image_i][1].barh(ind + margin, pred_values[::-1], width)
axies[image_i][1].set_yticks(ind + margin)
axies[image_i][1].set_yticklabels(pred_names[::-1])
axies[image_i][1].set_xticks([0, 0.5, 1.0])