def represent(documents):
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
# Tokenization
vectorizer = TfidfVectorizer(tokenizer=tokenize)
# Learn and transform train documents
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)
# Transform multilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id])
return vectorised_train_documents, train_labels, vectorised_test_documents, test_labels
python类MultiLabelBinarizer()的实例源码
def __init__(self, inputs, labels, test_indices=None, **kwargs):
"""Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it
easy to serialize and deserialize everything as a unit.
Args:
inputs: The raw model inputs. This can be set to None if you dont want
to serialize this value when you save the dataset.
labels: The raw output labels.
test_indices: The optional test indices to use. Ideally, this should be generated one time and reused
across experiments to make results comparable. `generate_test_indices` can be used generate first
time indices.
**kwargs: Additional key value items to store.
"""
self.X = np.array(inputs)
self.y = np.array(labels)
for key, value in kwargs.items():
setattr(self, key, value)
self._test_indices = None
self._train_indices = None
self.test_indices = test_indices
self.is_multi_label = isinstance(labels[0], (set, list, tuple))
self.label_encoder = MultiLabelBinarizer() if self.is_multi_label else LabelBinarizer()
self.y = self.label_encoder.fit_transform(self.y).flatten()
def __init__(self, model_module, weights_path, evaluation_strategy="s2"):
"""
Test metadata format
---------------------
filename : string
class_ids: string of ints with space as a delimiter
"""
test_dataset = pd.read_csv(IRMAS_TESTING_META_PATH, names=["filename", "class_ids"])
self.X = list(test_dataset.filename)
targets = [[int(category) for category in target.split()] for target in test_dataset.class_ids]
self.ml_binarizer = MultiLabelBinarizer().fit(targets)
self.y_true = self.ml_binarizer.transform(targets)
self.y_pred = np.zeros(shape=self.y_true.shape)
self.y_pred_raw = np.zeros(shape=self.y_true.shape)
self.y_pred_raw_average = np.zeros(shape=self.y_true.shape)
self.model_module = model_module
self.weights_path = weights_path
self.feature_filenames = os.listdir(os.path.join(IRMAS_TEST_FEATURE_BASEPATH, model_module.BASE_NAME))
self.dataset_mean = np.load(os.path.join(MODEL_MEANS_BASEPATH, "{}_mean.npy".format(model_module.BASE_NAME)))
self.evaluation_strategy = evaluation_strategy
self.thresholds_s1 = [0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.22, 0.24]
self.thresholds_s2 = [0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
def __init__(self, name='chord', sr=22050, hop_length=512, sparse=False):
'''Initialize a chord task transformer'''
super(ChordTransformer, self).__init__(name=name,
namespace='chord',
sr=sr, hop_length=hop_length)
self.encoder = MultiLabelBinarizer()
self.encoder.fit([list(range(12))])
self._classes = set(self.encoder.classes_)
self.sparse = sparse
self.register('pitch', [None, 12], np.bool)
if self.sparse:
self.register('root', [None, 1], np.int)
self.register('bass', [None, 1], np.int)
else:
self.register('root', [None, 13], np.bool)
self.register('bass', [None, 13], np.bool)
def __init__(self, name, namespace, labels=None):
super(StaticLabelTransformer, self).__init__(name=name,
namespace=namespace,
sr=1, hop_length=1)
if labels is None:
labels = jams.schema.values(namespace)
self.encoder = MultiLabelBinarizer()
self.encoder.fit([labels])
self._classes = set(self.encoder.classes_)
self.register('tags', [len(self._classes)], np.bool)
def __init__(self, multilabel=False):
self.multilabel = multilabel
if self.multilabel:
self.le = MultiLabelBinarizer(sparse_output=True)
else:
self.le = LabelEncoder()
self.from_classes = False
def __init__(self, name, namespace, labels=None, sr=22050, hop_length=512):
super(DynamicLabelTransformer, self).__init__(name=name,
namespace=namespace,
sr=sr,
hop_length=hop_length)
if labels is None:
labels = jams.schema.values(namespace)
self.encoder = MultiLabelBinarizer()
self.encoder.fit([labels])
self._classes = set(self.encoder.classes_)
self.register('tags', [None, len(self._classes)], np.bool)
custom_transformers.py 文件源码
项目:pandas-pipelines-custom-transformers
作者: jem1031
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def fit(self, X, y=None):
Xsplit = X.applymap(lambda x: x.split(self.sep))
self.mlbs = [MultiLabelBinarizer().fit(Xsplit[c]) for c in X.columns]
return self
def train(X, y, outpath=None, verbose=True):
def build(X, y=None):
"""
Inner build function that builds a single model.
"""
model = Pipeline([
('preprocessor', NLTKPreprocessor()),
('vectorizer', TfidfVectorizer(
tokenizer=identity, preprocessor=None, lowercase=False)),
('clf', OneVsRestClassifier(LinearSVC()))])
model.fit(X, y)
return model
# Label encode the targets
labels = preprocessing.MultiLabelBinarizer()
y = labels.fit_transform(y)
model = build(X, y)
model.labels_ = labels
if outpath:
with open(outpath, 'wb') as f:
pickle.dump(model, f)
if verbose:
print("Model written out to {}".format(outpath))
return model
def test_BRKnna_no_labels_take_closest(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]])
train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']]
mlb = MultiLabelBinarizer(sparse_output=True)
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a')
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
print(pred)
np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
def test_BRKnna_predict(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
mlb = MultiLabelBinarizer(sparse_output=True)
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
def test_BRKnna_predict_dense(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
def test_BRKnnb_predict(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1.5, 1], [0.5, 1]])
train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid3'], ['lid4', 'lid5']]
mlb = MultiLabelBinarizer(sparse_output=True)
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
np.testing.assert_array_equal([[1, 1, 0, 0, 0]], pred)
def test_BRKnnb_predict_dense(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1.5, 1], [0.5, 1]])
train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid3'], ['lid4', 'lid5']]
mlb = MultiLabelBinarizer(sparse_output=False)
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
np.testing.assert_array_equal([[1, 1, 0, 0, 0]], pred)
def test_BRKnnb_auto_optimize_k(self):
data = csr.csr_matrix([[0, 1], [1, 1], [0, 1.1], [1.1, 1]])
train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid2', 'lid3'], ['lid0', 'lid1']]
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(mode='b', n_neighbor_candidates=[1, 3], auto_optimize_k=True)
# noinspection PyUnusedLocal
def fun(s, X, y_):
return data[[1, 2, 3]], data[[0]], y[[1, 2, 3]], y[[0]]
BRKNeighborsClassifier._get_split = fun
knn.fit(data, y)
self.assertEquals(3, knn.n_neighbors)
pred = knn.predict(csr.csr_matrix([[0.1, 1], [2, 2]])).todense()
np.testing.assert_array_equal([[1, 1, 0, 0], [1, 1, 0, 0]], pred)
# def test_time_brknnb(self):
# times = []
# X = sp.rand(10000, 5000, density=0.005, format='csr')
# y = sp.rand(10000, 3000, density=0.005, format='csr')
# knn = BRKNeighborsClassifier(n_neighbors=100)
# knn.fit(X,y)
# X_test = sp.rand(1000, 5000, density=0.005, format ='csr')
# for _ in range(5):
# start = default_timer()
# knn.predict(X_test)
# times.append(default_timer() - start)
# print(np.mean(times))
def _fit(self, X, Y_labels, **kwargs):
Y_labels_filtered = filter_labels(Y_labels, include=self.include, exclude=self.exclude)
self.label_binarizer_ = MultiLabelBinarizer(sparse_output=False).fit(Y_labels_filtered)
logger.info('{} labels found in training instances.'.format(len(self.classes_)))
if not len(self.classes_): raise ValueError('There are no labels available for fitting model.')
return super(MultiLabelsClassifier, self)._fit(X, Y_labels_filtered, **kwargs)
#end def