def lbl_encode(df_tr,df_te=None,cols=None,objonly=True):
print("label encode ...")
lbl = LabelEncoder()
if df_te is not None:
df = df_tr.append(df_te)
if cols is None:
cols = set(df_tr.columns.values).intersection(set(df_te.columns.values))
else:
df = df_tr
if cols is None:
cols = df_tr.columns.values
encoded = []
for col in cols:
if objonly and df[col].dtype!='object':
continue
encoded.append(col)
lbl.fit(df[col].map(str))
df_tr[col] = lbl.transform(df_tr[col].map(str))
if df_te is not None:
df_te[col] = lbl.transform(df_te[col].map(str))
print('lbl encode:',encoded)
python类LabelEncoder()的实例源码
def test_autoclean_cv_no_nans_with_strings():
"""Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs"""
data = pd.DataFrame({'A': np.random.rand(1000),
'B': np.random.rand(1000),
'C': np.random.randint(0, 3, 1000)})
string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
data['C'] = data['C'].apply(lambda x: string_map[x])
training_data = data[:500].copy()
testing_data = data[500:].copy()
cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)
hand_cleaned_training_data = training_data.copy()
hand_cleaned_testing_data = testing_data.copy()
encoder = LabelEncoder()
hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values)
hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values)
assert cleaned_training_data.equals(hand_cleaned_training_data)
assert cleaned_testing_data.equals(hand_cleaned_testing_data)
def test_autoclean_with_nans_with_strings():
"""Test autoclean() with a data set that has some string-encoded categorical values and some NaNs"""
data = pd.DataFrame({'A': np.random.rand(1000),
'B': np.random.rand(1000),
'C': np.random.randint(0, 3, 1000)})
string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
data['C'] = data['C'].apply(lambda x: string_map[x])
data.loc[10:20, 'A'] = np.nan
data.loc[50:70, 'C'] = np.nan
hand_cleaned_data = data.copy()
hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True)
hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].mode()[0], inplace=True)
hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)
cleaned_data = autoclean(data)
assert cleaned_data.equals(hand_cleaned_data)
def test_autoclean_real_data():
"""Test autoclean() with the adult data set"""
adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip')
adult_data.loc[30:60, 'age'] = np.nan
adult_data.loc[90:100, 'education'] = np.nan
hand_cleaned_adult_data = adult_data.copy()
hand_cleaned_adult_data['age'].fillna(hand_cleaned_adult_data['age'].median(), inplace=True)
hand_cleaned_adult_data['education'].fillna(hand_cleaned_adult_data['education'].mode()[0], inplace=True)
for column in ['workclass', 'education', 'marital-status',
'occupation', 'relationship', 'race',
'sex', 'native-country', 'label']:
hand_cleaned_adult_data[column] = LabelEncoder().fit_transform(hand_cleaned_adult_data[column].values)
cleaned_adult_data = autoclean(adult_data)
assert cleaned_adult_data.equals(hand_cleaned_adult_data)
def initialize_labels(self, Y):
y_nodes_flat = [y_val for y in Y for y_val in y.nodes]
y_links_flat = [y_val for y in Y for y_val in y.links]
self.prop_encoder_ = LabelEncoder().fit(y_nodes_flat)
self.link_encoder_ = LabelEncoder().fit(y_links_flat)
self.n_prop_states = len(self.prop_encoder_.classes_)
self.n_link_states = len(self.link_encoder_.classes_)
self.prop_cw_ = np.ones_like(self.prop_encoder_.classes_,
dtype=np.double)
self.link_cw_ = compute_class_weight(self.class_weight,
self.link_encoder_.classes_,
y_links_flat)
self.link_cw_ /= self.link_cw_.min()
logging.info('Setting node class weights {}'.format(", ".join(
"{}: {}".format(lbl, cw) for lbl, cw in zip(
self.prop_encoder_.classes_, self.prop_cw_))))
logging.info('Setting link class weights {}'.format(", ".join(
"{}: {}".format(lbl, cw) for lbl, cw in zip(
self.link_encoder_.classes_, self.link_cw_))))
def _execute(self, sources, alignment_stream, interval):
time_interval = TimeInterval(MIN_DATE, interval.end)
param_doc = sources[0].window(time_interval, force_calculation=True).last()
if param_doc is None:
logging.debug("No model found in {} for time interval {}".format(sources[0].stream_id, time_interval))
return
steps = deserialise_json_pipeline({
'vectorisation': DictVectorizer(sparse=False),
'fill_missing': FillZeros(),
'classifier': LinearDiscriminantAnalysis(),
'label_encoder': LabelEncoder()
}, param_doc.value)
clf = Pipeline([(kk, steps[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')])
locations = steps['label_encoder'].classes_
data = sources[1].window(interval, force_calculation=True)
for tt, dd in data:
yield StreamInstance(tt, {locations[ii]: pp for ii, pp in enumerate(clf.predict_proba(dd)[0])})
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def test_predict_from_file():
from microtc.wrappers import ClassifierWrapper
from microtc.textmodel import TextModel
from microtc.utils import read_data_labels
from sklearn.preprocessing import LabelEncoder
import os
fname = os.path.dirname(__file__) + '/text.json'
corpus, labels = read_data_labels(fname)
t = TextModel(corpus)
le = LabelEncoder()
le.fit(labels)
y = le.transform(labels)
c = ClassifierWrapper()
X = [t[x] for x in corpus]
c.fit(X, y)
hy = le.inverse_transform(c.predict(X))
for i in hy:
assert i in ['POS', 'NEU', 'NEG']
def __init__(self, X, y, Xstatic=[], ystatic=[], ratio=0.8, test_ratio=None, score='r2', classifier=RegressorWrapper, random_state=None):
assert ratio < 1, "ratio {0} is invalid, valid values are 0 < ratio < 1".format(ratio)
self.score = score
self.le = preprocessing.LabelEncoder().fit(y)
self.create_classifier = classifier
if test_ratio is None:
test_ratio = 1.0 - ratio
I = list(range(len(y)))
np.random.shuffle(I)
s = int(np.ceil(len(y) * ratio))
s_end = int(np.ceil(len(y) * test_ratio))
y = self.le.transform(y)
train, test = I[:s], I[s:s+s_end]
self.train_corpus = [X[i] for i in train]
self.train_corpus.extend(Xstatic)
if len(ystatic) > 0:
ystatic = self.le.transform(ystatic)
self.train_y = np.hstack((y[train], ystatic))
else:
self.train_y = y[train]
self.test_corpus = [X[i] for i in test]
self.test_y = y[test]
def __init__(self, X, y, Xstatic=[], ystatic=[], ratio=0.8, test_ratio=None, score='macrof1', classifier=ClassifierWrapper, random_state=None):
assert ratio < 1, "ratio {0} is invalid, valid values are 0 < ratio < 1".format(ratio)
self.score = score
self.le = preprocessing.LabelEncoder().fit(y)
self.create_classifier = classifier
if test_ratio is None:
test_ratio = 1.0 - ratio
I = list(range(len(y)))
np.random.shuffle(I)
s = int(np.ceil(len(y) * ratio))
s_end = int(np.ceil(len(y) * test_ratio))
y = self.le.transform(y)
train, test = I[:s], I[s:s+s_end]
self.train_corpus = [X[i] for i in train]
self.train_corpus.extend(Xstatic)
if len(ystatic) > 0:
ystatic = self.le.transform(ystatic)
self.train_y = np.hstack((y[train], ystatic))
else:
self.train_y = y[train]
self.test_corpus = [X[i] for i in test]
self.test_y = y[test]
def score_model(model, data_test, labeler):
'''
??????? ?????????????????? ??????,
?????? ? ??????????? ????? ??? ???????:
???????? ?????????, ???????? ??????? ?
???????? ??? ??????? ??????, ????????
? ????????????? ??????.
?????????:
model - ????????? ??????
data_test - ??????????? ???????
labeler - LabelEncoder ?????? ???????
??????????:
??????
'''
X_test = data_test.drop(["proto"], axis=1)
y_test = data_test["proto"]
y_predicted = model.predict(X_test)
true_labels = labeler.inverse_transform(y_test)
predicted_labels = labeler.inverse_transform(y_predicted)
print feature_importances_report(model, X_test.columns)
print "\n", classification_report(true_labels, predicted_labels)
print cross_class_report(true_labels, predicted_labels)
def doDescartes(X_train, X_test):
res = X_test[['instanceID']]
X_test.drop('instanceID', axis=1, inplace=True)
data = X_train.append(X_test, ignore_index=True)
del X_train, X_test
gc.collect()
for feat_1 in ['maybe_0', 'maybe_2']:
for feat_2 in ['connectionType', 'creativeID', 'positionID']:
le = LabelEncoder()
data[feat_1 + '_' + feat_2] = le.fit_transform(data[feat_1].astype('str') + data[feat_2].astype('str'))
X_train = data.loc[data['label'] != -1, :]
X_test = data.loc[data['label'] == -1, :]
X_test.loc[:, 'instanceID'] = res.values
del data
gc.collect()
return X_train, X_test
def preprocessData(dataset):
le = preprocessing.LabelEncoder()
# in case divid-by-zero
dataset.Open[dataset.Open == 0] = 1
# add prediction target: next day Up/Down
threshold = 0.000
dataset['UpDown'] = (dataset['Close'] - dataset['Open']) / dataset['Open']
dataset.UpDown[dataset.UpDown >= threshold] = 'Up'
dataset.UpDown[dataset.UpDown < threshold] = 'Down'
dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
dataset.UpDown = dataset.UpDown.shift(-1) # shift 1, so the y is actually next day's up/down
dataset = dataset.drop(dataset.index[-1]) # drop last one because it has no up/down value
return dataset
def generate_test_data():
with open('./test.csv', 'r') as test_file:
test_csv = csv.reader(test_file, delimiter=',')
next(test_csv)
test_data = list(test_csv)
test_data = numpy.array(test_data)
# delete id column
# test_data = numpy.delete(test_data, 0, 1)
# One of K encoding of categorical data
encoder = preprocessing.LabelEncoder()
for j in (1, 2, 3, 4, 5, 6, 7, 8, 9, 14):
test_data[:, j+1] = encoder.fit_transform(test_data[:, j+1])
# Converting numpy strings to floats
test_data = test_data.astype(numpy.float)
missValueIndex = 7
Xy_test = test_data[test_data[:, 3+1]==missValueIndex]
Xy_train = test_data[test_data[:, 3+1]!=missValueIndex]
X_train = numpy.delete(Xy_train, 3+1 ,1)
y_train = Xy_train[:, 3+1]
X_test = numpy.delete(Xy_test, 3+1 ,1)
market_test_data = MarketingData(X_train, y_train, X_test)
return market_test_data, test_data
# use knn for impute missing values
scikitlearn.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def check_proba_classif_convergence(X_train, y_train, mc):
lb = LabelBinarizer()
y_bin = lb.fit_transform(y_train)
le = LabelEncoder()
y_enc = le.fit_transform(y_train)
proba = mc.predict_proba(X_train)
labels = mc.predict(X_train)
assert_array_equal(proba, y_bin)
assert_array_equal(labels, lb.inverse_transform(y_bin))
# For points completely far away from the training data, this
# should converge to the empirical distribution of labels.
# X is scaled between to -1.0 and 1.0
X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]),
-30.0 * np.ones(X_train.shape[1])))
inf_proba = mc.predict_proba(X_inf)
emp_proba = np.bincount(y_enc) / float(len(y_enc))
assert_array_almost_equal(inf_proba, [emp_proba, emp_proba])
def check_proba_classif_convergence(est, X_train, y_train):
lb = LabelBinarizer()
y_bin = lb.fit_transform(y_train)
le = LabelEncoder()
y_enc = le.fit_transform(y_train)
proba = est.predict_proba(X_train)
labels = est.predict(X_train)
assert_array_equal(proba, y_bin)
assert_array_equal(labels, lb.inverse_transform(y_bin))
# For points completely far away from the training data, this
# should converge to the empirical distribution of labels.
X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]),
-30.0 * np.ones(X_train.shape[1])))
inf_proba = est.predict_proba(X_inf)
emp_proba = np.bincount(y_enc) / float(len(y_enc))
assert_array_almost_equal(inf_proba, [emp_proba, emp_proba], 3)
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def train_model(data, with_mac=True):
global without_mac_clf, mac_clf
df = pd.DataFrame.from_dict(data)
y = df.pop("location")
features = [f for f in df.columns if f is not 'mac']
df = df.rename(columns=dict(zip(features, [POWER_SLAVE_PREFIX + f for f in features])))
model_name = MODEL_MAC_NAME if with_mac else MODEL_NAME
if with_mac:
df = df.apply(LabelEncoder().fit_transform)
else:
df.drop("mac", axis=1, inplace=True)
clf = DecisionTreeClassifier()
clf.fit(df, y)
joblib.dump(clf, model_name)
if with_mac and mac_clf is None:
mac_clf = clf
if not with_mac and without_mac_clf is None:
without_mac_clf = clf
export_graphviz(clf, feature_names=list(df.columns), class_names=y.unique(), filled=True, rounded=True, out_file='model.dot')
os.system("dot -Tpng model.dot -o model.png")
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def _fit_targets(self, y, classes=None):
self.multilabel_ = self._is_multilabel(y)
# If provided, use classes to fit the encoded and set classes_.
# Otherwise, find the unique classes in y.
if classes is not None:
y = classes
if self.multilabel_:
self._enc = None
self.classes_ = np.arange(y.shape[1])
self.n_classes_ = y.shape[1]
else:
self._enc = LabelEncoder().fit(y)
self.classes_ = self._enc.classes_
self.n_classes_ = len(self.classes_)
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def label_encoding(self, dataset):
"""
:param data_set:
:param data_target:
:return: data_set
"""
le_1 = preprocessing.LabelEncoder()
le_2 = preprocessing.LabelEncoder()
le_3 = preprocessing.LabelEncoder()
le_1.fit(np.unique(dataset[:, 1]))
le_2.fit(np.unique(dataset[:, 2]))
le_3.fit(np.unique(dataset[:, 3]))
dataset[:, 1] = le_1.transform(dataset[:, 1])
dataset[:, 2] = le_2.transform(dataset[:, 2])
dataset[:, 3] = le_3.transform(dataset[:, 3])
return dataset
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def gen_features(train, y, test):
for c in ['active', 'alco', 'smoke']:
le = preprocessing.LabelEncoder()
le.fit(train[c].values.tolist() + test[c].values.tolist())
train[c] = le.transform(train[c])
test[c] = le.transform(test[c])
train['ap_dif'] = train.ap_hi - train.ap_lo
test['ap_dif'] = test.ap_hi - test.ap_lo
h = train['height'] / 100
train['BWI'] = train['weight'] / (h * h)
h = test['height'] / 100
test['BWI'] = test['weight'] / (h * h)
imp = preprocessing.Imputer()
train = imp.fit_transform(train)
test = imp.transform(test)
return train, y, test
def gen_features(train, y, test):
for c in ['active', 'alco', 'smoke']:
le = preprocessing.LabelEncoder()
le.fit(train[c].values.tolist() + test[c].values.tolist())
train[c] = le.transform(train[c])
test[c] = le.transform(test[c])
train['ap_dif'] = train.ap_hi - train.ap_lo
test['ap_dif'] = test.ap_hi - test.ap_lo
h = train['height'] / 100
train['BWI'] = train['weight'] / (h * h)
h = test['height'] / 100
test['BWI'] = test['weight'] / (h * h)
imp = preprocessing.Imputer()
train = imp.fit_transform(train)
test = imp.transform(test)
return train, y, test
def gen_features(train, y, test):
for c in ['active', 'alco', 'smoke']:
le = preprocessing.LabelEncoder()
le.fit(train[c].values.tolist() + test[c].values.tolist())
train[c] = le.transform(train[c])
test[c] = le.transform(test[c])
train['ap_dif'] = train.ap_hi - train.ap_lo
test['ap_dif'] = test.ap_hi - test.ap_lo
h = train['height'] / 100
train['BWI'] = train['weight'] / (h * h)
h = test['height'] / 100
test['BWI'] = test['weight'] / (h * h)
imp = preprocessing.Imputer()
train = imp.fit_transform(train)
test = imp.transform(test)
return train, y, test
def gen_features(train, y, test):
for c in ['active', 'alco', 'smoke']:
le = preprocessing.LabelEncoder()
le.fit(train[c].values.tolist() + test[c].values.tolist())
train[c] = le.transform(train[c])
test[c] = le.transform(test[c])
train['ap_dif'] = train.ap_hi - train.ap_lo
test['ap_dif'] = test.ap_hi - test.ap_lo
h = train['height'] / 100
train['BWI'] = train['weight'] / (h * h)
h = test['height'] / 100
test['BWI'] = test['weight'] / (h * h)
imp = preprocessing.Imputer()
train = imp.fit_transform(train)
test = imp.transform(test)
return train, y, test