def __do_one_hot_encodings(self):
df_train, cv = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
enc = OneHotEncoder(sparse=False)
cross_feature_dict = self.__get_label_encode_dict()
to_be_encoded = []
for _, new_feature_name in cross_feature_dict.iteritems():
to_be_encoded.append(new_feature_name)
#fix all data source
to_be_stacked_df = pd.concat([df_train[to_be_encoded], df_testset1[to_be_encoded], df_testset2[to_be_encoded]], axis = 0)
enc.fit(to_be_stacked_df)
enc, to_be_encoded = self.__filter_too_big_onehot_encoding(enc, to_be_encoded, df_train, df_testset1, df_testset2)
# transform on seprate data source
self.res_data_dict[g_singletonDataFilePath.getTrainDir()] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded),cv
self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] = self.__do_one_hot_encoding(df_testset1,enc, to_be_encoded)
self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded)
return
python类OneHotEncoder()的实例源码
def vectors_from_txtfile(fname, codec, limit=-1, mutagen=None):
f = open(fname)
skipped = Counter()
vecs = []
for line in f:
line = line.strip()
try:
vecs.append(codec.encode(line, mutagen=mutagen))
if len(vecs) == limit:
break
except NonEncodableTextException as e:
# Too long, or illegal characters
skipped[e.reason] += 1
logging.debug("Gathered {} vectors. Skipped {} ({})".format(len(vecs),
sum(skipped.values()), dict(skipped)))
vecs = np.asarray(vecs)
# TODO: Why default to dtype=float? Seems wasteful? Maybe it doesn't really matter. Actually, docs here seem inconsistent? Constructor docs say default float. transform docs say int. Should file a bug on sklearn.
return OneHotEncoder(len(codec.alphabet)).fit_transform(vecs)
# Adapted from sklearn.utils.extmath.softmax
def __init__(self, n_values, feature_indices):
import warnings
from sklearn.preprocessing import OneHotEncoder
if not isinstance(n_values, np.ndarray):
n_values = np.array(n_values)
if not isinstance(feature_indices, np.ndarray):
feature_indices = np.array(feature_indices)
assert feature_indices.size > 0
assert feature_indices.shape == n_values.shape
for nv in n_values:
if nv <= 2:
raise Exception("Categorical features must have 3+ labels")
self.feature_indices = feature_indices
self.n_values = n_values
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.encoder = OneHotEncoder(n_values=n_values, sparse=False)
self.columnlabels = None
self.xform_start_indices = None
def test_boston_OHE_plus_trees(self):
data = load_boston()
pl = Pipeline([
("OHE", OneHotEncoder(categorical_features = [8], sparse=False)),
("Trees",GradientBoostingRegressor(random_state = 1))])
pl.fit(data.data, data.target)
# Convert the model
spec = convert(pl, data.feature_names, 'target')
# Get predictions
df = pd.DataFrame(data.data, columns=data.feature_names)
df['prediction'] = pl.predict(data.data)
# Evaluate it
result = evaluate_regressor(spec, df, 'target', verbose = False)
assert result["max_error"] < 0.0001
def test_boston_OHE(self):
data = load_boston()
for categorical_features in [ [3], [8], [3, 8], [8,3] ]:
model = OneHotEncoder(categorical_features = categorical_features, sparse=False)
model.fit(data.data, data.target)
# Convert the model
spec = sklearn.convert(model, data.feature_names, 'out').get_spec()
input_data = [dict(zip(data.feature_names, row)) for row in data.data]
output_data = [{"out" : row} for row in model.transform(data.data)]
result = evaluate_transformer(spec, input_data, output_data)
assert result["num_errors"] == 0
# This test still isn't working
def test_boston_OHE_pipeline(self):
data = load_boston()
for categorical_features in [ [3], [8], [3, 8], [8,3] ]:
# Put it in a pipeline so that we can test whether the output dimension
# handling is correct.
model = Pipeline([("OHE", OneHotEncoder(categorical_features = categorical_features)),
("Normalizer", Normalizer())])
model.fit(data.data.copy(), data.target)
# Convert the model
spec = sklearn.convert(model, data.feature_names, 'out').get_spec()
input_data = [dict(zip(data.feature_names, row)) for row in data.data]
output_data = [{"out" : row} for row in model.transform(data.data.copy())]
result = evaluate_transformer(spec, input_data, output_data)
assert result["num_errors"] == 0
def __init__(self, X, y, multinomial, rounding=None):
self.input_features = X.columns.values
X = X.values
cat_idx = [i for i in range(X.shape[1]) if min(X[:, i]) == 0]
self.encoder = OneHotEncoder(categorical_features=cat_idx, sparse=False)
X = self.encoder.fit_transform(X)
self.features = range(X.shape[1])
self.rounding = rounding
# train a model on the whole dataset
self.model = LogisticRegression()
self.model.fit(X, y)
self.w = self.model.coef_
self.intercept = self.model.intercept_
self.multinomial = multinomial
assert not (multinomial and len(self.get_classes()) == 2)
RegressionExtractor.__init__(self)
def fit_transform(self, X, y=None, sample_weight=None):
X = check_array(X, accept_sparse=['csc'], ensure_2d=False)
if sp.issparse(X):
# Pre-sort indices to avoid that each individual tree of the
# ensemble sorts the indices.
X.sort_indices()
X_, y_ = generate_discriminative_dataset(X)
super(RandomForestEmbedding, self).fit(X_, y_,
sample_weight=sample_weight)
self.one_hot_encoder_ = OneHotEncoder(sparse=True)
if self.sparse_output:
return self.one_hot_encoder_.fit_transform(self.apply(X))
return self.apply(X)
def test_OneHotEncoder():
'''
test the method
:return: None
'''
X=[ [1,2,3,4,5],
[5,4,3,2,1],
[3,3,3,3,3,],
[1,1,1,1,1] ]
print("before transform:",X)
encoder=OneHotEncoder(sparse=False)
encoder.fit(X)
print("active_features_:",encoder.active_features_)
print("feature_indices_:",encoder.feature_indices_)
print("n_values_:",encoder.n_values_)
print("after transform:",encoder.transform( [[1,2,3,4,5]]))
def get_one_hot_key():
encoder = OneHotEncoder(n_values=[3, 3], sparse=False)
encoder.fit([[0, 0]])
intersection_id_map = dict(
A=0,
B=1,
C=2
)
def one_hot_key(ix, **kargs):
return encoder.transform([
[
intersection_id_map[ix[-2]],
ix[-1] - 1
]
])[0].tolist()
return one_hot_key
def one_hot(hypno, n_categories):
enc = OneHotEncoder(n_values=n_categories)
hypno = enc.fit_transform(hypno).toarray()
return np.array(hypno,'int32')
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def one_hot_encoder(df, estimated_var):
df_class = df.copy()
ohe = OneHotEncoder()
label_classes = df_class[estimated_var].factorize()[1]
new_one_hot_encoded_features = [''.join([estimated_var, '_', x]) for x in label_classes]
mask = ~df[estimated_var].isnull()
feature_var_values = ohe.fit_transform(np.reshape(np.array(df[''.join([estimated_var, 'Num'])][mask].values),
(df[mask].shape[0], 1))).toarray().astype(int)
# Create new feature_var columns with one-hot encoded values
for ite in new_one_hot_encoded_features:
df[ite] = df[estimated_var]
df.loc[mask, tuple(new_one_hot_encoded_features)] = feature_var_values
def gen_pclass(self, data):
from sklearn.preprocessing import OneHotEncoder
data_pclass = data['data_df'][['Pclass']]
# set unknown as a class
data_pclass.fillna(4, inplace=True)
return {'pclass': OneHotEncoder(sparse=False)
.fit_transform(data_pclass.values)}
def generate_train_random_batch(data,label,batch_size,is_train = True):
indics = np.random.randint(0,len(data),size=batch_size)
vector = np.zeros([len(data)])
vector[indics] = 1
#y_label = OneHotEncoder(len(data),indics,sparse=False)
data_batch = data.iloc[indics]
if is_train:
label_batch = label.iloc[indics]
return data_batch.as_matrix(),label_batch.as_matrix(),vector
else:
return data_batch.as_matrix(),vector
def one_hot_encoder(df, estimated_var):
df_class = df.copy()
ohe = OneHotEncoder()
label_classes = df_class[estimated_var].factorize()[1]
new_one_hot_encoded_features = [''.join([estimated_var, '_', x]) for x in label_classes]
mask = ~df[estimated_var].isnull()
feature_var_values = ohe.fit_transform(np.reshape(np.array(df[''.join([estimated_var, 'Num'])][mask].values),
(df[mask].shape[0], 1))).toarray().astype(int)
# Create new feature_var columns with one-hot encoded values
for ite in new_one_hot_encoded_features:
df[ite] = df[estimated_var]
df.loc[mask, tuple(new_one_hot_encoded_features)] = feature_var_values
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(Exception):
model = RandomForestClassifier()
spec = skl_converter.convert(model, 'data', 'out')
# Check the expected class during covnersion.
from sklearn.preprocessing import OneHotEncoder
with self.assertRaises(Exception):
model = OneHotEncoder()
spec = skl_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(Exception):
model = RandomForestClassifier()
spec = skl_converter.convert(model, 'data', 'out')
# Check the expected class during covnersion.
with self.assertRaises(Exception):
from sklearn.preprocessing import OneHotEncoder
model = OneHotEncoder()
spec = skl_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(TypeError):
model = NuSVR()
spec = scikit_converter.convert(model, 'data', 'out')
# Check the expected class during covnersion.
with self.assertRaises(TypeError):
model = OneHotEncoder()
spec = scikit_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self):
# Check the expected class during covnersion.
with self.assertRaises(TypeError):
model = OneHotEncoder()
spec = libsvm.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(Exception):
model = DecisionTreeRegressor()
spec = skl_converter.convert(model, 'data', 'out')
# Check the expected class during covnersion.
from sklearn.preprocessing import OneHotEncoder
with self.assertRaises(Exception):
model = OneHotEncoder()
spec = skl_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(Exception):
model = GradientBoostingRegressor()
spec = skl_converter.convert(model, 'data', 'out')
# Check the expected class during covnersion.
with self.assertRaises(Exception):
model = OneHotEncoder()
spec = skl_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(TypeError):
model = GradientBoostingRegressor()
spec = xgb_converter.convert(model, 'data', 'out')
# Check the expected class during conversion
with self.assertRaises(TypeError):
model = OneHotEncoder()
spec = xgb_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(TypeError):
model = SVR()
spec = sklearn_converter.convert(model, 'data', 'out')
# Check the expected class during covnersion.
with self.assertRaises(TypeError):
model = OneHotEncoder()
spec = sklearn_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self):
# Check the expected class during covnersion.
with self.assertRaises(TypeError):
model = OneHotEncoder()
spec = libsvm.convert(model, 'data', 'out')
def test_conversion_one_column(self):
# Fit a single OHE
scikit_model = OneHotEncoder()
scikit_model.fit(self.scikit_data)
spec = sklearn.convert(scikit_model, 'single_feature', 'out').get_spec()
test_data = [{'single_feature' : row} for row in self.scikit_data]
scikit_output = [{'out' : row} for row in scikit_model.transform(self.scikit_data).toarray()]
metrics = evaluate_transformer(spec, test_data, scikit_output)
self.assertIsNotNone(spec)
self.assertIsNotNone(spec.description)
self.assertEquals(metrics['num_errors'], 0)
def test_conversion_many_columns(self):
scikit_model = OneHotEncoder()
scikit_model.fit(self.scikit_data_multiple_cols)
spec = sklearn.convert(scikit_model, ['feature_1', 'feature_2'], 'out').get_spec()
test_data = [{'feature_1': row[0], 'feature_2': row[1]} for row in self.scikit_data_multiple_cols]
scikit_output = [{'out': row} for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()]
metrics = evaluate_transformer(spec, test_data, scikit_output)
self.assertIsNotNone(spec)
self.assertIsNotNone(spec.description)
self.assertEquals(metrics['num_errors'], 0)
def test_conversion_one_column_of_several(self):
scikit_model = OneHotEncoder(categorical_features = [0])
scikit_model.fit(copy(self.scikit_data_multiple_cols))
spec = sklearn.convert(scikit_model, ['feature_1', 'feature_2'], 'out').get_spec()
test_data = [{'feature_1': row[0], 'feature_2': row[1]} for row in self.scikit_data_multiple_cols]
scikit_output = [{'out': row} for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()]
metrics = evaluate_transformer(spec, test_data, scikit_output)
self.assertIsNotNone(spec)
self.assertIsNotNone(spec.description)
self.assertEquals(metrics['num_errors'], 0)
def test_conversion_bad_inputs(self):
"""
Failure testing for bad conversion.
"""
# Error on converting an untrained model
with self.assertRaises(TypeError):
model = OneHotEncoder()
spec = converter.convert(model, 'data', 'out', 'regressor')
def test_conversion_bad_inputs(self):
from sklearn.preprocessing import OneHotEncoder
# Error on converting an untrained model
with self.assertRaises(TypeError):
model = SVC()
spec = scikit_converter.convert(model, 'data', 'out')
# Check the expected class during conversion
with self.assertRaises(TypeError):
model = OneHotEncoder()
spec = scikit_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(TypeError):
model = LinearRegression()
spec = convert(model, 'data', 'out')
# Check the expected class during covnersion.
from sklearn.preprocessing import OneHotEncoder
with self.assertRaises(TypeError):
model = OneHotEncoder()
spec = convert(model, 'data', 'out')