def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
scikit_data = load_boston()
scikit_model = Imputer(strategy='most_frequent', axis=0)
scikit_data['data'][1,8] = np.NaN
input_data = scikit_data['data'][:,8].reshape(-1, 1)
scikit_model.fit(input_data, scikit_data['target'])
# Save the data and the model
self.scikit_data = scikit_data
self.scikit_model = scikit_model
python类Imputer()的实例源码
def gen_features(train, y, test):
for c in ['active', 'alco', 'smoke']:
le = preprocessing.LabelEncoder()
le.fit(train[c].values.tolist() + test[c].values.tolist())
train[c] = le.transform(train[c])
test[c] = le.transform(test[c])
train['ap_dif'] = train.ap_hi - train.ap_lo
test['ap_dif'] = test.ap_hi - test.ap_lo
h = train['height'] / 100
train['BWI'] = train['weight'] / (h * h)
h = test['height'] / 100
test['BWI'] = test['weight'] / (h * h)
imp = preprocessing.Imputer()
train = imp.fit_transform(train)
test = imp.transform(test)
return train, y, test
def gen_features(train, y, test):
for c in ['active', 'alco', 'smoke']:
le = preprocessing.LabelEncoder()
le.fit(train[c].values.tolist() + test[c].values.tolist())
train[c] = le.transform(train[c])
test[c] = le.transform(test[c])
train['ap_dif'] = train.ap_hi - train.ap_lo
test['ap_dif'] = test.ap_hi - test.ap_lo
h = train['height'] / 100
train['BWI'] = train['weight'] / (h * h)
h = test['height'] / 100
test['BWI'] = test['weight'] / (h * h)
imp = preprocessing.Imputer()
train = imp.fit_transform(train)
test = imp.transform(test)
return train, y, test
def gen_features(train, y, test):
for c in ['active', 'alco', 'smoke']:
le = preprocessing.LabelEncoder()
le.fit(train[c].values.tolist() + test[c].values.tolist())
train[c] = le.transform(train[c])
test[c] = le.transform(test[c])
train['ap_dif'] = train.ap_hi - train.ap_lo
test['ap_dif'] = test.ap_hi - test.ap_lo
h = train['height'] / 100
train['BWI'] = train['weight'] / (h * h)
h = test['height'] / 100
test['BWI'] = test['weight'] / (h * h)
imp = preprocessing.Imputer()
train = imp.fit_transform(train)
test = imp.transform(test)
return train, y, test
def gen_features(train, y, test):
for c in ['active', 'alco', 'smoke']:
le = preprocessing.LabelEncoder()
le.fit(train[c].values.tolist() + test[c].values.tolist())
train[c] = le.transform(train[c])
test[c] = le.transform(test[c])
train['ap_dif'] = train.ap_hi - train.ap_lo
test['ap_dif'] = test.ap_hi - test.ap_lo
h = train['height'] / 100
train['BWI'] = train['weight'] / (h * h)
h = test['height'] / 100
test['BWI'] = test['weight'] / (h * h)
imp = preprocessing.Imputer()
train = imp.fit_transform(train)
test = imp.transform(test)
return train, y, test
def FeatureCombination(Df,s='',num_feature=2):
feature_set = []
for c in Df.columns:
if c.startswith(s): feature_set.append(c)
print('combining', len(feature_set), 'features')
data = Df[feature_set].values
for c in Df.columns:
if Df[c].dtype == 'object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(Df[c].values))
Df[c] = lbl.transform(list(Df[c].values))
imp = preprocessing.Imputer()
data = imp.fit_transform(data)
data = preprocessing.scale(data)
pca = PCA(num_feature)
pca.fit(data)
print('explained_variance_ratio_:', pca.explained_variance_ratio_)
trans = pca.transform(data)
for i in range(0,num_feature):
Df[s+'_%d'%(i+1)] = trans[:,i]
Df.drop(feature_set,1,inplace=True)
return Df
def fit(self, scenario: ASlibScenario, config: Configuration):
'''
fit pca object to ASlib scenario data
Arguments
---------
scenario: data.aslib_scenario.ASlibScenario
ASlib Scenario with all data in pandas
config: ConfigSpace.Configuration
configuration
'''
self.imputer = Imputer(strategy=config.get("imputer_strategy"))
self.imputer.fit(scenario.feature_data.values)
self.active = True
def __init__(self, max_iter=10, initial_strategy='mean', tol=1e-3, f_model="RandomForest"):
self.max_iter = max_iter
self.initial_strategy = initial_strategy
self.initial_imputer = Imputer(strategy=initial_strategy)
self.tol = tol
self.f_model = f_model
def build_model_random_forest(df, features, categorical_features, target, split=0.70):
print "using %d features (%d columns) on %d rows and target %s. Split %f." % (
len(features), len(df.columns), len(df), target, split)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= split
train, test = df[df['is_train'] == True], df[df['is_train'] == False]
# one_hot_encoding because it doesn't work in pipeline for some reason
# for f in categorical_features:
# dummies = pd.get_dummies(df[f], prefix=f)
# for dummy in dummies.columns:
# df[dummy] = dummies[dummy]
# features.append(dummy)
# df = df.drop(f, 1)
# features.remove(f)
clf = Pipeline([
("imputer", Imputer(strategy="mean", axis=0)),
('feature_selection', SelectKBest(k=5)),
("forest", RandomForestClassifier())])
clf.fit(train[features], train[target])
score = clf.score(test[features], test[target])
predicted = clf.predict(test[features])
cm = confusion_matrix(test[target], predicted)
print "Random Forest score: %f" % score
print "confusion_matrix : \n%s" % cm
return clf
def make_predictions_random_forest(df, features, target, split=0.70):
print "using %d features (%d columns) on %d rows and target %s. Split %f." % (
len(features), len(df.columns), len(df), target, split)
# print "unused features: ", '\n\t\t'.join([f for f in df.columns if f not in features])
# print "columns: ", '\n\t\t'.join(df.columns)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= split
train, test = df[df['is_train'] == True], df[df['is_train'] == False]
clf = Pipeline([
("imputer", Imputer(strategy="mean", axis=0)),
('feature_selection', SelectKBest(k=200)),
("forest", RandomForestClassifier(
min_samples_leaf=1, min_samples_split=10, n_estimators=60, max_depth=None, criterion='gini'))])
clf.fit(train[features], train[target])
score = clf.score(test[features], test[target])
predicted = clf.predict(test[features])
cm = confusion_matrix(test[target], predicted)
# print classification_report(test[target], predicted)
return score, cm
# Utility function to report best scores
def preprocess_data(X_train, X_test):
""" Impute missing values. """
# Impute using the mean of every column for now. However,
# I would've liked to impute 'F5' using mode instead.
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
train_xform = imp.fit_transform(X_train)
X_train = pd.DataFrame(train_xform, columns=X_train.columns)
test_xform = imp.transform(X_test)
X_test = pd.DataFrame(test_xform, columns=X_test.columns)
return X_train, X_test
def preprocess_data(X_train, X_test):
""" Impute missing values. """
# Impute using the mean of every column for now. However,
# I would've liked to impute 'F5' using mode instead.
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
train_xform = imp.fit_transform(X_train)
X_train = pd.DataFrame(train_xform, columns=X_train.columns)
test_xform = imp.transform(X_test)
X_test = pd.DataFrame(test_xform, columns=X_test.columns)
return X_train, X_test
xgb_classification.py 文件源码
项目:jingjuSingingPhraseMatching
作者: ronggong
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def imputerLabelEncoder_train(X,y):
imputer = preprocessing.Imputer()
X = imputer.fit_transform(X)
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
return X,y,imputer,le
def build_classifier(base_clf=svm.SVC()):
# The imputer is for "use_taxonomy", and shouldn't affect if it's False.
# TODO: should also try with other imputer strategies
return pipeline.make_pipeline(preprocessing.Imputer(strategy='most_frequent'), preprocessing.StandardScaler(),
base_clf)
# noinspection PyPep8Naming
custom_transformers.py 文件源码
项目:pandas-pipelines-custom-transformers
作者: jem1031
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def fit(self, X, y=None):
self.imp = Imputer(strategy=self.strategy)
self.imp.fit(X)
self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
return self
def test_grid_search_allows_nans():
# Test dcv.GridSearchCV with Imputer
X = np.arange(20, dtype=np.float64).reshape(5, -1)
X[2, :] = np.nan
y = [0, 0, 1, 1, 1]
p = Pipeline([
('imputer', Imputer(strategy='mean', missing_values='NaN')),
('classifier', MockClassifier()),
])
dcv.GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(Exception):
model = Imputer()
spec = converter.convert(model, 'data', 'out')
# Check the expected class during covnersion.
with self.assertRaises(Exception):
from sklearn.linear_model import LinearRegression
model = LinearRegression()
spec = converter.convert(model, 'data', 'out')
def test_conversion_boston(self):
from sklearn.datasets import load_boston
scikit_data = load_boston()
sh = scikit_data.data.shape
rn.seed(0)
missing_value_indices = [(rn.randint(sh[0]), rn.randint(sh[1]))
for k in range(sh[0])]
for strategy in ["mean", "median", "most_frequent"]:
for missing_value in [0, 'NaN', -999]:
X = np.array(scikit_data.data).copy()
for i, j in missing_value_indices:
X[i,j] = missing_value
model = Imputer(missing_values = missing_value, strategy = strategy)
model = model.fit(X)
tr_X = model.transform(X.copy())
spec = converter.convert(model, scikit_data.feature_names, 'out')
input_data = [dict(zip(scikit_data.feature_names, row))
for row in X]
output_data = [{"out" : row} for row in tr_X]
result = evaluate_transformer(spec, input_data, output_data)
assert result["num_errors"] == 0
def __init__(self, strategy_categorical="most_frequent", strategy_numerical="median", categorical=None):
"""
An Imputer that can apply a different strategy for both categorical data and numerical data.
:param strategy_categorical: "mean", "median" or "most_frequent"
:param strategy_numerical: "mean", "median" or "most_frequent"
:param categorical: A boolean mask for the categorical columns of a dataset
"""
if categorical is None:
categorical = []
self.strategy_categorical = strategy_categorical
self.strategy_numerical = strategy_numerical
self.cat_imputer = Imputer(strategy=strategy_categorical)
self.num_imputer = Imputer(strategy=strategy_numerical)
self.categorical = categorical
self._update_indices()
def remove_nan(x):
"""remove NaN values from data vectors"""
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
x_clean = imp.fit_transform(x)
return x_clean
def mean_shift(location, location_callback, bandwidth=None):
"""Returns one or more clusters of a set of points, using a mean shift
algorithm.
The result is sorted with the first value being the largest cluster.
Kwargs:
bandwidth (float): If bandwidth is None, a value is detected
automatically from the input using estimate_bandwidth.
Returns:
A list of NamedTuples (see get_cluster_named_tuple for a definition
of the tuple).
"""
pts = location._tuple_points()
if not pts:
return None
X = np.array(pts).reshape((len(pts), len(pts[0])))
if np.any(np.isnan(X)) or not np.all(np.isfinite(X)):
return None
X = Imputer().fit_transform(X)
X = X.astype(np.float32)
if not bandwidth:
bandwidth = estimate_bandwidth(X, quantile=0.3)
ms = MeanShift(bandwidth=bandwidth or None, bin_seeding=False).fit(X)
clusters = []
for cluster_id, cluster_centre in enumerate(ms.cluster_centers_):
locations = []
for j, label in enumerate(ms.labels_):
if not label == cluster_id:
continue
locations.append(location.locations[j])
if not locations:
continue
clusters.append(cluster_named_tuple()(label=cluster_id,
centroid=Point(cluster_centre),
location=location_callback(
locations)))
return clusters
def GetFeatures(frame):
#convert data to float
arr = np.array(frame,dtype=np.float)
#fill missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='mean')
arr = imputer.fit_transform(arr)
#normalize the entire data
from sklearn.preprocessing import scale
arr = scale(arr)
return arr
#=================================================
def GetFeatures(frame):
#convert data to float
arr = np.array(frame,dtype=np.float)
#fill missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='mean')
arr = imputer.fit_transform(arr)
#normalize the entire data
from sklearn.preprocessing import scale
arr = scale(arr)
return arr
#=================================================
def impute_and_scale(df, scaling=None):
"""Impute missing values with mean and scale data included in pandas dataframe.
Parameters
----------
df : pandas dataframe
dataframe to impute and scale
scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
type of scaling to apply
"""
df = df.dropna(axis=1, how='all')
imputer = Imputer(strategy='mean', axis=0)
mat = imputer.fit_transform(df)
# print(mat.shape)
if scaling is None:
return pd.DataFrame(mat, columns=df.columns)
# Scaling data
if scaling == 'maxabs':
# Normalizing -1 to 1
scaler = MaxAbsScaler()
elif scaling == 'minmax':
# Scaling to [0,1]
scaler = MinMaxScaler()
else:
# Standard normalization
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# print(mat.shape)
df = pd.DataFrame(mat, columns=df.columns)
return df
def impute_and_scale(df, scaling=None):
"""Impute missing values with mean and scale data included in pandas dataframe.
Parameters
----------
df : pandas dataframe
dataframe to impute and scale
scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
type of scaling to apply
"""
df = df.dropna(axis=1, how='all')
imputer = Imputer(strategy='mean', axis=0)
mat = imputer.fit_transform(df)
# print(mat.shape)
if scaling is None:
return pd.DataFrame(mat, columns=df.columns)
# Scaling data
if scaling == 'maxabs':
# Normalizing -1 to 1
scaler = MaxAbsScaler()
elif scaling == 'minmax':
# Scaling to [0,1]
scaler = MinMaxScaler()
else:
# Standard normalization
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# print(mat.shape)
df = pd.DataFrame(mat, columns=df.columns)
return df
def impute_and_scale(df, scaling='std'):
"""Impute missing values with mean and scale data included in pandas dataframe.
Parameters
----------
df : pandas dataframe
dataframe to impute and scale
scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
type of scaling to apply
"""
df = df.dropna(axis=1, how='all')
imputer = Imputer(strategy='mean', axis=0)
mat = imputer.fit_transform(df)
if scaling is None or scaling.lower() == 'none':
return pd.DataFrame(mat, columns=df.columns)
if scaling == 'maxabs':
scaler = MaxAbsScaler()
elif scaling == 'minmax':
scaler = MinMaxScaler()
else:
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
df = pd.DataFrame(mat, columns=df.columns)
return df
def data_handlemissing(dataframe, pipeline):
try:
if pipeline['options']['type'] == "dropcolumns":
thresh = pipeline['options']['thresh']
if thresh == -1:
dataframe.dropna(axis=1, how="all", inplace=True)
elif thresh == 0:
dataframe.dropna(axis=1, how="any", inplace=True)
elif thresh > 0:
dataframe.dropna(axis=1, thresh=thresh, inplace=True)
elif pipeline['options']['type'] == "droprows":
thresh = pipeline['options']['thresh']
if thresh == -1:
dataframe.dropna(axis=0, how="all", inplace=True)
elif thresh == 0:
dataframe.dropna(axis=0, how="any", inplace=True)
elif thresh > 0:
dataframe.dropna(axis=0, thresh=thresh)
elif pipeline['options']['type'] == "fillmissing":
strategy = pipeline['options']['strategy']
imp = Imputer(missing_values='NaN', strategy=strategy, axis=0)
array = imp.fit_transform(dataframe.values)
dataframe = pandas.DataFrame(array, columns = dataframe.columns)
return dataframe
except Exception as e:
raise Exception("data_handlemissing: " + str(e))
def imputer_transform(data):
imputer = Imputer()
imputer.fit(data)
return imputer.transform(data)
def imputator(features):
"""Fill in missing values with mean of the remaining samples
Keyword arguments:
features -- feature matrix
"""
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features)
return imp.transform(features)
def impute_data(self,x):
"""Imputes data set containing Nan values"""
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
return imp.fit_transform(x)