def summarize(self, x, summary_func, missing_data_cond, in_place=False):
""" Substitutes missing values with a statistical summary of each
feature vector
Parameters
----------
x : numpy.array
Assumes that each feature column is of single type. Converts
digit string features to float.
summary_func : function
Summarization function to be used for imputation
(mean, median, mode, max, min...)
missing_data_cond : function
Method that takes one value and returns True if it represents
missing data or false otherwise.
"""
if in_place:
data = x
else:
data = np.copy(x)
# replace missing values with the summarization function
for col in xrange(x.shape[1]):
nan_ids = missing_data_cond(x[:, col])
if True in nan_ids:
val = summary_func(x[~nan_ids, col])
data[nan_ids, col] = val
return data
评论列表
文章目录