def describe_data(data, info=False, describe=False, value_counts=None, unique=None,
univariate_feature_selection=None, description=None):
# Data diagnostics
if description is not None:
print("\n" + description)
# Info
if info:
print("\nInfo:")
print(data.info())
# Description
if describe:
print("\nDescribe:")
print(data.describe())
# Value counts
if value_counts is not None:
for feature in value_counts:
print("\nValue Counts [" + feature + "]")
print(pd.value_counts(data[feature]))
# Unique values
if unique is not None:
for feature in unique:
print("\nUnique [" + feature + "]")
print(data[feature].unique())
# Univariate feature selection
if univariate_feature_selection is not None:
# Extract predictors and target
predictors = univariate_feature_selection[0]
target = univariate_feature_selection[1]
# Perform feature selection
selector = SelectKBest(f_classif, k="all")
selector.fit(data[predictors], data[target])
# Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)
print("\nUnivariate Feature Selection:")
for feature, imp in sorted(zip(predictors, scores), key=lambda x: x[1] if pd.notnull(x[1]) else 0):
print(feature, imp)
评论列表
文章目录