def describe_1d(data, **kwargs):
leng = len(data) # number of observations in the Series
count = data.count() # number of non-NaN observations in the Series
# Replace infinite values with NaNs to avoid issues with
# histograms later.
data.replace(to_replace=[np.inf, np.NINF, np.PINF], value=np.nan, inplace=True)
n_infinite = count - data.count() # number of infinte observations in the Series
distinct_count = data.nunique(dropna=False) # number of unique elements in the Series
if count > distinct_count > 1:
mode = data.mode().iloc[0]
else:
mode = data[0]
results_data = {'count': count,
'distinct_count': distinct_count,
'p_missing': 1 - count / leng,
'n_missing': leng - count,
'p_infinite': n_infinite / leng,
'n_infinite': n_infinite,
'is_unique': distinct_count == leng,
'mode': mode,
'p_unique': distinct_count / leng}
try:
# pandas 0.17 onwards
results_data['memorysize'] = data.memory_usage()
except:
results_data['memorysize'] = 0
result = pd.Series(results_data, name=data.name)
vartype = get_vartype(data)
if vartype == 'CONST':
result = result.append(describe_constant_1d(data))
elif vartype == 'BOOL':
result = result.append(describe_boolean_1d(data, **kwargs))
elif vartype == 'NUM':
result = result.append(describe_numeric_1d(data, **kwargs))
elif vartype == 'DATE':
result = result.append(describe_date_1d(data, **kwargs))
elif vartype == 'UNIQUE':
result = result.append(describe_unique_1d(data, **kwargs))
else:
result = result.append(describe_categorical_1d(data))
return result
评论列表
文章目录