base.py 文件源码-python代码片段

def describe_1d(data, **kwargs):
    leng = len(data)  # number of observations in the Series
    count = data.count()  # number of non-NaN observations in the Series

    # Replace infinite values with NaNs to avoid issues with
    # histograms later.
    data.replace(to_replace=[np.inf, np.NINF, np.PINF], value=np.nan, inplace=True)

    n_infinite = count - data.count()  # number of infinte observations in the Series

    distinct_count = data.nunique(dropna=False)  # number of unique elements in the Series
    if count > distinct_count > 1:
        mode = data.mode().iloc[0]
    else:
        mode = data[0]

    results_data = {'count': count,
                    'distinct_count': distinct_count,
                    'p_missing': 1 - count / leng,
                    'n_missing': leng - count,
                    'p_infinite': n_infinite / leng,
                    'n_infinite': n_infinite,
                    'is_unique': distinct_count == leng,
                    'mode': mode,
                    'p_unique': distinct_count / leng}
    try:
        # pandas 0.17 onwards
        results_data['memorysize'] = data.memory_usage()
    except:
        results_data['memorysize'] = 0

    result = pd.Series(results_data, name=data.name)

    vartype = get_vartype(data)
    if vartype == 'CONST':
        result = result.append(describe_constant_1d(data))
    elif vartype == 'BOOL':
        result = result.append(describe_boolean_1d(data, **kwargs))
    elif vartype == 'NUM':
        result = result.append(describe_numeric_1d(data, **kwargs))
    elif vartype == 'DATE':
        result = result.append(describe_date_1d(data, **kwargs))
    elif vartype == 'UNIQUE':
        result = result.append(describe_unique_1d(data, **kwargs))
    else:
        result = result.append(describe_categorical_1d(data))
    return result