make_dataset.py 文件源码-python代码片段

def create_categorical_features(df, label_list, random_state=None):
    """
    Creates random categorical variables

    :param df: data frame we're operation on
    :param label_list: A list of lists, each list is the labels for one categorical variable
    :param random_state: the numpy RandomState
    :return: A modified dataframe

    Example:

    create_categorical_features(df, [['a','b'], ['red','blue']])

    """
    random_state = get_random_state(random_state)

    df = df.copy()
    n_categorical = len(label_list)

    # get numeric columns ONCE so we don't have to do it every time we loop:
    numer_cols = [col for col in df.select_dtypes(include=['number']).columns if col != 'y']

    for i in range(0, n_categorical):
        # we might be out of numerical columns!
        if not numer_cols:
            break

        # chose a random numeric column that isn't y
        chosen_col = random_state.choice(numer_cols)
        # pop the chosen_col out of the numer_cols
        numer_cols.pop(numer_cols.index(chosen_col))

        # use cut to convert that column to categorical
        df[chosen_col] = pd.cut(df[chosen_col], bins=len(label_list[i]), labels=label_list[i])

    return df