preprocess.py 文件源码-python代码片段

def preprocess_villani(in_file, out_file, long_fixed_out_file):
    """
    Preprocess the raw Villani dataset and extend the long fixed dataset
    """
    df = pd.read_csv(in_file, index_col=[0, 1])

    # Make age a binary target, <30 and >=30
    df['age'] = df['agegroup'].map({
        'under20': '<30',
        '20-29': '<30',
        '30-39': '>=30',
        '40-49': '>=30',
        '50-59': '>=30',
        'over60': '>=30'}
    )

    # Ignore missing data
    df = df.dropna()
    df = remove_repeated_keys(df)

    # combine the villani fixed text with citefa dataset fixed text
    long_fixed = load_data('long_fixed')
    slf = long_fixed.groupby(level=[0, 1]).size()

    villani_fixed = df[df['inputtype'] == 'fixed']
    villani_fixed = villani_fixed.groupby(level=[0, 1]).apply(lambda x: make_sessions(x, slf.mean(), slf.std()))
    villani_fixed = villani_fixed.reset_index(level=[0, 1], drop=True)
    villani_fixed = reduce_dataset(villani_fixed, min_samples=10, max_samples=10)

    long_fixed = pd.concat([long_fixed, villani_fixed])
    long_fixed = long_fixed[COLS]
    long_fixed.to_csv(long_fixed_out_file)

    # Free-text input only
    villani_free = df[df['inputtype'] == 'free']
    villani_free = villani_free.groupby(level=[0, 1]).apply(lambda x: make_sessions(x, slf.mean(), slf.std()))
    villani_free = villani_free.reset_index(level=[0, 1], drop=True)

    villani_free = reduce_dataset(villani_free, min_samples=10, max_samples=10)
    villani_free = villani_free[COLS]
    villani_free.to_csv(out_file)
    return