def preprocess_villani(in_file, out_file, long_fixed_out_file):
"""
Preprocess the raw Villani dataset and extend the long fixed dataset
"""
df = pd.read_csv(in_file, index_col=[0, 1])
# Make age a binary target, <30 and >=30
df['age'] = df['agegroup'].map({
'under20': '<30',
'20-29': '<30',
'30-39': '>=30',
'40-49': '>=30',
'50-59': '>=30',
'over60': '>=30'}
)
# Ignore missing data
df = df.dropna()
df = remove_repeated_keys(df)
# combine the villani fixed text with citefa dataset fixed text
long_fixed = load_data('long_fixed')
slf = long_fixed.groupby(level=[0, 1]).size()
villani_fixed = df[df['inputtype'] == 'fixed']
villani_fixed = villani_fixed.groupby(level=[0, 1]).apply(lambda x: make_sessions(x, slf.mean(), slf.std()))
villani_fixed = villani_fixed.reset_index(level=[0, 1], drop=True)
villani_fixed = reduce_dataset(villani_fixed, min_samples=10, max_samples=10)
long_fixed = pd.concat([long_fixed, villani_fixed])
long_fixed = long_fixed[COLS]
long_fixed.to_csv(long_fixed_out_file)
# Free-text input only
villani_free = df[df['inputtype'] == 'free']
villani_free = villani_free.groupby(level=[0, 1]).apply(lambda x: make_sessions(x, slf.mean(), slf.std()))
villani_free = villani_free.reset_index(level=[0, 1], drop=True)
villani_free = reduce_dataset(villani_free, min_samples=10, max_samples=10)
villani_free = villani_free[COLS]
villani_free.to_csv(out_file)
return
评论列表
文章目录