def test_autoclean_real_data():
"""Test autoclean() with the adult data set"""
adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip')
adult_data.loc[30:60, 'age'] = np.nan
adult_data.loc[90:100, 'education'] = np.nan
hand_cleaned_adult_data = adult_data.copy()
hand_cleaned_adult_data['age'].fillna(hand_cleaned_adult_data['age'].median(), inplace=True)
hand_cleaned_adult_data['education'].fillna(hand_cleaned_adult_data['education'].mode()[0], inplace=True)
for column in ['workclass', 'education', 'marital-status',
'occupation', 'relationship', 'race',
'sex', 'native-country', 'label']:
hand_cleaned_adult_data[column] = LabelEncoder().fit_transform(hand_cleaned_adult_data[column].values)
cleaned_adult_data = autoclean(adult_data)
assert cleaned_adult_data.equals(hand_cleaned_adult_data)
评论列表
文章目录