def test_autoclean_with_nans_with_strings():
"""Test autoclean() with a data set that has some string-encoded categorical values and some NaNs"""
data = pd.DataFrame({'A': np.random.rand(1000),
'B': np.random.rand(1000),
'C': np.random.randint(0, 3, 1000)})
string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
data['C'] = data['C'].apply(lambda x: string_map[x])
data.loc[10:20, 'A'] = np.nan
data.loc[50:70, 'C'] = np.nan
hand_cleaned_data = data.copy()
hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True)
hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].mode()[0], inplace=True)
hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)
cleaned_data = autoclean(data)
assert cleaned_data.equals(hand_cleaned_data)
评论列表
文章目录