def test_autoclean_cv_no_nans_with_strings():
"""Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs"""
data = pd.DataFrame({'A': np.random.rand(1000),
'B': np.random.rand(1000),
'C': np.random.randint(0, 3, 1000)})
string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
data['C'] = data['C'].apply(lambda x: string_map[x])
training_data = data[:500].copy()
testing_data = data[500:].copy()
cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)
hand_cleaned_training_data = training_data.copy()
hand_cleaned_testing_data = testing_data.copy()
encoder = LabelEncoder()
hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values)
hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values)
assert cleaned_training_data.equals(hand_cleaned_training_data)
assert cleaned_testing_data.equals(hand_cleaned_testing_data)
评论列表
文章目录