def fit(self, X_df, y=None):
print('Running basic data cleaning')
self.vals_to_drop = set(['ignore', 'output', 'regressor', 'classifier'])
# See if we should fit TfidfVectorizer or not
for key in X_df.columns:
if X_df[key].dtype == 'object' and self.column_descriptions.get(key, False) not in ['categorical', 'ignore', 'nlp']:
# First, make sure that the values in this column are not just ints, or float('nan')
vals = X_df[key].sample(n=10)
is_categorical = False
for val in vals:
try:
if val is not None:
float(val)
except Exception as e:
print(e)
is_categorical = True
if is_categorical:
print('\n')
print('Encountered a column that is not marked as categorical, but is an "object" pandas type, which typically indicates a categorical column.')
print('The name of this columns is: "{}"'.format(key))
print('Some example features in this column are: {}'.format(list(X_df[key].sample(n=5))))
print('If this is a categorical column, please mark it as `{}: "categorical"` as part of your column_descriptions'.format(key))
print('If this is not a categorical column, please consider converting its dtype before passing data into auto_ml')
print('\n')
warnings.warn('Consider marking the "{}" column as categorical'.format(key))
if self.transformed_column_descriptions.get(key) is None:
self.transformed_column_descriptions[key] = 'continuous'
if key in self.text_columns:
X_df[key].fillna('nan', inplace=True)
if pandas_version < '0.20.0':
text_col = X_df[key].astype(str, raise_on_error=False)
else:
text_col = X_df[key].astype(str, errors='ignore')
self.text_columns[key].fit(text_col)
col_names = self.text_columns[key].get_feature_names()
# Make weird characters play nice, or just ignore them :)
for idx, word in enumerate(col_names):
try:
col_names[idx] = str(word)
except:
col_names[idx] = 'non_ascii_word_' + str(idx)
col_names = ['nlp_' + key + '_' + str(word) for word in col_names]
self.text_columns[key].cleaned_feature_names = col_names
return self
评论列表
文章目录