utils_data_cleaning.py 文件源码-python代码片段

def fit(self, X_df, y=None):
        print('Running basic data cleaning')

        self.vals_to_drop = set(['ignore', 'output', 'regressor', 'classifier'])

        # See if we should fit TfidfVectorizer or not
        for key in X_df.columns:

            if X_df[key].dtype == 'object' and self.column_descriptions.get(key, False) not in ['categorical', 'ignore', 'nlp']:

                # First, make sure that the values in this column are not just ints, or float('nan')
                vals = X_df[key].sample(n=10)
                is_categorical = False
                for val in vals:
                    try:
                        if val is not None:
                            float(val)
                    except Exception as e:
                        print(e)
                        is_categorical = True

                if is_categorical:
                    print('\n')
                    print('Encountered a column that is not marked as categorical, but is an "object" pandas type, which typically indicates a categorical column.')
                    print('The name of this columns is: "{}"'.format(key))
                    print('Some example features in this column are: {}'.format(list(X_df[key].sample(n=5))))
                    print('If this is a categorical column, please mark it as `{}: "categorical"` as part of your column_descriptions'.format(key))
                    print('If this is not a categorical column, please consider converting its dtype before passing data into auto_ml')
                    print('\n')
                    warnings.warn('Consider marking the "{}" column as categorical'.format(key))

            if self.transformed_column_descriptions.get(key) is None:
                self.transformed_column_descriptions[key] = 'continuous'

            if key in self.text_columns:
                X_df[key].fillna('nan', inplace=True)
                if pandas_version < '0.20.0':
                    text_col = X_df[key].astype(str, raise_on_error=False)
                else:
                    text_col = X_df[key].astype(str, errors='ignore')
                self.text_columns[key].fit(text_col)

                col_names = self.text_columns[key].get_feature_names()

                # Make weird characters play nice, or just ignore them :)
                for idx, word in enumerate(col_names):
                    try:
                        col_names[idx] = str(word)
                    except:
                        col_names[idx] = 'non_ascii_word_' + str(idx)

                col_names = ['nlp_' + key + '_' + str(word) for word in col_names]

                self.text_columns[key].cleaned_feature_names = col_names

        return self