histogram_filling.py 文件源码-python代码片段

def categorize_columns(self, df):
        """Categorize columns of dataframe by data type

        :param df: input (pandas) data frame
        """

        # check presence and data type of requested columns
        # sort columns into numerical, timestamp and category based
        for c in self.columns:
            for col in c:
                if col not in df.columns:
                    raise KeyError('column "{0:s}" not in dataframe "{1:s}"'.format(col, self.read_key))
                dt = self.get_data_type(df, col)
                if col not in self.var_dtype:
                    self.var_dtype[col] = dt.type
                    if (self.var_dtype[col] is np.string_) or (self.var_dtype[col] is np.object_):
                        self.var_dtype[col] = str
                if not any(dt in types for types in (STRING_SUBSTR, NUMERIC_SUBSTR, TIME_SUBSTR)):
                    raise TypeError('cannot process column "{0:s}" of data type "{1:s}"'.format(col, str(dt)))
                is_number = isinstance(dt.type(), np.number)
                is_timestamp = isinstance(dt.type(), np.datetime64)
                colset = self.num_cols if is_number else self.dt_cols if is_timestamp else self.str_cols
                if col not in colset:
                    colset.append(col)
                self.log().debug('Data type of column "%s" is "%s"', col, self.var_dtype[col])