def categorize_columns(self, df):
"""Categorize columns of dataframe by data type
:param df: input (pandas) data frame
"""
# check presence and data type of requested columns
# sort columns into numerical, timestamp and category based
for c in self.columns:
for col in c:
if col not in df.columns:
raise KeyError('column "{0:s}" not in dataframe "{1:s}"'.format(col, self.read_key))
dt = self.get_data_type(df, col)
if col not in self.var_dtype:
self.var_dtype[col] = dt.type
if (self.var_dtype[col] is np.string_) or (self.var_dtype[col] is np.object_):
self.var_dtype[col] = str
if not any(dt in types for types in (STRING_SUBSTR, NUMERIC_SUBSTR, TIME_SUBSTR)):
raise TypeError('cannot process column "{0:s}" of data type "{1:s}"'.format(col, str(dt)))
is_number = isinstance(dt.type(), np.number)
is_timestamp = isinstance(dt.type(), np.datetime64)
colset = self.num_cols if is_number else self.dt_cols if is_timestamp else self.str_cols
if col not in colset:
colset.append(col)
self.log().debug('Data type of column "%s" is "%s"', col, self.var_dtype[col])
评论列表
文章目录