def process_columns(self, df):
"""Process columns before histogram filling
Specifically, convert timestamp columns to integers
and numeric variables are converted to indices
:param df: input (pandas) data frame
:returns: output (pandas) data frame with converted timestamp columns
:rtype: pandas DataFrame
"""
# timestamp variables are converted to ns here
# make temp df for value counting (used below)
idf = df[self.str_cols].copy(deep=False)
for col in self.dt_cols:
self.log().debug('Converting column "%s" of type "%s" to nanosec', col, self.var_dtype[col])
idf[col] = df[col].apply(hf.to_ns)
# numerical variables are converted to indices here
for col in self.num_cols + self.dt_cols:
self.log().debug('Converting column "%s" of type "%s" to index', col, self.var_dtype[col])
# find column specific bin_specs. if not found, use dict of default
# values.
dt = df[col].dtype
is_number = isinstance(dt.type(), np.number)
is_timestamp = isinstance(dt.type(), np.datetime64)
sf = idf if is_timestamp else df
bin_specs = self.bin_specs.get(col, self._unit_bin_specs if is_number else self._unit_timestamp_specs)
idf[col] = sf[col].apply(hf.value_to_bin_index, **bin_specs)
return idf
评论列表
文章目录