def _validate_input_col_descriptions(self):
found_output_column = False
self.cols_to_ignore = []
expected_vals = set(['categorical', 'text', 'nlp'])
for key, value in self.column_descriptions.items():
value = value.lower()
self.column_descriptions[key] = value
if value == 'output':
self.output_column = key
found_output_column = True
elif value == 'date':
self.date_cols.append(key)
elif value == 'ignore':
self.cols_to_ignore.append(key)
elif value in expected_vals:
pass
else:
raise ValueError('We are not sure how to process this column of data: ' + str(value) + '. Please pass in "output", "categorical", "ignore", "nlp", or "date".')
if found_output_column is False:
print('Here is the column_descriptions that was passed in:')
print(self.column_descriptions)
raise ValueError('In your column_descriptions, please make sure exactly one column has the value "output", which is the value we will be training models to predict.')
# We will be adding one new categorical variable for each date col
# Be sure to add it here so the rest of the pipeline knows to handle it as a categorical column
for date_col in self.date_cols:
self.column_descriptions[date_col + '_day_part'] = 'categorical'
self.cols_to_ignore = set(self.cols_to_ignore)
# We use _construct_pipeline at both the start and end of our training.
# At the start, it constructs the pipeline from scratch
# At the end, it takes FeatureSelection out after we've used it to restrict DictVectorizer, and adds final_model back in if we did grid search on it
评论列表
文章目录