def _split_and_dump(self, X, y, valid_X, valid_y):
if not hasattr(self, '_dm'):
raise ValueError("It should be called after the dumpmanager _dm is set")
if self.resampling == 'cv':
pass
elif self.resampling == 'holdout':
if not self._has_valid_data:
data_size = y.shape[0]
if data_size >= 100000:
valid_ratio = 0.3
elif 15000 <= data_size < 100000:
valid_ratio = 0.2
else:
valid_ratio = 0.15
valid_size = int(data_size * valid_ratio)
X, valid_X = X[valid_size:], X[:valid_size]
y, valid_y = y[valid_size:], y[:valid_size]
else:
raise NotImplementedError()
pkl = {"resampling": self.resampling,
"X": X, "y": y,
"valid_X": valid_X, "valid_y": valid_y}
datafile = os.path.join(self._dm.dir, "data.pkl")
joblib.dump(pkl, datafile, protocol=-1)
self._datafile = datafile
return datafile
评论列表
文章目录