def __init__(self, basename, input_dir, verbose=False, replace_missing=True, filter_features=False):
'''Constructor'''
self.use_pickle = False # Turn this to true to save data as pickle (inefficient)
self.basename = basename
if basename in input_dir:
self.input_dir = input_dir
else:
self.input_dir = input_dir + "/" + basename + "/"
if self.use_pickle:
if os.path.exists ("tmp"):
self.tmp_dir = "tmp"
elif os.path.exists ("../tmp"):
self.tmp_dir = "../tmp"
else:
os.makedirs("tmp")
self.tmp_dir = "tmp"
info_file = os.path.join (self.input_dir, basename + '_public.info')
self.info = {}
self.getInfo (info_file)
self.feat_type = self.loadType (os.path.join(self.input_dir, basename + '_feat.type'), verbose=verbose)
self.data = {}
Xtr = self.loadData (os.path.join(self.input_dir, basename + '_train.data'), verbose=verbose, replace_missing=replace_missing)
Ytr = self.loadLabel (os.path.join(self.input_dir, basename + '_train.solution'), verbose=verbose)
Xva = self.loadData (os.path.join(self.input_dir, basename + '_valid.data'), verbose=verbose, replace_missing=replace_missing)
Xte = self.loadData (os.path.join(self.input_dir, basename + '_test.data'), verbose=verbose, replace_missing=replace_missing)
# Normally, feature selection should be done as part of a pipeline.
# However, here we do it as a preprocessing for efficiency reason
idx=[]
if filter_features: # add hoc feature selection, for the example...
fn = min(Xtr.shape[1], 1000)
idx = data_converter.tp_filter(Xtr, Ytr, feat_num=fn, verbose=verbose)
Xtr = Xtr[:,idx]
Xva = Xva[:,idx]
Xte = Xte[:,idx]
self.feat_idx = np.array(idx).ravel()
self.data['X_train'] = Xtr
self.data['Y_train'] = Ytr
self.data['X_valid'] = Xva
self.data['X_test'] = Xte
评论列表
文章目录