data_manager.py 文件源码-python代码片段

def __init__(self, basename, input_dir, verbose=False, replace_missing=True, filter_features=False):
        '''Constructor'''
        self.use_pickle = False # Turn this to true to save data as pickle (inefficient)
        self.basename = basename
        if basename in input_dir:
            self.input_dir = input_dir 
        else:
            self.input_dir = input_dir + "/" + basename + "/"   
        if self.use_pickle:
            if os.path.exists ("tmp"):
                self.tmp_dir = "tmp"
            elif os.path.exists ("../tmp"):
                self.tmp_dir = "../tmp" 
            else:
                os.makedirs("tmp")
                self.tmp_dir = "tmp"
        info_file = os.path.join (self.input_dir, basename + '_public.info')
        self.info = {}
        self.getInfo (info_file)
            self.feat_type = self.loadType (os.path.join(self.input_dir, basename + '_feat.type'), verbose=verbose)
        self.data = {}  
        Xtr = self.loadData (os.path.join(self.input_dir, basename + '_train.data'), verbose=verbose, replace_missing=replace_missing)
        Ytr = self.loadLabel (os.path.join(self.input_dir, basename + '_train.solution'), verbose=verbose)
        Xva = self.loadData (os.path.join(self.input_dir, basename + '_valid.data'), verbose=verbose, replace_missing=replace_missing)
        Xte = self.loadData (os.path.join(self.input_dir, basename + '_test.data'), verbose=verbose, replace_missing=replace_missing)
           # Normally, feature selection should be done as part of a pipeline.
           # However, here we do it as a preprocessing for efficiency reason
        idx=[]
        if filter_features: # add hoc feature selection, for the example...
            fn = min(Xtr.shape[1], 1000)       
            idx = data_converter.tp_filter(Xtr, Ytr, feat_num=fn, verbose=verbose)
            Xtr = Xtr[:,idx]
            Xva = Xva[:,idx]
            Xte = Xte[:,idx]  
        self.feat_idx = np.array(idx).ravel()
        self.data['X_train'] = Xtr
        self.data['Y_train'] = Ytr
        self.data['X_valid'] = Xva
        self.data['X_test'] = Xte