dataset.py 文件源码-python代码片段

def grab_from_features_and_labels(db, fold, config):

    """
    A function that subsets the features df and labels df stored in the Postgres, into train and test features and labels, based on the fold info (train start, train end, test start, test end )

    Args
    DICT FOLD start and end date for both train and test set, in the fomat{"train":(start, end),"test":(start, end)}
    Returns
    df features train
    df labels train
    df features test
    df labels test
    """
    RESPONSE_RENAMER = {'response_f':'response', 'response_u':'response'}
    dataset = pd.read_sql('select * from modeling.dataset where (("Collection_Date" >= '+"'"+fold['train_start'].strftime('%Y-%m-%d')+"'"+') and ("Collection_Date" <= '+"'"+fold['test_end'].strftime('%Y-%m-%d')+"'"+'))', db['connection'], coerce_float=True, params=None)
    toilet_routes = pd.read_sql('select * from modeling.toilet_route', db['connection'], coerce_float=True, params=None)

    #TODO: Fix this...
    dataset = dataset.fillna(0) #A hack to make it run for now...
    #Drop the toilets that do not have contiguous data.
    # Note that missing collections are filled with NaN'd rows, so if a toilet is not contiguous, it must mean that it appeared or disappeared during the fold period -> ignore it.
    toilet_groups = dataset.groupby(config['cols']['toiletname'])
    toilets = dataset[config['cols']['toiletname']].unique()
    number_of_days = max(toilet_groups.size())
    contiguous_toilets = [t for t in toilets if (toilet_groups.size()[t] == number_of_days)]
    dataset = dataset.loc[dataset[config['cols']['toiletname']].isin(contiguous_toilets)]
    #Sort for the purposes of later functions...
    dataset = dataset.sort_values(by=['Collection_Date','ToiletID'])


    features_train = dataset.loc[((dataset['Collection_Date']>=fold["train_start"]) & (dataset['Collection_Date']<=fold["train_end"]))].drop(['response_f','response_u',config['Xy']['response_f']['variable'], config['Xy']['response_u']['variable']],axis=1)
    features_test = dataset.loc[((dataset['Collection_Date']>=fold["test_start"]) & (dataset['Collection_Date']<=fold["test_end"]))].drop(['response_f','response_u',config['Xy']['response_f']['variable'], config['Xy']['response_u']['variable']],axis=1)

    labels_train_u = dataset.loc[((dataset['Collection_Date']>=fold["train_start"]) & (dataset['Collection_Date']<=fold["train_end"])),['response_u','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER)
    labels_train_f = dataset.loc[((dataset['Collection_Date']>=fold["train_start"]) & (dataset['Collection_Date']<=fold["train_end"])),['response_f','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER)
    labels_test_f = dataset.loc[((dataset['Collection_Date']>=fold["test_start"]) & (dataset['Collection_Date']<=fold["test_end"])),['response_f','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER)
    labels_test_u = dataset.loc[((dataset['Collection_Date']>=fold["test_start"]) & (dataset['Collection_Date']<=fold["test_end"])),['response_u','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER)
    return(features_train, labels_train_f, labels_train_u, features_test, labels_test_f, labels_test_u, toilet_routes)