dataset.py 文件源码

python
阅读 76 收藏 0 点赞 0 评论 0

项目:sanergy-public 作者: dssg 项目源码 文件源码
def grab_from_features_and_labels(db, fold, config):

    """
    A function that subsets the features df and labels df stored in the Postgres, into train and test features and labels, based on the fold info (train start, train end, test start, test end )

    Args
    DICT FOLD start and end date for both train and test set, in the fomat{"train":(start, end),"test":(start, end)}
    Returns
    df features train
    df labels train
    df features test
    df labels test
    """
    RESPONSE_RENAMER = {'response_f':'response', 'response_u':'response'}
    dataset = pd.read_sql('select * from modeling.dataset where (("Collection_Date" >= '+"'"+fold['train_start'].strftime('%Y-%m-%d')+"'"+') and ("Collection_Date" <= '+"'"+fold['test_end'].strftime('%Y-%m-%d')+"'"+'))', db['connection'], coerce_float=True, params=None)
    toilet_routes = pd.read_sql('select * from modeling.toilet_route', db['connection'], coerce_float=True, params=None)

    #TODO: Fix this...
    dataset = dataset.fillna(0) #A hack to make it run for now...
    #Drop the toilets that do not have contiguous data.
    # Note that missing collections are filled with NaN'd rows, so if a toilet is not contiguous, it must mean that it appeared or disappeared during the fold period -> ignore it.
    toilet_groups = dataset.groupby(config['cols']['toiletname'])
    toilets = dataset[config['cols']['toiletname']].unique()
    number_of_days = max(toilet_groups.size())
    contiguous_toilets = [t for t in toilets if (toilet_groups.size()[t] == number_of_days)]
    dataset = dataset.loc[dataset[config['cols']['toiletname']].isin(contiguous_toilets)]
    #Sort for the purposes of later functions...
    dataset = dataset.sort_values(by=['Collection_Date','ToiletID'])


    features_train = dataset.loc[((dataset['Collection_Date']>=fold["train_start"]) & (dataset['Collection_Date']<=fold["train_end"]))].drop(['response_f','response_u',config['Xy']['response_f']['variable'], config['Xy']['response_u']['variable']],axis=1)
    features_test = dataset.loc[((dataset['Collection_Date']>=fold["test_start"]) & (dataset['Collection_Date']<=fold["test_end"]))].drop(['response_f','response_u',config['Xy']['response_f']['variable'], config['Xy']['response_u']['variable']],axis=1)

    labels_train_u = dataset.loc[((dataset['Collection_Date']>=fold["train_start"]) & (dataset['Collection_Date']<=fold["train_end"])),['response_u','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER)
    labels_train_f = dataset.loc[((dataset['Collection_Date']>=fold["train_start"]) & (dataset['Collection_Date']<=fold["train_end"])),['response_f','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER)
    labels_test_f = dataset.loc[((dataset['Collection_Date']>=fold["test_start"]) & (dataset['Collection_Date']<=fold["test_end"])),['response_f','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER)
    labels_test_u = dataset.loc[((dataset['Collection_Date']>=fold["test_start"]) & (dataset['Collection_Date']<=fold["test_end"])),['response_u','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER)
    return(features_train, labels_train_f, labels_train_u, features_test, labels_test_f, labels_test_u, toilet_routes)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号