utils.py 文件源码

python
阅读 72 收藏 0 点赞 0 评论 0

项目:Prudential-Life-Insurance-Assessment 作者: AntonUBC 项目源码 文件源码
def Load_data():
    train = pd.read_csv(path_train)
    test = pd.read_csv(path_test)

    # combine train and test
    data_comb = train.append(test)

    # Found at https://www.kaggle.com/marcellonegro/prudential-life-insurance-assessment/xgb-offset0501/run/137585/code
    # create any new variables    
    data_comb['Product_Info_2_char'] = data_comb.Product_Info_2.str[0]
    data_comb['Product_Info_2_num'] = data_comb.Product_Info_2.str[1]

    # factorize categorical variables
    data_comb['Product_Info_2'] = pd.factorize(data_comb['Product_Info_2'])[0]
    data_comb['Product_Info_2_char'] = pd.factorize(data_comb['Product_Info_2_char'])[0]
    data_comb['Product_Info_2_num'] = pd.factorize(data_comb['Product_Info_2_num'])[0]

    data_comb['BMI_Age'] = data_comb['BMI'] * data_comb['Ins_Age']

    med_keyword_columns = data_comb.columns[data_comb.columns.str.startswith('Medical_Keyword_')]
    data_comb['Med_Keywords_Count'] = data_comb[med_keyword_columns].sum(axis=1)

    print('Encode missing values')    
    data_comb.fillna(-1, inplace=True)

    # fix the dtype on the label column
    data_comb['Response'] = data_comb['Response'].astype(int)

    # split train and test
    train = data_comb[data_comb['Response']>0].copy()
    test = data_comb[data_comb['Response']<1].copy()

    target = train['Response'].values 
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(target) 

    train.drop(['Id', 'Response', 'Medical_History_10','Medical_History_24'], axis=1, inplace=True)
    test.drop(['Id', 'Response', 'Medical_History_10','Medical_History_24'], axis=1, inplace=True)
    train = train.as_matrix()
    test = test.as_matrix()

    print('Construct labels for bumping')
    num_class = len(np.unique(target))
    labels = np.zeros(shape=(train.shape[0],num_class-1))
    labels[:, 0][target==1]=1
    labels[:, 6][target<8]=1
    for i in range(1, num_class-2):
        labels[:, i][target<i+2]=1
    return train, test, target, labels
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号