def Load_data():
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)
# combine train and test
data_comb = train.append(test)
# Found at https://www.kaggle.com/marcellonegro/prudential-life-insurance-assessment/xgb-offset0501/run/137585/code
# create any new variables
data_comb['Product_Info_2_char'] = data_comb.Product_Info_2.str[0]
data_comb['Product_Info_2_num'] = data_comb.Product_Info_2.str[1]
# factorize categorical variables
data_comb['Product_Info_2'] = pd.factorize(data_comb['Product_Info_2'])[0]
data_comb['Product_Info_2_char'] = pd.factorize(data_comb['Product_Info_2_char'])[0]
data_comb['Product_Info_2_num'] = pd.factorize(data_comb['Product_Info_2_num'])[0]
data_comb['BMI_Age'] = data_comb['BMI'] * data_comb['Ins_Age']
med_keyword_columns = data_comb.columns[data_comb.columns.str.startswith('Medical_Keyword_')]
data_comb['Med_Keywords_Count'] = data_comb[med_keyword_columns].sum(axis=1)
print('Encode missing values')
data_comb.fillna(-1, inplace=True)
# fix the dtype on the label column
data_comb['Response'] = data_comb['Response'].astype(int)
# split train and test
train = data_comb[data_comb['Response']>0].copy()
test = data_comb[data_comb['Response']<1].copy()
target = train['Response'].values
le = preprocessing.LabelEncoder()
y = le.fit_transform(target)
train.drop(['Id', 'Response', 'Medical_History_10','Medical_History_24'], axis=1, inplace=True)
test.drop(['Id', 'Response', 'Medical_History_10','Medical_History_24'], axis=1, inplace=True)
train = train.as_matrix()
test = test.as_matrix()
print('Construct labels for bumping')
num_class = len(np.unique(target))
labels = np.zeros(shape=(train.shape[0],num_class-1))
labels[:, 0][target==1]=1
labels[:, 6][target<8]=1
for i in range(1, num_class-2):
labels[:, i][target<i+2]=1
return train, test, target, labels
utils.py 文件源码
python
阅读 72
收藏 0
点赞 0
评论 0
评论列表
文章目录