def get_concat_data(label_csv, label_col, other_csvs, is_rate, important_feats):
print 'important_feats : ',len(important_feats)
rank_feats = [f for f in get_csv_header(dataset1_csv) if 'click' in f]
rank_feats = [f for f in rank_feats if f in important_feats] if important_feats else rank_feats
X = pd.read_csv(label_csv, usecols = rank_feats+[label_col]).apply(small_dtype)
X = X[:1000000] if is_tiny else X
print 'concat csvs ......'
X = pd.concat([X, get_need_feats(other_csvs, is_rate, is_tiny, important_feats)], axis=1)
#if label_csv.split('/')[-1] == 'dataset2.csv':
# for c in X.columns:
# if c.endswith('_fset_total_cnt'):
# X = X.drop(X[X[c]==0].index, axis=0)
feat_cols = [f for f in X.columns if f != label_col]
if is_to_csv:
save_file = label_csv.split('.csv')[0]+'_concat.csv'
if os.path.exists(save_file):
print save_file + " has exists"
else:
print 'to csv ........'
X = X.replace(np.nan, -1)
X = X.replace(np.inf, -2)
X[feat_cols] = scale(X[feat_cols]).astype('float16')
X.to_csv(save_file, index=False, chunksize = 50000)
print X.shape
# TODO cate_feats = [f for f in X.columns if 'click' in f]
## ????????? 3 ???????,??rank_feats ????????
X, = change_to_category([X], cate_feats)
y = X[label_col].values
X = X[feat_cols]
if label_col == 'label':
print 'positive percent ',y.mean()
return X, y
model_single.py 文件源码
python
阅读 18
收藏 0
点赞 0
评论 0
评论列表
文章目录