def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
# SelectfromModel
clf = LGBMClassifier(n_estimators=400)
clf.fit(matrix_x_temp, label_y)
sfm = SelectFromModel(clf, prefit=True, threshold=th)
matrix_x = sfm.transform(matrix_x_temp)
# ????????????????
feature_score_dict = {}
for fn, s in zip(fe_name, clf.feature_importances_):
feature_score_dict[fn] = s
m = 0
for k in feature_score_dict:
if feature_score_dict[k] == 0.0:
m += 1
print 'number of not-zero features:' + str(len(feature_score_dict) - m)
# ????????
feature_score_dict_sorted = sorted(feature_score_dict.items(),
key=lambda d: d[1], reverse=True)
print 'feature_importance:'
for ii in range(len(feature_score_dict_sorted)):
print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1]
print '\n'
f = open('../eda/lgb_feature_importance.txt', 'w')
f.write(th)
f.write('\nRank\tFeature Name\tFeature Importance\n')
for i in range(len(feature_score_dict_sorted)):
f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
f.close()
# ???????????
how_long = matrix_x.shape[1] # matrix_x ? ?????? ????
feature_used_dict_temp = feature_score_dict_sorted[:how_long]
feature_used_name = []
for ii in range(len(feature_used_dict_temp)):
feature_used_name.append(feature_used_dict_temp[ii][0])
print 'feature_chooesed:'
for ii in range(len(feature_used_name)):
print feature_used_name[ii]
print '\n'
f = open('../eda/lgb_feature_chose.txt', 'w')
f.write('Feature Chose Name :\n')
for i in range(len(feature_used_name)):
f.write(str(feature_used_name[i]) + '\n')
f.close()
# ??????????
feature_not_used_name = []
for i in range(len(fe_name)):
if fe_name[i] not in feature_used_name:
feature_not_used_name.append(fe_name[i])
return matrix_x, feature_not_used_name[:], len(feature_used_name)
评论列表
文章目录