def bin_count(hdf_data_dir, file_prefix, num_of_parts):
"""
count positive/negative samples
:param hdf_data_dir:
:param file_prefix: see this param in feature_to_hdf()
:param num_of_parts:
:return: size of a dataset, positive samples, negative samples, positive ratio
"""
size = 0
num_of_pos = 0
num_of_neg = 0
for part in range(num_of_parts):
_y = pd.read_hdf(os.path.join(hdf_data_dir, file_prefix + '_output_part_' + str(part) + '.h5'), mode='r')
part_pos_num = _y.loc[_y.iloc[:, 0] == 1].shape[0]
part_neg_num = _y.shape[0] - part_pos_num
size += _y.shape[0]
num_of_pos += part_pos_num
num_of_neg += part_neg_num
pos_ratio = 1.0 * num_of_pos / (num_of_pos + num_of_neg)
return size, num_of_pos, num_of_neg, pos_ratio
评论列表
文章目录