def load_data(filenames):
""" ???????????
:param filenames: ??????
:return: Bunch ????. See:
http://scikit-learn.org/stable/datasets/index.html#datasets
"""
# ??????
data = []
# ???????????
target = []
# ????
target_names = {}
# ????????????HUM,???????
data_re = re.compile(r'(\w+),(.+)')
for line in fileinput.input(filenames):
match = data_re.match(line.decode('utf-8'))
if not match:
raise Exception("Invalid format in dataset {} at line {}"
.format(fileinput.filename(),
fileinput.filelineno()))
label, text = match.group(1), match.group(2)
if label not in target_names:
target_names[label] = len(target_names)
# ??????????`HUM`, `LOC`, etc.?
target.append(label)
# ?????????????{'HUM': 1, 'LOC': 2}?
# target.append(target_names[label])
data.append(text)
return Bunch(
data=numpy.array(data),
target=numpy.array(target),
target_names=numpy.array([k for k in target_names]),
)
评论列表
文章目录