def parse_training_data(dataset, output):
utils.mkdir(output)
stack = os.listdir(dataset)
print 'loading data in ' + dataset
while (len(stack) > 0):
file_name = stack.pop()
file_path = dataset + '/' + file_name
if (os.path.isdir(file_path)): # neu la thu muc thi day vao strong stack
utils.push_data_to_stack(stack, file_path, file_name)
else: # nguoc lai tien hanh readfile
with open(file_path, 'r', encoding='utf-8') as ff:
content = ff.read()
bs = BeautifulSoup(content)
with open(output + '/' + file_name, 'w', encoding='utf-8') as f:
f.write(bs.text)
评论列表
文章目录