def parse_testing_data(dataset, output):
rm_postag = re.compile(r'/[\w\-\.\,\?\"\':\!\@\#\$%%\^\&\*\(\)\[\]_\+\=\\\`\~]+')
utils.mkdir(output)
stack = os.listdir(dataset)
print 'loading data in ' + dataset
while (len(stack) > 0):
file_name = stack.pop()
file_path = dataset + '/' + file_name
if (os.path.isdir(file_path)): # neu la thu muc thi day vao strong stack
utils.push_data_to_stack(stack, file_path, file_name)
else: # nguoc lai tien hanh readfile
with open(file_path, 'r', encoding='utf-8') as ff:
content = ff.read()
content = rm_postag.sub(u'', content)
content = content.replace(u'/“', u'').replace(u'/”', u'')\
.replace(u'/…', u'').replace(u'…', u'...')
with open(output + '/' + file_name, 'w', encoding='utf-8') as f:
f.write(content)
评论列表
文章目录