def clean_dataset(path):
"""Remove duplicates from the dataset and write clean data in .tsv files
Args:
path: a path to the dataset
"""
with open(path, 'r') as labels_file:
context = ET.iterparse(labels_file, events=("start", "end"))
# turn it into an iterator
context = iter(context)
# get the root element
event, root = next(context)
with open(os.path.splitext(path)[0] + '.tsv', 'w') as tsv_file:
writer = csv.writer(tsv_file, delimiter='\t')
same_set = set()
for event, elem in context:
if event == "end" and elem.tag == "paraphrase":
question = []
y = None
for child in elem.iter():
if child.get('name') == 'text_1':
question.append(child.text)
if child.get('name') == 'text_2':
question.append(child.text)
if child.get('name') == 'class':
y = 1 if int(child.text) >= 0 else 0
root.clear()
check_string = "\n".join(question)
if check_string not in same_set:
writer.writerow([y, question[0], question[1]])
same_set.add(check_string)
评论列表
文章目录