build.py 文件源码-python代码片段

build.py 文件源码

python

阅读 31 收藏 0 点赞 0 评论 0

项目：deeppavlov 作者: deepmipt 项目源码文件源码

def clean_dataset(path):
    """Remove duplicates from the dataset and write clean data in .tsv files

    Args:
        path: a path to the dataset
    """

    with open(path, 'r') as labels_file:
        context = ET.iterparse(labels_file, events=("start", "end"))
        # turn it into an iterator
        context = iter(context)
        # get the root element
        event, root = next(context)

        with open(os.path.splitext(path)[0] + '.tsv', 'w') as tsv_file:
            writer = csv.writer(tsv_file, delimiter='\t')

            same_set = set()

            for event, elem in context:
                if event == "end" and elem.tag == "paraphrase":
                    question = []
                    y = None
                    for child in elem.iter():
                        if child.get('name') == 'text_1':
                            question.append(child.text)
                        if child.get('name') == 'text_2':
                            question.append(child.text)
                        if child.get('name') == 'class':
                            y = 1 if int(child.text) >= 0 else 0
                    root.clear()
                    check_string = "\n".join(question)
                    if check_string not in same_set:
                        writer.writerow([y, question[0], question[1]])
                        same_set.add(check_string)