twitter.py 文件源码-python代码片段

def write_unitn(cls, out_path, unitn_path, download_path, is_train):
        with open(unitn_path) as unitn_sr, open(download_path) as download_sr, open(out_path, 'a+') as out_sr:
            for unitn_line, download_line in zip(unitn_sr, download_sr):
                doc_id_unitn, label_unitn, text_unitn = \
                    re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t(.+)', unitn_line).groups()
                doc_id_download, label_download, text_download = \
                    re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t(.+)', download_line).groups()
                text_unitn = text_unitn.encode().decode('unicode-escape')
                text_unitn = text_unitn.replace(r'’', '\'')
                if is_train:
                    text_unitn = html.unescape(text_unitn)
                    text_unitn = text_unitn.replace('""', '"')
                text_download = html.unescape(html.unescape(text_download))
                assert doc_id_unitn == doc_id_download
                assert label_unitn == label_download
                text = text_unitn
                if text_download != 'Not Available':
                    # some differences are impossible to reconcile, some unitn data have the wrong order
                    # if re.sub(r'\s+', ' ', text_unitn) != re.sub(r'\s+', ' ', text_download):
                    #     logging.error(out_path)
                    #     logging.error(text_unitn)
                    #     logging.error(text_download)
                    # assert re.sub(r'\s+', ' ', text_unitn) == re.sub(r'\s+', ' ', text_download)
                    text = text_download
                out_sr.write(json.dumps({'id': doc_id_unitn, 'text': text, 'label': cls.class_map[label_unitn]}) + '\n')