def write_unitn(cls, out_path, unitn_path, download_path, is_train):
with open(unitn_path) as unitn_sr, open(download_path) as download_sr, open(out_path, 'a+') as out_sr:
for unitn_line, download_line in zip(unitn_sr, download_sr):
doc_id_unitn, label_unitn, text_unitn = \
re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t(.+)', unitn_line).groups()
doc_id_download, label_download, text_download = \
re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t(.+)', download_line).groups()
text_unitn = text_unitn.encode().decode('unicode-escape')
text_unitn = text_unitn.replace(r'’', '\'')
if is_train:
text_unitn = html.unescape(text_unitn)
text_unitn = text_unitn.replace('""', '"')
text_download = html.unescape(html.unescape(text_download))
assert doc_id_unitn == doc_id_download
assert label_unitn == label_download
text = text_unitn
if text_download != 'Not Available':
# some differences are impossible to reconcile, some unitn data have the wrong order
# if re.sub(r'\s+', ' ', text_unitn) != re.sub(r'\s+', ' ', text_download):
# logging.error(out_path)
# logging.error(text_unitn)
# logging.error(text_download)
# assert re.sub(r'\s+', ' ', text_unitn) == re.sub(r'\s+', ' ', text_download)
text = text_download
out_sr.write(json.dumps({'id': doc_id_unitn, 'text': text, 'label': cls.class_map[label_unitn]}) + '\n')
评论列表
文章目录