def corpus_ter(hypotheses, references, case_sensitive=True, tercom_path=None, **kwargs):
tercom_path = tercom_path or 'scripts/tercom.jar'
with tempfile.NamedTemporaryFile('w') as hypothesis_file, tempfile.NamedTemporaryFile('w') as reference_file:
for i, (hypothesis, reference) in enumerate(zip(hypotheses, references)):
hypothesis_file.write('{} ({})\n'.format(hypothesis, i))
reference_file.write('{} ({})\n'.format(reference, i))
hypothesis_file.flush()
reference_file.flush()
cmd = ['java', '-jar', tercom_path, '-h', hypothesis_file.name, '-r', reference_file.name]
if case_sensitive:
cmd.append('-s')
output = subprocess.check_output(cmd).decode()
error = re.findall(r'Total TER: (.*?) ', output, re.MULTILINE)[0]
return float(error) * 100, ''
评论列表
文章目录