def check_split(source, target, edits):
s = []
t = []
# Collect the tokens
for e in edits:
s_tok = source[e[1]:e[2]].orth_.replace("'", "")
t_tok = target[e[3]:e[4]].orth_.replace("'", "")
if len(s_tok) >= 1: s.append(s_tok)
if len(t_tok) >= 1: t.append(t_tok)
if len(s) == len(t):
return False
elif len(s) == 1 and len(t) > 1:
string = s[0]
tokens = t
elif len(t) == 1 and len(s) > 1:
string = t[0]
tokens = s
else:
return False
# Check split
if string.startswith(tokens[0]): # Matches beginning
string = string[len(tokens[0]):]
if string.endswith(tokens[-1]): # Matches end
string = string[:-len(tokens[-1])]
# Matches all tokens in the middle (in order)
match = True
for t in tokens[1:-1]:
try:
i = string.index(t)
string = string[i+len(t):]
except:
# Token not found
return False
# All tokens found
return True
# Any other case is False
return False
# Input 1: Spacy source sentence
# Input 2: Spacy target sentence
# Input 3: The alignmen between the 2; [e.g. M, M, S ,S M]
# Function that decide whether to merge, or keep separate, adjacent edits of various types
# Processes 1 alignment at a time
评论列表
文章目录