align_text.py 文件源码-python代码片段

def check_split(source, target, edits):
    s = []
    t = []
    # Collect the tokens
    for e in edits:
        s_tok = source[e[1]:e[2]].orth_.replace("'", "")
        t_tok = target[e[3]:e[4]].orth_.replace("'", "")
        if len(s_tok) >= 1: s.append(s_tok)
        if len(t_tok) >= 1: t.append(t_tok)

    if len(s) == len(t):
        return False
    elif len(s) == 1 and len(t) > 1:
        string = s[0]
        tokens = t
    elif len(t) == 1 and len(s) > 1:
        string = t[0]
        tokens = s
    else:
        return False
    # Check split
    if string.startswith(tokens[0]): # Matches beginning
        string = string[len(tokens[0]):]
        if string.endswith(tokens[-1]): # Matches end
            string = string[:-len(tokens[-1])]
            # Matches all tokens in the middle (in order)
            match = True
            for t in tokens[1:-1]:
                try:
                    i = string.index(t)
                    string = string[i+len(t):]
                except:
                    # Token not found
                    return False
            # All tokens found
            return True
    # Any other case is False
    return False

# Input 1: Spacy source sentence
# Input 2: Spacy target sentence
# Input 3: The alignmen between the 2; [e.g. M, M, S ,S M]
# Function that decide whether to merge, or keep separate, adjacent edits of various types
# Processes 1 alignment at a time