create_synonyms.py 文件源码-python代码片段

def parse_line(frequency_dict, word_index_dict, nynorsk_line, bokmaal_line):
    nn_tokenized = re.findall(r'\w+', nynorsk_line,  re.MULTILINE | re.UNICODE)
    nb_tokenized = re.findall(r'\w+', bokmaal_line,  re.MULTILINE | re.UNICODE)

    if (len(nn_tokenized) != len(nb_tokenized)):
        # Drop the whole sentence if it doesn't have the same number of tokens.
        return

    consecutive_skips = 0
    for i in range(len(nb_tokenized)):

        # If translation fails, the word is prefixed with '*'
        if '*' in nb_tokenized[i] or '*' in nn_tokenized[i]:
            continue

        # If the edit distance ratio is lower than 40 % for three consecutive words,
        # we conclude that we have gone astray, and drop the rest of the sentence.
        if (fuzz.ratio(nn_tokenized[i], nb_tokenized[i]) < 40):
            consecutive_skips += 1
            if (consecutive_skips == 3):
                break
        else:
            consecutive_skips = 0

        nn_token_idx = get_index_key(word_index_dict, nn_tokenized[i])
        nb_token_idx = get_index_key(word_index_dict, nb_tokenized[i])
        if (nn_token_idx, nb_token_idx) in frequency_dict:
            frequency_dict[(nn_token_idx, nb_token_idx)] += 1
        else:
            frequency_dict[(nn_token_idx, nb_token_idx)] = 1