def parse_line(frequency_dict, word_index_dict, nynorsk_line, bokmaal_line):
nn_tokenized = re.findall(r'\w+', nynorsk_line, re.MULTILINE | re.UNICODE)
nb_tokenized = re.findall(r'\w+', bokmaal_line, re.MULTILINE | re.UNICODE)
if (len(nn_tokenized) != len(nb_tokenized)):
# Drop the whole sentence if it doesn't have the same number of tokens.
return
consecutive_skips = 0
for i in range(len(nb_tokenized)):
# If translation fails, the word is prefixed with '*'
if '*' in nb_tokenized[i] or '*' in nn_tokenized[i]:
continue
# If the edit distance ratio is lower than 40 % for three consecutive words,
# we conclude that we have gone astray, and drop the rest of the sentence.
if (fuzz.ratio(nn_tokenized[i], nb_tokenized[i]) < 40):
consecutive_skips += 1
if (consecutive_skips == 3):
break
else:
consecutive_skips = 0
nn_token_idx = get_index_key(word_index_dict, nn_tokenized[i])
nb_token_idx = get_index_key(word_index_dict, nb_tokenized[i])
if (nn_token_idx, nb_token_idx) in frequency_dict:
frequency_dict[(nn_token_idx, nb_token_idx)] += 1
else:
frequency_dict[(nn_token_idx, nb_token_idx)] = 1
评论列表
文章目录