def check_spelling(spelling_lang, txt):
"""
Check the spelling in the text, and compute a score. The score is the
number of words correctly (or almost correctly) spelled, minus the number
of mispelled words. Words "almost" correct remains neutral (-> are not
included in the score)
Returns:
A tuple : (fixed text, score)
"""
if os.name == "nt":
assert(not "check_spelling() not available on Windows")
return
with _ENCHANT_LOCK:
# Maximum distance from the first suggestion from python-enchant
words_dict = enchant.request_dict(spelling_lang)
try:
tknzr = enchant.tokenize.get_tokenizer(spelling_lang)
except enchant.tokenize.TokenizerNotFoundError:
# Fall back to default tokenization if no match for 'lang'
tknzr = enchant.tokenize.get_tokenizer()
score = 0
offset = 0
for (word, word_pos) in tknzr(txt):
if len(word) < _MIN_WORD_LEN:
continue
if words_dict.check(word):
# immediately correct words are a really good hint for
# orientation
score += 100
continue
suggestions = words_dict.suggest(word)
if (len(suggestions) <= 0):
# this word is useless. It may even indicates a bad orientation
score -= 10
continue
main_suggestion = suggestions[0]
lv_dist = Levenshtein.distance(word, main_suggestion)
if (lv_dist > _MAX_LEVENSHTEIN_DISTANCE):
# hm, this word looks like it's in a bad shape
continue
logger.debug("Spell checking: Replacing: %s -> %s"
% (word, main_suggestion))
# let's replace the word by its suggestion
pre_txt = txt[:word_pos + offset]
post_txt = txt[word_pos + len(word) + offset:]
txt = pre_txt + main_suggestion + post_txt
offset += (len(main_suggestion) - len(word))
# fixed words may be a good hint for orientation
score += 5
return (txt, score)
评论列表
文章目录