def __init__(self, lang, suggest, word_list_filename, filters=[]):
self.dictionary = enchant.DictWithPWL(lang, word_list_filename)
self.tokenizer = get_tokenizer(lang, filters)
self.original_tokenizer = self.tokenizer
self.suggest = suggest
python类DictWithPWL()的实例源码
def test_pwl(self):
"""Test checker loop with PWL."""
from enchant import DictWithPWL
d = DictWithPWL("en_US",None,None)
txt = "I am sme text to be cheked with personal list of cheked words"
chkr = SpellChecker(d,txt)
for n,err in enumerate(chkr):
if n == 0:
self.assertEqual(err.word,"sme")
if n == 1:
self.assertEqual(err.word,"cheked")
chkr.add()
self.assertEqual(n,1)
def __init__(self, lang, suggest, word_list_filename, filters=[]):
self.dictionary = enchant.DictWithPWL(lang, word_list_filename)
self.tokenizer = get_tokenizer(lang, filters)
self.original_tokenizer = self.tokenizer
self.suggest = suggest
def even_or_odd(self, message=None, match=None, to=None):
is_odd = len(match.group("evenOrOdd")) % 2
num = random.randint(1, 10)
if (is_odd and num % 2) or (not is_odd and not num % 2):
return TextMessageProtocolEntity("[%d]\nYou win." % num, to=message.getFrom())
else:
return TextMessageProtocolEntity("[%d]\nYou lose!" % num, to=message.getFrom())
# def beban_spell_checker(self, message=None, match=None, to=None):
# print(message.getBody())
# correctionList = ""
# text = message.getBody()
# d = enchant.DictWithPWL("es_MX","wordList.txt")
# d_en = enchant.Dict("en_US")
# wordList = text.split()
# for word in wordList:
# if(word.isalnum() == True):
# print(word)
# if(d.check(word) == False):
# # if(d_en.check(word) == False):
# solutions = d.suggest(word)
# print(solutions)
# sol = str(solutions[0])
# if(sol.isalnum() == False):
# correctionList += sol + "* "
# if (correctionList != ""):
# print(correctionList)
# return TextMessageProtocolEntity(correctionList, to=message.getFrom())
def __init__(self, lang, suggest, word_list_filename, filters=[]):
self.dictionary = enchant.DictWithPWL(lang, word_list_filename)
self.tokenizer = get_tokenizer(lang, filters)
self.original_tokenizer = self.tokenizer
self.suggest = suggest
def suggest(self):
if re.sub(r'[a-zA-Z\d\'\-\.\s]', '', self.word):
return None
import enchant
try:
d = enchant.DictWithPWL(
'en_US', path + '/data/spell-checker/american-english-large')
except:
d = enchant.Dict('en_US')
suggestion = d.suggest(self.word)
return suggestion
def __init__(self):
self.stemmer = LancasterStemmer()
#Convert a collection of text documents to a matrix of token counts
#Remove accents during the preprocessing step.
self.vectorizer = CountVectorizer(strip_accents='ascii')
self.tokenizer = self.vectorizer.build_tokenizer()
self.preprocessor = self.vectorizer.build_preprocessor()
self.spellchecker = enchant.DictWithPWL("en_US",
pwl=path_config.PERSONAL_WORD_DICTIONARY_FILE)
def open(self):
self.initialized = False
self.private_dict_file = None
if enchant is None:
return
dict_name = self.config.spelling_dict
if not dict_name:
return
self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
# "param" appears in docstring in param description and
# "pylint" appears in comments in pylint pragmas.
self.ignore_list.extend(["param", "pylint"])
if self.config.spelling_private_dict_file:
self.spelling_dict = enchant.DictWithPWL(
dict_name, self.config.spelling_private_dict_file)
self.private_dict_file = open(
self.config.spelling_private_dict_file, "a")
else:
self.spelling_dict = enchant.Dict(dict_name)
if self.config.spelling_store_unknown_words:
self.unknown_words = set()
# Prepare regex for stripping punctuation signs from text.
# ' and _ are treated in a special way.
puncts = string.punctuation.replace("'", "").replace("_", "")
self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
self.initialized = True
def open(self):
self.initialized = False
self.private_dict_file = None
if enchant is None:
return
dict_name = self.config.spelling_dict
if not dict_name:
return
self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
# "param" appears in docstring in param description and
# "pylint" appears in comments in pylint pragmas.
self.ignore_list.extend(["param", "pylint"])
if self.config.spelling_private_dict_file:
self.spelling_dict = enchant.DictWithPWL(
dict_name, self.config.spelling_private_dict_file)
self.private_dict_file = open(
self.config.spelling_private_dict_file, "a")
else:
self.spelling_dict = enchant.Dict(dict_name)
if self.config.spelling_store_unknown_words:
self.unknown_words = set()
# Prepare regex for stripping punctuation signs from text.
# ' and _ are treated in a special way.
puncts = string.punctuation.replace("'", "").replace("_", "")
self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
self.initialized = True
def get_new_dictionary(dictionary_lang="en_GB"):
personal_words_list_path = os.path.join(CONFIG_PATH, 'personal-words-list.txt')
return enchant.DictWithPWL(dictionary_lang, personal_words_list_path)
def open(self):
self.initialized = False
self.private_dict_file = None
if enchant is None:
return
dict_name = self.config.spelling_dict
if not dict_name:
return
self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
# "param" appears in docstring in param description and
# "pylint" appears in comments in pylint pragmas.
self.ignore_list.extend(["param", "pylint"])
if self.config.spelling_private_dict_file:
self.spelling_dict = enchant.DictWithPWL(
dict_name, self.config.spelling_private_dict_file)
self.private_dict_file = open(
self.config.spelling_private_dict_file, "a")
else:
self.spelling_dict = enchant.Dict(dict_name)
if self.config.spelling_store_unknown_words:
self.unknown_words = set()
# Prepare regex for stripping punctuation signs from text.
# ' and _ are treated in a special way.
puncts = string.punctuation.replace("'", "").replace("_", "")
self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
self.initialized = True
def open(self):
self.initialized = False
self.private_dict_file = None
if enchant is None:
return
dict_name = self.config.spelling_dict
if not dict_name:
return
self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
# "param" appears in docstring in param description and
# "pylint" appears in comments in pylint pragmas.
self.ignore_list.extend(["param", "pylint"])
if self.config.spelling_private_dict_file:
self.spelling_dict = enchant.DictWithPWL(
dict_name, self.config.spelling_private_dict_file)
self.private_dict_file = open(
self.config.spelling_private_dict_file, "a")
else:
self.spelling_dict = enchant.Dict(dict_name)
if self.config.spelling_store_unknown_words:
self.unknown_words = set()
# Prepare regex for stripping punctuation signs from text.
# ' and _ are treated in a special way.
puncts = string.punctuation.replace("'", "").replace("_", "")
self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
self.initialized = True
def spellcheck_hints(args, packages):
spelldict = DictWithPWL('en-US')
chkr = SpellChecker(spelldict, filters=[DescFilter])
misspellings = {}
# add technical words not in spell-checking dictionary
wordlist = []
with open('words.txt') as f:
for w in f:
# strip any trailing comment
w = re.sub(r'#.*$', '', w)
# strip any whitespace
w = w.strip()
spelldict.add(w)
wordlist.append(w.lower())
# XXX: for the moment, to reduce the set of errors, ignore the fact
# that words.txt gives a canonical capitalization, and accept any
# capitalization
spelldict.add(w.lower())
spelldict.add(w.capitalize())
# add all package names as valid words
for p in packages:
for w in re.split('[_-]', p):
# remove punctuation characters
w = re.sub(r'[+]', '', w)
# strip off any trailing numbers
w = re.sub(r'[\d.]*$', '', w)
# both with and without any lib prefix
for w1 in [w, re.sub(r'^lib', '', w)]:
# add the package name unless it exists in the list above, which
# will give a canonical capitalization
if w.lower() not in wordlist:
spelldict.add(w.lower())
spelldict.add(w)
spelldict.add(w.capitalize())
# for each package
for p in sorted(packages.keys()):
# debuginfo packages have uninteresting, auto-generated text which
# contains the package name
if p.endswith('-debuginfo'):
continue
# spell-check the spell-checkable keys
for k in ['sdesc', 'ldesc', 'message']:
if k in packages[p].hints:
chkr.set_text(packages[p].hints[k])
# XXX: this is doing all the work to generate suggestions, which
# we then ignore, so could be written much more efficiently
for err in chkr:
# logging.error("package '%s', hint '%s': Is '%s' a word?" % (p, k, err.word))
misspellings.setdefault(err.word, 0)
misspellings[err.word] += 1
# summarize
for c in sorted(misspellings, key=misspellings.get, reverse=True):
print('%16s: %4d' % (c, misspellings[c]))
def tesseract_ocr_helper(base_image, config="Default"):
""" A wrapper for using tesseract to do OCR
"""
tools = pyocr.get_available_tools()
if len(tools) == 0:
print("No OCR tool found")
sys.exit(1)
# The tools are returned in the recommended order of usage
tool = tools[0]
print("Will use tool '%s'" % (tool.get_name()))
langs = tool.get_available_languages()
print("Available languages: %s" % ", ".join(langs))
lang = langs[0]
print("Will use lang '%s'" % (lang))
custom_builder = pyocr.builders.TextBuilder()
if config != "Default":
custom_builder.tesseract_configs = [config]
txt = tool.image_to_string(
base_image,
lang=lang,
builder=custom_builder
)
# Spell correct
dict_path = os.path.join(os.path.dirname(__file__),"dict/urban_dict.txt")
d = enchant.DictWithPWL("en_US", dict_path)
txtA = txt.replace('\n', ' \n ')
A = txtA.split(" ")
B = []
for x in A:
if (x != '\n' and len(x) != 0
and d.check(x) is False
and len(d.suggest(x)) != 0):
B.append(d.suggest(x)[0])
else:
B.append(x)
return " ".join(B)
def _init_spell_checker(self):
"""
Initialize spell checker dictionary
"""
default_dict = "en_US"
spell_dict = None
jargonfile = self.params.get('jargonfile')
if not jargonfile:
jargonfile = os.environ.get('JARGONFILE')
if jargonfile is not None:
try:
jargonfile = str(jargonfile)
spell_dict = DictWithPWL(default_dict, jargonfile)
except:
self.error(
"Could not initialize dictionary using %s file" % jargonfile)
if not spell_dict:
try:
spell_dict = DictWithPWL(default_dict)
except:
self.error(
"Could not initialize spell checker with dictionary %s" % default_dict)
#Check if there is jargonfile on module repo
url = ("https://src.fedoraproject.org/cgit/modules/%s.git/plain/jargon.txt" %
self.mmd.name)
resp = requests.get(url)
if resp.status_code >= 200 and resp.status_code < 300:
for w in resp.content.split("\n"):
if w != '':
spell_dict.add_to_session(w)
#add words from module name as jargon
for w in self.mmd.name.split('-'):
spell_dict.add_to_session(w)
try:
chkr = SpellChecker(spell_dict)
except:
self.error("Could not initialize spell checker")
return chkr