def spellchecker(title):
try:
d = enchant.Dict("en_US")
except ImportError:
print ("Enchant Library Not Found. Spell Checking Failed.")
return title
options = []
newt = ""
ccount = 0
fail = "no"
for word in title.split(" "):
if d.check(word) is True:
newt = newt + word + " "
else:
clist = d.suggest(word)
word = clist[ccount]
newt = newt + word + " "
fail = "yes"
return newt
python类Dict()的实例源码
def misses_to_frame(parsed_lexemes: Iterable,
terms: Dict[str, str]=None) -> pd.DataFrame:
if not terms:
terms = {}
miss_dict = collect_misses(parsed_lexemes)
misses = []
for miss in miss_dict:
low_miss = miss.lower()
miss_record = OrderedDict()
miss_record['miss'] = low_miss
miss_record['term'] = terms.get(low_miss, low_miss)
miss_record['lexemes'] = ' '.join(miss_dict[miss])
misses.append(miss_record)
miss_frame = pd.DataFrame.from_records(
misses, index='miss', columns=['miss', 'term', 'lexemes'])
return miss_frame
def getConfNum(msgText):
# get the confirmation number
if 'confNum=' in msgText:
strIndexStart = msgText.find('confNum=')+8
strIndexEnd = strIndexStart+6
confNum = str(msgText[strIndexStart:strIndexEnd])
else:
# get dictionary
d = enchant.Dict("en_US")
pattern = re.compile(r'(?<![A-Za-z0-9])[A-Z0-9]{6}(?![A-Za-z0-9])')
msgTextConfNumSearch = msgText[200:]
regExSearch = pattern.search(msgTextConfNumSearch)
while regExSearch:
# see if the found string is a real word
possibleConfNum = regExSearch.group()
if not d.check(possibleConfNum):
confNum = str(possibleConfNum)
break
else:
msgTextConfNumSearch = msgTextConfNumSearch[regExSearch.end():]
regExSearch = pattern.search(msgTextConfNumSearch)
return confNum
def process_vcode(self, response):
vcode_url = response.css('#content > div > div.article > form > img::attr(src)').extract_first()
vcode = recognize_url(vcode_url)
import enchant
import requests
d = enchant.Dict("en_US")
valid = d.check("enchant")
if valid:
id_index = response.url.find('id=')
try:
original_url = response.css(
'#content > div > div.article > form > input[type="hidden"]:nth-child(8)::attr(value)').extract_first()
except Exception:
original_url = 'https://movie.douban.com/search/%E6%B0%B8%E4%BD%9C%E5%8D%9A%E7%BE%8E'
vcode_id = response.url[id_index + 3:]
frmdata = {"captcha-solution": "".format(vcode), "captcha-id": "".format(vcode_id),
"original-url": "".format(original_url)}
requests.post(url=response.url, data=frmdata, headers=response.headers)
else:
print('wrong vcode')
def breakWithOutWhiteSpace(sentence):
import re
r = "\.\w+"
sentences = []
tmp = re.findall(r, sentence, re.X)
places = [0]
if len(tmp)>0:
import enchant
d = enchant.Dict("en_UK")
for item in tmp:
word = item[1:]
if len(word)<2:
if word.lower() in ['i','a']:
places.extend([m.start() for m in re.finditer(item, sentence)])
else:
if d.check(item[1:]):
places.extend([m.start() for m in re.finditer(item, sentence)])
places = sorted(set(places))
places.append(len(sentence)-1)
i = 0
if len(places)==2:
return [sentence]
start = 0
while True:
start = places[i]
if start>0:
start +=1
end = places[i+1] + 1
if end>len(sentence):
end = len(sentence)-1
sentences.append(sentence[start:end])
i +=1
if len(sentences)==len(places)-1:
break
return sentences
def run(self):
spell_check_lang = self.api.opt.general['spell_check']
if not spell_check_lang:
bubblesub.ui.util.error('Spell check was disabled in config.')
return
try:
dictionary = enchant.Dict(spell_check_lang)
except enchant.errors.DictNotFoundError:
bubblesub.ui.util.error(
f'Spell check language {spell_check_lang} was not found.')
return
async def run(api, main_window):
SpellCheckDialog(api, main_window, dictionary)
await self.api.gui.exec(run)
def __init__(self, api, *args):
super().__init__(*args)
spell_check_lang = api.opt.general['spell_check']
try:
self._dictionary = (
enchant.Dict(spell_check_lang)
if spell_check_lang
else None)
except enchant.errors.DictNotFoundError:
self._dictionary = None
api.log.warn(f'dictionary {spell_check_lang} not installed.')
self._fmt = QtGui.QTextCharFormat()
self._fmt.setUnderlineColor(QtCore.Qt.red)
self._fmt.setUnderlineStyle(QtGui.QTextCharFormat.SpellCheckUnderline)
self._fmt.setFontUnderline(True)
def english_test(string):
dict_en = enchant.Dict("en_US")
words = string.split()
wcount = 0
for word in words :
if(dict_en.check(word)) :
wcount +=1
pass
pass
return wcount
def collect_misses(parsed_lexemes: Iterable) -> Dict:
misses = SortedDict()
for lexeme in parsed_lexemes:
for sublexeme in lexeme:
for segment in sublexeme:
for sm in segment[1]:
if sm.seg_type == 'miss':
misses.setdefault(
sm.segment.lower(), default=SortedSet()).add(sm.lexeme)
return misses
def split_file_path(s: str) -> Dict[str, Optional[Any]]:
path_parts = split_slash(s)
try:
ext_index = path_parts[-1].rindex('.')
name, ext = path_parts[-1][:ext_index], path_parts[-1][ext_index+1:]
except ValueError:
name, ext = path_parts[-1], None
return dict(dirs=path_parts[:-1], name=name, ext=ext)
def __init__(self, lang="en_US"):
self.checker = enchant.Dict(lang)
def __init__(self):
self.WN_TAGS = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
self.wnl = WordNetLemmatizer()
self.dictionary = enchant.Dict('en')
self.inflengine = inflect.engine()
def __init__(self):
self.WN_TAGS = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
self.wnl = WordNetLemmatizer()
self.dictionary = enchant.Dict('en')
self.lookup_table = {}
def __init__(self, settings, lang="en_US"):
self.settings = settings
self.dict_spelling = enchant.Dict(lang)
self.cache = set(self.uimsgs)
cache = self.settings.SPELL_CACHE
if cache and os.path.exists(cache):
with open(cache, 'rb') as f:
self.cache |= set(pickle.load(f))
def test_bug2785373(self):
"""Testcases for bug #2785373."""
c = SpellChecker(enchant.Dict("en"),"")
c.set_text("So, one dey when I wes 17, I left.")
for err in c:
pass
c = SpellChecker(enchant.Dict("en"),"")
c.set_text(raw_unicode("So, one dey when I wes 17, I left."))
for err in c:
pass
def spell(inp):
"""spell <word/sentence> -- Check spelling of a word or sentence."""
if not enchant.dict_exists(locale):
return "Could not find dictionary: {}".format(locale)
if len(inp.split(" ")) > 1:
# input is a sentence
chkr = SpellChecker(locale)
chkr.set_text(inp)
offset = 0
for err in chkr:
# find the location of the incorrect word
start = err.wordpos + offset
finish = start + len(err.word)
# get some suggestions for it
suggestions = err.suggest()
s_string = '/'.join(suggestions[:3])
s_string = "\x02{}\x02".format(s_string)
# calculate the offset for the next word
offset = (offset + len(s_string)) - len(err.word)
# replace the word with the suggestions
inp = inp[:start] + s_string + inp[finish:]
return inp
else:
# input is a word
dictionary = enchant.Dict(locale)
is_correct = dictionary.check(inp)
suggestions = dictionary.suggest(inp)
s_string = ', '.join(suggestions[:10])
if is_correct:
return '"{}" appears to be \x02valid\x02! ' \
'(suggestions: {})'.format(inp, s_string)
else:
return '"{}" appears to be \x02invalid\x02! ' \
'(suggestions: {})'.format(inp, s_string)
def extract_acronyms(textblob):
"""Creates a list of words beginning with at least 2 capital letters that are not regular English words,
in descending order of frequency. enchant dictionary returns True if word is an English word."""
d = enchant.Dict("en_US")
words = textblob.words
counts = []
for word in words:
if len(word) > 1:
if word[0].isupper() and word[1].isupper() and word not in [p[0] for p in counts]:
if not d.check(word):
counts.append((word, textblob.words.count(word)))
return counts
replacers.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def __init__(self, dict_name='en', max_dist=2):
self.spell_dict = enchant.Dict(dict_name)
self.max_dist = max_dist
def updateSpellLanguage(self):
if not initialized:
self.env['runtime']['outputManager'].presentText('pychant is not installed', interrupt=True)
return
self.spellChecker = enchant.Dict(self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage'))
self.language = self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage')
def updateSpellLanguage(self):
self.spellChecker = enchant.Dict(self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage'))
self.language = self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage')
def updateSpellLanguage(self):
self.spellChecker = enchant.Dict(self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage'))
self.language = self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage')
def updateSpellLanguage(self):
if not initialized:
self.env['runtime']['outputManager'].presentText(_('pyenchant is not installed'), interrupt=True)
return
self.spellChecker = enchant.Dict(self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage'))
self.language = self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage')
def __init__(self, server_instance, full_name):
super(Quotes, self).__init__(server_instance, full_name)
self.quotes_path = os.path.join(self.local_data_dir, 'quotes')
if not os.path.exists(self.quotes_path):
os.makedirs(self.quotes_path)
self.dictionaries = [
enchant.Dict('en_US'),
enchant.Dict('en_GB')
]
def spellcheck(self, message, word):
"""
Says whether the given word is spelled correctly, and gives suggestions if
it's not.
"""
if word == '':
await self.provide_help('spell', message)
return
word = word.split(' ', 1)[0]
dictionary = enchant.Dict("en_US")
dictionary_uk = enchant.Dict("en_GB")
# I don't want to make anyone angry, so I check both American and British English.
if dictionary_uk.check(word):
if dictionary.check(word):
await self.client.send_message(message.channel, word + " is spelled correctly")
else:
await self.client.send_message(message.channel, word + " is spelled correctly (British)")
elif dictionary.check(word):
await self.client.send_message(message.channel, word + " is spelled correctly (American)")
else:
msg = word + " is not spelled correctly. Maybe you want one of these spellings:"
sugWords = []
for suggested_word in dictionary.suggest(word):
sugWords.append(suggested_word)
for suggested_word in dictionary_uk.suggest(word):
sugWords.append(suggested_word)
for suggested_word in sorted(set(sugWords)): # removes duplicates
msg = msg + " '" + suggested_word + "',"
await self.client.send_message(message.channel, msg)
def setup(bot):
dictionary = enchant.Dict("en_CA") # should crash here if no dictionary installed. See comments above
bot.add_cog(Spellcheck(bot, dictionary))
def even_or_odd(self, message=None, match=None, to=None):
is_odd = len(match.group("evenOrOdd")) % 2
num = random.randint(1, 10)
if (is_odd and num % 2) or (not is_odd and not num % 2):
return TextMessageProtocolEntity("[%d]\nYou win." % num, to=message.getFrom())
else:
return TextMessageProtocolEntity("[%d]\nYou lose!" % num, to=message.getFrom())
# def beban_spell_checker(self, message=None, match=None, to=None):
# print(message.getBody())
# correctionList = ""
# text = message.getBody()
# d = enchant.DictWithPWL("es_MX","wordList.txt")
# d_en = enchant.Dict("en_US")
# wordList = text.split()
# for word in wordList:
# if(word.isalnum() == True):
# print(word)
# if(d.check(word) == False):
# # if(d_en.check(word) == False):
# solutions = d.suggest(word)
# print(solutions)
# sol = str(solutions[0])
# if(sol.isalnum() == False):
# correctionList += sol + "* "
# if (correctionList != ""):
# print(correctionList)
# return TextMessageProtocolEntity(correctionList, to=message.getFrom())
def load_dictionary(self):
'''Load a hunspell dictionary and instantiate a
enchant.Dict() or a hunspell.Hunspell() object.
'''
if DEBUG_LEVEL > 0:
sys.stderr.write("load_dictionary() ...\n")
(self.dic_path,
self.encoding,
self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name)
if self.words:
# List of languages where accent insensitive matching makes sense:
accent_languages = (
'af', 'ast', 'az', 'be', 'bg', 'br', 'bs', 'ca', 'cs', 'csb',
'cv', 'cy', 'da', 'de', 'dsb', 'el', 'en', 'es', 'eu', 'fo',
'fr', 'fur', 'fy', 'ga', 'gd', 'gl', 'grc', 'gv', 'haw', 'hr',
'hsb', 'ht', 'hu', 'ia', 'is', 'it', 'kk', 'ku', 'ky', 'lb',
'ln', 'lv', 'mg', 'mi', 'mk', 'mn', 'mos', 'mt', 'nb', 'nds',
'nl', 'nn', 'nr', 'nso', 'ny', 'oc', 'pl', 'plt', 'pt', 'qu',
'quh', 'ru', 'sc', 'se', 'sh', 'shs', 'sk', 'sl', 'smj', 'sq',
'sr', 'ss', 'st', 'sv', 'tet', 'tk', 'tn', 'ts', 'uk', 'uz',
've', 'vi', 'wa', 'xh',
)
if self.name.split('_')[0] in accent_languages:
self.word_pairs = [
(x, itb_util.remove_accents(x))
for x in self.words
]
for x in self.words:
if len(x) > self.max_word_len:
self.max_word_len = len(x)
if DEBUG_LEVEL > 1:
sys.stderr.write(
'load_dictionary() max_word_len = %s\n'
% self.max_word_len)
if IMPORT_ENCHANT_SUCCESSFUL:
self.enchant_dict = enchant.Dict(self.name)
elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path:
aff_path = self.dic_path.replace('.dic', '.aff')
self.pyhunspell_object = hunspell.HunSpell(self.dic_path, aff_path)
def suggest(self):
if re.sub(r'[a-zA-Z\d\'\-\.\s]', '', self.word):
return None
import enchant
try:
d = enchant.DictWithPWL(
'en_US', path + '/data/spell-checker/american-english-large')
except:
d = enchant.Dict('en_US')
suggestion = d.suggest(self.word)
return suggestion
def is_word(self, word):
dic = enchant.Dict("en_US")
return dic.check(word)
def open(self):
self.initialized = False
self.private_dict_file = None
if enchant is None:
return
dict_name = self.config.spelling_dict
if not dict_name:
return
self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
# "param" appears in docstring in param description and
# "pylint" appears in comments in pylint pragmas.
self.ignore_list.extend(["param", "pylint"])
if self.config.spelling_private_dict_file:
self.spelling_dict = enchant.DictWithPWL(
dict_name, self.config.spelling_private_dict_file)
self.private_dict_file = open(
self.config.spelling_private_dict_file, "a")
else:
self.spelling_dict = enchant.Dict(dict_name)
if self.config.spelling_store_unknown_words:
self.unknown_words = set()
# Prepare regex for stripping punctuation signs from text.
# ' and _ are treated in a special way.
puncts = string.punctuation.replace("'", "").replace("_", "")
self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
self.initialized = True