def set_offset(self,off,whence=0):
"""Set the offset of the tokenization routine.
For more details on the purpose of the tokenization offset,
see the documentation of the 'enchant.tokenize' module.
The optional argument whence indicates the method by
which to change the offset:
* 0 (the default) treats <off> as an increment
* 1 treats <off> as a distance from the start
* 2 treats <off> as a distance from the end
"""
if whence == 0:
self._tokens.set_offset(self._tokens.offset + off)
elif whence == 1:
assert(off > 0)
self._tokens.set_offset(off)
elif whence == 2:
assert(off > 0)
self._tokens.set_offset(len(self._text) - 1 - off)
else:
raise ValueError("Invalid value for whence: %s"%(whence,))
python类tokenize()的实例源码
def buildtestsuite(recurse=True):
from enchant.checker.tests import TestChecker
from enchant.tokenize.tests import TestTokenization, TestFilters
from enchant.tokenize.tests import TestTokenizeEN
suite = unittest.TestSuite()
if recurse:
suite.addTest(unittest.makeSuite(TestInstallEnv))
suite.addTest(unittest.makeSuite(TestPy2exe))
suite.addTest(unittest.makeSuite(TestBroker))
suite.addTest(unittest.makeSuite(TestDict))
suite.addTest(unittest.makeSuite(TestPWL))
suite.addTest(unittest.makeSuite(TestUtils))
suite.addTest(unittest.makeSuite(TestDocStrings))
suite.addTest(unittest.makeSuite(TestChecker))
suite.addTest(unittest.makeSuite(TestTokenization))
suite.addTest(unittest.makeSuite(TestTokenizeEN))
suite.addTest(unittest.makeSuite(TestFilters))
return suite
def __init__(self, words):
tokenize.__init__(self, '')
self._words = words
def test_filters(self):
"""Test SpellChecker with the 'filters' argument."""
text = """I contain WikiWords that ShouldBe skipped by the filters"""
chkr = SpellChecker("en_US",text=text,
filters=[enchant.tokenize.WikiWordFilter])
for err in chkr:
# There are no errors once the WikiWords are skipped
self.fail("Extraneous spelling errors were found")
self.assertEqual(chkr.get_text(),text)
def test_chunkers(self):
"""Test SpellChecker with the 'chunkers' argument."""
text = """I contain <html a=xjvf>tags</html> that should be skipped"""
chkr = SpellChecker("en_US",text=text,
chunkers=[enchant.tokenize.HTMLChunker])
for err in chkr:
# There are no errors when the <html> tag is skipped
self.fail("Extraneous spelling errors were found")
self.assertEqual(chkr.get_text(),text)
def test_chunkers_and_filters(self):
"""Test SpellChecker with the 'chunkers' and 'filters' arguments."""
text = """I contain <html a=xjvf>tags</html> that should be skipped
along with a <a href='http://example.com/">link to
http://example.com/</a> that should also be skipped"""
# There are no errors when things are correctly skipped
chkr = SpellChecker("en_US",text=text,
filters=[enchant.tokenize.URLFilter],
chunkers=[enchant.tokenize.HTMLChunker])
for err in chkr:
self.fail("Extraneous spelling errors were found")
self.assertEqual(chkr.get_text(),text)
# The "html" is an error when not using HTMLChunker
chkr = SpellChecker("en_US",text=text,
filters=[enchant.tokenize.URLFilter])
for err in chkr:
self.assertEqual(err.word,"html")
break
self.assertEqual(chkr.get_text(),text)
# The "http" from the URL is an error when not using URLFilter
chkr = SpellChecker("en_US",text=text,
chunkers=[enchant.tokenize.HTMLChunker])
for err in chkr:
self.assertEqual(err.word,"http")
break
self.assertEqual(chkr.get_text(),text)
def _check_docstrings(self,obj,errors):
import enchant
if hasattr(obj,"__doc__"):
skip_errors = [w for w in getattr(obj,"_DOC_ERRORS",[])]
chkr = enchant.checker.SpellChecker("en_AU",obj.__doc__,filters=[enchant.tokenize.URLFilter])
for err in chkr:
if len(err.word) == 1:
continue
if err.word.lower() in self.WORDS:
continue
if skip_errors and skip_errors[0] == err.word:
skip_errors.pop(0)
continue
errors.append((obj,err.word,err.wordpos))
msg = "\nDOCSTRING SPELLING ERROR: %s %s %d %s\n" % (obj,err.word,err.wordpos,chkr.suggest())
printf([msg],file=sys.stderr)
# Find and yield all child objects that should be checked
for name in dir(obj):
if name.startswith("__"):
continue
child = getattr(obj,name)
if hasattr(child,"__file__"):
if not hasattr(globals(),"__file__"):
continue
if not child.__file__.startswith(os.path.dirname(__file__)):
continue
else:
cmod = getattr(child,"__module__",None)
if not cmod:
cclass = getattr(child,"__class__",None)
cmod = getattr(cclass,"__module__",None)
if cmod and not cmod.startswith("enchant"):
continue
yield child
def __init__(self):
tokenize.__init__(self,"")
def __init__(self,text):
tokenize.__init__(self,text)
self._done = False
def _try_tokenizer(modName):
"""Look for a tokenizer in the named module.
Returns the function if found, None otherwise.
"""
modBase = "enchant.tokenize."
funcName = "tokenize"
modName = modBase + modName
try:
mod = __import__(modName,globals(),{},funcName)
return getattr(mod,funcName)
except ImportError:
return None
def __init__(self, words):
tokenize.__init__(self, '')
self._words = words
def __init__(self, words):
tokenize.__init__(self, '')
self._words = words
def __init__(self,lang=None,text=None,tokenize=None,chunkers=None,filters=None):
"""Constructor for the SpellChecker class.
SpellChecker objects can be created in two ways, depending on
the nature of the first argument. If it is a string, it
specifies a language tag from which a dictionary is created.
Otherwise, it must be an enchant Dict object to be used.
Optional keyword arguments are:
* text: to set the text to be checked at creation time
* tokenize: a custom tokenization function to use
* chunkers: a list of chunkers to apply during tokenization
* filters: a list of filters to apply during tokenization
If <tokenize> is not given and the first argument is a Dict,
its 'tag' attribute must be a language tag so that a tokenization
function can be created automatically. If this attribute is missing
the user's default language will be used.
"""
if lang is None:
lang = get_default_language()
if isinstance(lang,basestring):
dict = enchant.Dict(lang)
else:
dict = lang
try:
lang = dict.tag
except AttributeError:
lang = get_default_language()
if lang is None:
raise DefaultLanguageNotFoundError
self.lang = lang
self.dict = dict
if tokenize is None:
try:
tokenize = get_tokenizer(lang,chunkers,filters)
except TokenizerNotFoundError:
# Fall back to default tokenization if no match for 'lang'
tokenize = get_tokenizer(None,chunkers,filters)
self._tokenize = tokenize
self.word = None
self.wordpos = None
self._ignore_words = {}
self._replace_words = {}
# Default to the empty string as the text to be checked
self._text = array.array('u')
self._use_tostring = False
self._tokens = iter([])
if text is not None:
self.set_text(text)
def check_spelling(spelling_lang, txt):
"""
Check the spelling in the text, and compute a score. The score is the
number of words correctly (or almost correctly) spelled, minus the number
of mispelled words. Words "almost" correct remains neutral (-> are not
included in the score)
Returns:
A tuple : (fixed text, score)
"""
if os.name == "nt":
assert(not "check_spelling() not available on Windows")
return
with _ENCHANT_LOCK:
# Maximum distance from the first suggestion from python-enchant
words_dict = enchant.request_dict(spelling_lang)
try:
tknzr = enchant.tokenize.get_tokenizer(spelling_lang)
except enchant.tokenize.TokenizerNotFoundError:
# Fall back to default tokenization if no match for 'lang'
tknzr = enchant.tokenize.get_tokenizer()
score = 0
offset = 0
for (word, word_pos) in tknzr(txt):
if len(word) < _MIN_WORD_LEN:
continue
if words_dict.check(word):
# immediately correct words are a really good hint for
# orientation
score += 100
continue
suggestions = words_dict.suggest(word)
if (len(suggestions) <= 0):
# this word is useless. It may even indicates a bad orientation
score -= 10
continue
main_suggestion = suggestions[0]
lv_dist = Levenshtein.distance(word, main_suggestion)
if (lv_dist > _MAX_LEVENSHTEIN_DISTANCE):
# hm, this word looks like it's in a bad shape
continue
logger.debug("Spell checking: Replacing: %s -> %s"
% (word, main_suggestion))
# let's replace the word by its suggestion
pre_txt = txt[:word_pos + offset]
post_txt = txt[word_pos + len(word) + offset:]
txt = pre_txt + main_suggestion + post_txt
offset += (len(main_suggestion) - len(word))
# fixed words may be a good hint for orientation
score += 5
return (txt, score)