def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False, lower=False):
"""
Iteratively yield tokens as unicode strings, optionally also lowercasing them
and removing accent marks.
Input text may be either unicode or utf8-encoded byte string.
The tokens on output are maximal contiguous sequences of alphabetic
characters (no digits!).
>>> list(tokenize('Nic nem?že let?t rychlostí vyšší, než 300 tisíc kilometr? za sekundu!', deacc = True))
[u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']
"""
lowercase = lowercase or to_lower or lower
text = to_unicode(text, errors=errors)
if lowercase:
text = text.lower()
if deacc:
text = deaccent(text)
for match in PAT_ALPHABETIC.finditer(text):
yield match.group()
评论列表
文章目录