def tokenize(text, splits='COPZ'):
token = []
if PY3:
for c in str(text, 'utf-8'):
if category(c)[0] in splits:
if len(token):
yield u''.join(token)
token = []
else:
token.append(c)
else:
for c in unicode(text):
if category(c)[0] in splits:
if len(token):
yield u''.join(token)
token = []
else:
token.append(c)
if len(token):
yield u''.join(token)
评论列表
文章目录