def tokenize(text):
"""
:param text: a paragraph string
:return: a list of words
"""
try:
try:
txt = unicode(text, 'utf-8') # py2
except NameError:
txt = text # py3
words = wordpunct_tokenize(txt)
length = len(words)
except TypeError:
words, length = ['NA'], 0
return words, length
评论列表
文章目录