def tokenize(text = u""):
"""
Tokenize text into words
@param text: the input text.
@type text: unicode.
@return: list of words.
@rtype: list.
"""
if text == u'':
return []
else:
#split tokens
mylist = TOKEN_PATTERN.split(text)
# don't remove newline \n
mylist = [TOKEN_REPLACE.sub('',x) for x in mylist if x]
# remove empty substring
mylist = [x for x in mylist if x]
return mylist
评论列表
文章目录