def tokenize(text, level):
"""Tokenize a text into a list of strings.
Args:
text (str): An arbitrary string.
level (str): Either "char" or "word". For "char", the string is split into characters. For
"word", letters and numbers are glued to themselves and everything else is split.
Example: "asdf df!?123 as12" -> "asdf", " ", "df", "!", "?", "123", " ", "as", "12"
Returns:
list[str]: The tokens
Raises:
ValueError: If the level is not "char" or "word"
"""
if level == "char":
# No need for tokenizing
return list(text)
elif level == "word":
# Tokenize while keeping indentation. Glue letters and numbers to themselves but
# keep all other chars isolated.
tokenizer = RegexpTokenizer(r'\w+|\S|\s')
return tokenizer.tokenize(text)
else:
raise ValueError("Unknown token level: {}".format(level))
评论列表
文章目录