def custom_tokenizer(sentence, delimiters=['|', ','], remove_puncs=True, get_unique=False):
# tokens = re.split('(\W)', sentence)
for delimiter in delimiters:
sentence = re.sub(re.escape(delimiter), " "+delimiter+" ", sentence)
tokens = word_tokenize(sentence)
# Remove duplicates
if get_unique:
tokens = list(set(tokens))
if remove_puncs:
tokens = [token for token in tokens if
not ((len(token.strip()) == 1) and bool(re.search("[^a-zA-Z0-9]", token)))]
tokens = [token for token in tokens if (not bool(re.search("\s", token)) and token != '')]
# Remove duplicates
if get_unique:
tokens = list(set(tokens))
return tokens
评论列表
文章目录