def rm_punctuation(data,pattern=r'[a-zA-Z]+-?[0-9]*',silent=1):
if silent==0:
print("remove punctuation ...")
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(pattern)
return tokenizer.tokenize(" ".join(data))
评论列表
文章目录