def tiny_tokenize_xml(text, stem=False, stop_words=[]):
return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if
not token.isdigit() and not token in stop_words]
评论列表
文章目录