def tokenize(data, language="english", filterStopWords = False, tagging = False):
result = {}
tags = []
filterChars = [",", ".", "?", ";", ":", "'", "!", "@", "#", "$", "%", "&", "*", "(", ")", "+", "{", "}", "[", "]", "\\", "|"]
sent_token = nltk.tokenize.sent_tokenize(data, language)
word_token = nltk.tokenize.word_tokenize(data, language)
word_token = [w for w in word_token if not w in filterChars]
if filterStopWords is True:
stop_words = set(stopwords.words(language))
word_token = [w for w in word_token if not w in stop_words]
if tagging is True:
tags = nltk.pos_tag(word_token)
result = {"sent_token": sent_token, "word_token": word_token, "pos_tag": tags}
return json.loads(jsonpickle.encode(result, unpicklable=False))
评论列表
文章目录