def tokenize(text):
# text = NB.remove_punctuation(text)
try:
text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
except:
text = text.encode('ascii', 'replace').strip().lower()
word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)] # split punctuations but dont split single quotes for words like don't
biword = [b for b in nltk.bigrams(word)]
triword = [t for t in nltk.trigrams(word)]
# word = [w for w in word if w not in stopwords.words('english')]
return word # triword
评论列表
文章目录