def tokenize(text):
# lowers = text.lower()
# no_punctuation = lowers.translate(None, string.punctuation)
time0 = time.time()
# tokens = [word[0] for word in TextBlob(unicode(TextBlob(text).correct())).tags if word[1] in ['NN', 'NNS', 'NNP', 'JJ', 'VB'] ]
# stems = stem_tokens(tokens, stemmer)
stems = re.findall('[a-z]+', text)
# stems = [word[0] for word in nltk.pos_tag(tokens) if word[1] in ['NN', 'NNS', 'NNP', 'JJ', 'VB'] ]
print('%s seconds' % (time.time()-time0))
print(stems)
return stems
评论列表
文章目录