lda.py 文件源码-python代码片段

lda.py 文件源码

python

阅读 27 收藏 0 点赞 0 评论 0

def tokenize(text):
  # lowers = text.lower()
  # no_punctuation = lowers.translate(None, string.punctuation)
  time0 = time.time()
  # tokens = [word[0] for word in TextBlob(unicode(TextBlob(text).correct())).tags if word[1] in ['NN', 'NNS', 'NNP', 'JJ', 'VB'] ]
  # stems = stem_tokens(tokens, stemmer)

  stems = re.findall('[a-z]+', text)
  # stems = [word[0] for word in nltk.pos_tag(tokens) if word[1] in ['NN', 'NNS', 'NNP', 'JJ', 'VB'] ]

  print('%s seconds' % (time.time()-time0))
  print(stems)
  return stems