def LemNormalizeIt(text):
# convert non ascii characters
text = text.encode('ascii', 'replace').decode()
# remove punctuation and digits
remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits])
transformed = text.lower().translate(remove_punct_and_digits)
# tokenize the transformed string
tokenized = nltk.word_tokenize(transformed)
# apply lemming with morph it
morph_it = load_morph_it()
tokenized = [morph_it.get(w, w) for w in tokenized if len(w) > 3]
return tokenized
评论列表
文章目录