def preprocessing(text):
text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
tokens = [word for sent in nltk.sent_tokenize(text2) for word in
nltk.word_tokenize(sent)]
tokens = [word.lower() for word in tokens]
stopwds = stopwords.words('english')
tokens = [token for token in tokens if token not in stopwds]
tokens = [word for word in tokens if len(word)>=3]
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens]
tagged_corpus = pos_tag(tokens)
Noun_tags = ['NN','NNP','NNPS','NNS']
Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
lemmatizer = WordNetLemmatizer()
def prat_lemmatize(token,tag):
if tag in Noun_tags:
return lemmatizer.lemmatize(token,'n')
elif tag in Verb_tags:
return lemmatizer.lemmatize(token,'v')
else:
return lemmatizer.lemmatize(token,'n')
pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])
return pre_proc_text
Chapter 05_KNN n Naive Bayes.py 文件源码
python
阅读 19
收藏 0
点赞 0
评论 0
评论列表
文章目录