Chapter 05_KNN n Naive Bayes.py 文件源码-python代码片段

Chapter 05_KNN n Naive Bayes.py 文件源码

python

阅读 25 收藏 0 点赞 0 评论 0

项目：Statistics-for-Machine-Learning 作者: PacktPublishing 项目源码文件源码

def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]

    tokens = [word.lower() for word in tokens]

    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]

    tokens = [word for word in tokens if len(word)>=3]

    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    

    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')

    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text