vocabulary.py 文件源码-python代码片段

vocabulary.py 文件源码

python

阅读 26 收藏 0 点赞 0 评论 0

def __init__(self, exclude_stopwords=False, lemmatize=True):

        try:
            import nltk
            _NLTK_DISABLED = False
        except:
            _NLTK_DISABLED = True

        self.vocas = []        # id to word
        self.token2id = dict() # word to id
        self.docfreq = []      # id to document frequency
        self.exclude_stopwords = exclude_stopwords

        self.stopwords_list = []
        if exclude_stopwords:
            # Too much strict
            #with open (os.path.join(os.path.dirname(__file__), 'stopwords.txt'), "r") as _f:
            #    stopwords_list = _f.read().replace('\n', '').split()
            if not _NLTK_DISABLED:
                stopwords_list += nltk.corpus.stopwords.words('english')
            self.stopwords_list = set(stopwords_list)

        if lemmatize:
            if not _NLTK_DISABLED:
                self.wlemm = nltk.WordNetLemmatizer()
            else:
                print ('Warning: no lemmatizer !')