def __init__(self, exclude_stopwords=False, lemmatize=True):
try:
import nltk
_NLTK_DISABLED = False
except:
_NLTK_DISABLED = True
self.vocas = [] # id to word
self.token2id = dict() # word to id
self.docfreq = [] # id to document frequency
self.exclude_stopwords = exclude_stopwords
self.stopwords_list = []
if exclude_stopwords:
# Too much strict
#with open (os.path.join(os.path.dirname(__file__), 'stopwords.txt'), "r") as _f:
# stopwords_list = _f.read().replace('\n', '').split()
if not _NLTK_DISABLED:
stopwords_list += nltk.corpus.stopwords.words('english')
self.stopwords_list = set(stopwords_list)
if lemmatize:
if not _NLTK_DISABLED:
self.wlemm = nltk.WordNetLemmatizer()
else:
print ('Warning: no lemmatizer !')
评论列表
文章目录