def get_vocabulary(doc_set):
tokenizer = RegexpTokenizer(r'\w+')
distinctwords = {}
i = 0
# loop through document list
for text in doc_set:
raw = text.lower()
tokens = tokenizer.tokenize(raw)
for word in tokens:
if word not in distinctwords:
distinctwords[word] = i
i += 1
return distinctwords
评论列表
文章目录