def build_windex(self, sentences, wordlist=[]):
"""
go through all the sentences and get an overview of all used words and their frequencies
"""
# get an overview of the vocabulary
vocab = defaultdict(int)
total_words = 0
for sentence_no, sentence in enumerate(sentences):
if not sentence_no % self.progress:
print("PROGRESS: at sentence #%i, processed %i words and %i unique words" % (sentence_no, sum(vocab.values()), len(vocab)))
for word in sentence:
vocab[word] += 1
print("collected %i unique words from a corpus of %i words and %i sentences" % (len(vocab), sum(vocab.values()), sentence_no + 1))
# assign a unique index to each word and remove all words with freq < min_count
self.wcounts, self.word2index, self.index2word = {}, {}, []
if not wordlist:
wordlist = [word for word, c in vocab.items() if c >= self.min_count]
for word in wordlist:
self.word2index[word] = len(self.word2index)
self.index2word.append(word)
self.wcounts[word] = vocab[word]
评论列表
文章目录