context2vec.py 文件源码-python代码片段

context2vec.py 文件源码
python
阅读 35 收藏 0 点赞 0 评论 0
def build_windex(self, sentences, wordlist=[]):
        """
        go through all the sentences and get an overview of all used words and their frequencies
        """
        # get an overview of the vocabulary
        vocab = defaultdict(int)
        total_words = 0
        for sentence_no, sentence in enumerate(sentences):
            if not sentence_no % self.progress:
                print("PROGRESS: at sentence #%i, processed %i words and %i unique words" % (sentence_no, sum(vocab.values()), len(vocab)))
            for word in sentence:
                vocab[word] += 1
        print("collected %i unique words from a corpus of %i words and %i sentences" % (len(vocab), sum(vocab.values()), sentence_no + 1))
        # assign a unique index to each word and remove all words with freq < min_count
        self.wcounts, self.word2index, self.index2word = {}, {}, []
        if not wordlist:
            wordlist = [word for word, c in vocab.items() if c >= self.min_count]
        for word in wordlist:
            self.word2index[word] = len(self.word2index)
            self.index2word.append(word)
            self.wcounts[word] = vocab[word]