def buildCorpus(self):
"""
Build the corpus from the documents:
1. Remove words that only appeared once.
2. Create the Dictionary object.
3. Convert the documents to simple bag-of-words representation.
4. Convert the bag-of-words vectors to tf-idf.
"""
# Remove words that only appear once.
self.documents = [[token for token in doc if self.frequency[token] > 1]
for doc in self.documents]
# Build a dictionary from the text.
self.dictionary = corpora.Dictionary(self.documents)
# Map the documents to vectors.
corpus = [self.dictionary.doc2bow(text) for text in self.documents]
# Delete the tokenized representation of the documents--no need to
# carry this around!
del self.documents[:]
# Convert the simple bag-of-words vectors to a tf-idf representation.
self.tfidf_model = TfidfModel(corpus)
self.corpus_tfidf = self.tfidf_model[corpus]
评论列表
文章目录