def _remove_uncommon_words(cls, tokenized_corpus, vocabulary_size):
word_count = nltk.FreqDist( itertools.chain(*tokenized_corpus) )
word_count = [cls.WORD_COUNT_ITEM(word=word, count=count) for word, count in word_count.items()]
word_count = sorted(word_count, key=lambda item: (item.count, item.word), reverse=True)
most_common_words = [word_count_item.word for word_count_item in word_count[:vocabulary_size - \
cls.NUMBER_OF_WORDS_TO_ADD_IN_MANUALLY + 1]]
tokenized_corpus = [
[word if word in most_common_words else cls.UNKNOWN_TOKEN for word in sentence]\
for sentence in tokenized_corpus
]
return tokenized_corpus
评论列表
文章目录