def tokenize_sentences(self):
# tokenize the sentences into words and count the word frequencies
# get most common words, build index_to_word and word_to_index vectors
self.tokenized_sentences = [nltk.word_tokenize(sent) for sent in
self.sentences]
word_freq = nltk.FreqDist(itertools.chain(*self.tokenized_sentences))
print("Found %d unique word tokens." % len(word_freq.items()))
vocab = word_freq.most_common(self.vocabulary_size - 1)
self.index_to_word = [x[0] for x in vocab]
self.index_to_word.append(self.unknown_token)
self.word_to_index = dict(
[(w, i) for i, w in enumerate(self.index_to_word)])
print("Using vocabulary size %d." % self.vocabulary_size)
print(
"The least frequent word is '%s' appearing %d times." % (
vocab[-1][0], vocab[-1][1]))
# replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(self.tokenized_sentences):
self.tokenized_sentences[i] = [
w if w in self.word_to_index else self.unknown_token for w in
sent]
评论列表
文章目录