def prepare_train_data(self):
texts,labels = load_corpus()
volcabulary, train_words = get_volcabulary_and_list_words(texts)
self.set_volcabulary(volcabulary)
del volcabulary,texts
words_index = self.get_word_index(train_words, self.volcabulary, self.max_words, self.max_length)
# del reviews_words, volcabulary
index = np.arange(words_index.shape[0])
train_index, valid_index = train_test_split(
index, train_size=0.8, random_state=520)
train_data = words_index[train_index]
valid_data = words_index[valid_index]
labels = np.asarray(labels)
train_labels = labels[train_index]
valid_labels = labels[valid_index]
print(train_data.shape)
print(valid_data.shape)
pickle.dump((words_index, labels), open("output/zh_comments.pkl", 'wb'))
return train_data, train_labels, valid_data, valid_labels
评论列表
文章目录