def get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2):
# fit tokenizer
tk = Tokenizer(num_words=TrainConfig.MAX_NB_WORDS)
tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2)
word_index = tk.word_index
# q1, q2 training text sequence
# (sentence_len, MAX_SEQUENCE_LENGTH)
train_x1 = tk.texts_to_sequences(train_ori1)
train_x1 = pad_sequences(train_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
train_x2 = tk.texts_to_sequences(train_ori2)
train_x2 = pad_sequences(train_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
# q1, q2 testing text sequence
test_x1 = tk.texts_to_sequences(test_ori1)
test_x1 = pad_sequences(test_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
test_x2 = tk.texts_to_sequences(test_ori2)
test_x2 = pad_sequences(test_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
np.save(open(DirConfig.Q1_CACHE_TRAIN, 'wb'), train_x1)
np.save(open(DirConfig.Q2_CACHE_TRAIN, 'wb'), train_x2)
np.save(open(DirConfig.Q1_CACHE_TEST, 'wb'), test_x1)
np.save(open(DirConfig.Q2_CACHE_TEST, 'wb'), test_x2)
np.save(open(DirConfig.WORD_INDEX_CACHE, 'wb'), word_index)
return train_x1, train_x2, test_x1, test_x2, word_index
评论列表
文章目录