data_util.py 文件源码-python代码片段

def get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2):
    # fit tokenizer
    tk = Tokenizer(num_words=TrainConfig.MAX_NB_WORDS)
    tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2)
    word_index = tk.word_index

    # q1, q2 training text sequence
    # (sentence_len, MAX_SEQUENCE_LENGTH)
    train_x1 = tk.texts_to_sequences(train_ori1)
    train_x1 = pad_sequences(train_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
    train_x2 = tk.texts_to_sequences(train_ori2)
    train_x2 = pad_sequences(train_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)

    # q1, q2 testing text sequence
    test_x1 = tk.texts_to_sequences(test_ori1)
    test_x1 = pad_sequences(test_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
    test_x2 = tk.texts_to_sequences(test_ori2)
    test_x2 = pad_sequences(test_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)

    np.save(open(DirConfig.Q1_CACHE_TRAIN, 'wb'), train_x1)
    np.save(open(DirConfig.Q2_CACHE_TRAIN, 'wb'), train_x2)
    np.save(open(DirConfig.Q1_CACHE_TEST, 'wb'), test_x1)
    np.save(open(DirConfig.Q2_CACHE_TEST, 'wb'), test_x2)
    np.save(open(DirConfig.WORD_INDEX_CACHE, 'wb'), word_index)
    return train_x1, train_x2, test_x1, test_x2, word_index