data_util.py 文件源码

python
阅读 30 收藏 0 点赞 0 评论 0

项目:BiMPM_keras 作者: ijinmao 项目源码 文件源码
def get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2):
    # fit tokenizer
    tk = Tokenizer(num_words=TrainConfig.MAX_NB_WORDS)
    tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2)
    word_index = tk.word_index

    # q1, q2 training text sequence
    # (sentence_len, MAX_SEQUENCE_LENGTH)
    train_x1 = tk.texts_to_sequences(train_ori1)
    train_x1 = pad_sequences(train_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
    train_x2 = tk.texts_to_sequences(train_ori2)
    train_x2 = pad_sequences(train_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)

    # q1, q2 testing text sequence
    test_x1 = tk.texts_to_sequences(test_ori1)
    test_x1 = pad_sequences(test_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
    test_x2 = tk.texts_to_sequences(test_ori2)
    test_x2 = pad_sequences(test_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)

    np.save(open(DirConfig.Q1_CACHE_TRAIN, 'wb'), train_x1)
    np.save(open(DirConfig.Q2_CACHE_TRAIN, 'wb'), train_x2)
    np.save(open(DirConfig.Q1_CACHE_TEST, 'wb'), test_x1)
    np.save(open(DirConfig.Q2_CACHE_TEST, 'wb'), test_x2)
    np.save(open(DirConfig.WORD_INDEX_CACHE, 'wb'), word_index)
    return train_x1, train_x2, test_x1, test_x2, word_index
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号