python类pad_sequences()的实例源码-面圈网

tfRNN.py 文件源码项目：SNLI-Keras 作者: adamzjk 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def prep_data(self):
    # 1, Read raw Training,Validation and Test data
    self.train,self.validation,self.test = self.load_data()

    # 2, Prep Word Indexer: assign each word a number
    self.indexer = Tokenizer(lower=False, filters='')
    self.indexer.fit_on_texts(self.train[0] + self.train[1]) # todo remove test
    self.Vocab = len(self.indexer.word_counts) + 1

    # 3, Convert each word in sent to num and zero pad
    def padding(x, MaxLen):
      return pad_sequences(sequences=self.indexer.texts_to_sequences(x), maxlen=MaxLen)
    def pad_data(x):
      return padding(x[0], self.SentMaxLen), padding(x[1], self.SentMaxLen), x[2]

    self.train = pad_data(self.train)
    self.validation = pad_data(self.validation)
    self.test = pad_data(self.test)

lstm.py 文件源码项目：hyperas 作者: maxpumperla 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def data():
    maxlen = 100
    max_features = 20000

    print('Loading data...')
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
    print(len(X_train), 'train sequences')
    print(len(X_test), 'test sequences')

    print("Pad sequences (samples x time)")
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    return X_train, X_test, y_train, y_test, max_features, maxlen

pos-tagging-explore.py 文件源码项目：Deep-Learning-with-Keras 作者: PacktPublishing 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def build_tensor(filename, numrecs, word2index, maxlen, 
                 make_categorical=False):
    data = np.empty((numrecs, ), dtype=list)
    fin = open(filename, "rb")
    i = 0
    for line in fin:
        wids = []
        for word in line.strip().split():
            if word2index.has_key(word):
                wids.append(word2index[word])
            else:
                wids.append(word2index["UNK"])
        if make_categorical:
            data[i] = np_utils.to_categorical(
                wids, num_classes=len(word2index))
        else:
            data[i] = wids
        i += 1
    fin.close()
    pdata = sequence.pad_sequences(data, maxlen=maxlen)
    return pdata

pos_tagging_gru.py 文件源码项目：Deep-Learning-with-Keras 作者: PacktPublishing 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def generate_batch(s_sents, s_word2index, t_sents, t_word2index, 
                   batch_size, maxlen):
    while True:
        # shuffle the input
        indices = np.random.permutation(np.arange(len(s_sents)))
        ss_sents = [s_sents[ix] for ix in indices]
        ts_sents = [t_sents[ix] for ix in indices]
        # convert to word indices
        si_sents = [[get_or_else(s_word2index, word, s_word2index["UNK"]) 
                    for word in sent] 
                    for sent in ss_sents]
        ti_sents = [[t_word2index[word] for word in sent]
                    for sent in ts_sents]
        # inner loop should run for an epoch
        num_batches = len(s_sents) // batch_size
        for i in range(num_batches):
            s_batch = si_sents[i * batch_size : (i + 1) * batch_size]
            t_batch = ti_sents[i * batch_size : (i + 1) * batch_size]
            sp_batch = sequence.pad_sequences(s_batch, maxlen=maxlen)
            tp_batch = sequence.pad_sequences(t_batch, maxlen=maxlen)
            tpc_batch = np_utils.to_categorical(tp_batch.reshape(-1, 1), 
                num_classes=len(t_word2index)).reshape(batch_size, 
                -1, len(t_word2index))
            yield sp_batch, tpc_batch

tfRNN.py 文件源码项目：SNLI-Keras 作者: adamzjk 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def label_test_file(self):
    outfile = open("pred_vld.txt","w")
    prep_alfa = lambda X: pad_sequences(sequences=self.indexer.texts_to_sequences(X),
                                        maxlen=self.SentMaxLen)
    vld = json.loads(open('validation.json', 'r').read())
    for prem, hypo, label in zip(vld[0], vld[1], vld[2]):
      prem_pad, hypo_pad = prep_alfa([prem]), prep_alfa([hypo])
      ans = np.reshape(self.model.predict(x=[prem_pad, hypo_pad], batch_size = 1), -1)  # PREDICTION
      if np.argmax(ans) != label:
        outfile.write(prem + "\n" + hypo + "\n")
        outfile.write("Truth: " + self.rLabels[label] + "\n")
        outfile.write('Contradiction \t{:.1f}%\n'.format(float(ans[0]) * 100) +
                      'Neutral \t\t{:.1f}%\n'.format(float(ans[1]) * 100) +
                      'Entailment \t{:.1f}%\n'.format(float(ans[2]) * 100))
        outfile.write("-"*15 + "\n")
    outfile.close()

QnARecurAtteLatest2GRU.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# for validation dataset

QnARecurAtteLatest2GRU.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')

QnARecurAtteLatest3Atten.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# for validation dataset

QnARecurAtteLatest3Atten.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')

QnARecurAtteLatest2GRUUnidirect.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')

QnARecurAtteLatest2Attenenhance.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# for validation dataset

QnARecurAtteLatest2Attenenhance.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')

QnARecurAtteLatest2GRU1SATTE.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# for validation dataset

QnARecurAtteLatest2GRU1SATTE.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')

QnA.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# Note: Need to download and unzip Glove pre-train model files into same file as this script

QnARecurAtteLatest1Attenenhance.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# for validation dataset

QnARecurAtteLatest1Attenenhance.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')

QnARecurAtteLatest3Attenenhance.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# for validation dataset

QnARecurAtteLatest3Attenenhance.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')

QnARecurAtteLatest.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')

data_util.py 文件源码项目：BiMPM_keras 作者: ijinmao 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2):
    # fit tokenizer
    tk = Tokenizer(num_words=TrainConfig.MAX_NB_WORDS)
    tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2)
    word_index = tk.word_index

    # q1, q2 training text sequence
    # (sentence_len, MAX_SEQUENCE_LENGTH)
    train_x1 = tk.texts_to_sequences(train_ori1)
    train_x1 = pad_sequences(train_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
    train_x2 = tk.texts_to_sequences(train_ori2)
    train_x2 = pad_sequences(train_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)

    # q1, q2 testing text sequence
    test_x1 = tk.texts_to_sequences(test_ori1)
    test_x1 = pad_sequences(test_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
    test_x2 = tk.texts_to_sequences(test_ori2)
    test_x2 = pad_sequences(test_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)

    np.save(open(DirConfig.Q1_CACHE_TRAIN, 'wb'), train_x1)
    np.save(open(DirConfig.Q2_CACHE_TRAIN, 'wb'), train_x2)
    np.save(open(DirConfig.Q1_CACHE_TEST, 'wb'), test_x1)
    np.save(open(DirConfig.Q2_CACHE_TEST, 'wb'), test_x2)
    np.save(open(DirConfig.WORD_INDEX_CACHE, 'wb'), word_index)
    return train_x1, train_x2, test_x1, test_x2, word_index

data_util.py 文件源码项目：BiMPM_keras 作者: ijinmao 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def words_to_char_sequence(words_list, tk):
    """Convert words list to chars sequence

    # Arguments
        words: word list, (sentence_len, word_len)

    # Output shape
        (sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
    """
    c_seqs = np.zeros((len(words_list),
                       TrainConfig.MAX_SEQUENCE_LENGTH,
                       TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
    for w_i in xrange(len(words_list)):
        words = words_list[w_i]
        fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
                             TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
        ws = tk.texts_to_sequences(words)
        ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
        if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
            max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
        else:
            max_word_len = len(words)
        fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
        c_seqs[w_i] = fixed_ws
    return c_seqs

preprocessors.py 文件源码项目：keras-image-captioning 作者: danieljl 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def preprocess_batch(self, captions_label_encoded):
        captions = keras_seq.pad_sequences(captions_label_encoded,
                                           padding='post')
        # Because the number of timesteps/words resulted by the model is
        # maxlen(captions) + 1 (because the first "word" is the image).
        captions_extended1 = keras_seq.pad_sequences(captions,
                                                maxlen=captions.shape[-1] + 1,
                                                padding='post')
        captions_one_hot = map(self._tokenizer.sequences_to_matrix,
                               np.expand_dims(captions_extended1, -1))
        captions_one_hot = np.array(captions_one_hot, dtype='int')

        # Decrease/shift word index by 1.
        # Shifting `captions_one_hot` makes the padding word
        # (index=0, encoded=[1, 0, ...]) encoded all zeros ([0, 0, ...]),
        # so its cross entropy loss will be zero.
        captions_decreased = captions.copy()
        captions_decreased[captions_decreased > 0] -= 1
        captions_one_hot_shifted = captions_one_hot[:, :, 1:]

        captions_input = captions_decreased
        captions_output = captions_one_hot_shifted
        return captions_input, captions_output

parse_features.py 文件源码项目：genre-erkennung-pipeline 作者: amirothman 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def build_vectors(keyword="",data_label="",lower_limit=None,upper_limit=None,folder_path="dataset"):
    # training
    training_vector,labels,maxlen_training = create_dataset(dataset_path = folder_path+"/train",keyword=keyword,lower_limit=lower_limit,upper_limit=upper_limit)

    # validation
    evaluation_training_vector,evaluation_labels,maxlen_evaluation = create_dataset(dataset_path = "{0}/test".format(folder_path),keyword=keyword,lower_limit=lower_limit,upper_limit=upper_limit)

    # # X_training
    training_vector = sequence.pad_sequences(training_vector, maxlen=np.max([maxlen_training,maxlen_evaluation]),dtype='float32')
    pickle.dump(training_vector,open("pickled_vectors/{1}{0}_training_vector.pickle".format(keyword,data_label),"wb"))
    #
    # # y
    #
    pickle.dump(labels,open("pickled_vectors/{1}{0}_label.pickle".format(keyword,data_label),"wb"))
    #
    #
    # # evaluation
    evaluation_training_vector = sequence.pad_sequences(evaluation_training_vector, maxlen=np.max([maxlen_training,maxlen_evaluation]),dtype='float32')
    pickle.dump(evaluation_training_vector,open("pickled_vectors/{1}{0}_evaluation_training_vector.pickle".format(keyword,data_label),"wb"))
    #
    # # evaluation
    pickle.dump(evaluation_labels,open("pickled_vectors/{1}{0}_evaluation_label.pickle".format(keyword,data_label),"wb"))
    with(open("maxlen_{0}".format(keyword),"w")) as _f:
        _f.write(str(np.max([maxlen_training,maxlen_evaluation])))

test_sequence.py 文件源码项目：keras 作者: GeekLiB 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def test_pad_sequences():
    a = [[1], [1, 2], [1, 2, 3]]

    # test padding
    b = pad_sequences(a, maxlen=3, padding='pre')
    assert_allclose(b, [[0, 0, 1], [0, 1, 2], [1, 2, 3]])
    b = pad_sequences(a, maxlen=3, padding='post')
    assert_allclose(b, [[1, 0, 0], [1, 2, 0], [1, 2, 3]])

    # test truncating
    b = pad_sequences(a, maxlen=2, truncating='pre')
    assert_allclose(b, [[0, 1], [1, 2], [2, 3]])
    b = pad_sequences(a, maxlen=2, truncating='post')
    assert_allclose(b, [[0, 1], [1, 2], [1, 2]])

    # test value
    b = pad_sequences(a, maxlen=3, value=1)
    assert_allclose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])

__init__.py 文件源码项目：text-classification-with-convnets 作者: osmanbaskaya 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def testset_read(fn, word_idx, maxlen):
    total_num_of_unk = 0
    tokenizer = TreebankWordTokenizer()
    try:
        lines = codecs.open(fn, encoding='utf8').read().splitlines()
    except UnicodeDecodeError:
        lines = codecs.open(fn).read().splitlines()
    X = []
    sentences = []
    for line in lines:
        s = []
        for token in tokenizer.tokenize(line):
            idx = word_idx.get(token, 1)  # 1 is UNKNOWN word id
            if idx == 1:
                total_num_of_unk += 1
            s.append(idx)
        X.append(s)
        sentences.append(line)

    X = sequence.pad_sequences(X, maxlen=maxlen)

    print >> sys.stderr, "Total number of UNK={}, Avg. {}".format(total_num_of_unk, total_num_of_unk / float(len(sentences)))
    return X, sentences

conll2000.py 文件源码项目：keras-contrib 作者: farizrahman4u 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def _process_data(data, vocab, pos_tags, chunk_tags, maxlen=None, onehot=False):
    if maxlen is None:
        maxlen = max(len(s) for s in data)
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]  # set to <unk> (index 1) if not in vocab

    y_pos = [[pos_tags.index(w[1]) for w in s] for s in data]
    y_chunk = [[chunk_tags.index(w[2]) for w in s] for s in data]

    x = pad_sequences(x, maxlen)  # left padding

    y_pos = pad_sequences(y_pos, maxlen, value=-1)  # lef padded with -1. Indeed, any interger works as it will be masked
    y_chunk = pad_sequences(y_chunk, maxlen, value=-1)

    if onehot:
        y_pos = numpy.eye(len(pos_tags), dtype='float32')[y]
        y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y]
    else:
        y_pos = numpy.expand_dims(y_pos, 2)
        y_chunk = numpy.expand_dims(y_chunk, 2)
    return x, y_pos, y_chunk

data_util.py 文件源码项目：quora_duplicate 作者: ijinmao 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2):
    # fit tokenizer
    tk = Tokenizer(num_words=TrainConfig.MAX_NB_WORDS)
    tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2)
    word_index = tk.word_index

    # q1, q2 training text sequence
    # (sentence_len, MAX_SEQUENCE_LENGTH)
    train_x1 = tk.texts_to_sequences(train_ori1)
    train_x1 = pad_sequences(train_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
    train_x2 = tk.texts_to_sequences(train_ori2)
    train_x2 = pad_sequences(train_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)

    # q1, q2 testing text sequence
    test_x1 = tk.texts_to_sequences(test_ori1)
    test_x1 = pad_sequences(test_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
    test_x2 = tk.texts_to_sequences(test_ori2)
    test_x2 = pad_sequences(test_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)

    np.save(open(DirConfig.Q1_CACHE_TRAIN, 'wb'), train_x1)
    np.save(open(DirConfig.Q2_CACHE_TRAIN, 'wb'), train_x2)
    np.save(open(DirConfig.Q1_CACHE_TEST, 'wb'), test_x1)
    np.save(open(DirConfig.Q2_CACHE_TEST, 'wb'), test_x2)
    np.save(open(DirConfig.WORD_INDEX_CACHE, 'wb'), word_index)
    return train_x1, train_x2, test_x1, test_x2, word_index

data_util.py 文件源码项目：quora_duplicate 作者: ijinmao 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def words_to_char_sequence(words_list, tk):
    """Convert words list to chars sequence

    # Arguments
        words: word list, (sentence_len, word_len)

    # Output shape
        (sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
    """
    c_seqs = np.zeros((len(words_list),
                       TrainConfig.MAX_SEQUENCE_LENGTH,
                       TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
    for w_i in xrange(len(words_list)):
        words = words_list[w_i]
        fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
                             TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
        ws = tk.texts_to_sequences(words)
        ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
        if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
            max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
        else:
            max_word_len = len(words)
        fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
        c_seqs[w_i] = fixed_ws
    return c_seqs

as_reader_tf.py 文件源码项目：attention-sum-reader 作者: cairoHy 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def preprocess_input_sequences(self, data, shuffle=True):
        """
        ??????
        shuffle
        PAD/TRUNC????????
        y_true????self.A_len????index=0??????one-hot??
        """
        documents, questions, answer, candidates = self.union_shuffle(data) if shuffle else data
        d_lens = [len(i) for i in documents]

        questions_ok = pad_sequences(questions, maxlen=self.q_len, dtype="int32", padding="post", truncating="post")
        documents_ok = pad_sequences(documents, maxlen=self.d_len, dtype="int32", padding="post", truncating="post")
        context_mask = K.eval(tf.sequence_mask(d_lens, self.d_len, dtype=tf.float32))
        candidates_ok = pad_sequences(candidates, maxlen=self.A_len, dtype="int32", padding="post", truncating="post")
        y_true = np.zeros_like(candidates_ok)
        y_true[:, 0] = 1
        return questions_ok, documents_ok, context_mask, candidates_ok, y_true