python类PAD_ID的实例源码

seq2seq_model.py 文件源码 项目:seq2seq_parser 作者: trangham283 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def get_decode_batch(self, data, bucket_id):
    """Get sequential batch
    """
    encoder_size, decoder_size = self.buckets[bucket_id]
    encoder_inputs, decoder_inputs = [], []
    this_batch_size = len(data[bucket_id])

    # Get a random batch of encoder and decoder inputs from data,
    # pad them if needed, reverse encoder inputs and add GO to decoder.
    for sample in data[bucket_id]:
      encoder_input, decoder_input = sample

      # Encoder inputs are padded and then reversed.
      encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
      encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))

      # Decoder inputs get an extra "GO" symbol, and are padded then.
      decoder_pad_size = decoder_size - len(decoder_input) - 1
      decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                            [data_utils.PAD_ID] * decoder_pad_size)

    # Now we create batch-major vectors from the data selected above.
    batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []

    # Batch encoder inputs are just re-indexed encoder_inputs.
    for length_idx in xrange(encoder_size):
      batch_encoder_inputs.append(np.array([encoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(this_batch_size)], dtype=np.int32))

    # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
    for length_idx in xrange(decoder_size):
      batch_decoder_inputs.append(np.array([decoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(this_batch_size)], dtype=np.int32))

      # Create target_weights to be 0 for targets that are padding.
      batch_weight = np.ones(this_batch_size, dtype=np.float32)
      for batch_idx in xrange(this_batch_size):
        # We set weight to 0 if the corresponding target is a PAD symbol.
        # The corresponding target is decoder_input shifted by 1 forward.
        if length_idx < decoder_size - 1:
          target = decoder_inputs[batch_idx][length_idx + 1]
        if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
          batch_weight[batch_idx] = 0.0
      batch_weights.append(batch_weight)
    return batch_encoder_inputs, batch_decoder_inputs, batch_weights
seq2seq_model.py 文件源码 项目:seq2seq_parser 作者: trangham283 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def get_batch(self, data, bucket_id):
    """Get a random batch of data from the specified bucket, prepare for step.

    To feed data in step(..) it must be a list of batch-major vectors, while
    data here contains single length-major cases. So the main logic of this
    function is to re-index data cases to be in the proper format for feeding.

    Args:
      data: a tuple of size len(self.buckets) in which each element contains
        lists of pairs of input and output data that we use to create a batch.
      bucket_id: integer, which bucket to get the batch for.

    Returns:
      The triple (encoder_inputs, decoder_inputs, target_weights) for
      the constructed batch that has the proper format to call step(...) later.
    """
    encoder_size, decoder_size = self.buckets[bucket_id]
    encoder_inputs, decoder_inputs = [], []

    # Get a random batch of encoder and decoder inputs from data,
    # pad them if needed, reverse encoder inputs and add GO to decoder.
    for _ in xrange(self.batch_size):
      encoder_input, decoder_input = random.choice(data[bucket_id])

      # Encoder inputs are padded and then reversed.
      encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
      encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))

      # Decoder inputs get an extra "GO" symbol, and are padded then.
      decoder_pad_size = decoder_size - len(decoder_input) - 1
      decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                            [data_utils.PAD_ID] * decoder_pad_size)

    # Now we create batch-major vectors from the data selected above.
    batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []

    # Batch encoder inputs are just re-indexed encoder_inputs.
    for length_idx in xrange(encoder_size):
      batch_encoder_inputs.append(
          np.array([encoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))

    # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
    for length_idx in xrange(decoder_size):
      batch_decoder_inputs.append(
          np.array([decoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))

      # Create target_weights to be 0 for targets that are padding.
      batch_weight = np.ones(self.batch_size, dtype=np.float32)
      for batch_idx in xrange(self.batch_size):
        # We set weight to 0 if the corresponding target is a PAD symbol.
        # The corresponding target is decoder_input shifted by 1 forward.
        if length_idx < decoder_size - 1:
          target = decoder_inputs[batch_idx][length_idx + 1]
        if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
          batch_weight[batch_idx] = 0.0
      batch_weights.append(batch_weight)
    return batch_encoder_inputs, batch_decoder_inputs, batch_weights
seq2seq_model.py 文件源码 项目:seq2seq_parser 作者: trangham283 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def get_mix_batch(self, bucketed_data, bucket_id, this_batch_size):
    """Get a random batch of data from the specified bucket, prepare for step.

    To feed data in step(..) it must be a list of batch-major vectors, while
    data here contains single length-major cases. So the main logic of this
    function is to re-index data cases to be in the proper format for feeding.

    Args:
      data: a tuple of size len(self.buckets) in which each element contains
        lists of pairs of input and output data that we use to create a batch.
      bucket_id: integer, which bucket to get the batch for.

    Returns:
      The triple (encoder_inputs, decoder_inputs, target_weights) for
      the constructed batch that has the proper format to call step(...) later.
    """
    encoder_size, decoder_size = self.buckets[bucket_id]
    encoder_inputs, decoder_inputs = [], []

    # Get a random batch of encoder and decoder inputs from data,
    # pad them if needed, reverse encoder inputs and add GO to decoder.
    for _ in xrange(this_batch_size):
      encoder_input, decoder_input = random.choice(bucketed_data)

      # Encoder inputs are padded and then reversed.
      encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
      encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))

      # Decoder inputs get an extra "GO" symbol, and are padded then.
      decoder_pad_size = decoder_size - len(decoder_input) - 1
      decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                            [data_utils.PAD_ID] * decoder_pad_size)

    # Now we create batch-major vectors from the data selected above.
    batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []

    # Batch encoder inputs are just re-indexed encoder_inputs.
    for length_idx in xrange(encoder_size):
      batch_encoder_inputs.append(
          np.array([encoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(this_batch_size)], dtype=np.int32))

    # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
    for length_idx in xrange(decoder_size):
      batch_decoder_inputs.append(
          np.array([decoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(this_batch_size)], dtype=np.int32))

      # Create target_weights to be 0 for targets that are padding.
      batch_weight = np.ones(this_batch_size, dtype=np.float32)
      for batch_idx in xrange(this_batch_size):
        # We set weight to 0 if the corresponding target is a PAD symbol.
        # The corresponding target is decoder_input shifted by 1 forward.
        if length_idx < decoder_size - 1:
          target = decoder_inputs[batch_idx][length_idx + 1]
        if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
          batch_weight[batch_idx] = 0.0
      batch_weights.append(batch_weight)
    return batch_encoder_inputs, batch_decoder_inputs, batch_weights
ranker.py 文件源码 项目:deeplearning4chatbot 作者: liangjz92 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def train2vec(self, dialogs, iters):
        batch_size = len(dialogs)   #????batch_size
        max_border = self.get_max(iters)    #??????????
        history_inputs =[]
        true_inputs =[]
        false_inputs = []
        for i in range( batch_size ):
            border = min(len(dialogs[i]),max_border*2)
            dialogs[i] = dialogs[i][:border]
            #for j in len(dialogs[i]):
        if (dialogs ==None) or len(dialogs)==0 : #??????
            return None,None,None
        for i in range(batch_size): #batch
            one_session = dialogs[i]    #??????
            cache = []
            for j in range(self.max_dialogue_size): #????????????????
                if j < len(one_session):
                    encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0]))   #0??????
                    #print('encoder_pad',encoder_pad)
                    cache.append(list(reversed(one_session[j][0]+encoder_pad))) #????
                else:
                    cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
            history_inputs.append(cache)
            true_cache =[]
            false_cache = []
            for j in range(self.max_dialogue_size):   #candidate part
                if j %2==0: #?0,2,4,..??????
                    continue
                if j<len(one_session):
                    true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0]))
                    true_cache.append(list(reversed(one_session[j][0] + true_pad)))# true candiate
                    false_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][1]))
                    false_cache.append(list(reversed(one_session[j][1] + false_pad)))#false candidate
                else:
                    true_cache.append(list([data_utils.EOS_ID]*self.max_sentence_size))
                    false_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
            true_inputs.append(true_cache)
            false_inputs.append(false_cache)
        ######################################################
        batch_history,batch_true,batch_false = [], [], []
        for sent_index in range(self.max_dialogue_size):
            history_cache = []
            for length_index in range(self.max_sentence_size):
                history_cache.append(np.array([history_inputs[batch_index][sent_index][length_index] for batch_index in range(len(history_inputs))]))
            batch_history.append(history_cache)
            if sent_index % 2!=0:
                true_cache, false_cache = [], []
                for length_index in range(self.max_sentence_size):
                    true_cache.append(np.array([true_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))]))
                    false_cache.append(np.array([false_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))]))
                batch_true.append(true_cache)
                batch_false.append(false_cache)

        return batch_history, batch_true, batch_false
ranker.py 文件源码 项目:deeplearning4chatbot 作者: liangjz92 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test2vec(self,history):
        #?????????????
        #???????????
        history_inputs =[]
        candidate_inputs =[]
        if (history ==None) or len(history)==0 : #??????
            return None,None
        #print(history)
        candidate_size = len(history[1])
        #print('candidate_size',candidate_size)
        cache = []
        for j in range(self.max_dialogue_size): #????????????????
            if j< len(history):
                encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(history[j][0]))   #0??????
                cache.append(list(reversed(history[j][0]+encoder_pad))) #????
            else:
                cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
        history_inputs = cache
        #print(history_inputs)
        true_cache =[]
        for i in range(self.max_dialogue_size):   #candidate part
            if i %2==0: #?0,2,4,..??????
                continue
            if i<len(history):  #????????
                for j in range(candidate_size):
                    true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(history[i][j]))
                    true_cache.append(list(reversed(history[i][j] + true_pad)))# true candidate
            else:
                for j in range(candidate_size):
                    true_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
            candidate_inputs.append(true_cache)
            true_cache =[]


        ######################################################
        batch_history, batch_candidate = [], []

        for sent_index in range(self.max_dialogue_size):
            history_cache = []
            for length_index in range(self.max_sentence_size):
                history_cache.append(np.array( [history_inputs[sent_index][length_index]]))
            batch_history.append(history_cache)

            if sent_index % 2 != 0:
                candidate_cache = []
                for length_index in range(self.max_sentence_size):
                    candidate_cache.append(np.array([candidate_inputs[int(sent_index/2)][batch_index][length_index] for batch_index in range(candidate_size)]))
                batch_candidate.append(candidate_cache)
        return batch_history, batch_candidate

############################################################################
ranker_b.py 文件源码 项目:deeplearning4chatbot 作者: liangjz92 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def train2vec(self, dialogs, iters):
        batch_size = len(dialogs)   #????batch_size
        max_border = self.get_max(iters)    #??????????
        history_inputs =[]
        true_inputs =[]
        false_inputs = []
        for i in range( batch_size ):
            border = min(len(dialogs[i]),max_border*2)
            dialogs[i] = dialogs[i][:border]
            #for j in len(dialogs[i]):
        if (dialogs ==None) or len(dialogs)==0 : #??????
            return None,None,None
        for i in range(batch_size): #batch
            one_session = dialogs[i]    #??????
            cache = []
            for j in range(self.max_dialogue_size): #????????????????
                if j < len(one_session):
                    encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0]))   #0??????
                    #print('encoder_pad',encoder_pad)
                    cache.append(list(reversed(one_session[j][0]+encoder_pad))) #????
                else:
                    cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
            history_inputs.append(cache)
            true_cache =[]
            false_cache = []
            for j in range(self.max_dialogue_size):   #candidate part
                if j %2==0: #?0,2,4,..??????
                    continue
                if j<len(one_session):
                    true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0]))
                    true_cache.append(list(reversed(one_session[j][0] + true_pad)))# true candiate
                    false_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][1]))
                    false_cache.append(list(reversed(one_session[j][1] + false_pad)))#false candidate
                else:
                    true_cache.append(list([data_utils.EOS_ID]*self.max_sentence_size))
                    false_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
            true_inputs.append(true_cache)
            false_inputs.append(false_cache)
        ######################################################
        batch_history,batch_true,batch_false = [], [], []
        for sent_index in range(self.max_dialogue_size):
            history_cache = []
            for length_index in range(self.max_sentence_size):
                history_cache.append(np.array([history_inputs[batch_index][sent_index][length_index] for batch_index in range(len(history_inputs))]))
            batch_history.append(history_cache)
            if sent_index % 2!=0:
                true_cache, false_cache = [], []
                for length_index in range(self.max_sentence_size):
                    true_cache.append(np.array([true_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))]))
                    false_cache.append(np.array([false_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))]))
                batch_true.append(true_cache)
                batch_false.append(false_cache)

        return batch_history, batch_true, batch_false
ranker_c.py 文件源码 项目:deeplearning4chatbot 作者: liangjz92 项目源码 文件源码 阅读 13 收藏 0 点赞 0 评论 0
def train2vec(self, dialogs, iters):
        batch_size = len(dialogs)   #????batch_size
        max_border = self.get_max(iters)    #??????????
        history_inputs =[]
        true_inputs =[]
        false_inputs = []
        for i in range( batch_size ):
            border = min(len(dialogs[i]),max_border*2)
            dialogs[i] = dialogs[i][:border]
            #for j in len(dialogs[i]):
        if (dialogs ==None) or len(dialogs)==0 : #??????
            return None,None,None
        for i in range(batch_size): #batch
            one_session = dialogs[i]    #??????
            cache = []
            for j in range(self.max_dialogue_size): #????????????????
                if j < len(one_session):
                    encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0]))   #0??????
                    #print('encoder_pad',encoder_pad)
                    cache.append(list(reversed(one_session[j][0]+encoder_pad))) #????
                else:
                    cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
            history_inputs.append(cache)
            true_cache =[]
            false_cache = []
            for j in range(self.max_dialogue_size):   #candidate part
                if j %2==0: #?0,2,4,..??????
                    continue
                if j<len(one_session):
                    true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0]))
                    true_cache.append(list(reversed(one_session[j][0] + true_pad)))# true candiate
                    false_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][1]))
                    false_cache.append(list(reversed(one_session[j][1] + false_pad)))#false candidate
                else:
                    true_cache.append(list([data_utils.EOS_ID]*self.max_sentence_size))
                    false_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
            true_inputs.append(true_cache)
            false_inputs.append(false_cache)
        ######################################################
        batch_history,batch_true,batch_false = [], [], []
        for sent_index in range(self.max_dialogue_size):
            history_cache = []
            for length_index in range(self.max_sentence_size):
                history_cache.append(np.array([history_inputs[batch_index][sent_index][length_index] for batch_index in range(len(history_inputs))]))
            batch_history.append(history_cache)
            if sent_index % 2!=0:
                true_cache, false_cache = [], []
                for length_index in range(self.max_sentence_size):
                    true_cache.append(np.array([true_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))]))
                    false_cache.append(np.array([false_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))]))
                batch_true.append(true_cache)
                batch_false.append(false_cache)

        return batch_history, batch_true, batch_false
ranker_c.py 文件源码 项目:deeplearning4chatbot 作者: liangjz92 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def test2vec(self,history):
        #?????????????
        #???????????
        history_inputs =[]
        candidate_inputs =[]
        if (history ==None) or len(history)==0 : #??????
            return None,None
        candidate_size = len(history[1])
        cache = []
        for j in range(self.max_dialogue_size): #????????????????
            if j< len(history):
                encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(history[j][0]))   #0??????
                cache.append(list(reversed(history[j][0]+encoder_pad))) #????
            else:
                cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
        history_inputs = cache
        true_cache =[]
        for i in range(self.max_dialogue_size):   #candidate part
            if i %2==0: #?0,2,4,..??????
                continue
            if i<len(history):  #????????
                for j in range(candidate_size):
                    true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(history[i][j]))
                    true_cache.append(list(reversed(history[i][j] + true_pad)))# true candidate
            else:
                for j in range(candidate_size):
                    true_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
            candidate_inputs.append(true_cache)
            true_cache =[]


        ######################################################
        batch_history, batch_candidate = [], []

        for sent_index in range(self.max_dialogue_size):
            history_cache = []
            for length_index in range(self.max_sentence_size):
                history_cache.append(np.array( [history_inputs[sent_index][length_index]]))
            batch_history.append(history_cache)

            if sent_index % 2 != 0:
                candidate_cache = []
                for length_index in range(self.max_sentence_size):
                    candidate_cache.append(np.array([candidate_inputs[int(sent_index/2)][batch_index][length_index] for batch_index in range(candidate_size)]))
                batch_candidate.append(candidate_cache)
        return batch_history, batch_candidate

############################################################################
seq2seq_model.py 文件源码 项目:Seq2Seq-Tensorflow-1.0-Chatbot 作者: igorvishnevskiy 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def get_batch(self, data, bucket_id):
    """Get a random batch of data from the specified bucket, prepare for step.

    To feed data in step(..) it must be a list of batch-major vectors, while
    data here contains single length-major cases. So the main logic of this
    function is to re-index data cases to be in the proper format for feeding.

    Args:
      data: a tuple of size len(self.buckets) in which each element contains
        lists of pairs of input and output data that we use to create a batch.
      bucket_id: integer, which bucket to get the batch for.

    Returns:
      The triple (encoder_inputs, decoder_inputs, target_weights) for
      the constructed batch that has the proper format to call step(...) later.
    """
    encoder_size, decoder_size = self.buckets[bucket_id]
    encoder_inputs, decoder_inputs = [], []

    # Get a random batch of encoder and decoder inputs from data,
    # pad them if needed, reverse encoder inputs and add GO to decoder.
    for _ in xrange(self.batch_size):
      encoder_input, decoder_input = random.choice(data[bucket_id])

      # Encoder inputs are padded and then reversed.
      encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
      encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))

      # Decoder inputs get an extra "GO" symbol, and are padded then.
      decoder_pad_size = decoder_size - len(decoder_input) - 1
      decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                            [data_utils.PAD_ID] * decoder_pad_size)

    # Now we create batch-major vectors from the data selected above.
    batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []

    # Batch encoder inputs are just re-indexed encoder_inputs.
    for length_idx in xrange(encoder_size):
      batch_encoder_inputs.append(
          np.array([encoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))

    # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
    for length_idx in xrange(decoder_size):
      batch_decoder_inputs.append(
          np.array([decoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))

      # Create target_weights to be 0 for targets that are padding.
      batch_weight = np.ones(self.batch_size, dtype=np.float32)
      for batch_idx in xrange(self.batch_size):
        # We set weight to 0 if the corresponding target is a PAD symbol.
        # The corresponding target is decoder_input shifted by 1 forward.
        if length_idx < decoder_size - 1:
          target = decoder_inputs[batch_idx][length_idx + 1]
        if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
          batch_weight[batch_idx] = 0.0
      batch_weights.append(batch_weight)
    return batch_encoder_inputs, batch_decoder_inputs, batch_weights
parser.py 文件源码 项目:DeepDeepParser 作者: janmbuys 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def read_mrs_data(buckets, source_paths, target_paths, max_size=None,
    any_length=False, offset_target=-1):
  # Read in all files seperately.
  source_inputs = [data_utils.read_ids_file(path, max_size) 
                   for path in source_paths]
  target_inputs = [data_utils.read_ids_file(path, max_size) 
                   for path in target_paths]

  data_set = [[] for _ in buckets]
  data_list = []
  # Assume everything is well-aligned.
  for i in xrange(len(source_inputs[0])): # over examples
    # List of sequences of each type.
    source_ids = [source_input[i] for source_input in source_inputs]
    # Assume first target type predicts EOS.
    # Not checking pointer ranges: do that inside tf graph.
    target_ids = [target_inputs[0][i] + [data_utils.EOS_ID]]
    for j, target_input in enumerate(target_inputs[1:]):
      if offset_target > 0 and j + 1 == offset_target:
        target_ids.append([data_utils.PAD_ID] + target_input[i] 
                          + [data_utils.PAD_ID])
      else:
        target_ids.append(target_input[i] + [data_utils.PAD_ID])

    found_bucket = False
    for bucket_id, (source_size, target_size) in enumerate(buckets):
      if len(source_ids[0]) < source_size and len(target_ids[0]) < target_size:
        data_set[bucket_id].append([source_ids, target_ids])
        data_list.append([source_ids, target_ids, bucket_id])
        found_bucket = True
        break
    if any_length and not found_bucket:
      # Crop examples that are larger than the largest bucket.
      source_size, target_size = buckets[-1][0], buckets[-1][1]
      if len(source_ids[0]) >= source_size:
        source_ids = [source_id[:source_size] for source_id in source_ids]
      if len(target_ids[0]) >= target_size:
        target_ids = [target_id[:target_size] for target_id in target_ids]
      bucket_id = len(buckets) - 1
      data_set[bucket_id].append([source_ids, target_ids])
      data_list.append([source_ids, target_ids, bucket_id])
  return data_set, data_list
seq2seq_model.py 文件源码 项目:seq2seq-chinese-textsum 作者: zpppy 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_batch(self, data, bucket_id):
    """Get a random batch of data from the specified bucket, prepare for step.
    To feed data in step(..) it must be a list of batch-major vectors, while
    data here contains single length-major cases. So the main logic of this
    function is to re-index data cases to be in the proper format for feeding.
    Args:
      data: a tuple of size len(self.buckets) in which each element contains
        lists of pairs of input and output data that we use to create a batch.
      bucket_id: integer, which bucket to get the batch for.
    Returns:
      The triple (encoder_inputs, decoder_inputs, target_weights) for
      the constructed batch that has the proper format to call step(...) later.
    """
    encoder_size, decoder_size = self.buckets[bucket_id]
    encoder_inputs, decoder_inputs = [], []

    # Get a random batch of encoder and decoder inputs from data,
    # pad them if needed, reverse encoder inputs and add GO to decoder.
    for _ in xrange(self.batch_size):
      encoder_input, decoder_input = random.choice(data[bucket_id])

      # Encoder inputs are padded and then reversed.
      encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
      encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))

      # Decoder inputs get an extra "GO" symbol, and are padded then.
      decoder_pad_size = decoder_size - len(decoder_input) - 1
      decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                            [data_utils.PAD_ID] * decoder_pad_size)

    # Now we create batch-major vectors from the data selected above.
    batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []

    # Batch encoder inputs are just re-indexed encoder_inputs.
    #encoder_inputs?shape?(batch_size,encoder_size) 
    #batch_encoder_inputs?shape?(encoder_size,batch_size)
    for length_idx in xrange(encoder_size):
      batch_encoder_inputs.append(
          np.array([encoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))

    # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
    for length_idx in xrange(decoder_size):
      batch_decoder_inputs.append(
          np.array([decoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))

      # Create target_weights to be 0 for targets that are padding.
      batch_weight = np.ones(self.batch_size, dtype=np.float32)
      for batch_idx in xrange(self.batch_size):
        # We set weight to 0 if the corresponding target is a PAD symbol.
        # The corresponding target is decoder_input shifted by 1 forward.
        if length_idx < decoder_size - 1:
          target = decoder_inputs[batch_idx][length_idx + 1]
          #????decoder????????target?pad,????????????????????
        if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
          batch_weight[batch_idx] = 0.0
      batch_weights.append(batch_weight) #shape?(encoder_size,batch_size)
    return batch_encoder_inputs, batch_decoder_inputs, batch_weights
gst_rnn_model.py 文件源码 项目:Deep-Reinforcement-Learning-for-Dialogue-Generation-in-tensorflow 作者: liuyuemaicha 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def get_batch(self, train_data, bucket_id):
        encoder_size, decoder_size = self.buckets[bucket_id]
        encoder_inputs, decoder_inputs = [], []
        batch_source_encoder, batch_source_decoder = [], []

        #print("bucket_id: ", bucket_id)
        for batch_i in xrange(self.batch_size):
            encoder_input, decoder_input = random.choice(train_data[bucket_id])

            batch_source_encoder.append(encoder_input)
            batch_source_decoder.append(decoder_input)

            #print("encoder_input: ", encoder_input)
            encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
            encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
            #print("encoder_input pad: ", list(reversed(encoder_input + encoder_pad)))

            #print("decoder_input: ", decoder_input)
            decoder_pad_size = decoder_size - len(decoder_input) - 1
            decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                                  [data_utils.PAD_ID] * decoder_pad_size)
            #print("decoder_pad: ",[data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size)

        batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []

        for length_idx in xrange(encoder_size):
            batch_encoder_inputs.append(
                np.array([encoder_inputs[batch_idx][length_idx]
                          for batch_idx in xrange(self.batch_size)], dtype=np.int32))

        for length_idx in xrange(decoder_size):
            batch_decoder_inputs.append(
                np.array([decoder_inputs[batch_idx][length_idx]
                          for batch_idx in xrange(self.batch_size)], dtype=np.int32))

            batch_weight = np.ones(self.batch_size, dtype=np.float32)
            for batch_idx in xrange(self.batch_size):
                # We set weight to 0 if the corresponding target is a PAD symbol.
                # The corresponding target is decoder_input shifted by 1 forward.
                if length_idx < decoder_size - 1:
                    target = decoder_inputs[batch_idx][length_idx + 1]
                if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
                    batch_weight[batch_idx] = 0.0
            batch_weights.append(batch_weight)

        return batch_encoder_inputs, batch_decoder_inputs, batch_weights, batch_source_encoder, batch_source_decoder
grl_rnn_model.py 文件源码 项目:Deep-Reinforcement-Learning-for-Dialogue-Generation-in-tensorflow 作者: liuyuemaicha 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_batch(self, train_data, bucket_id, type=0):

        encoder_size, decoder_size = self.buckets[bucket_id]
        encoder_inputs, decoder_inputs = [], []

        # print("Batch_Size: %s" %self.batch_size)
        # Get a random batch of encoder and decoder inputs from data,
        # pad them if needed, reverse encoder inputs and add GO to decoder.
        batch_source_encoder, batch_source_decoder = [], []
        # print("bucket_id: %s" %bucket_id)
        for batch_i in xrange(self.batch_size):
            if type == 1:
                # feed_data = {bucket_id: zip(tokens_a, tokens_b)}
                encoder_input, decoder_input = train_data[bucket_id][batch_i]
            elif type == 2:
                # feed_data = {bucket_id: [(resp_tokens, [])]}
                encoder_input_a, decoder_input = train_data[bucket_id][0]
                encoder_input = encoder_input_a[batch_i]
            elif type == 0:
                encoder_input, decoder_input = random.choice(train_data[bucket_id])
                print("train en: %s, de: %s" % (encoder_input, decoder_input))

            batch_source_encoder.append(encoder_input)
            batch_source_decoder.append(decoder_input)
            # Encoder inputs are padded and then reversed.
            encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
            encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))

            # Decoder inputs get an extra "GO" symbol, and are padded then.
            decoder_pad_size = decoder_size - len(decoder_input) - 1
            decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                                  [data_utils.PAD_ID] * decoder_pad_size)

        # Now we create batch-major vectors from the data selected above.
        batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []

        # Batch encoder inputs are just re-indexed encoder_inputs.
        for length_idx in xrange(encoder_size):
            batch_encoder_inputs.append(
                np.array([encoder_inputs[batch_idx][length_idx]
                          for batch_idx in xrange(self.batch_size)], dtype=np.int32))

        # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
        for length_idx in xrange(decoder_size):
            batch_decoder_inputs.append(
                np.array([decoder_inputs[batch_idx][length_idx]
                          for batch_idx in xrange(self.batch_size)], dtype=np.int32))

            # Create target_weights to be 0 for targets that are padding.
            batch_weight = np.ones(self.batch_size, dtype=np.float32)
            for batch_idx in xrange(self.batch_size):
                # We set weight to 0 if the corresponding target is a PAD symbol.
                # The corresponding target is decoder_input shifted by 1 forward.
                if length_idx < decoder_size - 1:
                    target = decoder_inputs[batch_idx][length_idx + 1]
                if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
                    batch_weight[batch_idx] = 0.0
            batch_weights.append(batch_weight)

        return batch_encoder_inputs, batch_decoder_inputs, batch_weights, batch_source_encoder, batch_source_decoder
seq2seq_model.py 文件源码 项目:tf_chatbot_seq2seq_antilm 作者: Marsan-Ma 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_batch(self, data, bucket_id):
    """Get a random batch of data from the specified bucket, prepare for step.

    To feed data in step(..) it must be a list of batch-major vectors, while
    data here contains single length-major cases. So the main logic of this
    function is to re-index data cases to be in the proper format for feeding.

    Args:
      data: a tuple of size len(self.buckets) in which each element contains
        lists of pairs of input and output data that we use to create a batch.
      bucket_id: integer, which bucket to get the batch for.

    Returns:
      The triple (encoder_inputs, decoder_inputs, target_weights) for
      the constructed batch that has the proper format to call step(...) later.
    """
    encoder_size, decoder_size = self.buckets[bucket_id]
    encoder_inputs, decoder_inputs = [], []

    # Get a random batch of encoder and decoder inputs from data,
    # pad them if needed, reverse encoder inputs and add GO to decoder.
    for _ in xrange(self.batch_size):
      encoder_input, decoder_input = random.choice(data[bucket_id])

      # Encoder inputs are padded and then reversed.
      encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
      encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))

      # Decoder inputs get an extra "GO" symbol, and are padded then.
      decoder_pad_size = decoder_size - len(decoder_input) - 1
      decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                            [data_utils.PAD_ID] * decoder_pad_size)

    # Now we create batch-major vectors from the data selected above.
    batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []

    # Batch encoder inputs are just re-indexed encoder_inputs.
    for length_idx in xrange(encoder_size):
      batch_encoder_inputs.append(
          np.array([encoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))

    # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
    for length_idx in xrange(decoder_size):
      batch_decoder_inputs.append(
          np.array([decoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))

      # Create target_weights to be 0 for targets that are padding.
      batch_weight = np.ones(self.batch_size, dtype=np.float32)
      for batch_idx in xrange(self.batch_size):
        # We set weight to 0 if the corresponding target is a PAD symbol.
        # The corresponding target is decoder_input shifted by 1 forward.
        if length_idx < decoder_size - 1:
          target = decoder_inputs[batch_idx][length_idx + 1]
        if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
          batch_weight[batch_idx] = 0.0
      batch_weights.append(batch_weight)
    return batch_encoder_inputs, batch_decoder_inputs, batch_weights
seq2seq_model.py 文件源码 项目:tf-tutorial 作者: zchen0211 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def get_batch(self, data, bucket_id):
    """Get a random batch of data from the specified bucket, prepare for step.

    To feed data in step(..) it must be a list of batch-major vectors, while
    data here contains single length-major cases. So the main logic of this
    function is to re-index data cases to be in the proper format for feeding.

    Args:
      data: a tuple of size len(self.buckets) in which each element contains
        lists of pairs of input and output data that we use to create a batch.
      bucket_id: integer, which bucket to get the batch for.

    Returns:
      The triple (encoder_inputs, decoder_inputs, target_weights) for
      the constructed batch that has the proper format to call step(...) later.
    """
    encoder_size, decoder_size = self.buckets[bucket_id]
    encoder_inputs, decoder_inputs = [], []

    # Get a random batch of encoder and decoder inputs from data,
    # pad them if needed, reverse encoder inputs and add GO to decoder.
    for _ in xrange(self.batch_size):
      encoder_input, decoder_input = random.choice(data[bucket_id])

      # Encoder inputs are padded and then reversed.
      encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
      encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))

      # Decoder inputs get an extra "GO" symbol, and are padded then.
      decoder_pad_size = decoder_size - len(decoder_input) - 1
      decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                            [data_utils.PAD_ID] * decoder_pad_size)

    # Now we create batch-major vectors from the data selected above.
    batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []

    # Batch encoder inputs are just re-indexed encoder_inputs.
    for length_idx in xrange(encoder_size):
      batch_encoder_inputs.append(
          np.array([encoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))

    # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
    for length_idx in xrange(decoder_size):
      batch_decoder_inputs.append(
          np.array([decoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))

      # Create target_weights to be 0 for targets that are padding.
      batch_weight = np.ones(self.batch_size, dtype=np.float32)
      for batch_idx in xrange(self.batch_size):
        # We set weight to 0 if the corresponding target is a PAD symbol.
        # The corresponding target is decoder_input shifted by 1 forward.
        if length_idx < decoder_size - 1:
          target = decoder_inputs[batch_idx][length_idx + 1]
        if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
          batch_weight[batch_idx] = 0.0
      batch_weights.append(batch_weight)
    return batch_encoder_inputs, batch_decoder_inputs, batch_weights
seq2seq_model.py 文件源码 项目:dnnQuery 作者: richardxiong 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_batch(self, data, bucket_id):
    """Get a random batch of data from the specified bucket, prepare for step.
    To feed data in step(..) it must be a list of batch-major vectors, while
    data here contains single length-major cases. So the main logic of this
    function is to re-index data cases to be in the proper format for feeding.
    Args:
      data: a tuple of size len(self.buckets) in which each element contains
        lists of pairs of input and output data that we use to create a batch.
      bucket_id: integer, which bucket to get the batch for.
    Returns:
      The triple (encoder_inputs, decoder_inputs, target_weights) for
      the constructed batch that has the proper format to call step(...) later.
    """
    encoder_size, decoder_size = self.buckets[bucket_id]
    encoder_inputs, decoder_inputs = [], []

    # Get a random batch of encoder and decoder inputs from data,
    # pad them if needed, reverse encoder inputs and add GO to decoder.
    for _ in xrange(self.batch_size):
      encoder_input, decoder_input = random.choice(data[bucket_id])

      # Encoder inputs are padded and then reversed.
      encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
      encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))

      # Decoder inputs get an extra "GO" symbol, and are padded then.
      decoder_pad_size = decoder_size - len(decoder_input) - 1
      decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                            [data_utils.PAD_ID] * decoder_pad_size)

    # Now we create batch-major vectors from the data selected above.
    batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []

    # Batch encoder inputs are just re-indexed encoder_inputs.
    for length_idx in xrange(encoder_size):
      batch_encoder_inputs.append(
          np.array([encoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))

    # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
    for length_idx in xrange(decoder_size):
      batch_decoder_inputs.append(
          np.array([decoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))

      # Create target_weights to be 0 for targets that are padding.
      batch_weight = np.ones(self.batch_size, dtype=np.float32)
      for batch_idx in xrange(self.batch_size):
        # We set weight to 0 if the corresponding target is a PAD symbol.
        # The corresponding target is decoder_input shifted by 1 forward.
        if length_idx < decoder_size - 1:
          target = decoder_inputs[batch_idx][length_idx + 1]
        if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
          batch_weight[batch_idx] = 0.0
      batch_weights.append(batch_weight)
    return batch_encoder_inputs, batch_decoder_inputs, batch_weights


问题


面经


文章

微信
公众号

扫码关注公众号