def get_decode_batch(self, data, bucket_id):
"""Get sequential batch
"""
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
this_batch_size = len(data[bucket_id])
# Get a random batch of encoder and decoder inputs from data,
# pad them if needed, reverse encoder inputs and add GO to decoder.
for sample in data[bucket_id]:
encoder_input, decoder_input = sample
# Encoder inputs are padded and then reversed.
encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
# Decoder inputs get an extra "GO" symbol, and are padded then.
decoder_pad_size = decoder_size - len(decoder_input) - 1
decoder_inputs.append([data_utils.GO_ID] + decoder_input +
[data_utils.PAD_ID] * decoder_pad_size)
# Now we create batch-major vectors from the data selected above.
batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
# Batch encoder inputs are just re-indexed encoder_inputs.
for length_idx in xrange(encoder_size):
batch_encoder_inputs.append(np.array([encoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(this_batch_size)], dtype=np.int32))
# Batch decoder inputs are re-indexed decoder_inputs, we create weights.
for length_idx in xrange(decoder_size):
batch_decoder_inputs.append(np.array([decoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(this_batch_size)], dtype=np.int32))
# Create target_weights to be 0 for targets that are padding.
batch_weight = np.ones(this_batch_size, dtype=np.float32)
for batch_idx in xrange(this_batch_size):
# We set weight to 0 if the corresponding target is a PAD symbol.
# The corresponding target is decoder_input shifted by 1 forward.
if length_idx < decoder_size - 1:
target = decoder_inputs[batch_idx][length_idx + 1]
if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
batch_weight[batch_idx] = 0.0
batch_weights.append(batch_weight)
return batch_encoder_inputs, batch_decoder_inputs, batch_weights
python类PAD_ID的实例源码
def get_batch(self, data, bucket_id):
"""Get a random batch of data from the specified bucket, prepare for step.
To feed data in step(..) it must be a list of batch-major vectors, while
data here contains single length-major cases. So the main logic of this
function is to re-index data cases to be in the proper format for feeding.
Args:
data: a tuple of size len(self.buckets) in which each element contains
lists of pairs of input and output data that we use to create a batch.
bucket_id: integer, which bucket to get the batch for.
Returns:
The triple (encoder_inputs, decoder_inputs, target_weights) for
the constructed batch that has the proper format to call step(...) later.
"""
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
# Get a random batch of encoder and decoder inputs from data,
# pad them if needed, reverse encoder inputs and add GO to decoder.
for _ in xrange(self.batch_size):
encoder_input, decoder_input = random.choice(data[bucket_id])
# Encoder inputs are padded and then reversed.
encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
# Decoder inputs get an extra "GO" symbol, and are padded then.
decoder_pad_size = decoder_size - len(decoder_input) - 1
decoder_inputs.append([data_utils.GO_ID] + decoder_input +
[data_utils.PAD_ID] * decoder_pad_size)
# Now we create batch-major vectors from the data selected above.
batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
# Batch encoder inputs are just re-indexed encoder_inputs.
for length_idx in xrange(encoder_size):
batch_encoder_inputs.append(
np.array([encoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Batch decoder inputs are re-indexed decoder_inputs, we create weights.
for length_idx in xrange(decoder_size):
batch_decoder_inputs.append(
np.array([decoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Create target_weights to be 0 for targets that are padding.
batch_weight = np.ones(self.batch_size, dtype=np.float32)
for batch_idx in xrange(self.batch_size):
# We set weight to 0 if the corresponding target is a PAD symbol.
# The corresponding target is decoder_input shifted by 1 forward.
if length_idx < decoder_size - 1:
target = decoder_inputs[batch_idx][length_idx + 1]
if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
batch_weight[batch_idx] = 0.0
batch_weights.append(batch_weight)
return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def get_mix_batch(self, bucketed_data, bucket_id, this_batch_size):
"""Get a random batch of data from the specified bucket, prepare for step.
To feed data in step(..) it must be a list of batch-major vectors, while
data here contains single length-major cases. So the main logic of this
function is to re-index data cases to be in the proper format for feeding.
Args:
data: a tuple of size len(self.buckets) in which each element contains
lists of pairs of input and output data that we use to create a batch.
bucket_id: integer, which bucket to get the batch for.
Returns:
The triple (encoder_inputs, decoder_inputs, target_weights) for
the constructed batch that has the proper format to call step(...) later.
"""
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
# Get a random batch of encoder and decoder inputs from data,
# pad them if needed, reverse encoder inputs and add GO to decoder.
for _ in xrange(this_batch_size):
encoder_input, decoder_input = random.choice(bucketed_data)
# Encoder inputs are padded and then reversed.
encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
# Decoder inputs get an extra "GO" symbol, and are padded then.
decoder_pad_size = decoder_size - len(decoder_input) - 1
decoder_inputs.append([data_utils.GO_ID] + decoder_input +
[data_utils.PAD_ID] * decoder_pad_size)
# Now we create batch-major vectors from the data selected above.
batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
# Batch encoder inputs are just re-indexed encoder_inputs.
for length_idx in xrange(encoder_size):
batch_encoder_inputs.append(
np.array([encoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(this_batch_size)], dtype=np.int32))
# Batch decoder inputs are re-indexed decoder_inputs, we create weights.
for length_idx in xrange(decoder_size):
batch_decoder_inputs.append(
np.array([decoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(this_batch_size)], dtype=np.int32))
# Create target_weights to be 0 for targets that are padding.
batch_weight = np.ones(this_batch_size, dtype=np.float32)
for batch_idx in xrange(this_batch_size):
# We set weight to 0 if the corresponding target is a PAD symbol.
# The corresponding target is decoder_input shifted by 1 forward.
if length_idx < decoder_size - 1:
target = decoder_inputs[batch_idx][length_idx + 1]
if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
batch_weight[batch_idx] = 0.0
batch_weights.append(batch_weight)
return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def train2vec(self, dialogs, iters):
batch_size = len(dialogs) #????batch_size
max_border = self.get_max(iters) #??????????
history_inputs =[]
true_inputs =[]
false_inputs = []
for i in range( batch_size ):
border = min(len(dialogs[i]),max_border*2)
dialogs[i] = dialogs[i][:border]
#for j in len(dialogs[i]):
if (dialogs ==None) or len(dialogs)==0 : #??????
return None,None,None
for i in range(batch_size): #batch
one_session = dialogs[i] #??????
cache = []
for j in range(self.max_dialogue_size): #????????????????
if j < len(one_session):
encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0])) #0??????
#print('encoder_pad',encoder_pad)
cache.append(list(reversed(one_session[j][0]+encoder_pad))) #????
else:
cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
history_inputs.append(cache)
true_cache =[]
false_cache = []
for j in range(self.max_dialogue_size): #candidate part
if j %2==0: #?0,2,4,..??????
continue
if j<len(one_session):
true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0]))
true_cache.append(list(reversed(one_session[j][0] + true_pad)))# true candiate
false_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][1]))
false_cache.append(list(reversed(one_session[j][1] + false_pad)))#false candidate
else:
true_cache.append(list([data_utils.EOS_ID]*self.max_sentence_size))
false_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
true_inputs.append(true_cache)
false_inputs.append(false_cache)
######################################################
batch_history,batch_true,batch_false = [], [], []
for sent_index in range(self.max_dialogue_size):
history_cache = []
for length_index in range(self.max_sentence_size):
history_cache.append(np.array([history_inputs[batch_index][sent_index][length_index] for batch_index in range(len(history_inputs))]))
batch_history.append(history_cache)
if sent_index % 2!=0:
true_cache, false_cache = [], []
for length_index in range(self.max_sentence_size):
true_cache.append(np.array([true_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))]))
false_cache.append(np.array([false_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))]))
batch_true.append(true_cache)
batch_false.append(false_cache)
return batch_history, batch_true, batch_false
def test2vec(self,history):
#?????????????
#???????????
history_inputs =[]
candidate_inputs =[]
if (history ==None) or len(history)==0 : #??????
return None,None
#print(history)
candidate_size = len(history[1])
#print('candidate_size',candidate_size)
cache = []
for j in range(self.max_dialogue_size): #????????????????
if j< len(history):
encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(history[j][0])) #0??????
cache.append(list(reversed(history[j][0]+encoder_pad))) #????
else:
cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
history_inputs = cache
#print(history_inputs)
true_cache =[]
for i in range(self.max_dialogue_size): #candidate part
if i %2==0: #?0,2,4,..??????
continue
if i<len(history): #????????
for j in range(candidate_size):
true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(history[i][j]))
true_cache.append(list(reversed(history[i][j] + true_pad)))# true candidate
else:
for j in range(candidate_size):
true_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
candidate_inputs.append(true_cache)
true_cache =[]
######################################################
batch_history, batch_candidate = [], []
for sent_index in range(self.max_dialogue_size):
history_cache = []
for length_index in range(self.max_sentence_size):
history_cache.append(np.array( [history_inputs[sent_index][length_index]]))
batch_history.append(history_cache)
if sent_index % 2 != 0:
candidate_cache = []
for length_index in range(self.max_sentence_size):
candidate_cache.append(np.array([candidate_inputs[int(sent_index/2)][batch_index][length_index] for batch_index in range(candidate_size)]))
batch_candidate.append(candidate_cache)
return batch_history, batch_candidate
############################################################################
def train2vec(self, dialogs, iters):
batch_size = len(dialogs) #????batch_size
max_border = self.get_max(iters) #??????????
history_inputs =[]
true_inputs =[]
false_inputs = []
for i in range( batch_size ):
border = min(len(dialogs[i]),max_border*2)
dialogs[i] = dialogs[i][:border]
#for j in len(dialogs[i]):
if (dialogs ==None) or len(dialogs)==0 : #??????
return None,None,None
for i in range(batch_size): #batch
one_session = dialogs[i] #??????
cache = []
for j in range(self.max_dialogue_size): #????????????????
if j < len(one_session):
encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0])) #0??????
#print('encoder_pad',encoder_pad)
cache.append(list(reversed(one_session[j][0]+encoder_pad))) #????
else:
cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
history_inputs.append(cache)
true_cache =[]
false_cache = []
for j in range(self.max_dialogue_size): #candidate part
if j %2==0: #?0,2,4,..??????
continue
if j<len(one_session):
true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0]))
true_cache.append(list(reversed(one_session[j][0] + true_pad)))# true candiate
false_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][1]))
false_cache.append(list(reversed(one_session[j][1] + false_pad)))#false candidate
else:
true_cache.append(list([data_utils.EOS_ID]*self.max_sentence_size))
false_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
true_inputs.append(true_cache)
false_inputs.append(false_cache)
######################################################
batch_history,batch_true,batch_false = [], [], []
for sent_index in range(self.max_dialogue_size):
history_cache = []
for length_index in range(self.max_sentence_size):
history_cache.append(np.array([history_inputs[batch_index][sent_index][length_index] for batch_index in range(len(history_inputs))]))
batch_history.append(history_cache)
if sent_index % 2!=0:
true_cache, false_cache = [], []
for length_index in range(self.max_sentence_size):
true_cache.append(np.array([true_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))]))
false_cache.append(np.array([false_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))]))
batch_true.append(true_cache)
batch_false.append(false_cache)
return batch_history, batch_true, batch_false
def train2vec(self, dialogs, iters):
batch_size = len(dialogs) #????batch_size
max_border = self.get_max(iters) #??????????
history_inputs =[]
true_inputs =[]
false_inputs = []
for i in range( batch_size ):
border = min(len(dialogs[i]),max_border*2)
dialogs[i] = dialogs[i][:border]
#for j in len(dialogs[i]):
if (dialogs ==None) or len(dialogs)==0 : #??????
return None,None,None
for i in range(batch_size): #batch
one_session = dialogs[i] #??????
cache = []
for j in range(self.max_dialogue_size): #????????????????
if j < len(one_session):
encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0])) #0??????
#print('encoder_pad',encoder_pad)
cache.append(list(reversed(one_session[j][0]+encoder_pad))) #????
else:
cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
history_inputs.append(cache)
true_cache =[]
false_cache = []
for j in range(self.max_dialogue_size): #candidate part
if j %2==0: #?0,2,4,..??????
continue
if j<len(one_session):
true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0]))
true_cache.append(list(reversed(one_session[j][0] + true_pad)))# true candiate
false_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][1]))
false_cache.append(list(reversed(one_session[j][1] + false_pad)))#false candidate
else:
true_cache.append(list([data_utils.EOS_ID]*self.max_sentence_size))
false_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
true_inputs.append(true_cache)
false_inputs.append(false_cache)
######################################################
batch_history,batch_true,batch_false = [], [], []
for sent_index in range(self.max_dialogue_size):
history_cache = []
for length_index in range(self.max_sentence_size):
history_cache.append(np.array([history_inputs[batch_index][sent_index][length_index] for batch_index in range(len(history_inputs))]))
batch_history.append(history_cache)
if sent_index % 2!=0:
true_cache, false_cache = [], []
for length_index in range(self.max_sentence_size):
true_cache.append(np.array([true_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))]))
false_cache.append(np.array([false_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))]))
batch_true.append(true_cache)
batch_false.append(false_cache)
return batch_history, batch_true, batch_false
def test2vec(self,history):
#?????????????
#???????????
history_inputs =[]
candidate_inputs =[]
if (history ==None) or len(history)==0 : #??????
return None,None
candidate_size = len(history[1])
cache = []
for j in range(self.max_dialogue_size): #????????????????
if j< len(history):
encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(history[j][0])) #0??????
cache.append(list(reversed(history[j][0]+encoder_pad))) #????
else:
cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
history_inputs = cache
true_cache =[]
for i in range(self.max_dialogue_size): #candidate part
if i %2==0: #?0,2,4,..??????
continue
if i<len(history): #????????
for j in range(candidate_size):
true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(history[i][j]))
true_cache.append(list(reversed(history[i][j] + true_pad)))# true candidate
else:
for j in range(candidate_size):
true_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size))
candidate_inputs.append(true_cache)
true_cache =[]
######################################################
batch_history, batch_candidate = [], []
for sent_index in range(self.max_dialogue_size):
history_cache = []
for length_index in range(self.max_sentence_size):
history_cache.append(np.array( [history_inputs[sent_index][length_index]]))
batch_history.append(history_cache)
if sent_index % 2 != 0:
candidate_cache = []
for length_index in range(self.max_sentence_size):
candidate_cache.append(np.array([candidate_inputs[int(sent_index/2)][batch_index][length_index] for batch_index in range(candidate_size)]))
batch_candidate.append(candidate_cache)
return batch_history, batch_candidate
############################################################################
seq2seq_model.py 文件源码
项目:Seq2Seq-Tensorflow-1.0-Chatbot
作者: igorvishnevskiy
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def get_batch(self, data, bucket_id):
"""Get a random batch of data from the specified bucket, prepare for step.
To feed data in step(..) it must be a list of batch-major vectors, while
data here contains single length-major cases. So the main logic of this
function is to re-index data cases to be in the proper format for feeding.
Args:
data: a tuple of size len(self.buckets) in which each element contains
lists of pairs of input and output data that we use to create a batch.
bucket_id: integer, which bucket to get the batch for.
Returns:
The triple (encoder_inputs, decoder_inputs, target_weights) for
the constructed batch that has the proper format to call step(...) later.
"""
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
# Get a random batch of encoder and decoder inputs from data,
# pad them if needed, reverse encoder inputs and add GO to decoder.
for _ in xrange(self.batch_size):
encoder_input, decoder_input = random.choice(data[bucket_id])
# Encoder inputs are padded and then reversed.
encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
# Decoder inputs get an extra "GO" symbol, and are padded then.
decoder_pad_size = decoder_size - len(decoder_input) - 1
decoder_inputs.append([data_utils.GO_ID] + decoder_input +
[data_utils.PAD_ID] * decoder_pad_size)
# Now we create batch-major vectors from the data selected above.
batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
# Batch encoder inputs are just re-indexed encoder_inputs.
for length_idx in xrange(encoder_size):
batch_encoder_inputs.append(
np.array([encoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Batch decoder inputs are re-indexed decoder_inputs, we create weights.
for length_idx in xrange(decoder_size):
batch_decoder_inputs.append(
np.array([decoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Create target_weights to be 0 for targets that are padding.
batch_weight = np.ones(self.batch_size, dtype=np.float32)
for batch_idx in xrange(self.batch_size):
# We set weight to 0 if the corresponding target is a PAD symbol.
# The corresponding target is decoder_input shifted by 1 forward.
if length_idx < decoder_size - 1:
target = decoder_inputs[batch_idx][length_idx + 1]
if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
batch_weight[batch_idx] = 0.0
batch_weights.append(batch_weight)
return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def read_mrs_data(buckets, source_paths, target_paths, max_size=None,
any_length=False, offset_target=-1):
# Read in all files seperately.
source_inputs = [data_utils.read_ids_file(path, max_size)
for path in source_paths]
target_inputs = [data_utils.read_ids_file(path, max_size)
for path in target_paths]
data_set = [[] for _ in buckets]
data_list = []
# Assume everything is well-aligned.
for i in xrange(len(source_inputs[0])): # over examples
# List of sequences of each type.
source_ids = [source_input[i] for source_input in source_inputs]
# Assume first target type predicts EOS.
# Not checking pointer ranges: do that inside tf graph.
target_ids = [target_inputs[0][i] + [data_utils.EOS_ID]]
for j, target_input in enumerate(target_inputs[1:]):
if offset_target > 0 and j + 1 == offset_target:
target_ids.append([data_utils.PAD_ID] + target_input[i]
+ [data_utils.PAD_ID])
else:
target_ids.append(target_input[i] + [data_utils.PAD_ID])
found_bucket = False
for bucket_id, (source_size, target_size) in enumerate(buckets):
if len(source_ids[0]) < source_size and len(target_ids[0]) < target_size:
data_set[bucket_id].append([source_ids, target_ids])
data_list.append([source_ids, target_ids, bucket_id])
found_bucket = True
break
if any_length and not found_bucket:
# Crop examples that are larger than the largest bucket.
source_size, target_size = buckets[-1][0], buckets[-1][1]
if len(source_ids[0]) >= source_size:
source_ids = [source_id[:source_size] for source_id in source_ids]
if len(target_ids[0]) >= target_size:
target_ids = [target_id[:target_size] for target_id in target_ids]
bucket_id = len(buckets) - 1
data_set[bucket_id].append([source_ids, target_ids])
data_list.append([source_ids, target_ids, bucket_id])
return data_set, data_list
def get_batch(self, data, bucket_id):
"""Get a random batch of data from the specified bucket, prepare for step.
To feed data in step(..) it must be a list of batch-major vectors, while
data here contains single length-major cases. So the main logic of this
function is to re-index data cases to be in the proper format for feeding.
Args:
data: a tuple of size len(self.buckets) in which each element contains
lists of pairs of input and output data that we use to create a batch.
bucket_id: integer, which bucket to get the batch for.
Returns:
The triple (encoder_inputs, decoder_inputs, target_weights) for
the constructed batch that has the proper format to call step(...) later.
"""
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
# Get a random batch of encoder and decoder inputs from data,
# pad them if needed, reverse encoder inputs and add GO to decoder.
for _ in xrange(self.batch_size):
encoder_input, decoder_input = random.choice(data[bucket_id])
# Encoder inputs are padded and then reversed.
encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
# Decoder inputs get an extra "GO" symbol, and are padded then.
decoder_pad_size = decoder_size - len(decoder_input) - 1
decoder_inputs.append([data_utils.GO_ID] + decoder_input +
[data_utils.PAD_ID] * decoder_pad_size)
# Now we create batch-major vectors from the data selected above.
batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
# Batch encoder inputs are just re-indexed encoder_inputs.
#encoder_inputs?shape?(batch_size,encoder_size)
#batch_encoder_inputs?shape?(encoder_size,batch_size)
for length_idx in xrange(encoder_size):
batch_encoder_inputs.append(
np.array([encoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Batch decoder inputs are re-indexed decoder_inputs, we create weights.
for length_idx in xrange(decoder_size):
batch_decoder_inputs.append(
np.array([decoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Create target_weights to be 0 for targets that are padding.
batch_weight = np.ones(self.batch_size, dtype=np.float32)
for batch_idx in xrange(self.batch_size):
# We set weight to 0 if the corresponding target is a PAD symbol.
# The corresponding target is decoder_input shifted by 1 forward.
if length_idx < decoder_size - 1:
target = decoder_inputs[batch_idx][length_idx + 1]
#????decoder????????target?pad,????????????????????
if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
batch_weight[batch_idx] = 0.0
batch_weights.append(batch_weight) #shape?(encoder_size,batch_size)
return batch_encoder_inputs, batch_decoder_inputs, batch_weights
gst_rnn_model.py 文件源码
项目:Deep-Reinforcement-Learning-for-Dialogue-Generation-in-tensorflow
作者: liuyuemaicha
项目源码
文件源码
阅读 15
收藏 0
点赞 0
评论 0
def get_batch(self, train_data, bucket_id):
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
batch_source_encoder, batch_source_decoder = [], []
#print("bucket_id: ", bucket_id)
for batch_i in xrange(self.batch_size):
encoder_input, decoder_input = random.choice(train_data[bucket_id])
batch_source_encoder.append(encoder_input)
batch_source_decoder.append(decoder_input)
#print("encoder_input: ", encoder_input)
encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
#print("encoder_input pad: ", list(reversed(encoder_input + encoder_pad)))
#print("decoder_input: ", decoder_input)
decoder_pad_size = decoder_size - len(decoder_input) - 1
decoder_inputs.append([data_utils.GO_ID] + decoder_input +
[data_utils.PAD_ID] * decoder_pad_size)
#print("decoder_pad: ",[data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size)
batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
for length_idx in xrange(encoder_size):
batch_encoder_inputs.append(
np.array([encoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
for length_idx in xrange(decoder_size):
batch_decoder_inputs.append(
np.array([decoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
batch_weight = np.ones(self.batch_size, dtype=np.float32)
for batch_idx in xrange(self.batch_size):
# We set weight to 0 if the corresponding target is a PAD symbol.
# The corresponding target is decoder_input shifted by 1 forward.
if length_idx < decoder_size - 1:
target = decoder_inputs[batch_idx][length_idx + 1]
if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
batch_weight[batch_idx] = 0.0
batch_weights.append(batch_weight)
return batch_encoder_inputs, batch_decoder_inputs, batch_weights, batch_source_encoder, batch_source_decoder
grl_rnn_model.py 文件源码
项目:Deep-Reinforcement-Learning-for-Dialogue-Generation-in-tensorflow
作者: liuyuemaicha
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def get_batch(self, train_data, bucket_id, type=0):
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
# print("Batch_Size: %s" %self.batch_size)
# Get a random batch of encoder and decoder inputs from data,
# pad them if needed, reverse encoder inputs and add GO to decoder.
batch_source_encoder, batch_source_decoder = [], []
# print("bucket_id: %s" %bucket_id)
for batch_i in xrange(self.batch_size):
if type == 1:
# feed_data = {bucket_id: zip(tokens_a, tokens_b)}
encoder_input, decoder_input = train_data[bucket_id][batch_i]
elif type == 2:
# feed_data = {bucket_id: [(resp_tokens, [])]}
encoder_input_a, decoder_input = train_data[bucket_id][0]
encoder_input = encoder_input_a[batch_i]
elif type == 0:
encoder_input, decoder_input = random.choice(train_data[bucket_id])
print("train en: %s, de: %s" % (encoder_input, decoder_input))
batch_source_encoder.append(encoder_input)
batch_source_decoder.append(decoder_input)
# Encoder inputs are padded and then reversed.
encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
# Decoder inputs get an extra "GO" symbol, and are padded then.
decoder_pad_size = decoder_size - len(decoder_input) - 1
decoder_inputs.append([data_utils.GO_ID] + decoder_input +
[data_utils.PAD_ID] * decoder_pad_size)
# Now we create batch-major vectors from the data selected above.
batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
# Batch encoder inputs are just re-indexed encoder_inputs.
for length_idx in xrange(encoder_size):
batch_encoder_inputs.append(
np.array([encoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Batch decoder inputs are re-indexed decoder_inputs, we create weights.
for length_idx in xrange(decoder_size):
batch_decoder_inputs.append(
np.array([decoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Create target_weights to be 0 for targets that are padding.
batch_weight = np.ones(self.batch_size, dtype=np.float32)
for batch_idx in xrange(self.batch_size):
# We set weight to 0 if the corresponding target is a PAD symbol.
# The corresponding target is decoder_input shifted by 1 forward.
if length_idx < decoder_size - 1:
target = decoder_inputs[batch_idx][length_idx + 1]
if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
batch_weight[batch_idx] = 0.0
batch_weights.append(batch_weight)
return batch_encoder_inputs, batch_decoder_inputs, batch_weights, batch_source_encoder, batch_source_decoder
def get_batch(self, data, bucket_id):
"""Get a random batch of data from the specified bucket, prepare for step.
To feed data in step(..) it must be a list of batch-major vectors, while
data here contains single length-major cases. So the main logic of this
function is to re-index data cases to be in the proper format for feeding.
Args:
data: a tuple of size len(self.buckets) in which each element contains
lists of pairs of input and output data that we use to create a batch.
bucket_id: integer, which bucket to get the batch for.
Returns:
The triple (encoder_inputs, decoder_inputs, target_weights) for
the constructed batch that has the proper format to call step(...) later.
"""
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
# Get a random batch of encoder and decoder inputs from data,
# pad them if needed, reverse encoder inputs and add GO to decoder.
for _ in xrange(self.batch_size):
encoder_input, decoder_input = random.choice(data[bucket_id])
# Encoder inputs are padded and then reversed.
encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
# Decoder inputs get an extra "GO" symbol, and are padded then.
decoder_pad_size = decoder_size - len(decoder_input) - 1
decoder_inputs.append([data_utils.GO_ID] + decoder_input +
[data_utils.PAD_ID] * decoder_pad_size)
# Now we create batch-major vectors from the data selected above.
batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
# Batch encoder inputs are just re-indexed encoder_inputs.
for length_idx in xrange(encoder_size):
batch_encoder_inputs.append(
np.array([encoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Batch decoder inputs are re-indexed decoder_inputs, we create weights.
for length_idx in xrange(decoder_size):
batch_decoder_inputs.append(
np.array([decoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Create target_weights to be 0 for targets that are padding.
batch_weight = np.ones(self.batch_size, dtype=np.float32)
for batch_idx in xrange(self.batch_size):
# We set weight to 0 if the corresponding target is a PAD symbol.
# The corresponding target is decoder_input shifted by 1 forward.
if length_idx < decoder_size - 1:
target = decoder_inputs[batch_idx][length_idx + 1]
if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
batch_weight[batch_idx] = 0.0
batch_weights.append(batch_weight)
return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def get_batch(self, data, bucket_id):
"""Get a random batch of data from the specified bucket, prepare for step.
To feed data in step(..) it must be a list of batch-major vectors, while
data here contains single length-major cases. So the main logic of this
function is to re-index data cases to be in the proper format for feeding.
Args:
data: a tuple of size len(self.buckets) in which each element contains
lists of pairs of input and output data that we use to create a batch.
bucket_id: integer, which bucket to get the batch for.
Returns:
The triple (encoder_inputs, decoder_inputs, target_weights) for
the constructed batch that has the proper format to call step(...) later.
"""
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
# Get a random batch of encoder and decoder inputs from data,
# pad them if needed, reverse encoder inputs and add GO to decoder.
for _ in xrange(self.batch_size):
encoder_input, decoder_input = random.choice(data[bucket_id])
# Encoder inputs are padded and then reversed.
encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
# Decoder inputs get an extra "GO" symbol, and are padded then.
decoder_pad_size = decoder_size - len(decoder_input) - 1
decoder_inputs.append([data_utils.GO_ID] + decoder_input +
[data_utils.PAD_ID] * decoder_pad_size)
# Now we create batch-major vectors from the data selected above.
batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
# Batch encoder inputs are just re-indexed encoder_inputs.
for length_idx in xrange(encoder_size):
batch_encoder_inputs.append(
np.array([encoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Batch decoder inputs are re-indexed decoder_inputs, we create weights.
for length_idx in xrange(decoder_size):
batch_decoder_inputs.append(
np.array([decoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Create target_weights to be 0 for targets that are padding.
batch_weight = np.ones(self.batch_size, dtype=np.float32)
for batch_idx in xrange(self.batch_size):
# We set weight to 0 if the corresponding target is a PAD symbol.
# The corresponding target is decoder_input shifted by 1 forward.
if length_idx < decoder_size - 1:
target = decoder_inputs[batch_idx][length_idx + 1]
if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
batch_weight[batch_idx] = 0.0
batch_weights.append(batch_weight)
return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def get_batch(self, data, bucket_id):
"""Get a random batch of data from the specified bucket, prepare for step.
To feed data in step(..) it must be a list of batch-major vectors, while
data here contains single length-major cases. So the main logic of this
function is to re-index data cases to be in the proper format for feeding.
Args:
data: a tuple of size len(self.buckets) in which each element contains
lists of pairs of input and output data that we use to create a batch.
bucket_id: integer, which bucket to get the batch for.
Returns:
The triple (encoder_inputs, decoder_inputs, target_weights) for
the constructed batch that has the proper format to call step(...) later.
"""
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
# Get a random batch of encoder and decoder inputs from data,
# pad them if needed, reverse encoder inputs and add GO to decoder.
for _ in xrange(self.batch_size):
encoder_input, decoder_input = random.choice(data[bucket_id])
# Encoder inputs are padded and then reversed.
encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
# Decoder inputs get an extra "GO" symbol, and are padded then.
decoder_pad_size = decoder_size - len(decoder_input) - 1
decoder_inputs.append([data_utils.GO_ID] + decoder_input +
[data_utils.PAD_ID] * decoder_pad_size)
# Now we create batch-major vectors from the data selected above.
batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
# Batch encoder inputs are just re-indexed encoder_inputs.
for length_idx in xrange(encoder_size):
batch_encoder_inputs.append(
np.array([encoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Batch decoder inputs are re-indexed decoder_inputs, we create weights.
for length_idx in xrange(decoder_size):
batch_decoder_inputs.append(
np.array([decoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(self.batch_size)], dtype=np.int32))
# Create target_weights to be 0 for targets that are padding.
batch_weight = np.ones(self.batch_size, dtype=np.float32)
for batch_idx in xrange(self.batch_size):
# We set weight to 0 if the corresponding target is a PAD symbol.
# The corresponding target is decoder_input shifted by 1 forward.
if length_idx < decoder_size - 1:
target = decoder_inputs[batch_idx][length_idx + 1]
if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
batch_weight[batch_idx] = 0.0
batch_weights.append(batch_weight)
return batch_encoder_inputs, batch_decoder_inputs, batch_weights