def _compute_y_and_t(self, exp_batch, gamma):
batch_size = exp_batch['reward'].shape[0]
# Compute Q-values for current states
batch_state = exp_batch['state']
qout = self.model(batch_state)
batch_actions = exp_batch['action']
batch_q = F.reshape(qout.evaluate_actions(
batch_actions), (batch_size, 1))
with chainer.no_backprop_mode():
batch_q_target = F.reshape(
self._compute_target_values(exp_batch, gamma),
(batch_size, 1))
return batch_q, batch_q_target
python类no_backprop_mode()的实例源码
def compute_policy_gradient_full_correction(
action_distrib, action_distrib_mu, action_value, v,
truncation_threshold):
"""Compute off-policy bias correction term wrt all actions."""
assert truncation_threshold is not None
assert np.isscalar(v)
with chainer.no_backprop_mode():
rho_all_inv = compute_full_importance(action_distrib_mu,
action_distrib)
correction_weight = (
np.maximum(1 - truncation_threshold * rho_all_inv,
np.zeros_like(rho_all_inv)) *
action_distrib.all_prob.data[0])
correction_advantage = action_value.q_values.data[0] - v
return -F.sum(correction_weight *
action_distrib.all_log_prob *
correction_advantage, axis=1)
def compute_policy_gradient_sample_correction(
action_distrib, action_distrib_mu, action_value, v,
truncation_threshold):
"""Compute off-policy bias correction term wrt a sampled action."""
assert np.isscalar(v)
assert truncation_threshold is not None
with chainer.no_backprop_mode():
sample_action = action_distrib.sample().data
rho_dash_inv = compute_importance(
action_distrib_mu, action_distrib, sample_action)
if (truncation_threshold > 0 and
rho_dash_inv >= 1 / truncation_threshold):
return chainer.Variable(np.asarray([0], dtype=np.float32))
correction_weight = max(0, 1 - truncation_threshold * rho_dash_inv)
assert correction_weight <= 1
q = float(action_value.evaluate_actions(sample_action).data[0])
correction_advantage = q - v
return -(correction_weight *
action_distrib.log_prob(sample_action) *
correction_advantage)
def update_on_policy(self, statevar):
assert self.t_start < self.t
if not self.disable_online_update:
if statevar is None:
R = 0
else:
with chainer.no_backprop_mode():
with state_kept(self.model):
action_distrib, action_value, v = self.model(statevar)
R = float(v.data)
self.update(
t_start=self.t_start, t_stop=self.t, R=R,
states=self.past_states,
actions=self.past_actions,
rewards=self.past_rewards,
values=self.past_values,
action_values=self.past_action_values,
action_distribs=self.past_action_distrib,
action_distribs_mu=None,
avg_action_distribs=self.past_avg_action_distrib)
self.init_history_data_for_online_update()
def __call__(self, trainer):
with chainer.no_backprop_mode():
references = []
hypotheses = []
for i in range(0, len(self.test_data[0:100]), self.batch):
sources, targets = zip(*self.test_data[i:i + self.batch])
references.extend([[t[0].tolist()] for t in targets])
sources = [
chainer.dataset.to_device(self.device, x) for x in sources]
ys = self.model.translate(sources, self.max_length)
ys = [y.tolist() for y in ys]
hypotheses.extend(ys)
source, target = zip(*self.test_data[0:100])
loss = self.model.CalculateValLoss(source, target)
bleu = bleu_score.corpus_bleu(
references, hypotheses,
smoothing_function=bleu_score.SmoothingFunction().method1)
reporter.report({self.key[0]: bleu})
reporter.report({self.key[1]: loss})
def __call__(self, trainer):
with chainer.no_backprop_mode():
references = []
hypotheses = []
for i in range(0, len(self.test_data), self.batch):
sources, targets = zip(*self.test_data[i:i + self.batch])
references.extend([[t.tolist()] for t in targets])
sources = [
chainer.dataset.to_device(self.device, x) for x in sources]
ys = [y.tolist()
for y in self.model.translate(sources, self.max_length)]
hypotheses.extend(ys)
bleu = bleu_score.corpus_bleu(
references, hypotheses,
smoothing_function=bleu_score.SmoothingFunction().method1)
reporter.report({self.key: bleu})
def __call__(self, trainer):
with chainer.no_backprop_mode():
references = []
hypotheses = []
for i in range(0, len(self.test_data), self.batch):
sources, targets = zip(*self.test_data[i:i + self.batch])
references.extend([[t.tolist()] for t in targets])
sources = [
chainer.dataset.to_device(self.device, x) for x in sources]
ys = [y.tolist()
for y in self.model.translate(sources, self.max_length)]
hypotheses.extend(ys)
bleu = bleu_score.corpus_bleu(
references, hypotheses,
smoothing_function=bleu_score.SmoothingFunction().method1)
reporter.report({self.key: bleu})
def __call__(self, trainer):
with chainer.no_backprop_mode():
references = []
hypotheses = []
for i in range(0, len(self.test_data), self.batch):
sources, targets = zip(*self.test_data[i:i + self.batch])
references.extend([[t.tolist()] for t in targets])
sources = [
chainer.dataset.to_device(self.device, x) for x in sources]
ys = [y.tolist()
for y in self.model.translate(sources, self.max_length)]
hypotheses.extend(ys)
bleu = bleu_score.corpus_bleu(
references, hypotheses,
smoothing_function=bleu_score.SmoothingFunction().method1)
reporter.report({self.key: bleu})
def __call__(self, trainer):
with chainer.no_backprop_mode():
references = []
hypotheses = []
for i in range(0, len(self.test_data), self.batch):
sources, targets = zip(*self.test_data[i:i + self.batch])
references.extend([[t.tolist()] for t in targets])
sources = [
chainer.dataset.to_device(self.device, x) for x in sources]
ys = [y.tolist()
for y in self.model.translate(sources, self.max_length)]
hypotheses.extend(ys)
bleu = bleu_score.corpus_bleu(
references, hypotheses,
smoothing_function=bleu_score.SmoothingFunction().method1)
reporter.report({self.key: bleu})
def __call__(self, trainer):
with chainer.no_backprop_mode():
references = []
hypotheses = []
for i in range(0, len(self.test_data[0:100]), self.batch):
sources, targets = zip(*self.test_data[i:i + self.batch])
references.extend([[t[0].tolist()] for t in targets])
sources = [
chainer.dataset.to_device(self.device, x) for x in sources]
ys = self.model.translate(sources, self.max_length)
ys = [y.tolist() for y in ys]
hypotheses.extend(ys)
source, target = zip(*self.test_data[0:100])
loss = self.model.CalculateValLoss(source, target)
bleu = bleu_score.corpus_bleu(
references, hypotheses,
smoothing_function=bleu_score.SmoothingFunction().method1)
reporter.report({self.key[0]: bleu})
reporter.report({self.key[1]: loss})
def __call__(self, trainer):
print('## Calculate BLEU')
with chainer.no_backprop_mode():
with chainer.using_config('train', False):
references = []
hypotheses = []
for i in range(0, len(self.test_data), self.batch):
sources, targets = zip(*self.test_data[i:i + self.batch])
references.extend([[t.tolist()] for t in targets])
sources = [
chainer.dataset.to_device(self.device, x) for x in sources]
ys = [y.tolist()
for y in self.model.translate(sources, self.max_length)]
hypotheses.extend(ys)
bleu = bleu_score.corpus_bleu(
references, hypotheses,
smoothing_function=bleu_score.SmoothingFunction().method1) * 100
print('BLEU:', bleu)
reporter.report({self.key: bleu})
def plot_scatter():
parser = argparse.ArgumentParser()
parser.add_argument("--model", "-m", type=str, default="model.hdf5")
args = parser.parse_args()
dataset_train, dataset_test = chainer.datasets.get_mnist()
images_train, labels_train = dataset_train._datasets
images_test, labels_test = dataset_test._datasets
model = Model()
assert model.load(args.model)
# normalize
images_train = (images_train - 0.5) * 2
images_test = (images_test - 0.5) * 2
with chainer.no_backprop_mode() and chainer.using_config("train", False):
z = model.encode_x_yz(images_test)[1].data
plot.scatter_labeled_z(z, labels_test, "scatter_gen.png")
def plot_representation():
parser = argparse.ArgumentParser()
parser.add_argument("--model", "-m", type=str, default="model.hdf5")
args = parser.parse_args()
dataset_train, dataset_test = chainer.datasets.get_mnist()
images_train, labels_train = dataset_train._datasets
images_test, labels_test = dataset_test._datasets
model = Model()
assert model.load(args.model)
# normalize
images_train = (images_train - 0.5) * 2
images_test = (images_test - 0.5) * 2
with chainer.no_backprop_mode() and chainer.using_config("train", False):
y_onehot, z = model.encode_x_yz(images_test, apply_softmax_y=True)
representation = model.encode_yz_representation(y_onehot, z).data
plot.scatter_labeled_z(representation, labels_test, "scatter_r.png")
def plot_z():
parser = argparse.ArgumentParser()
parser.add_argument("--model", "-m", type=str, default="model.hdf5")
args = parser.parse_args()
dataset_train, dataset_test = chainer.datasets.get_mnist()
images_train, labels_train = dataset_train._datasets
images_test, labels_test = dataset_test._datasets
model = Model()
assert model.load(args.model)
# normalize
images_train = (images_train - 0.5) * 2
images_test = (images_test - 0.5) * 2
with chainer.no_backprop_mode() and chainer.using_config("train", False):
z = model.encode_x_yz(images_test)[1].data
plot.scatter_labeled_z(z, labels_test, "scatter_z.png")
def plot_scatter():
parser = argparse.ArgumentParser()
parser.add_argument("--model", "-m", type=str, default="model.hdf5")
args = parser.parse_args()
dataset_train, dataset_test = chainer.datasets.get_mnist()
images_train, labels_train = dataset_train._datasets
images_test, labels_test = dataset_test._datasets
model = Model()
assert model.load(args.model)
# normalize
images_train = (images_train - 0.5) * 2
images_test = (images_test - 0.5) * 2
with chainer.no_backprop_mode() and chainer.using_config("train", False):
z = model.encode_x_z(images_test).data
plot.scatter_labeled_z(z, labels_test, "scatter_z.png")
def plot_scatter():
parser = argparse.ArgumentParser()
parser.add_argument("--model", "-m", type=str, default="model.hdf5")
args = parser.parse_args()
dataset_train, dataset_test = chainer.datasets.get_mnist()
images_train, labels_train = dataset_train._datasets
images_test, labels_test = dataset_test._datasets
model = Model()
assert model.load(args.model)
# normalize
images_train = (images_train - 0.5) * 2
images_test = (images_test - 0.5) * 2
with chainer.no_backprop_mode() and chainer.using_config("train", False):
z = model.encode_x_yz(images_test)[1].data
plot.scatter_labeled_z(z, labels_test, "scatter_gen.png")
def plot_representation():
parser = argparse.ArgumentParser()
parser.add_argument("--model", "-m", type=str, default="model.hdf5")
args = parser.parse_args()
dataset_train, dataset_test = chainer.datasets.get_mnist()
images_train, labels_train = dataset_train._datasets
images_test, labels_test = dataset_test._datasets
model = Model()
assert model.load(args.model)
# normalize
images_train = (images_train - 0.5) * 2
images_test = (images_test - 0.5) * 2
with chainer.no_backprop_mode() and chainer.using_config("train", False):
y_onehot, z = model.encode_x_yz(images_test, apply_softmax_y=True)
representation = model.encode_yz_representation(y_onehot, z).data
plot.scatter_labeled_z(representation, labels_test, "scatter_r.png")
def __call__(self, trainer):
with chainer.no_backprop_mode():
references = []
hypotheses = []
for i in range(0, len(self.test_data), self.batch):
sources, targets = zip(*self.test_data[i:i + self.batch])
references.extend([[t.tolist()] for t in targets])
sources = [
chainer.dataset.to_device(self.device, x) for x in sources]
ys = [y.tolist()
for y in self.model.translate(sources, self.max_length)]
hypotheses.extend(ys)
bleu = bleu_score.corpus_bleu(
references, hypotheses,
smoothing_function=bleu_score.SmoothingFunction().method1)
reporter.report({self.key: bleu})
def translate(self, sentence: np.ndarray, max_length: int = 30) -> List[int]:
with chainer.no_backprop_mode(), chainer.using_config('train', False):
sentence = sentence[::-1]
embedded_xs = self._embed_input(sentence)
hidden_states, cell_states, attentions = self._encoder(None, None, [embedded_xs])
wid = EOS
result = []
for i in range(max_length):
output, hidden_states, cell_states = \
self._translate_one_word(wid, hidden_states, cell_states, attentions)
wid = np.argmax(output.data)
if wid == EOS:
break
result.append(wid)
return result
def _forward(self, *args, calc_score=False):
"""Forward computation without backward.
Predicts by the model's output by returning `predictor`'s output
"""
with chainer.using_config('train', False), chainer.no_backprop_mode():
if calc_score:
self(*args)
return self.y
else:
if self.predictor is None:
print("[ERROR] predictor is not set or not build yet.")
return
# TODO: it passes all the args, sometimes (x, y) which is too many arguments.
# Consider how to deal with the number of input
if hasattr(self.predictor, '_forward'):
fn = self.predictor._forward
else:
fn = self.predictor
return fn(*filter_args(fn, args))
def select_action(self, t, greedy_action_func, action_value=None):
assert action_value is not None
assert isinstance(action_value,
chainerrl.action_value.DiscreteActionValue)
n_actions = action_value.q_values.shape[1]
with chainer.no_backprop_mode():
probs = chainer.cuda.to_cpu(
F.softmax(action_value.q_values / self.T).data).ravel()
return np.random.choice(np.arange(n_actions), p=probs)
def _compute_y_and_t(self, exp_batch, gamma):
batch_state = exp_batch['state']
batch_size = len(exp_batch['reward'])
qout = self.q_function(batch_state)
batch_actions = exp_batch['action']
# Q(s_t,a_t)
batch_q = F.reshape(qout.evaluate_actions(
batch_actions), (batch_size, 1))
with chainer.no_backprop_mode():
# Compute target values
target_qout = self.target_q_function(batch_state)
# Q'(s_t,a_t)
target_q = F.reshape(target_qout.evaluate_actions(
batch_actions), (batch_size, 1))
# LQ'(s_t,a)
target_q_expect = F.reshape(
self._l_operator(target_qout), (batch_size, 1))
# r + g * LQ'(s_{t+1},a)
batch_q_target = F.reshape(
self._compute_target_values(exp_batch, gamma), (batch_size, 1))
# Q'(s_t,a_t) + r + g * LQ'(s_{t+1},a) - LQ'(s_t,a)
t = target_q + batch_q_target - target_q_expect
return batch_q, t
def act(self, obs):
# Use the process-local model for acting
with chainer.no_backprop_mode():
statevar = self.batch_states([obs], np, self.phi)
pout, _ = self.model.pi_and_v(statevar)
if self.act_deterministically:
return pout.most_probable.data[0]
else:
return pout.sample().data[0]
def act(self, obs):
# Use the process-local model for acting
with chainer.no_backprop_mode():
statevar = self.batch_states([obs], self.xp, self.phi)
action_distrib, _ = self.model(statevar)
if self.act_deterministically:
return chainer.cuda.to_cpu(
action_distrib.most_probable.data)[0]
else:
return chainer.cuda.to_cpu(action_distrib.sample().data)[0]
def _compute_y_and_t(self, exp_batch, gamma):
batch_state = exp_batch['state']
batch_size = len(exp_batch['reward'])
qout = self.q_function(batch_state)
batch_actions = exp_batch['action']
batch_q = qout.evaluate_actions(batch_actions)
# Compute target values
with chainer.no_backprop_mode():
target_qout = self.target_q_function(batch_state)
batch_next_state = exp_batch['next_state']
with state_kept(self.target_q_function):
target_next_qout = self.target_q_function(
batch_next_state)
next_q_max = F.reshape(target_next_qout.max, (batch_size,))
batch_rewards = exp_batch['reward']
batch_terminal = exp_batch['is_state_terminal']
# T Q: Bellman operator
t_q = batch_rewards + self.gamma * \
(1.0 - batch_terminal) * next_q_max
# T_PAL Q: persistent advantage learning operator
cur_advantage = F.reshape(
target_qout.compute_advantage(batch_actions), (batch_size,))
next_advantage = F.reshape(
target_next_qout.compute_advantage(batch_actions),
(batch_size,))
tpal_q = t_q + self.alpha * \
F.maximum(cur_advantage, next_advantage)
return batch_q, tpal_q
def _act(self, state):
xp = self.xp
with chainer.using_config('train', False):
b_state = batch_states([state], xp, self.phi)
with chainer.no_backprop_mode():
action_distrib, v = self.model(b_state)
action = action_distrib.sample()
return cuda.to_cpu(action.data)[0], cuda.to_cpu(v.data)[0]
def update(self):
xp = self.xp
if self.standardize_advantages:
all_advs = xp.array([b['adv'] for b in self.memory])
mean_advs = xp.mean(all_advs)
std_advs = xp.std(all_advs)
target_model = copy.deepcopy(self.model)
dataset_iter = chainer.iterators.SerialIterator(
self.memory, self.minibatch_size)
dataset_iter.reset()
while dataset_iter.epoch < self.epochs:
batch = dataset_iter.__next__()
states = batch_states([b['state'] for b in batch], xp, self.phi)
actions = xp.array([b['action'] for b in batch])
distribs, vs_pred = self.model(states)
with chainer.no_backprop_mode():
target_distribs, _ = target_model(states)
advs = xp.array([b['adv'] for b in batch], dtype=xp.float32)
if self.standardize_advantages:
advs = (advs - mean_advs) / std_advs
self.optimizer.update(
self._lossfun,
distribs, vs_pred, distribs.log_prob(actions),
vs_pred_old=xp.array(
[b['v_pred'] for b in batch], dtype=xp.float32),
target_log_probs=target_distribs.log_prob(actions),
advs=advs,
vs_teacher=xp.array(
[b['v_teacher'] for b in batch], dtype=xp.float32),
)
def act(self, state):
with chainer.using_config('train', False):
with chainer.no_backprop_mode():
action_value = self.model(
self.batch_states([state], self.xp, self.phi))
q = float(action_value.max.data)
action = cuda.to_cpu(action_value.greedy_actions.data)[0]
# Update stats
self.average_q *= self.average_q_decay
self.average_q += (1 - self.average_q_decay) * q
self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value)
return action
def act(self, obs):
with chainer.no_backprop_mode():
batch_obs = self.batch_states([obs], self.xp, self.phi)
action_distrib = self.model(batch_obs)
if self.act_deterministically:
return chainer.cuda.to_cpu(
action_distrib.most_probable.data)[0]
else:
return chainer.cuda.to_cpu(action_distrib.sample().data)[0]
def CalculateValLoss(self, xs, ys):
with chainer.no_backprop_mode(), chainer.using_config('train', False):
loss = self.CalcLoss(xs, ys)
return loss.data