def set_parameters(self, params):
self.params = [p for p in params if p.requires_grad]
if self.method == 'sgd':
self.optimizer = optim.SGD(self.params, lr=self.lr)
elif self.method == 'adagrad':
self.optimizer = optim.Adagrad(self.params, lr=self.lr)
for group in self.optimizer.param_groups:
for p in group['params']:
self.optimizer.state[p]['sum'] = self.optimizer\
.state[p]['sum'].fill_(self.adagrad_accum)
elif self.method == 'adadelta':
self.optimizer = optim.Adadelta(self.params, lr=self.lr)
elif self.method == 'adam':
self.optimizer = optim.Adam(self.params, lr=self.lr,
betas=self.betas, eps=1e-9)
else:
raise RuntimeError("Invalid optim method: " + self.method)
# We use the default parameters for Adam that are suggested by
# the original paper https://arxiv.org/pdf/1412.6980.pdf
# These values are also used by other established implementations,
# e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
# https://keras.io/optimizers/
# Recently there are slightly different values used in the paper
# "Attention is all you need"
# https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98
# was used there however, beta2=0.999 is still arguably the more
# established value, so we use that here as well
评论列表
文章目录