def __init__(self,
action_space,
observation_space,
batch_size=128,
learning_rate=1e-3,
discount=1.0,
epsilon=0.05):
if not isinstance(action_space, spaces.Discrete):
raise TypeError("Action space type should be Discrete.")
self._action_space = action_space
self._batch_size = batch_size
self._discount = discount
self._epsilon = epsilon
self._q_network = FCNet(
input_size=reduce(lambda x, y: x * y, observation_space.shape),
output_size=action_space.n)
self._optimizer = optim.RMSprop(
self._q_network.parameters(), lr=learning_rate)
self._memory = ReplayMemory(100000)
python类optim()的实例源码
def train_epoch(self, X, y, show_bar=True):
optimizer = optim.Adam(self.parameters())
if show_bar:
bar = Progbar(len(X))
for ix, (elem, tags) in enumerate(zip(X, y)):
self.zero_grad()
sentence, feature_vector, sentence_markers = self.get_sentence_feature_vector(elem)
if self.GPU:
targets = torch.LongTensor(tags).cuda()
else:
targets = torch.LongTensor(tags)
neg_log_likelihood = self.neg_log_likelihood(sentence, feature_vector, targets)
neg_log_likelihood.backward()
optimizer.step()
if show_bar:
bar.update(ix + 1)
if show_bar:
print ''
sys.stdout.flush()
def test_sgd(self):
self._test_rosenbrock(
lambda params: optim.SGD(params, lr=1e-3),
wrap_old_fn(old_optim.sgd, learningRate=1e-3)
)
self._test_rosenbrock(
lambda params: optim.SGD(params, lr=1e-3, momentum=0.9,
dampening=0, weight_decay=1e-4),
wrap_old_fn(old_optim.sgd, learningRate=1e-3, momentum=0.9,
dampening=0, weightDecay=1e-4)
)
self._test_basic_cases(
lambda weight, bias: optim.SGD([weight, bias], lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.SGD(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.SGD(
self._build_params_dict_single(weight, bias, lr=1e-2),
lr=1e-3)
)
def test_adam(self):
self._test_rosenbrock(
lambda params: optim.Adam(params, lr=1e-2),
wrap_old_fn(old_optim.adam, learningRate=1e-2)
)
self._test_rosenbrock(
lambda params: optim.Adam(params, lr=1e-2, weight_decay=1e-2),
wrap_old_fn(old_optim.adam, learningRate=1e-2, weightDecay=1e-2)
)
self._test_basic_cases(
lambda weight, bias: optim.Adam([weight, bias], lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.Adam(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-3)
)
def test_adadelta(self):
self._test_rosenbrock(
lambda params: optim.Adadelta(params),
wrap_old_fn(old_optim.adadelta)
)
self._test_rosenbrock(
lambda params: optim.Adadelta(params, rho=0.95),
wrap_old_fn(old_optim.adadelta, rho=0.95)
)
self._test_rosenbrock(
lambda params: optim.Adadelta(params, weight_decay=1e-2),
wrap_old_fn(old_optim.adadelta, weightDecay=1e-2)
)
self._test_basic_cases(
lambda weight, bias: optim.Adadelta([weight, bias])
)
self._test_basic_cases(
lambda weight, bias: optim.Adadelta(
self._build_params_dict(weight, bias, rho=0.95))
)
def test_adagrad(self):
self._test_rosenbrock(
lambda params: optim.Adagrad(params, lr=1e-1),
wrap_old_fn(old_optim.adagrad, learningRate=1e-1)
)
self._test_rosenbrock(
lambda params: optim.Adagrad(params, lr=1e-1, lr_decay=1e-3),
wrap_old_fn(old_optim.adagrad, learningRate=1e-1, learningRateDecay=1e-3)
)
self._test_rosenbrock(
lambda params: optim.Adagrad(params, lr=1e-1, weight_decay=1e-2),
wrap_old_fn(old_optim.adagrad, learningRate=1e-1, weightDecay=1e-2)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-1)
)
def test_rmsprop(self):
self._test_rosenbrock(
lambda params: optim.RMSprop(params, lr=1e-2),
wrap_old_fn(old_optim.rmsprop, learningRate=1e-2)
)
self._test_rosenbrock(
lambda params: optim.RMSprop(params, lr=1e-2, weight_decay=1e-2),
wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, weightDecay=1e-2)
)
self._test_rosenbrock(
lambda params: optim.RMSprop(params, lr=1e-2, alpha=0.95),
wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, alpha=0.95)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-2)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad(
self._build_params_dict(weight, bias, lr=1e-3),
lr=1e-2)
)
def test_rprop(self):
self._test_rosenbrock(
lambda params: optim.Rprop(params, lr=1e-3),
wrap_old_fn(old_optim.rprop, stepsize=1e-3)
)
self._test_rosenbrock(
lambda params: optim.Rprop(params, lr=1e-3, etas=(0.6, 1.1)),
wrap_old_fn(old_optim.rprop, stepsize=1e-3, etaminus=0.6, etaplus=1.1)
)
self._test_rosenbrock(
lambda params: optim.Rprop(params, lr=1e-3, step_sizes=(1e-4, 3)),
wrap_old_fn(old_optim.rprop, stepsize=1e-3, stepsizemin=1e-4, stepsizemax=3)
)
self._test_basic_cases(
lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.Rprop(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-3)
)
def fit(self, observations, labels):
def closure():
predicted = self.predict(observations)
loss = self.loss_fn(predicted, labels)
self.optimizer.zero_grad()
loss.backward()
return loss
old_params = parameters_to_vector(self.model.parameters())
for lr in self.lr * .5**np.arange(10):
self.optimizer = optim.LBFGS(self.model.parameters(), lr=lr)
self.optimizer.step(closure)
current_params = parameters_to_vector(self.model.parameters())
if any(np.isnan(current_params.data.cpu().numpy())):
print("LBFGS optimization diverged. Rolling back update...")
vector_to_parameters(old_params, self.model.parameters())
else:
return
def init_optimizer(self, state_dict=None):
"""Initialize an optimizer for the free parameters of the network.
Args:
state_dict: network parameters
"""
if self.args.fix_embeddings:
for p in self.network.embedding.parameters():
p.requires_grad = False
parameters = [p for p in self.network.parameters() if p.requires_grad]
if self.args.optimizer == 'sgd':
self.optimizer = optim.SGD(parameters, self.args.learning_rate,
momentum=self.args.momentum,
weight_decay=self.args.weight_decay)
elif self.args.optimizer == 'adamax':
self.optimizer = optim.Adamax(parameters,
weight_decay=self.args.weight_decay)
else:
raise RuntimeError('Unsupported optimizer: %s' %
self.args.optimizer)
# --------------------------------------------------------------------------
# Learning
# --------------------------------------------------------------------------
def __init__(self, model, **kwargs):
super(Seq2SeqTrainerPyTorch, self).__init__()
self.steps = 0
self.gpu = bool(kwargs.get('gpu', True))
optim = kwargs.get('optim', 'adam')
eta = float(kwargs.get('eta', 0.01))
mom = float(kwargs.get('mom', 0.9))
self.clip = float(kwargs.get('clip', 5))
if optim == 'adadelta':
self.optimizer = torch.optim.Adadelta(model.parameters(), lr=eta)
elif optim == 'adam':
self.optimizer = torch.optim.Adam(model.parameters(), lr=eta)
elif optim == 'rmsprop':
self.optimizer = torch.optim.RMSprop(model.parameters(), lr=eta)
else:
self.optimizer = torch.optim.SGD(model.parameters(), lr=eta, momentum=mom)
self.model = model
self._input = model.make_input
self.crit = model.create_loss()
if self.gpu:
self.model = torch.nn.DataParallel(model).cuda()
self.crit.cuda()
def train(rank, args, model):
torch.manual_seed(args.seed + rank)
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, num_workers=1)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, num_workers=1)
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
for epoch in range(1, args.epochs + 1):
train_epoch(epoch, args, model, train_loader, optimizer)
test_epoch(model, test_loader)
def update_lr(self,max_j):
for param_group in self.optim.param_groups:
param_group['lr'] = (0.5 * self.lr) * (1 + np.cos(np.pi * self.j / max_j))
# Optionally anneal the width settings throughout training.
# self.min_width = 0.25 + 0.25 * min(self.j / (max_j * 0.5), 1.0)
# self.max_width = 0.50 + 0.50 * min(self.j / (max_j * 0.5), 1.0)
# self.max_paths = [min(float(self.j) / (max_j * 0.5), 1.0)] * 3
# self.min_budget = 0.25 + 0.25 * min(self.j / (max_j * 0.5), 1.0)
self.max_budget = 0.50 + 0.50 * min(self.j / (max_j * 0.5), 1.0)
# Anneal kernel sizes towards max kernel size
self.max_kernel = 3 + int(((self.final_max_kernel - 3)//2) * min(self.j / (max_j * 0.5), 1.0) * 2)
self.j += 1
def test_sgd(self):
self._test_rosenbrock(
lambda params: optim.SGD(params, lr=1e-3),
wrap_old_fn(old_optim.sgd, learningRate=1e-3)
)
self._test_rosenbrock(
lambda params: optim.SGD(params, lr=1e-3, momentum=0.9,
dampening=0, weight_decay=1e-4),
wrap_old_fn(old_optim.sgd, learningRate=1e-3, momentum=0.9,
dampening=0, weightDecay=1e-4)
)
self._test_basic_cases(
lambda weight, bias: optim.SGD([weight, bias], lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.SGD(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.SGD(
self._build_params_dict_single(weight, bias, lr=1e-2),
lr=1e-3)
)
def test_adadelta(self):
self._test_rosenbrock(
lambda params: optim.Adadelta(params),
wrap_old_fn(old_optim.adadelta)
)
self._test_rosenbrock(
lambda params: optim.Adadelta(params, rho=0.95),
wrap_old_fn(old_optim.adadelta, rho=0.95)
)
self._test_rosenbrock(
lambda params: optim.Adadelta(params, weight_decay=1e-2),
wrap_old_fn(old_optim.adadelta, weightDecay=1e-2)
)
self._test_basic_cases(
lambda weight, bias: optim.Adadelta([weight, bias])
)
self._test_basic_cases(
lambda weight, bias: optim.Adadelta(
self._build_params_dict(weight, bias, rho=0.95))
)
def test_adamax(self):
self._test_rosenbrock(
lambda params: optim.Adamax(params, lr=1e-1),
wrap_old_fn(old_optim.adamax, learningRate=1e-1)
)
self._test_rosenbrock(
lambda params: optim.Adamax(params, lr=1e-1, weight_decay=1e-2),
wrap_old_fn(old_optim.adamax, learningRate=1e-1, weightDecay=1e-2)
)
self._test_rosenbrock(
lambda params: optim.Adamax(params, lr=1e-1, betas=(0.95, 0.998)),
wrap_old_fn(old_optim.adamax, learningRate=1e-1, beta1=0.95, beta2=0.998)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-1)
)
def test_rmsprop(self):
self._test_rosenbrock(
lambda params: optim.RMSprop(params, lr=1e-2),
wrap_old_fn(old_optim.rmsprop, learningRate=1e-2)
)
self._test_rosenbrock(
lambda params: optim.RMSprop(params, lr=1e-2, weight_decay=1e-2),
wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, weightDecay=1e-2)
)
self._test_rosenbrock(
lambda params: optim.RMSprop(params, lr=1e-2, alpha=0.95),
wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, alpha=0.95)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-2)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad(
self._build_params_dict(weight, bias, lr=1e-3),
lr=1e-2)
)
def test_rprop(self):
self._test_rosenbrock(
lambda params: optim.Rprop(params, lr=1e-3),
wrap_old_fn(old_optim.rprop, stepsize=1e-3)
)
self._test_rosenbrock(
lambda params: optim.Rprop(params, lr=1e-3, etas=(0.6, 1.1)),
wrap_old_fn(old_optim.rprop, stepsize=1e-3, etaminus=0.6, etaplus=1.1)
)
self._test_rosenbrock(
lambda params: optim.Rprop(params, lr=1e-3, step_sizes=(1e-4, 3)),
wrap_old_fn(old_optim.rprop, stepsize=1e-3, stepsizemin=1e-4, stepsizemax=3)
)
self._test_basic_cases(
lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.Rprop(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-3)
)
def __init__(self):
super(Generator, self).__init__()
self.main = nn.Sequential(
nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False),
nn.BatchNorm2d(ngf * 8),
nn.ReLU(True),
nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 4),
nn.ReLU(True),
nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 2),
nn.ReLU(True),
nn.ConvTranspose2d(ngf * 2, ngf * 1, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 1),
nn.ReLU(True),
nn.ConvTranspose2d(ngf * 1, nc, 4, 2, 1, bias=False),
nn.Tanh()
)
self.apply(weights_init)
self.optimizer = optim.Adam(self.parameters(), lr=learning_rate, betas=(beta_1, beta_2))
#self.optimizer = optim.RMSprop(self.parameters(), lr=learning_rate, alpha=beta_2)
def __init__(self):
super(Discriminator, self).__init__()
self.main = nn.Sequential(
nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 2),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 4),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 8),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
nn.Sigmoid()
)
self.apply(weights_init)
self.optimizer = optim.Adam(self.parameters(), lr=learning_rate, betas=(beta_1, beta_2))
#self.optimizer = optim.RMSprop(self.parameters(), lr=learning_rate, alpha=beta_2)