|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import List, Optional, Union |
|
|
|
import torch |
|
from torch.optim import Optimizer |
|
|
|
|
|
class Eve(Optimizer): |
|
r""" |
|
Implements Eve algorithm. This is a modified version of AdamW with a special |
|
way of setting the weight-decay / shrinkage-factor, which is designed to make the |
|
rms of the parameters approach a particular target_rms (default: 0.1). This is |
|
for use with networks with 'scaled' versions of modules (see scaling.py), which |
|
will be close to invariant to the absolute scale on the parameter matrix. |
|
|
|
The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. |
|
The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. |
|
Eve is unpublished so far. |
|
|
|
Arguments: |
|
params (iterable): iterable of parameters to optimize or dicts defining |
|
parameter groups |
|
lr (float, optional): learning rate (default: 1e-3) |
|
betas (Tuple[float, float], optional): coefficients used for computing |
|
running averages of gradient and its square (default: (0.9, 0.999)) |
|
eps (float, optional): term added to the denominator to improve |
|
numerical stability (default: 1e-8) |
|
weight_decay (float, optional): weight decay coefficient (default: 3e-4; |
|
this value means that the weight would decay significantly after |
|
about 3k minibatches. Is not multiplied by learning rate, but |
|
is conditional on RMS-value of parameter being > target_rms. |
|
target_rms (float, optional): target root-mean-square value of |
|
parameters, if they fall below this we will stop applying weight decay. |
|
|
|
|
|
.. _Adam\: A Method for Stochastic Optimization: |
|
https://arxiv.org/abs/1412.6980 |
|
.. _Decoupled Weight Decay Regularization: |
|
https://arxiv.org/abs/1711.05101 |
|
.. _On the Convergence of Adam and Beyond: |
|
https://openreview.net/forum?id=ryQu7f-RZ |
|
""" |
|
|
|
def __init__( |
|
self, |
|
params, |
|
lr=1e-3, |
|
betas=(0.9, 0.98), |
|
eps=1e-8, |
|
weight_decay=1e-3, |
|
target_rms=0.1, |
|
): |
|
|
|
if not 0.0 <= lr: |
|
raise ValueError("Invalid learning rate: {}".format(lr)) |
|
if not 0.0 <= eps: |
|
raise ValueError("Invalid epsilon value: {}".format(eps)) |
|
if not 0.0 <= betas[0] < 1.0: |
|
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) |
|
if not 0.0 <= betas[1] < 1.0: |
|
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) |
|
if not 0 <= weight_decay <= 0.1: |
|
raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) |
|
if not 0 < target_rms <= 10.0: |
|
raise ValueError("Invalid target_rms value: {}".format(target_rms)) |
|
defaults = dict( |
|
lr=lr, |
|
betas=betas, |
|
eps=eps, |
|
weight_decay=weight_decay, |
|
target_rms=target_rms, |
|
) |
|
super(Eve, self).__init__(params, defaults) |
|
|
|
def __setstate__(self, state): |
|
super(Eve, self).__setstate__(state) |
|
|
|
@torch.no_grad() |
|
def step(self, closure=None): |
|
"""Performs a single optimization step. |
|
|
|
Arguments: |
|
closure (callable, optional): A closure that reevaluates the model |
|
and returns the loss. |
|
""" |
|
loss = None |
|
if closure is not None: |
|
with torch.enable_grad(): |
|
loss = closure() |
|
|
|
for group in self.param_groups: |
|
for p in group["params"]: |
|
if p.grad is None: |
|
continue |
|
|
|
|
|
grad = p.grad |
|
if grad.is_sparse: |
|
raise RuntimeError("AdamW does not support sparse gradients") |
|
|
|
state = self.state[p] |
|
|
|
|
|
if len(state) == 0: |
|
state["step"] = 0 |
|
|
|
state["exp_avg"] = torch.zeros_like( |
|
p, memory_format=torch.preserve_format |
|
) |
|
|
|
state["exp_avg_sq"] = torch.zeros_like( |
|
p, memory_format=torch.preserve_format |
|
) |
|
|
|
exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] |
|
|
|
beta1, beta2 = group["betas"] |
|
|
|
state["step"] += 1 |
|
bias_correction1 = 1 - beta1 ** state["step"] |
|
bias_correction2 = 1 - beta2 ** state["step"] |
|
|
|
|
|
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) |
|
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) |
|
denom = (exp_avg_sq.sqrt() * (bias_correction2**-0.5)).add_( |
|
group["eps"] |
|
) |
|
|
|
step_size = group["lr"] / bias_correction1 |
|
target_rms = group["target_rms"] |
|
weight_decay = group["weight_decay"] |
|
|
|
if p.numel() > 1: |
|
|
|
|
|
is_above_target_rms = p.norm() > (target_rms * (p.numel() ** 0.5)) |
|
p.mul_(1 - (weight_decay * is_above_target_rms)) |
|
p.addcdiv_(exp_avg, denom, value=-step_size) |
|
|
|
|
|
if p.numel() == 1: |
|
p.clamp_(min=-10, max=2) |
|
|
|
return loss |
|
|
|
|
|
class LRScheduler(object): |
|
""" |
|
Base-class for learning rate schedulers where the learning-rate depends on both the |
|
batch and the epoch. |
|
""" |
|
|
|
def __init__(self, optimizer: Optimizer, verbose: bool = False): |
|
|
|
if not isinstance(optimizer, Optimizer): |
|
raise TypeError("{} is not an Optimizer".format(type(optimizer).__name__)) |
|
self.optimizer = optimizer |
|
self.verbose = verbose |
|
|
|
for group in optimizer.param_groups: |
|
group.setdefault("initial_lr", group["lr"]) |
|
|
|
self.base_lrs = [group["initial_lr"] for group in optimizer.param_groups] |
|
|
|
self.epoch = 0 |
|
self.batch = 0 |
|
|
|
def state_dict(self): |
|
"""Returns the state of the scheduler as a :class:`dict`. |
|
|
|
It contains an entry for every variable in self.__dict__ which |
|
is not the optimizer. |
|
""" |
|
return { |
|
"base_lrs": self.base_lrs, |
|
"epoch": self.epoch, |
|
"batch": self.batch, |
|
} |
|
|
|
def load_state_dict(self, state_dict): |
|
"""Loads the schedulers state. |
|
|
|
Args: |
|
state_dict (dict): scheduler state. Should be an object returned |
|
from a call to :meth:`state_dict`. |
|
""" |
|
self.__dict__.update(state_dict) |
|
|
|
def get_last_lr(self) -> List[float]: |
|
"""Return last computed learning rate by current scheduler. Will be a list of float.""" |
|
return self._last_lr |
|
|
|
def get_lr(self): |
|
|
|
|
|
|
|
raise NotImplementedError |
|
|
|
def step_batch(self, batch: Optional[int] = None) -> None: |
|
|
|
|
|
|
|
|
|
|
|
if batch is not None: |
|
self.batch = batch |
|
else: |
|
self.batch = self.batch + 1 |
|
self._set_lrs() |
|
|
|
def step_epoch(self, epoch: Optional[int] = None): |
|
|
|
|
|
|
|
if epoch is not None: |
|
self.epoch = epoch |
|
else: |
|
self.epoch = self.epoch + 1 |
|
self._set_lrs() |
|
|
|
def _set_lrs(self): |
|
values = self.get_lr() |
|
assert len(values) == len(self.optimizer.param_groups) |
|
|
|
for i, data in enumerate(zip(self.optimizer.param_groups, values)): |
|
param_group, lr = data |
|
param_group["lr"] = lr |
|
self.print_lr(self.verbose, i, lr) |
|
self._last_lr = [group["lr"] for group in self.optimizer.param_groups] |
|
|
|
def print_lr(self, is_verbose, group, lr): |
|
"""Display the current learning rate.""" |
|
if is_verbose: |
|
print( |
|
f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate" |
|
f" of group {group} to {lr:.4e}." |
|
) |
|
|
|
|
|
class Eden(LRScheduler): |
|
""" |
|
Eden scheduler. |
|
lr = initial_lr * (((batch**2 + lr_batches**2) / lr_batches**2) ** -0.25 * |
|
(((epoch**2 + lr_epochs**2) / lr_epochs**2) ** -0.25)) |
|
|
|
E.g. suggest initial-lr = 0.003 (passed to optimizer). |
|
|
|
Args: |
|
optimizer: the optimizer to change the learning rates on |
|
lr_batches: the number of batches after which we start significantly |
|
decreasing the learning rate, suggest 5000. |
|
lr_epochs: the number of epochs after which we start significantly |
|
decreasing the learning rate, suggest 6 if you plan to do e.g. |
|
20 to 40 epochs, but may need smaller number if dataset is huge |
|
and you will do few epochs. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
optimizer: Optimizer, |
|
lr_batches: Union[int, float], |
|
lr_epochs: Union[int, float], |
|
verbose: bool = False, |
|
): |
|
super(Eden, self).__init__(optimizer, verbose) |
|
self.lr_batches = lr_batches |
|
self.lr_epochs = lr_epochs |
|
|
|
def get_lr(self): |
|
factor = ( |
|
(self.batch**2 + self.lr_batches**2) / self.lr_batches**2 |
|
) ** -0.25 * ( |
|
((self.epoch**2 + self.lr_epochs**2) / self.lr_epochs**2) ** -0.25 |
|
) |
|
return [x * factor for x in self.base_lrs] |
|
|
|
|
|
def _test_eden(): |
|
m = torch.nn.Linear(100, 100) |
|
optim = Eve(m.parameters(), lr=0.003) |
|
|
|
scheduler = Eden(optim, lr_batches=30, lr_epochs=2, verbose=True) |
|
|
|
for epoch in range(10): |
|
scheduler.step_epoch(epoch) |
|
|
|
for step in range(20): |
|
x = torch.randn(200, 100).detach() |
|
x.requires_grad = True |
|
y = m(x) |
|
dy = torch.randn(200, 100).detach() |
|
f = (y * dy).sum() |
|
f.backward() |
|
|
|
optim.step() |
|
scheduler.step_batch() |
|
optim.zero_grad() |
|
print("last lr = ", scheduler.get_last_lr()) |
|
print("state dict = ", scheduler.state_dict()) |
|
|
|
|
|
if __name__ == "__main__": |
|
_test_eden() |
|
|