initial commit

46455cd over 1 year ago

11.9 kB

	# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey)
	#
	# See ../LICENSE for clarification regarding multiple authors
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	from typing import List, Optional, Union

	import torch
	from torch.optim import Optimizer


	class Eve(Optimizer):
	r"""
	Implements Eve algorithm. This is a modified version of AdamW with a special
	way of setting the weight-decay / shrinkage-factor, which is designed to make the
	rms of the parameters approach a particular target_rms (default: 0.1). This is
	for use with networks with 'scaled' versions of modules (see scaling.py), which
	will be close to invariant to the absolute scale on the parameter matrix.

	The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
	The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
	Eve is unpublished so far.

	Arguments:
	params (iterable): iterable of parameters to optimize or dicts defining
	parameter groups
	lr (float, optional): learning rate (default: 1e-3)
	betas (Tuple[float, float], optional): coefficients used for computing
	running averages of gradient and its square (default: (0.9, 0.999))
	eps (float, optional): term added to the denominator to improve
	numerical stability (default: 1e-8)
	weight_decay (float, optional): weight decay coefficient (default: 3e-4;
	this value means that the weight would decay significantly after
	about 3k minibatches. Is not multiplied by learning rate, but
	is conditional on RMS-value of parameter being > target_rms.
	target_rms (float, optional): target root-mean-square value of
	parameters, if they fall below this we will stop applying weight decay.


	.. _Adam\: A Method for Stochastic Optimization:
	https://arxiv.org/abs/1412.6980
	.. _Decoupled Weight Decay Regularization:
	https://arxiv.org/abs/1711.05101
	.. _On the Convergence of Adam and Beyond:
	https://openreview.net/forum?id=ryQu7f-RZ
	"""

	def __init__(
	self,
	params,
	lr=1e-3,
	betas=(0.9, 0.98),
	eps=1e-8,
	weight_decay=1e-3,
	target_rms=0.1,
	):

	if not 0.0 <= lr:
	raise ValueError("Invalid learning rate: {}".format(lr))
	if not 0.0 <= eps:
	raise ValueError("Invalid epsilon value: {}".format(eps))
	if not 0.0 <= betas[0] < 1.0:
	raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
	if not 0.0 <= betas[1] < 1.0:
	raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
	if not 0 <= weight_decay <= 0.1:
	raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
	if not 0 < target_rms <= 10.0:
	raise ValueError("Invalid target_rms value: {}".format(target_rms))
	defaults = dict(
	lr=lr,
	betas=betas,
	eps=eps,
	weight_decay=weight_decay,
	target_rms=target_rms,
	)
	super(Eve, self).__init__(params, defaults)

	def __setstate__(self, state):
	super(Eve, self).__setstate__(state)

	@torch.no_grad()
	def step(self, closure=None):
	"""Performs a single optimization step.

	Arguments:
	closure (callable, optional): A closure that reevaluates the model
	and returns the loss.
	"""
	loss = None
	if closure is not None:
	with torch.enable_grad():
	loss = closure()

	for group in self.param_groups:
	for p in group["params"]:
	if p.grad is None:
	continue

	# Perform optimization step
	grad = p.grad
	if grad.is_sparse:
	raise RuntimeError("AdamW does not support sparse gradients")

	state = self.state[p]

	# State initialization
	if len(state) == 0:
	state["step"] = 0
	# Exponential moving average of gradient values
	state["exp_avg"] = torch.zeros_like(
	p, memory_format=torch.preserve_format
	)
	# Exponential moving average of squared gradient values
	state["exp_avg_sq"] = torch.zeros_like(
	p, memory_format=torch.preserve_format
	)

	exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]

	beta1, beta2 = group["betas"]

	state["step"] += 1
	bias_correction1 = 1 - beta1 ** state["step"]
	bias_correction2 = 1 - beta2 ** state["step"]

	# Decay the first and second moment running average coefficient
	exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
	exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
	denom = (exp_avg_sq.sqrt() * (bias_correction2**-0.5)).add_(
	group["eps"]
	)

	step_size = group["lr"] / bias_correction1
	target_rms = group["target_rms"]
	weight_decay = group["weight_decay"]

	if p.numel() > 1:
	# avoid applying this weight-decay on "scaling factors"
	# (which are scalar).
	is_above_target_rms = p.norm() > (target_rms * (p.numel() ** 0.5))
	p.mul_(1 - (weight_decay * is_above_target_rms))
	p.addcdiv_(exp_avg, denom, value=-step_size)

	# Constrain the range of scalar weights
	if p.numel() == 1:
	p.clamp_(min=-10, max=2)

	return loss


	class LRScheduler(object):
	"""
	Base-class for learning rate schedulers where the learning-rate depends on both the
	batch and the epoch.
	"""

	def __init__(self, optimizer: Optimizer, verbose: bool = False):
	# Attach optimizer
	if not isinstance(optimizer, Optimizer):
	raise TypeError("{} is not an Optimizer".format(type(optimizer).__name__))
	self.optimizer = optimizer
	self.verbose = verbose

	for group in optimizer.param_groups:
	group.setdefault("initial_lr", group["lr"])

	self.base_lrs = [group["initial_lr"] for group in optimizer.param_groups]

	self.epoch = 0
	self.batch = 0

	def state_dict(self):
	"""Returns the state of the scheduler as a :class:`dict`.

	It contains an entry for every variable in self.__dict__ which
	is not the optimizer.
	"""
	return {
	"base_lrs": self.base_lrs,
	"epoch": self.epoch,
	"batch": self.batch,
	}

	def load_state_dict(self, state_dict):
	"""Loads the schedulers state.

	Args:
	state_dict (dict): scheduler state. Should be an object returned
	from a call to :meth:`state_dict`.
	"""
	self.__dict__.update(state_dict)

	def get_last_lr(self) -> List[float]:
	"""Return last computed learning rate by current scheduler. Will be a list of float."""
	return self._last_lr

	def get_lr(self):
	# Compute list of learning rates from self.epoch and self.batch and
	# self.base_lrs; this must be overloaded by the user.
	# e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
	raise NotImplementedError

	def step_batch(self, batch: Optional[int] = None) -> None:
	# Step the batch index, or just set it. If `batch` is specified, it
	# must be the batch index from the start of training, i.e. summed over
	# all epochs.
	# You can call this in any order; if you don't provide 'batch', it should
	# of course be called once per batch.
	if batch is not None:
	self.batch = batch
	else:
	self.batch = self.batch + 1
	self._set_lrs()

	def step_epoch(self, epoch: Optional[int] = None):
	# Step the epoch index, or just set it. If you provide the 'epoch' arg,
	# you should call this at the start of the epoch; if you don't provide the 'epoch'
	# arg, you should call it at the end of the epoch.
	if epoch is not None:
	self.epoch = epoch
	else:
	self.epoch = self.epoch + 1
	self._set_lrs()

	def _set_lrs(self):
	values = self.get_lr()
	assert len(values) == len(self.optimizer.param_groups)

	for i, data in enumerate(zip(self.optimizer.param_groups, values)):
	param_group, lr = data
	param_group["lr"] = lr
	self.print_lr(self.verbose, i, lr)
	self._last_lr = [group["lr"] for group in self.optimizer.param_groups]

	def print_lr(self, is_verbose, group, lr):
	"""Display the current learning rate."""
	if is_verbose:
	print(
	f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate"
	f" of group {group} to {lr:.4e}."
	)


	class Eden(LRScheduler):
	"""
	Eden scheduler.
	lr = initial_lr * (((batch2 + lr_batches2) / lr_batches2) -0.25 *
	(((epoch2 + lr_epochs2) / lr_epochs2) -0.25))

	E.g. suggest initial-lr = 0.003 (passed to optimizer).

	Args:
	optimizer: the optimizer to change the learning rates on
	lr_batches: the number of batches after which we start significantly
	decreasing the learning rate, suggest 5000.
	lr_epochs: the number of epochs after which we start significantly
	decreasing the learning rate, suggest 6 if you plan to do e.g.
	20 to 40 epochs, but may need smaller number if dataset is huge
	and you will do few epochs.
	"""

	def __init__(
	self,
	optimizer: Optimizer,
	lr_batches: Union[int, float],
	lr_epochs: Union[int, float],
	verbose: bool = False,
	):
	super(Eden, self).__init__(optimizer, verbose)
	self.lr_batches = lr_batches
	self.lr_epochs = lr_epochs

	def get_lr(self):
	factor = (
	(self.batch2 + self.lr_batches2) / self.lr_batches**2
	) ** -0.25 * (
	((self.epoch2 + self.lr_epochs2) / self.lr_epochs2) -0.25
	)
	return [x * factor for x in self.base_lrs]


	def _test_eden():
	m = torch.nn.Linear(100, 100)
	optim = Eve(m.parameters(), lr=0.003)

	scheduler = Eden(optim, lr_batches=30, lr_epochs=2, verbose=True)

	for epoch in range(10):
	scheduler.step_epoch(epoch) # sets epoch to `epoch`

	for step in range(20):
	x = torch.randn(200, 100).detach()
	x.requires_grad = True
	y = m(x)
	dy = torch.randn(200, 100).detach()
	f = (y * dy).sum()
	f.backward()

	optim.step()
	scheduler.step_batch()
	optim.zero_grad()
	print("last lr = ", scheduler.get_last_lr())
	print("state dict = ", scheduler.state_dict())


	if __name__ == "__main__":
	_test_eden()