Source code for qhoptim.tf.qhm

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import resource_variable_ops
from tensorflow.python.training import optimizer
from tensorflow.python.training import training_ops

from ..common import param_conv
from .util import call_if_callable


[docs]class QHMOptimizer(optimizer.Optimizer):
    r"""Implements the quasi-hyperbolic momentum (QHM) optimization algorithm
    `(Ma and Yarats, 2019)`_.

    Note that many other optimization algorithms are accessible via specific
    parameterizations of QHM. See :func:`from_accsgd()`,
    :func:`from_robust_momentum()`, etc. for details.

    Args:
        learning_rate (float):
            learning rate (:math:`\alpha` from the paper)
        momentum (float):
            momentum factor (:math:`\beta` from the paper)
        nu (float):
            immediate discount factor (:math:`\nu` from the paper)
        use_locking (bool):
            whether or not to use locking parameter updates
        name (str):
            name of the optimizer

    Example:
        >>> optimizer = qhoptim.tf.QHMOptimizer(
        ...     learning_rate=1.0, nu=0.7, momentum=0.999)

    .. _`(Ma and Yarats, 2019)`: https://arxiv.org/abs/1810.06801

    .. note::

        Mathematically, QHM is a simple interpolation between plain SGD and
        momentum:

        .. math::

            \begin{align*}
                g_{t + 1} &\leftarrow
                    \beta \cdot g_t +
                    (1 - \beta) \cdot \nabla_t \\
                \theta_{t + 1} &\leftarrow
                    \theta_t + \alpha \left[ (1 - \nu) \cdot \nabla_t +
                                             \nu \cdot g_{t + 1} \right]
            \end{align*}

        Here, :math:`\alpha` is the learning rate, :math:`\beta` is the momentum
        factor, and :math:`\nu` is the "immediate discount" factor which
        controls the interpolation between plain SGD and momentum.
        :math:`g_t` is the momentum buffer, :math:`\theta_t` is the parameter
        vector, and :math:`\nabla_t` is the gradient with respect to
        :math:`\theta_t`.

    .. note::

        QHM uses **dampened** momentum. This means that when converting from
        plain momentum to QHM, the learning rate must be scaled by
        :math:`\frac{1}{1 - \beta}`. For example, momentum with learning rate
        :math:`\alpha = 0.1` and momentum :math:`\beta = 0.9` should be
        converted to QHM with learning rate :math:`\alpha = 1.0`.
    """

    def __init__(self, learning_rate, momentum, nu, use_locking=False, name="QHM"):
        super().__init__(use_locking, name)
        self._learning_rate = learning_rate
        self._learning_rate_tensor = None
        self._momentum = momentum
        self._momentum_tensor = None
        self._nu = nu
        self._nu_tensor = None

    def _create_slots(self, var_list):
        for v in var_list:
            self._zeros_slot(v, "momentum", self._name)

    def _prepare(self):
        learning_rate = call_if_callable(self._learning_rate)
        self._learning_rate_tensor = ops.convert_to_tensor(learning_rate, dtype=dtypes.float64, name="learning_rate")

        momentum = call_if_callable(self._momentum)
        self._momentum_tensor = ops.convert_to_tensor(momentum, dtype=dtypes.float64, name="momentum")

        nu = call_if_callable(self._nu)
        self._nu_tensor = ops.convert_to_tensor(nu, dtype=dtypes.float64, name="nu")

    def _apply_dense(self, grad, var):
        momentum_buffer = self.get_slot(var, "momentum")
        learning_rate = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
        momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype)
        nu = math_ops.cast(self._nu_tensor, var.dtype.base_dtype)

        momentum_op = training_ops.apply_momentum(
            var,
            momentum_buffer,
            nu * (1.0 - momentum) * learning_rate,
            grad,
            momentum,
            use_locking=self._use_locking,
            use_nesterov=False,
        ).op

        with ops.control_dependencies([momentum_op]):
            gd_op = training_ops.apply_gradient_descent(
                var, (1.0 - nu) * learning_rate, grad, use_locking=self._use_locking
            ).op

        return control_flow_ops.group(momentum_op, gd_op)

    def _resource_apply_dense(self, grad, var):
        momentum_buffer = self.get_slot(var, "momentum")
        learning_rate = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
        momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype)
        nu = math_ops.cast(self._nu_tensor, var.dtype.base_dtype)

        momentum_op = training_ops.resource_apply_momentum(
            var.handle,
            momentum_buffer.handle,
            nu * (1.0 - momentum) * learning_rate,
            grad,
            momentum,
            use_locking=self._use_locking,
            use_nesterov=False,
        )

        with ops.control_dependencies([momentum_op]):
            gd_op = training_ops.resource_apply_gradient_descent(
                var.handle, (1.0 - nu) * learning_rate, grad, use_locking=self._use_locking
            )

        return control_flow_ops.group(momentum_op, gd_op)

    def _apply_sparse(self, grad, var):
        momentum_buffer = self.get_slot(var, "momentum")
        learning_rate = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
        momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype)
        nu = math_ops.cast(self._nu_tensor, var.dtype.base_dtype)

        momentum_op = training_ops.sparse_apply_momentum(
            var,
            momentum_buffer,
            nu * (1.0 - momentum) * learning_rate,
            grad.values,
            grad.indices,
            momentum,
            use_locking=self._use_locking,
            use_nesterov=False,
        ).op

        with ops.control_dependencies([momentum_op]):
            delta = ops.IndexedSlices((nu - 1.0) * learning_rate * grad.values, grad.indices, grad.dense_shape)
            gd_op = var.scatter_add(delta, use_locking=self._use_locking)

        return control_flow_ops.group(momentum_op, gd_op)

    def _resource_apply_sparse(self, grad, var, indices):
        momentum_buffer = self.get_slot(var, "momentum")
        learning_rate = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
        momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype)
        nu = math_ops.cast(self._nu_tensor, var.dtype.base_dtype)

        momentum_op = training_ops.resource_sparse_apply_momentum(
            var.handle,
            momentum_buffer.handle,
            nu * (1.0 - momentum) * learning_rate,
            grad,
            indices,
            momentum,
            use_locking=self._use_locking,
            use_nesterov=False,
        )

        with ops.control_dependencies([momentum_op]):
            delta = (nu - 1.0) * learning_rate * grad
            gd_op = resource_variable_ops.resource_scatter_add(var.handle, indices, delta)

        return control_flow_ops.group(momentum_op, gd_op)

    @classmethod
    def _params_to_dict(cls, params):
        return {"learning_rate": params.alpha, "nu": params.nu, "momentum": params.beta}

[docs]    @classmethod
    def from_pid(cls, k_p, k_i, k_d):
        r"""Calculates the QHM hyperparameters required to recover a PID
        optimizer as described in `Recht (2018)`_.

        Args:
            k_p (float):
                proportional gain (see reference)
            k_i (float):
                integral gain (see reference)
            k_d (float):
                derivative gain (see reference)

        Returns:
            Three-element ``dict`` containing ``learning_rate``, ``momentum``,
            and ``nu`` to use in QHM.

        Example:
            >>> optimizer = qhoptim.tf.QHMOptimizer(
            ...     **qhoptim.tf.QHMOptimizer.from_pid(
            ...         k_p=-0.1, k_i=1.0, k_d=3.0))

        .. _`Recht (2018)`: https://web.archive.org/web/20181027184056/http://www.argmin.net/2018/04/19/pid/
        """
        return cls._params_to_dict(param_conv.from_pid(k_p, k_i, k_d))

[docs]    @classmethod
    def from_synthesized_nesterov(cls, alpha, beta1, beta2):
        r"""Calculates the QHM hyperparameters required to recover the
        synthesized Nesterov optimizer (Section 6 of `Lessard et al. (2016)`_).

        Args:
            alpha (float):
                learning rate
            beta1 (float):
                first momentum (see reference)
            beta2 (float):
                second momentum (see reference)

        Returns:
            Three-element ``dict`` containing ``learning_rate``, ``momentum``,
            and ``nu`` to use in QHM.

        Example:
            >>> optimizer = qhoptim.tf.QHMOptimizer(
            ...     **qhoptim.tf.QHMOptimizer.from_synthesized_nesterov(
            ...         alpha=0.1, beta1=0.9, beta2=0.6))

        .. _`Lessard et al. (2016)`: https://arxiv.org/abs/1408.3595
        """
        return cls._params_to_dict(param_conv.from_synthesized_nesterov(alpha, beta1, beta2))

[docs]    @classmethod
    def from_robust_momentum(cls, l, kappa, rho=None):
        r"""Calculates the QHM hyperparameters required to recover the Robust
        Momentum `(Cyrus et al., 2018)`_ or Triple Momentum
        `(Scoy et al., 2018)`_ optimizers.

        Args:
            l (float):
                Lipschitz constant of gradient (see reference)
            kappa (float):
                condition ratio (see reference)
            rho (float, optional):
                noise-free convergence rate. If None, will return the
                parameters for the Triple Momentum optimizer.

        Returns:
            Three-element ``dict`` containing ``learning_rate``, ``momentum``,
            and ``nu`` to use in QHM.

        Example:
            >>> optimizer = qhoptim.tf.QHMOptimizer(
            ...     **qhoptim.tf.QHMOptimizer.from_robust_momentum(
            ...         l=5.0, kappa=15.0))

        .. _`(Cyrus et al., 2018)`: https://arxiv.org/abs/1710.04753

        .. _`(Scoy et al., 2018)`: http://www.optimization-online.org/DB_FILE/2017/03/5908.pdf
        """
        return cls._params_to_dict(param_conv.from_robust_momentum(l, kappa, rho))

[docs]    @classmethod
    def from_accsgd(cls, delta, kappa, xi, eps=0.7):
        r"""Calculates the QHM hyperparameters required to recover the AccSGD
        optimizer `(Kidambi et al., 2018)`_.

        Args:
            delta (float):
                short step (see reference)
            kappa (float):
                long step parameter (see reference)
            xi (float):
                statistical advantage parameter (see reference)
            eps (float, optional):
                arbitrary value, between 0 and 1 exclusive (see reference)
                (default: 0.7)

        Returns:
            Three-element ``dict`` containing ``learning_rate``, ``momentum``,
            and ``nu`` to use in QHM.

        Example:
            >>> optimizer = qhoptim.tf.QHMOptimizer(
            ...     **qhoptim.tf.QHMOptimizer.from_accsgd(
            ...         delta=0.1, kappa=1000.0, xi=10.0))

        .. _`(Kidambi et al., 2018)`: https://arxiv.org/abs/1803.05591
        """
        return cls._params_to_dict(param_conv.from_accsgd(delta, kappa, xi, eps))

[docs]    @classmethod
    def from_two_state_optimizer(cls, h, k, l, m, q, z):
        r"""Calculates the QHM hyperparameters required to recover the
        following optimizer (named "TSO" in `Ma and Yarats (2019)`_):

        .. math::

            \begin{align*}
                a_{t + 1} &\leftarrow
                    h \cdot a_t + k \cdot \theta_t + l \cdot \nabla_t \\
                \theta_{t + 1} &\leftarrow
                    m \cdot a_t + q \cdot \theta_t + z \cdot \nabla_t
            \end{align*}

        Here, :math:`a_t` and :math:`\theta_t` are the two states and
        :math:`\nabla_t` is the gradient with respect to :math:`\theta_t`.

        Be careful that your coefficients satisfy the regularity conditions
        from the reference.

        Args:
            h (float):
                see description
            k (float):
                see description
            l (float):
                see description
            m (float):
                see description
            q (float):
                see description
            z (float):
                see description

        Returns:
            Three-element ``dict`` containing ``learning_rate``, ``momentum``,
            and ``nu`` to use in QHM.

        Example:
            >>> optimizer = qhoptim.tf.QHMOptimizer(
            ...     **qhoptim.tf.QHMOptimizer.from_two_state_optimizer(
            ...         h=0.9, k=0.0, l=0.1, m=-0.09, q=1.0, z=-0.01))

        .. _`Ma and Yarats (2019)`: https://arxiv.org/abs/1810.06801
        """
        return cls._params_to_dict(param_conv.from_two_state_optimizer(h, k, l, m, q, z))