!18126 add comments for master thor api

Merge pull request !18126 from wangshuangling/master
2021-06-10 20:21:49 +08:00 · 2021-06-10 20:21:49 +08:00 · 22181af515
parent dc9720cbb2 54890b88fc
commit 22181af515
7 changed files with 193 additions and 58 deletions
--- a/mindspore/nn/layer/thor_layer.py
+++ b/mindspore/nn/layer/thor_layer.py
@ -552,7 +552,7 @@ class EmbeddingThor(Cell):
        Tensor of output shape :math:`(\text{batch_size}, \text{input_length}, \text{embedding_size})`.

    Examples:
-        >>> net = nn.Embedding(20000, 768,  True)
+        >>> net = nn.EmbeddingThor(20000, 768,  True)
        >>> input_data = Tensor(np.ones([8, 128]), mindspore.int32)
        >>>
        >>> # Maps the input word IDs to word embedding.
--- a/mindspore/nn/optim/thor.py
+++ b/mindspore/nn/optim/thor.py
@ -27,7 +27,7 @@ from mindspore import context
 from mindspore.context import ParallelMode
 from mindspore.nn.layer import DenseThor, Conv2dThor, EmbeddingThor
 from mindspore.nn.wrap import DistributedGradReducer
-from mindspore.train.train_thor.convert_utils import ConvertNetUntils
+from mindspore.train.train_thor.convert_utils import ConvertNetUtils
 from mindspore.parallel._auto_parallel_context import auto_parallel_context

 # Enumerates types of Layer
@ -101,6 +101,17 @@ def clip_gradient(enable_clip_grad, gradients):
 C0 = 16


+def _check_param(momentum, frequency, lr, cls_name):
+    """Check param."""
+    Validator.check_value_type("momentum", momentum, [float], cls_name)
+    if isinstance(momentum, float) and momentum < 0.0:
+        raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
+    Validator.check_value_type("frequency", frequency, [int], cls_name)
+    if isinstance(frequency, int) and frequency < 2:
+        raise ValueError("frequency should be at least 2, but got frequency {}".format(frequency))
+    Validator.check_value_type("learning rate", lr, [Tensor], cls_name)
+
+
 def caculate_device_shape(matrix_dim, channel, is_a):
    ll = (0)
    if is_a:
@ -188,13 +199,103 @@ def thor(net, learning_rate, damping, momentum, weight_decay=0.0, loss_scale=1.0
         use_nesterov=False, decay_filter=lambda x: x.name not in [], split_indices=None, enable_clip_grad=False,
         frequency=100):
    r"""
-    Updates gradients by the THOR algorithm.
+    Updates gradients by second-order algorithm--THOR.
+
    Trace-based Hardware-driven layer-ORiented Natural Gradient Descent Computation (THOR) algorithm is proposed in:
+
    `THOR: Trace-based Hardware-driven layer-ORiented Natural Gradient Descent Computation
    <https://www.aaai.org/AAAI21Papers/AAAI-6611.ChenM.pdf>`_
+
+    The updating formulas are as follows,
+
+    .. math::
+    \begin{array}{ll} \\
+        A_i = a_i{a_i}^T \\
+        G_i = D_{s_i}{ D_{s_i}}^T \\
+        m_i = \beta * m_i + ({G_i^{(k)}}+\lambda I)^{-1}) g_i ({\overline A_{i-1}^{(k)}}+\lambda I)^{-1} \\
+        w_i = w_i - \alpha * m_i \\
+    \end{array}
+
+    :math:`D_{s_i}` represents the derivative of the loss function of the output of the i-th layer,
+    :math:`a_{i-1}` represents the input of i-th layer,and which is the activations of previous layer,
+    :math:`\beta` represents momentum, :math:`I` represents the identity matrix,
+    :math:`\overline A` represents the transpose of matrix A,
+    :math:`\lambda` represents 'damping', :math:`g_i` represents gradients of the i-th layer,
+    :math:`\otimes` represents Kronecker product, :math:`\alpha` represents 'learning rate'
+
+    Note:
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied
+        on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive.
+
+        When separating parameter groups, if you want to centralize the gradient, set grad_centralization to True,
+        but the gradient centralization can only be applied to the parameters of the convolution layer.
+        If the parameters of the non convolution layer are set to True, an error will be reported.
+
+        To improve parameter groups performance, the customized order of parameters can be supported.
+
+    Args:
+        net (Cell): The training network.
+
+        learning_rate (Tensor): A value for the learning rate.
+
+        damping (Tensor): A value for the damping.
+
+        momentum (float): Hyper-parameter of type float, means momentum for the moving average. It must be at least 0.0.
+
+        weight_decay (int, float): Weight decay (L2 penalty). It must be equal to or greater than 0.0. Default: 0.0.
+
+        loss_scale (float): A value for the loss scale. It must be greater than 0.0. In general, use the
+            default value. Default: 1.0.
+
+        batch_size (int): The size of a batch. Default: 32
+
+        use_nesterov (bool): Enable Nesterov momentum. Default: False.
+
+        decay_filter (function): A function to determine which layers the weight decay applied to. And it
+            only works when the weight_decay > 0. Default: lambda x: x.name not in []
+
+        split_indices (list): Set allreduce fusion strategy by A/G layer indices . Only works when distributed
+        computing. ResNet50 as an example, there are 54 layers of A/G respectively, when split_indices is set
+        to [26, 53], it means A/G is divided into two groups to allreduce,  one is 0~26 layer, and the other
+        is 27~53. Default: None
+
+        enable_clip_grad (bool): Whether to clip the gradients. Default: False
+
+        frequency(int): The update interval of A/G and $A^{-1}/G^{-1}$. When frequency equals N (N is greater than 1),
+        A/G and $A^{-1}/G^{-1}$ will be updated  every N steps, and other steps will use the stale A/G and
+        $A^{-1}/G^{-1}$ to update weights. Default: 100.
+
+    Inputs:
+        - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
+
+    Outputs:
+        tuple[bool], all elements are True.
+
+    Raises:
+        TypeError: If `learning_rate` is not Tensor.
+        TypeError: If `loss_scale`,`momentum` or `frequency` is not a float.
+        TypeError: If `weight_decay` is neither float nor int.
+        TypeError: If `use_nesterov` is not a bool.
+        ValueError: If `loss_scale` is less than or equal to 0.
+        ValueError: If `weight_decay` or `momentum` is less than 0.
+        ValueError: If `frequency` is not int.
+        ValueError: If `frequency` is less than 2.
+
+    Supported Platforms:
+        ``Ascend`` ``GPU``
+
+    Examples:
+        >>> net = Net()
+        >>> optim = thor(net, lr=Tensor(1e-3), damping=Tensor(1e-3), momentum=0.9)
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
+        ... loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False)
+        >>> model.train(config.epoch_size, dataset, callbacks=cb, sink_size=100, dataset_sink_mode=True)
+
    """
    context.set_context(max_call_depth=10000)
-    ConvertNetUntils().convert_to_thor_net(net)
+    ConvertNetUtils().convert_to_thor_net(net)
    if context.get_context("device_target") == "Ascend":
        return ThorAscend(net, learning_rate, damping, momentum, weight_decay, loss_scale, batch_size, decay_filter,
                          split_indices=split_indices, enable_clip_grad=enable_clip_grad, frequency=frequency)
@ -212,9 +313,7 @@ class ThorGpu(Optimizer):
                 enable_clip_grad=False, frequency=100):
        params = filter(lambda x: x.requires_grad, net.get_parameters())
        super(ThorGpu, self).__init__(learning_rate, params, weight_decay, loss_scale)
-        Validator.check_value_type("momentum", momentum, [float], self.cls_name)
-        if isinstance(momentum, float) and momentum < 0.0:
-            raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
+        _check_param(momentum, frequency, learning_rate, self.__class__.__name__)
        self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
        self.params = self.parameters
        self.use_nesterov = Validator.check_bool(use_nesterov)
@ -448,18 +547,14 @@ class ThorGpu(Optimizer):
                    matrix_a = matrix_a_allreduce[thor_layer_count]
                    matrix_g = matrix_g_allreduce[thor_layer_count]
                    g = self.update_gradient(matrix_g, g, matrix_a)
-                    fake_a = self.assign(self.matrix_a[thor_layer_count], matrix_a)
-                    fake_g = self.assign(self.matrix_g[thor_layer_count], matrix_g)
-                    g = F.depend(g, fake_a)
-                    g = F.depend(g, fake_g)
+                    self.assign(self.matrix_a[thor_layer_count], matrix_a)
+                    self.assign(self.matrix_g[thor_layer_count], matrix_g)
                    g = self._reshape_gradient(conv_layer_count, g, g_shape)
                elif layer_type == Embedding:
                    matrix_a = matrix_a_allreduce[thor_layer_count]
                    matrix_g = matrix_g_allreduce[thor_layer_count]
-                    fake_a = self.assign(self.matrix_a[thor_layer_count], matrix_a)
-                    fake_g = self.assign(self.matrix_g[thor_layer_count], matrix_g)
-                    g = F.depend(g, fake_a)
-                    g = F.depend(g, fake_g)
+                    self.assign(self.matrix_a[thor_layer_count], matrix_a)
+                    self.assign(self.matrix_g[thor_layer_count], matrix_g)
                    temp_a = self.expand(matrix_a, 1)
                    g = self.mul(temp_a, g)
                    g = self.matmul(g, matrix_g)
@ -507,8 +602,7 @@ class ThorAscend(Optimizer):
                 decay_filter=lambda x: x.name not in [], split_indices=None, enable_clip_grad=False, frequency=100):
        params = filter(lambda x: x.requires_grad, net.get_parameters())
        super(ThorAscend, self).__init__(learning_rate, params, weight_decay, loss_scale)
-        if isinstance(momentum, float) and momentum < 0.0:
-            raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
+        _check_param(momentum, frequency, learning_rate, self.__class__.__name__)
        self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
        self.params = self.parameters
        self.moments = self.params.clone(prefix="moments", init='zeros')
@ -845,10 +939,8 @@ class ThorAscend(Optimizer):
        """process thor graph fc layer"""
        temp_a = matrix_a_allreduce[thor_layer_count]
        temp_g = matrix_g_allreduce[thor_layer_count]
-        fake_a = self.assign(self.matrix_a_cov[thor_layer_count], temp_a)
-        fake_g = self.assign(self.matrix_g_cov[thor_layer_count], temp_g)
-        g = F.depend(g, fake_a)
-        g = F.depend(g, fake_g)
+        self.assign(self.matrix_a_cov[thor_layer_count], temp_a)
+        self.assign(self.matrix_g_cov[thor_layer_count], temp_g)
        temp_a = self.cast(temp_a, mstype.float16)
        temp_g = self.cast(temp_g, mstype.float16)
        g = self.cast(g, mstype.float16)
@ -925,10 +1017,8 @@ class ThorAscend(Optimizer):
        if layer_type == Embedding:
            temp_a_ori = matrix_a_allreduce[thor_layer_count]
            temp_g = matrix_g_allreduce[thor_layer_count]
-            fake_a = self.assign(self.matrix_a_cov[thor_layer_count], temp_a_ori)
-            fake_g = self.assign(self.matrix_g_cov[thor_layer_count], temp_g)
-            g = F.depend(g, fake_a)
-            g = F.depend(g, fake_g)
+            self.assign(self.matrix_a_cov[thor_layer_count], temp_a_ori)
+            self.assign(self.matrix_g_cov[thor_layer_count], temp_g)
            temp_a = self.expand(temp_a_ori, 1)
            g = self.mul(temp_a, g)
            temp_g = self.cast(temp_g, mstype.float16)
@ -982,12 +1072,9 @@ class ThorAscend(Optimizer):
                    temp_a = self.cast(temp_a, mstype.float16)
                    temp_g = self.cast(temp_g, mstype.float16)
                    g, temp_max = self._get_second_grad_by_matmul(i, temp_a, temp_g, g, temp_max)
-                    fake_a = self.assign(self.matrix_a[thor_layer_count], temp_a)
-                    fake_g = self.assign(self.matrix_g[thor_layer_count], temp_g)
-                    fake_max = self.assign(self.matrix_max_inv[thor_layer_count], temp_max)
-                    g = F.depend(g, fake_a)
-                    g = F.depend(g, fake_g)
-                    g = F.depend(g, fake_max)
+                    self.assign(self.matrix_a[thor_layer_count], temp_a)
+                    self.assign(self.matrix_g[thor_layer_count], temp_g)
+                    self.assign(self.matrix_max_inv[thor_layer_count], temp_max)
                    new_grads = new_grads + (g,)
                gradients = new_grads
            else:
--- a/mindspore/ops/operations/_thor_ops.py
+++ b/mindspore/ops/operations/_thor_ops.py
@ -85,7 +85,7 @@ class CusCholeskyTrsm(PrimitiveWithInfer):
    Examples:
        >>> input_x = Tensor(np.ones(shape=[256, 256]), mindspore.float32)
        >>> cus_choleskytrsm = ops.CusCholeskyTrsm()
-        >>> output = matmul(input_x)
+        >>> output = cus_choleskytrsm(input_x)
    """

    @prim_attr_register
--- a/mindspore/train/train_thor/init.py
+++ b/mindspore/train/train_thor/init.py
@ -14,6 +14,6 @@
 # ============================================================================
 """convert to second order related classes and functions."""

-from .convert_utils import ConvertNetUntils, ConvertModelUtils
+from .convert_utils import ConvertNetUtils, ConvertModelUtils

-__all__ = ["ConvertNetUntils", "ConvertModelUtils"]
+__all__ = ["ConvertNetUtils", "ConvertModelUtils"]
--- a/mindspore/train/train_thor/convert_utils.py
+++ b/mindspore/train/train_thor/convert_utils.py
@ -13,29 +13,28 @@
 # limitations under the License.
 # ============================================================================
 """
-convert utils for second order optimizer: thor
+Conversion interface for second-order optimizer thor
 """
 import mindspore.nn as nn
 import mindspore.common.dtype as mstype
 from mindspore import context


-class ConvertNetUntils():
+class ConvertNetUtils():
    """
    Convert net to thor layer net
    """
    def __init__(self):
-        self._convert_method_map = {nn.Dense: ConvertNetUntils._convert_dense,
-                                    nn.Embedding: ConvertNetUntils._convert_embedding,
-                                    nn.Conv2d: ConvertNetUntils._convert_conv2d}
+        self._convert_method_map = {nn.Dense: ConvertNetUtils._convert_dense,
+                                    nn.Embedding: ConvertNetUtils._convert_embedding,
+                                    nn.Conv2d: ConvertNetUtils._convert_conv2d}


    @staticmethod
    def _convert_dense(subcell):
        """
-        convert dense cell to second_order cell
+        Convert dense cell to second-order cell
        """
-
        weight = subcell.weight
        act_name = None
        if subcell.activation_flag:
@ -69,7 +68,7 @@ class ConvertNetUntils():
    @staticmethod
    def _convert_embedding(subcell):
        """
-        convert embedding cell to second_order cell
+        Convert embedding cell to second-order cell
        """
        new_subcell = nn.EmbeddingThor(vocab_size=subcell.vocab_size,
                                       embedding_size=subcell.embedding_size,
@ -81,7 +80,7 @@ class ConvertNetUntils():
    @staticmethod
    def _convert_conv2d(subcell):
        """
-        convert conv2d cell to second_order cell
+        Convert conv2d cell to second-order cell
        """
        out_channel = subcell.out_channels
        in_channel = subcell.in_channels
@ -99,7 +98,7 @@ class ConvertNetUntils():

    def _convert_to_thor_net(self, net):
        """
-        convert net to thor net
+        Convert net to thor net
        """
        cells = net.name_cells()
        change = False
@ -131,25 +130,76 @@ class ConvertNetUntils():

    def convert_to_thor_net(self, net):
        """
-        api for convert net to thor net
+        This interface is used to convert a network to thor layer network, in order to calculate and store the
+        second-order information matrix.
+
+        Notes:
+        This interface is automatically called by the second-order optimizer thor.
+
+        Args:
+        net (Cell): network to be trained by the second-order optimizer thor.
+
+        Examples:
+        >>> ConvertNetUtils().convert_to_thor_net(net)
        """
+
        net.update_cell_prefix()
        self._convert_to_thor_net(net)
-        net.update_cell_type("second_order")
+        net.update_cell_type("second-order")


 class ConvertModelUtils():
    """
-    convert model to thor model utils
+    Convert model to thor model.
    """
-
    @staticmethod
    def convert_to_thor_model(model, network, loss_fn=None, optimizer=None, metrics=None, amp_level="O0",
                              loss_scale_manager=None, keep_batchnorm_fp32=False):
+        """
+        This interface is used to convert model to thor model.

+        Args:
+            model (Object): High-Level API for Training.
+                            `Model` groups layers into an object with training features.
+            network (Cell): A training network.
+            loss_fn (Cell): Objective function. Default: None.
+            optimizer (Cell): Optimizer used to updating the weights. Default: None.
+            metrics (Union[dict, set]): A Dictionary or a set of metrics to be evaluated by the model during
+                                        training. eg: {'accuracy', 'recall'}. Default: None.
+            amp_level (str): Level for mixed precision training. Supports ["O0", "O2", "O3", "auto"]. Default: "O0".
+                - O0: Do not change.
+                - O2: Cast network to float16, keep batchnorm run in float32, using dynamic loss scale.
+                - O3: Cast network to float16, with additional property 'keep_batchnorm_fp32=False'.
+                - auto: Set level to recommended level in different devices. O2 is recommended on GPU, O3 is
+                  recommended on Ascend. The recommended level is based on the expert experience, cannot
+                  always generalize. User should specify the level for special network.
+            loss_scale_manager (Union[None, LossScaleManager]): If it is None, the loss would not be scaled.
+                Otherwise, scale the loss by LossScaleManager and optimizer can not be None. It is a key argument.
+                e.g. Use `loss_scale_manager=None` to set the value.
+            keep_batchnorm_fp32 (bool): Keep Batchnorm running in `float32`. If True, the level setting before
+                will be overwritten. Default: True.
+
+        Returns:
+             model (Object): High-Level API for Training.
+                            `Model` groups layers into an object with training features.
+
+        Examples:
+            >>> from mindspore.nn.optim import thor
+            >>> from mindspore.train.model import Model
+            >>> from mindspore.train.loss_scale_manager import FixedLossScaleManager
+            >>>
+            >>> net = Net()
+            >>> loss_manager = FixedLossScaleManager(128, drop_overflow_update=False)
+            >>> opt = thor(net, lr, damping, momentum=0.9, weight_decay=1e-4, loss_scale=128, batch_size=32,
+            ...            frequency=100)
+            >>> model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_manager, metrics={"acc"},
+            ...               amp_level="O2", keep_batchnorm_fp32=False)
+            >>> model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
+            ...                                                   metrics={'acc'}, amp_level="O2",
+            ...                                                   loss_scale_manager=loss_manager,
+            ...                                                   keep_batchnorm_fp32=False)
        """
-        api for convert model to thor model
-        """
+
        optim_name = type(optimizer).__name__
        if optim_name in ("ThorAscend", "ThorGpu"):
            from .model_thor import ModelThor
--- a/mindspore/train/train_thor/dataset_helper.py
+++ b/mindspore/train/train_thor/dataset_helper.py
@ -53,6 +53,7 @@ class DatasetHelper:
                             If sink_size=-1, sink the complete dataset for each epoch.
                             If sink_size>0, sink sink_size data for each epoch. Default: -1.
        epoch_num (int): Control the number of epoch data to send. Default: 1.
+        iter_first_order (int): Control the number of steps first_order graph to execute. Default: 1.

    Examples:
        >>> dataset_helper = DatasetHelper(dataset)
--- a/tests/st/networks/models/resnet50/src_thor/thor.py
+++ b/tests/st/networks/models/resnet50/src_thor/thor.py
@ -26,7 +26,7 @@ from mindspore import context
 from mindspore.context import ParallelMode
 from mindspore.nn.layer import DenseThor, Conv2dThor, EmbeddingThor
 from mindspore.nn.wrap import DistributedGradReducer
-from mindspore.train.train_thor.convert_utils import ConvertNetUntils
+from mindspore.train.train_thor.convert_utils import ConvertNetUtils
 from mindspore.parallel._auto_parallel_context import auto_parallel_context

 # Enumerates types of Layer
@ -147,7 +147,7 @@ def get_layer_counter(layer_type, layer_counter, params, idx):
 def THOR(net, learning_rate, damping, momentum, weight_decay=0.0, loss_scale=1.0, batch_size=32,
         use_nesterov=False, decay_filter=lambda x: x.name not in [], split_indices=None):
    context.set_context(max_call_depth=10000)
-    ConvertNetUntils().convert_to_thor_net(net)
+    ConvertNetUtils().convert_to_thor_net(net)

    return THOR_Ascend(net, learning_rate, damping, momentum, weight_decay, loss_scale, batch_size, decay_filter,
                       split_indices=split_indices)
@ -486,12 +486,9 @@ class THOR_Ascend(Optimizer):
                    temp_max = self.mul(temp_max, self.batch_size / A_normalizer)
                    g = self.cube_matmul_left(temp_g, g)
                    g = self.cube_matmul_right_mul(g, temp_a, temp_max)
-                fake_A = self.assign(self.matrix_A[thor_layer_count], temp_a)
-                fake_G = self.assign(self.matrix_G[thor_layer_count], temp_g)
-                fake_max = self.assign(self.matrix_max_inv[thor_layer_count], temp_max)
-                g = F.depend(g, fake_A)
-                g = F.depend(g, fake_G)
-                g = F.depend(g, fake_max)
+                self.assign(self.matrix_A[thor_layer_count], temp_a)
+                self.assign(self.matrix_G[thor_layer_count], temp_g)
+                self.assign(self.matrix_max_inv[thor_layer_count], temp_max)
                if i == 159:
                    new_grads = new_grads + (g, gradients[i + 1])
                else: