update annotation of WarmUpLR, FTRL, LARS, etc. operators.

2021-01-18 19:17:18 +08:00 · 2021-01-18 19:17:18 +08:00 · 2e078eefb4
parent fa3638ad6b
commit 2e078eefb4
6 changed files with 113 additions and 96 deletions
--- a/mindspore/nn/layer/basic.py
+++ b/mindspore/nn/layer/basic.py
@ -481,28 +481,32 @@ class OneHot(Cell):
    """
    Returns a one-hot tensor.

-    The locations represented by indices in argument 'indices' take value on_value,
+    The locations represented by indices in argument `indices` take value on_value,
    while all other locations take value off_value.

    Note:
        If the input indices is rank :math:`N`, the output will have rank :math:`N+1`. The new
        axis is created at dimension `axis`.

-    If :math:`indices` is a scalar, the output shape will be a vector of length :math:`depth`.
+    If `indices` is a scalar, the output shape will be a vector of length `depth`.

-    If :math:`indices` is a vector of length :math:`features`, the output shape will be:
+    If `indices` is a vector of length `features`, the output shape will be:

-    :math:`features * depth if axis == -1`
+    .. code-block::

-    :math:`depth * features if axis == 0`
+        features * depth if axis == -1

-    If :math:`indices` is a matrix with shape :math:`[batch, features]`, the output shape will be:
+        depth * features if axis == 0

-    :math:`batch * features * depth if axis == -1`
+    If `indices` is a matrix with shape `[batch, features]`, the output shape will be:

-    :math:`batch * depth * features if axis == 1`
+    .. code-block::

-    :math:`depth * batch * features if axis == 0`
+        batch * features * depth if axis == -1
+
+        batch * depth * features if axis == 1
+
+        depth * batch * features if axis == 0

    Args:
        axis (int): Features x depth if axis is -1, depth x features
@ -519,7 +523,7 @@ class OneHot(Cell):
        - **indices** (Tensor) - A tensor of indices of data type mindspore.int32 and arbitrary shape.

    Outputs:
-        Tensor, the one-hot tensor of data type 'dtype' with dimension at 'axis' expanded to 'depth' and filled with
+        Tensor, the one-hot tensor of data type `dtype` with dimension at `axis` expanded to `depth` and filled with
        on_value and off_value.

    Supported Platforms:
@ -563,7 +567,9 @@ class Pad(Cell):
            be extended behind of the `D` th dimension of the input tensor. The padded size of each dimension D of the
            output is:

-            :math:`paddings[D, 0]` + input_x.dim_size(D) + paddings[D, 1]`.
+            .. code-block::
+
+                paddings[D, 0] + input_x.dim_size(D) + paddings[D, 1]

        mode (str): Specifies padding mode. The optional values are "CONSTANT", "REFLECT", "SYMMETRIC".
            Default: "CONSTANT".
@ -723,9 +729,14 @@ class Unfold(Cell):
    Outputs:
        Tensor, a 4-D tensor whose data type is same as `input_x`,
        and the shape is [out_batch, out_depth, out_row, out_col] where `out_batch` is the same as the `in_batch`.
-        :math:`out_depth = ksize_row * ksize_col * in_depth`,
-        :math:`out_row = (in_row - (ksize_row + (ksize_row - 1) * (rate_row - 1))) // stride_row + 1`,
-        :math:`out_col = (in_col - (ksize_col + (ksize_col - 1) * (rate_col - 1))) // stride_col + 1`.
+
+        .. code-block::
+
+            out_depth = ksize_row * ksize_col * in_depth
+
+            out_row = (in_row - (ksize_row + (ksize_row - 1) * (rate_row - 1))) // stride_row + 1
+
+            out_col = (in_col - (ksize_col + (ksize_col - 1) * (rate_col - 1))) // stride_col + 1

    Supported Platforms:
        ``Ascend``
@ -867,13 +878,15 @@ def _get_matrix_diag_part_assist(x_shape, x_dtype):


 class MatrixDiag(Cell):
-    """
+    r"""
    Returns a batched diagonal tensor with a given batched diagonal values.

    Assume :math:`x` has :math:`k` dimensions :math:`[I, J, K, ..., N]`, then the output is a tensor of rank
    :math:`k+1` with dimensions :math:`[I, J, K, ..., N, N]` where:

-    :math:`output[i, j, k, ..., m, n] = 1{m=n} * x[i, j, k, ..., n]`.
+    .. code-block::
+
+        output[i, j, k, ..., m, n] = 1{m=n} * x[i, j, k, ..., n]

    Inputs:
        - **x** (Tensor) - The diagonal values. It can be one of the following data types:
@ -911,10 +924,12 @@ class MatrixDiagPart(Cell):
    r"""
    Returns the batched diagonal part of a batched tensor.

-    Assume :math:`x` has :math:`k` dimensions :math:`[I, J, K, ..., M, N]`, then the output is a tensor of rank
+    Assume `x` has :math:`k` dimensions :math:`[I, J, K, ..., M, N]`, then the output is a tensor of rank
    :math:`k-1` with dimensions :math:`[I, J, K, ..., min(M, N]` where:

-    :math:`output[i, j, k, ..., n] = x[i, j, k, ..., n, n]`.
+    .. code-block::
+
+        output[i, j, k, ..., n] = x[i, j, k, ..., n, n]

    Inputs:
        - **x** (Tensor) - The batched tensor. It can be one of the following data types:
@ -953,13 +968,15 @@ class MatrixSetDiag(Cell):
    r"""
    Modifies the batched diagonal part of a batched tensor.

-    Assume :math:`x` has :math:`k+1` dimensions :math:`[I, J, K, ..., M, N]` and :math:`diagonal` has :math:`k`
+    Assume `x` has :math:`k+1` dimensions :math:`[I, J, K, ..., M, N]` and `diagonal` has :math:`k`
    dimensions :math:`[I, J, K, ..., min(M, N)]`. Then the output is a tensor of rank :math:`k+1` with dimensions
    :math:`[I, J, K, ..., M, N]` where:

-        :math:`output[i, j, k, ..., m, n] = diagnoal[i, j, k, ..., n]` for :math:`m == n`.
+    .. code-block::

-        :math:`output[i, j, k, ..., m, n] = x[i, j, k, ..., m, n]` for :math:`m != n`.
+        output[i, j, k, ..., m, n] = diagnoal[i, j, k, ..., n] for m == n
+
+        output[i, j, k, ..., m, n] = x[i, j, k, ..., m, n] for m != n

    Inputs:
        - **x** (Tensor) - The batched tensor. Rank k+1, where k >= 1. It can be one of the following data types:
--- a/mindspore/nn/layer/math.py
+++ b/mindspore/nn/layer/math.py
@ -105,7 +105,7 @@ class Range(Cell):
    r"""
    Creates a sequence of numbers in range [start, limit) with step size delta.

-    The size of output is \left \lfloor \frac{limit-start}{delta}  \right \rfloor + 1 and `delta` is the gap
+    The size of output is :math:`\left \lfloor \frac{limit-start}{delta}  \right \rfloor + 1` and `delta` is the gap
    between two values in the tensor.

    .. math::
@ -827,7 +827,7 @@ def matmul_op_select(x1_shape, x2_shape, transpose_x1, transpose_x2):


 class MatMul(Cell):
-    """
+    r"""
    Multiplies matrix `x1` by matrix `x2`.

    - If both x1 and x2 are 1-dimensional, the dot product is returned.
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@ -212,26 +212,26 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):
    r"""
    Quantization aware operation which provides the fake quantization observer function on data with min and max.

-    The running min/max :math:`x_\text{min}` and :math:`x_\text{max}` are computed as:
+    The running min/max :math:`x_{min}` and :math:`x_{max}` are computed as:

    .. math::

-    \begin{array}{ll} \\
-        x_\text{min} =
-        \begin{cases}
-            \min(\min(X), 0)
-              & \text{ if } ema = \text{False} \\
-            \min((1 - c) \min(X) + \text{c } x_\text{min}, 0)
-              & \text{ if } \text{otherwise}
-        \end{cases}\\
-        x_\text{max} =
-        \begin{cases}
-            \max(\max(X), 0)
-              & \text{ if } ema = \text{False} \\
-            \max((1 - c) \max(X) + \text{c } x_\text{max}, 0)
-              & \text{ if } \text{otherwise}
-        \end{cases}
-    \end{array}
+        \begin{array}{ll} \\
+            x_{min} =
+            \begin{cases}
+                \min(\min(X), 0)
+                  & \text{ if } ema = \text{False} \\
+                \min((1 - c) \min(X) + \text{c } x_{min}, 0)
+                  & \text{ if } \text{otherwise}
+            \end{cases}\\
+            x_{max} =
+            \begin{cases}
+                \max(\max(X), 0)
+                  & \text{ if } ema = \text{False} \\
+                \max((1 - c) \max(X) + \text{c } x_{max}, 0)
+                  & \text{ if } \text{otherwise}
+            \end{cases}
+        \end{array}

    where X is the input tensor, and :math:`c` is the `ema_decay`.

@ -239,32 +239,32 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):

    .. math::

-    \begin{array}{ll} \\
-        s =
-        \begin{cases}
-            \frac{x_\text{max} - x_\text{min}}{Q_\text{max} - Q_\text{min}}
-              & \text{ if } symmetric = \text{False} \\
-            \frac{2\max(x_\text{max}, \left | x_\text{min} \right |) }{Q_\text{max} - Q_\text{min}}
-              & \text{ if } \text{otherwise}
-        \end{cases}\\
-        zp\_min = Q_\text{min} - \frac{x_\text{min}}{scale} \\
-        zp = \left \lfloor \min(Q_\text{max}, \max(Q_\text{min}, zp\_min)) + 0.5 \right \rfloor
-    \end{array}
+        \begin{array}{ll} \\
+            s =
+            \begin{cases}
+                \frac{x_{max} - x_{min}}{Q_{max} - Q_{min}}
+                  & \text{ if } symmetric = \text{False} \\
+                \frac{2\max(x_{max}, \left | x_{min} \right |) }{Q_{max} - Q_{min}}
+                  & \text{ if } \text{otherwise}
+            \end{cases}\\
+            zp\_min = Q_{min} - \frac{x_{min}}{scale} \\
+            zp = \left \lfloor \min(Q_{max}, \max(Q_{min}, zp\_min)) + 0.5 \right \rfloor
+        \end{array}

-    where :math:`Q_\text{max}` and :math:`Q_\text{min}` is decided by quant_dtype, for example, if quant_dtype=INT8,
-    then :math:`Q_\text{max}`=127 and :math:`Q_\text{min}`=-128.
+    where :math:`Q_{max}` and :math:`Q_{min}` is decided by quant_dtype, for example, if quant_dtype=INT8,
+    then :math:`Q_{max} = 127` and :math:`Q_{min} = -128`.

    The fake quant output is computed as:

    .. math::

-    \begin{array}{ll} \\
-        u_\text{min} = (Q_\text{min} - zp) * scale \\
-        u_\text{max} = (Q_\text{max} - zp) * scale \\
-        u_X = \left \lfloor \frac{\min(u_\text{max}, \max(u_\text{min}, X)) - u_\text{min}}{scale}
-        + 0.5 \right \rfloor \\
-        output = u_X * scale + u_\text{min}
-    \end{array}
+        \begin{array}{ll} \\
+            u_{min} = (Q_{min} - zp) * scale \\
+            u_{max} = (Q_{max} - zp) * scale \\
+            u_X = \left \lfloor \frac{\min(u_{max}, \max(u_{min}, X)) - u_{min}}{scale}
+            + 0.5 \right \rfloor \\
+            output = u_X * scale + u_{min}
+        \end{array}


    Args:
@ -393,7 +393,7 @@ class Conv2dBnFoldQuantOneConv(Cell):
    2D convolution which use the convolution layer statistics once to calculate BatchNormal operation folded construct.

    This part is a more detailed overview of Conv2d operation. For more detials about Quantilization,
-    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.
+    please refer to :class:`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@ -594,7 +594,7 @@ class Conv2dBnFoldQuant(Cell):
    2D convolution with BatchNormal operation folded construct.

    This part is a more detailed overview of Conv2d operation. For more detials about Quantilization,
-    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.
+    please refer to :class:`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@ -783,7 +783,7 @@ class Conv2dBnWithoutFoldQuant(Cell):
    2D convolution and batchnorm without fold with fake quantized construct.

    This part is a more detailed overview of Conv2d operation. For more detials about Quantilization,
-    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.
+    please refer to :class:`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@ -899,7 +899,7 @@ class Conv2dQuant(Cell):
    2D convolution with fake quantized operation layer.

    This part is a more detailed overview of Conv2d operation. For more detials about Quantilization,
-    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.
+    please refer to :class:`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@ -1010,7 +1010,7 @@ class DenseQuant(Cell):
    The fully connected layer with fake quantized operation.

    This part is a more detailed overview of Dense operation. For more detials about Quantilization,
-    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.
+    please refer to :class:`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        in_channels (int): The dimension of the input space.
@ -1127,7 +1127,7 @@ class ActQuant(_QuantActivation):

    Add the fake quantized operation to the end of activation operation, by which the output of activation operation
    will be truncated. For more detials about Quantilization,
-    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.
+    please refer to :class:`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        activation (Cell): Activation cell.
@ -1196,7 +1196,7 @@ class TensorAddQuant(Cell):
    Add fake quantized operation after TensorAdd operation.

    This part is a more detailed overview of TensorAdd operation. For more detials about Quantilization,
-    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.
+    please refer to :class:`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
@ -1249,7 +1249,7 @@ class MulQuant(Cell):
    Add fake quantized operation after `Mul` operation.

    This part is a more detailed overview of `Mul` operation. For more detials about Quantilization,
-    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.
+    please refer to :class:`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
--- a/mindspore/nn/learning_rate_schedule.py
+++ b/mindspore/nn/learning_rate_schedule.py
@ -79,7 +79,7 @@ class ExponentialDecayLR(LearningRateSchedule):
    Inputs:
        Tensor. The current step number.

-    Returns:
+    Outputs:
        Tensor. The learning rate value for the current step.

    Examples:
@ -137,7 +137,7 @@ class NaturalExpDecayLR(LearningRateSchedule):
    Inputs:
        Tensor. The current step number.

-    Returns:
+    Outputs:
        Tensor. The learning rate value for the current step.

    Examples:
@ -196,7 +196,7 @@ class InverseDecayLR(LearningRateSchedule):
    Inputs:
        Tensor. The current step number.

-    Returns:
+    Outputs:
        Tensor. The learning rate value for the current step.

    Examples:
@ -244,7 +244,7 @@ class CosineDecayLR(LearningRateSchedule):
    Inputs:
        Tensor. The current step number.

-    Returns:
+    Outputs:
        Tensor. The learning rate value for the current step.

    Examples:
@ -311,7 +311,7 @@ class PolynomialDecayLR(LearningRateSchedule):
    Inputs:
        Tensor. The current step number.

-    Returns:
+    Outputs:
        Tensor. The learning rate value for the current step.

    Examples:
@ -381,7 +381,7 @@ class WarmUpLR(LearningRateSchedule):
    Inputs:
        Tensor. The current step number.

-    Returns:
+    Outputs:
        Tensor. The learning rate value for the current step.

    Examples:
--- a/mindspore/nn/optim/ftrl.py
+++ b/mindspore/nn/optim/ftrl.py
@ -85,17 +85,17 @@ class FTRL(Optimizer):

    .. math::

-    \begin{array}{ll} \\
-        m_{t+1} = m_{t} + g^2 \\
-        u_{t+1} = u_{t} + g  - \frac{m_{t+1}^\text{-p} - m_{t}^\text{-p}}{\alpha } * \omega_{t} \\
-        \omega_{t+1} =
-        \begin{cases}
-            \frac{(sign(u_{t+1}) * l1 - u_{t+1})}{\frac{m_{t+1}^\text{-p}}{\alpha } + 2 * l2 }
-                & \text{ if } |u_{t+1}| > l1 \\
-            0.0
-                & \text{ otherwise }
-        \end{cases}\\
-    \end{array}
+        \begin{array}{ll} \\
+            m_{t+1} = m_{t} + g^2 \\
+            u_{t+1} = u_{t} + g  - \frac{m_{t+1}^\text{-p} - m_{t}^\text{-p}}{\alpha } * \omega_{t} \\
+            \omega_{t+1} =
+            \begin{cases}
+                \frac{(sign(u_{t+1}) * l1 - u_{t+1})}{\frac{m_{t+1}^\text{-p}}{\alpha } + 2 * l2 }
+                    & \text{ if } |u_{t+1}| > l1 \\
+                0.0
+                    & \text{ otherwise }
+            \end{cases}\\
+        \end{array}

    :math:`m` represents `accum`, :math:`g` represents `grads`, :math:`t` represents updateing step,
    :math:`u` represents `linear`, :math:`p` represents `lr_power`, :math:`\alpha` represents `learning_rate`,
--- a/mindspore/nn/optim/lars.py
+++ b/mindspore/nn/optim/lars.py
@ -57,17 +57,17 @@ class LARS(Optimizer):

    .. math::

-    \begin{array}\\
-        \lambda  = \frac{\theta  \text{ * } || \omega  ||  }{|| g_{t} || \text{ + } \delta \text{ * } || \omega  || }  \\
-        \lambda  =
-        \begin{cases}
-            \min(\frac{\lambda}{\alpha }, 1)
-                & \text{ if } clip = True \\
-            \lambda
-                & \text{ otherwise }
-        \end{cases}\\
-        g_{t+1} = \lambda * (g_{t} + \delta * \omega)
-    \end{array}
+        \begin{array}{ll} \\
+            \lambda  = \frac{\theta  \text{ * } || \omega  ||  }{|| g_{t} || \text{ + } \delta \text{ * } || \omega  || }  \\
+            \lambda  =
+            \begin{cases}
+                \min(\frac{\lambda}{\alpha }, 1)
+                    & \text{ if } clip = True \\
+                \lambda
+                    & \text{ otherwise }
+            \end{cases}\\
+            g_{t+1} = \lambda * (g_{t} + \delta * \omega)
+        \end{array}

    :math:`\theta` represents `coefficient`, :math:`\omega` represents `parameters`, :math:`g` represents `gradients`,
    :math:`t` represents updateing step, :math:`\delta` represents `weight_decay`,