!27052 fix new initializer and optimizer

Merge pull request !27052 from wanyiming/fix_init_opt
2021-12-03 09:25:50 +00:00 · 2021-12-03 09:25:50 +00:00 · 5bfb306f77
parent a3695771aa 9bdd548b5b
commit 5bfb306f77
3 changed files with 20 additions and 18 deletions
--- a/mindspore/common/initializer.py
+++ b/mindspore/common/initializer.py
@ -434,6 +434,7 @@ class Sparse(Initializer):
        _assignment(arr, data)


+@_register()
 class Dirac(Initializer):
    """Initialize input tensor with the Dirac delta function. It tries to preserves the identity of
    input for convolution layers. For group convolution, each group of channels will be preserved respectively.
@ -442,8 +443,8 @@ class Dirac(Initializer):
        groups (int): The number of group in convolution layer. Default: 1.

    Raises:
-        ValueError: If the value of group is not in [3, 4, 5] or the first dimension of the initialized
-        tensor cannot be divisible by group.
+        ValueError: If the value of group is not in [3, 4, 5].
+        ValueError: The first dimension of the initialized tensor cannot be divisible by group.

    Examples:
        >>> import mindspore
@ -545,7 +546,7 @@ class VarianceScaling(Initializer):
        distribution(str): The type of distribution chose to sample values. Default: 'truncated_normal'.

    Raises:
-        ValueError: If scale is not greater than 0..
+        ValueError: If scale is not greater than 0.
        ValueError: If mode is not fan_in, fan_out or fan_avg.
        ValueError: If distribution is not uniform, truncated_normal or untruncated_normal.

@ -553,7 +554,7 @@ class VarianceScaling(Initializer):
        >>> import mindspore
        >>> from mindspore.common.initializer import initializer, VarianceScaling
        >>> tensor1 = initializer(VarianceScaling(scale=1.0, mode='fan_out',
-        >>>                                       distribution='untruncated_normal'), [2, 3], mindspore.float32)
+        ...                                       distribution='untruncated_normal'), [2, 3], mindspore.float32)
        >>> tensor2 = initializer('varianceScaling', [2, 3], mindspore.float32)
    """
    def __init__(self, scale=1.0, mode='fan_in', distribution='truncated_normal'):
--- a/mindspore/nn/optim/asgd.py
+++ b/mindspore/nn/optim/asgd.py
@ -52,8 +52,8 @@ class ASGD(Optimizer):

    Args:
        params (Union[list[Parameter], list[dict]]): Must be list of `Parameter` or list of `dict`. When the
-        `parameters` is a list of `dict`, the "params", "lr", "weight_decay", "grad_centralization" and
-        "order_params" are the keys can be parsed.
+            `parameters` is a list of `dict`, the "params", "lr", "weight_decay", "grad_centralization" and
+            "order_params" are the keys can be parsed.

            - params: Required. Parameters in current group. The value must be a list of `Parameter`.

@ -90,7 +90,7 @@ class ASGD(Optimizer):
        lambd (float): The decay term. Default: 1e-4.
        alpha (float): The power for eta update. Default: 0.75.
        t0 (float): The point of starting averaging. Default: 1e6.
-        weight_decay (float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
+        weight_decay (int, float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.

    Inputs:
        - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
@ -168,6 +168,7 @@ class ASGD(Optimizer):

        for index, (grad, param, mu, eta, ax) in enumerate(zip(gradients, self.parameters, self.mu, self.eta, self.ax)):
            lr = lrs[index] if self.is_group_lr else lrs
+            lr = self.squeeze(lr)

            if self.step == 1.:
                self.assign(eta, lr)
--- a/mindspore/nn/optim/rprop.py
+++ b/mindspore/nn/optim/rprop.py
@ -34,13 +34,13 @@ class Rprop(Optimizer):

    .. math::
            \begin{gather*}
-            &\hspace{0mm}  \textbf{if} \:   g_{t-1} g_t  > 0                                     \\
-            &\hspace{5mm}  \Delta_t \leftarrow \mathrm{min}(\Delta_{t-1} \eta_{+}, \Delta_{max}) \\
+            &\hspace{-10mm}  \textbf{if} \:   g_{t-1} g_t  > 0                                     \\
+            &\hspace{25mm}  \Delta_t \leftarrow \mathrm{min}(\Delta_{t-1} \eta_{+}, \Delta_{max}) \\
            &\hspace{0mm}  \textbf{else if}  \:  g_{t-1} g_t < 0                                 \\
-            &\hspace{5mm}  \Delta_t \leftarrow \mathrm{max}(\Delta_{t-1} \eta_{-}, \Delta_{min}) \\
-            &\hspace{mm}  \textbf{else}  \:                                                      \\
-            &\hspace{5mm}  \Delta_t \leftarrow \Delta_{t-1}                                      \\
-            &\hspace{0mm} w_{t} \leftarrow w_{t-1}- \Delta_{t} \mathrm{sign}(g_t)                \\
+            &\hspace{25mm}  \Delta_t \leftarrow \mathrm{max}(\Delta_{t-1} \eta_{-}, \Delta_{min}) \\
+            &\hspace{-25mm}  \textbf{else}  \:                                                      \\
+            &\hspace{-5mm}  \Delta_t \leftarrow \Delta_{t-1}                                      \\
+            &\hspace{15mm} w_{t} \leftarrow w_{t-1}- \Delta_{t} \mathrm{sign}(g_t)                \\
            \end{gather*}

    :math:`\Delta_{min/max}` represents the min/max step size, :math:`\eta_{+/-}` represents the factors of
@ -53,8 +53,8 @@ class Rprop(Optimizer):

    Args:
        params (Union[list[Parameter], list[dict]]): Must be list of `Parameter` or list of `dict`. When the
-        `parameters` is a list of `dict`, the "params", "lr", "weight_decay", "grad_centralization" and
-        "order_params" are the keys can be parsed.
+            `parameters` is a list of `dict`, the "params", "lr", "weight_decay", "grad_centralization" and
+            "order_params" are the keys can be parsed.

            - params: Required. Parameters in current group. The value must be a list of `Parameter`.

@ -91,7 +91,7 @@ class Rprop(Optimizer):
        etas (tuple[float, float]): The factor of multiplicative increasing or
            descreasing(etaminus, etaplus).
        step_sizes(tuple[float, float]): The allowed minimal and maximal step size(min_step_sizes, max_step_size).
-        weight_decay (float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
+        weight_decay (int, float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.

    Inputs:
        - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
@ -104,7 +104,7 @@ class Rprop(Optimizer):
        TypeError: If element of `parameters` is neither Parameter nor dict.
        TypeError: If `step_sizes` or `etas` is not a tuple.
        ValueError: If maximal step size is less than minimal step size.
-        ValueError: If the length of `step_sizes` or `ets` is not equal to 2.
+        ValueError: If the length of `step_sizes` or `etas` is not equal to 2.
        TypeError: If  the element in `etas` or `step_sizes` is not a float.
        ValueError: If `etaminus` is not in the range of (0, 1) or `etaplus` is not greater than 1.
        TypeError: If `weight_decay` is neither float nor int.
@ -136,7 +136,7 @@ class Rprop(Optimizer):
    """

    @opt_init_args_register
-    def __init__(self, params, learning_rate=0.1, etas=(0.5, 1.2), step_sizes=(1e-6, 50.), weight_decay=0.1):
+    def __init__(self, params, learning_rate=0.1, etas=(0.5, 1.2), step_sizes=(1e-6, 50.), weight_decay=0.):

        super(Rprop, self).__init__(learning_rate, params, weight_decay)
        if not isinstance(etas, tuple):