forked from mindspore-Ecosystem/mindspore
!27052 fix new initializer and optimizer
Merge pull request !27052 from wanyiming/fix_init_opt
This commit is contained in:
commit
5bfb306f77
|
@ -434,6 +434,7 @@ class Sparse(Initializer):
|
|||
_assignment(arr, data)
|
||||
|
||||
|
||||
@_register()
|
||||
class Dirac(Initializer):
|
||||
"""Initialize input tensor with the Dirac delta function. It tries to preserves the identity of
|
||||
input for convolution layers. For group convolution, each group of channels will be preserved respectively.
|
||||
|
@ -442,8 +443,8 @@ class Dirac(Initializer):
|
|||
groups (int): The number of group in convolution layer. Default: 1.
|
||||
|
||||
Raises:
|
||||
ValueError: If the value of group is not in [3, 4, 5] or the first dimension of the initialized
|
||||
tensor cannot be divisible by group.
|
||||
ValueError: If the value of group is not in [3, 4, 5].
|
||||
ValueError: The first dimension of the initialized tensor cannot be divisible by group.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore
|
||||
|
@ -545,7 +546,7 @@ class VarianceScaling(Initializer):
|
|||
distribution(str): The type of distribution chose to sample values. Default: 'truncated_normal'.
|
||||
|
||||
Raises:
|
||||
ValueError: If scale is not greater than 0..
|
||||
ValueError: If scale is not greater than 0.
|
||||
ValueError: If mode is not fan_in, fan_out or fan_avg.
|
||||
ValueError: If distribution is not uniform, truncated_normal or untruncated_normal.
|
||||
|
||||
|
@ -553,7 +554,7 @@ class VarianceScaling(Initializer):
|
|||
>>> import mindspore
|
||||
>>> from mindspore.common.initializer import initializer, VarianceScaling
|
||||
>>> tensor1 = initializer(VarianceScaling(scale=1.0, mode='fan_out',
|
||||
>>> distribution='untruncated_normal'), [2, 3], mindspore.float32)
|
||||
... distribution='untruncated_normal'), [2, 3], mindspore.float32)
|
||||
>>> tensor2 = initializer('varianceScaling', [2, 3], mindspore.float32)
|
||||
"""
|
||||
def __init__(self, scale=1.0, mode='fan_in', distribution='truncated_normal'):
|
||||
|
|
|
@ -52,8 +52,8 @@ class ASGD(Optimizer):
|
|||
|
||||
Args:
|
||||
params (Union[list[Parameter], list[dict]]): Must be list of `Parameter` or list of `dict`. When the
|
||||
`parameters` is a list of `dict`, the "params", "lr", "weight_decay", "grad_centralization" and
|
||||
"order_params" are the keys can be parsed.
|
||||
`parameters` is a list of `dict`, the "params", "lr", "weight_decay", "grad_centralization" and
|
||||
"order_params" are the keys can be parsed.
|
||||
|
||||
- params: Required. Parameters in current group. The value must be a list of `Parameter`.
|
||||
|
||||
|
@ -90,7 +90,7 @@ class ASGD(Optimizer):
|
|||
lambd (float): The decay term. Default: 1e-4.
|
||||
alpha (float): The power for eta update. Default: 0.75.
|
||||
t0 (float): The point of starting averaging. Default: 1e6.
|
||||
weight_decay (float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
|
||||
weight_decay (int, float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
|
||||
|
||||
Inputs:
|
||||
- **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
|
||||
|
@ -168,6 +168,7 @@ class ASGD(Optimizer):
|
|||
|
||||
for index, (grad, param, mu, eta, ax) in enumerate(zip(gradients, self.parameters, self.mu, self.eta, self.ax)):
|
||||
lr = lrs[index] if self.is_group_lr else lrs
|
||||
lr = self.squeeze(lr)
|
||||
|
||||
if self.step == 1.:
|
||||
self.assign(eta, lr)
|
||||
|
|
|
@ -34,13 +34,13 @@ class Rprop(Optimizer):
|
|||
|
||||
.. math::
|
||||
\begin{gather*}
|
||||
&\hspace{0mm} \textbf{if} \: g_{t-1} g_t > 0 \\
|
||||
&\hspace{5mm} \Delta_t \leftarrow \mathrm{min}(\Delta_{t-1} \eta_{+}, \Delta_{max}) \\
|
||||
&\hspace{-10mm} \textbf{if} \: g_{t-1} g_t > 0 \\
|
||||
&\hspace{25mm} \Delta_t \leftarrow \mathrm{min}(\Delta_{t-1} \eta_{+}, \Delta_{max}) \\
|
||||
&\hspace{0mm} \textbf{else if} \: g_{t-1} g_t < 0 \\
|
||||
&\hspace{5mm} \Delta_t \leftarrow \mathrm{max}(\Delta_{t-1} \eta_{-}, \Delta_{min}) \\
|
||||
&\hspace{mm} \textbf{else} \: \\
|
||||
&\hspace{5mm} \Delta_t \leftarrow \Delta_{t-1} \\
|
||||
&\hspace{0mm} w_{t} \leftarrow w_{t-1}- \Delta_{t} \mathrm{sign}(g_t) \\
|
||||
&\hspace{25mm} \Delta_t \leftarrow \mathrm{max}(\Delta_{t-1} \eta_{-}, \Delta_{min}) \\
|
||||
&\hspace{-25mm} \textbf{else} \: \\
|
||||
&\hspace{-5mm} \Delta_t \leftarrow \Delta_{t-1} \\
|
||||
&\hspace{15mm} w_{t} \leftarrow w_{t-1}- \Delta_{t} \mathrm{sign}(g_t) \\
|
||||
\end{gather*}
|
||||
|
||||
:math:`\Delta_{min/max}` represents the min/max step size, :math:`\eta_{+/-}` represents the factors of
|
||||
|
@ -53,8 +53,8 @@ class Rprop(Optimizer):
|
|||
|
||||
Args:
|
||||
params (Union[list[Parameter], list[dict]]): Must be list of `Parameter` or list of `dict`. When the
|
||||
`parameters` is a list of `dict`, the "params", "lr", "weight_decay", "grad_centralization" and
|
||||
"order_params" are the keys can be parsed.
|
||||
`parameters` is a list of `dict`, the "params", "lr", "weight_decay", "grad_centralization" and
|
||||
"order_params" are the keys can be parsed.
|
||||
|
||||
- params: Required. Parameters in current group. The value must be a list of `Parameter`.
|
||||
|
||||
|
@ -91,7 +91,7 @@ class Rprop(Optimizer):
|
|||
etas (tuple[float, float]): The factor of multiplicative increasing or
|
||||
descreasing(etaminus, etaplus).
|
||||
step_sizes(tuple[float, float]): The allowed minimal and maximal step size(min_step_sizes, max_step_size).
|
||||
weight_decay (float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
|
||||
weight_decay (int, float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
|
||||
|
||||
Inputs:
|
||||
- **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
|
||||
|
@ -104,7 +104,7 @@ class Rprop(Optimizer):
|
|||
TypeError: If element of `parameters` is neither Parameter nor dict.
|
||||
TypeError: If `step_sizes` or `etas` is not a tuple.
|
||||
ValueError: If maximal step size is less than minimal step size.
|
||||
ValueError: If the length of `step_sizes` or `ets` is not equal to 2.
|
||||
ValueError: If the length of `step_sizes` or `etas` is not equal to 2.
|
||||
TypeError: If the element in `etas` or `step_sizes` is not a float.
|
||||
ValueError: If `etaminus` is not in the range of (0, 1) or `etaplus` is not greater than 1.
|
||||
TypeError: If `weight_decay` is neither float nor int.
|
||||
|
@ -136,7 +136,7 @@ class Rprop(Optimizer):
|
|||
"""
|
||||
|
||||
@opt_init_args_register
|
||||
def __init__(self, params, learning_rate=0.1, etas=(0.5, 1.2), step_sizes=(1e-6, 50.), weight_decay=0.1):
|
||||
def __init__(self, params, learning_rate=0.1, etas=(0.5, 1.2), step_sizes=(1e-6, 50.), weight_decay=0.):
|
||||
|
||||
super(Rprop, self).__init__(learning_rate, params, weight_decay)
|
||||
if not isinstance(etas, tuple):
|
||||
|
|
Loading…
Reference in New Issue