forked from mindspore-Ecosystem/mindspore
!27052 fix new initializer and optimizer
Merge pull request !27052 from wanyiming/fix_init_opt
This commit is contained in:
commit
5bfb306f77
|
@ -434,6 +434,7 @@ class Sparse(Initializer):
|
||||||
_assignment(arr, data)
|
_assignment(arr, data)
|
||||||
|
|
||||||
|
|
||||||
|
@_register()
|
||||||
class Dirac(Initializer):
|
class Dirac(Initializer):
|
||||||
"""Initialize input tensor with the Dirac delta function. It tries to preserves the identity of
|
"""Initialize input tensor with the Dirac delta function. It tries to preserves the identity of
|
||||||
input for convolution layers. For group convolution, each group of channels will be preserved respectively.
|
input for convolution layers. For group convolution, each group of channels will be preserved respectively.
|
||||||
|
@ -442,8 +443,8 @@ class Dirac(Initializer):
|
||||||
groups (int): The number of group in convolution layer. Default: 1.
|
groups (int): The number of group in convolution layer. Default: 1.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If the value of group is not in [3, 4, 5] or the first dimension of the initialized
|
ValueError: If the value of group is not in [3, 4, 5].
|
||||||
tensor cannot be divisible by group.
|
ValueError: The first dimension of the initialized tensor cannot be divisible by group.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
>>> import mindspore
|
>>> import mindspore
|
||||||
|
@ -545,7 +546,7 @@ class VarianceScaling(Initializer):
|
||||||
distribution(str): The type of distribution chose to sample values. Default: 'truncated_normal'.
|
distribution(str): The type of distribution chose to sample values. Default: 'truncated_normal'.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If scale is not greater than 0..
|
ValueError: If scale is not greater than 0.
|
||||||
ValueError: If mode is not fan_in, fan_out or fan_avg.
|
ValueError: If mode is not fan_in, fan_out or fan_avg.
|
||||||
ValueError: If distribution is not uniform, truncated_normal or untruncated_normal.
|
ValueError: If distribution is not uniform, truncated_normal or untruncated_normal.
|
||||||
|
|
||||||
|
@ -553,7 +554,7 @@ class VarianceScaling(Initializer):
|
||||||
>>> import mindspore
|
>>> import mindspore
|
||||||
>>> from mindspore.common.initializer import initializer, VarianceScaling
|
>>> from mindspore.common.initializer import initializer, VarianceScaling
|
||||||
>>> tensor1 = initializer(VarianceScaling(scale=1.0, mode='fan_out',
|
>>> tensor1 = initializer(VarianceScaling(scale=1.0, mode='fan_out',
|
||||||
>>> distribution='untruncated_normal'), [2, 3], mindspore.float32)
|
... distribution='untruncated_normal'), [2, 3], mindspore.float32)
|
||||||
>>> tensor2 = initializer('varianceScaling', [2, 3], mindspore.float32)
|
>>> tensor2 = initializer('varianceScaling', [2, 3], mindspore.float32)
|
||||||
"""
|
"""
|
||||||
def __init__(self, scale=1.0, mode='fan_in', distribution='truncated_normal'):
|
def __init__(self, scale=1.0, mode='fan_in', distribution='truncated_normal'):
|
||||||
|
|
|
@ -52,8 +52,8 @@ class ASGD(Optimizer):
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
params (Union[list[Parameter], list[dict]]): Must be list of `Parameter` or list of `dict`. When the
|
params (Union[list[Parameter], list[dict]]): Must be list of `Parameter` or list of `dict`. When the
|
||||||
`parameters` is a list of `dict`, the "params", "lr", "weight_decay", "grad_centralization" and
|
`parameters` is a list of `dict`, the "params", "lr", "weight_decay", "grad_centralization" and
|
||||||
"order_params" are the keys can be parsed.
|
"order_params" are the keys can be parsed.
|
||||||
|
|
||||||
- params: Required. Parameters in current group. The value must be a list of `Parameter`.
|
- params: Required. Parameters in current group. The value must be a list of `Parameter`.
|
||||||
|
|
||||||
|
@ -90,7 +90,7 @@ class ASGD(Optimizer):
|
||||||
lambd (float): The decay term. Default: 1e-4.
|
lambd (float): The decay term. Default: 1e-4.
|
||||||
alpha (float): The power for eta update. Default: 0.75.
|
alpha (float): The power for eta update. Default: 0.75.
|
||||||
t0 (float): The point of starting averaging. Default: 1e6.
|
t0 (float): The point of starting averaging. Default: 1e6.
|
||||||
weight_decay (float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
|
weight_decay (int, float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
|
- **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
|
||||||
|
@ -168,6 +168,7 @@ class ASGD(Optimizer):
|
||||||
|
|
||||||
for index, (grad, param, mu, eta, ax) in enumerate(zip(gradients, self.parameters, self.mu, self.eta, self.ax)):
|
for index, (grad, param, mu, eta, ax) in enumerate(zip(gradients, self.parameters, self.mu, self.eta, self.ax)):
|
||||||
lr = lrs[index] if self.is_group_lr else lrs
|
lr = lrs[index] if self.is_group_lr else lrs
|
||||||
|
lr = self.squeeze(lr)
|
||||||
|
|
||||||
if self.step == 1.:
|
if self.step == 1.:
|
||||||
self.assign(eta, lr)
|
self.assign(eta, lr)
|
||||||
|
|
|
@ -34,13 +34,13 @@ class Rprop(Optimizer):
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
\begin{gather*}
|
\begin{gather*}
|
||||||
&\hspace{0mm} \textbf{if} \: g_{t-1} g_t > 0 \\
|
&\hspace{-10mm} \textbf{if} \: g_{t-1} g_t > 0 \\
|
||||||
&\hspace{5mm} \Delta_t \leftarrow \mathrm{min}(\Delta_{t-1} \eta_{+}, \Delta_{max}) \\
|
&\hspace{25mm} \Delta_t \leftarrow \mathrm{min}(\Delta_{t-1} \eta_{+}, \Delta_{max}) \\
|
||||||
&\hspace{0mm} \textbf{else if} \: g_{t-1} g_t < 0 \\
|
&\hspace{0mm} \textbf{else if} \: g_{t-1} g_t < 0 \\
|
||||||
&\hspace{5mm} \Delta_t \leftarrow \mathrm{max}(\Delta_{t-1} \eta_{-}, \Delta_{min}) \\
|
&\hspace{25mm} \Delta_t \leftarrow \mathrm{max}(\Delta_{t-1} \eta_{-}, \Delta_{min}) \\
|
||||||
&\hspace{mm} \textbf{else} \: \\
|
&\hspace{-25mm} \textbf{else} \: \\
|
||||||
&\hspace{5mm} \Delta_t \leftarrow \Delta_{t-1} \\
|
&\hspace{-5mm} \Delta_t \leftarrow \Delta_{t-1} \\
|
||||||
&\hspace{0mm} w_{t} \leftarrow w_{t-1}- \Delta_{t} \mathrm{sign}(g_t) \\
|
&\hspace{15mm} w_{t} \leftarrow w_{t-1}- \Delta_{t} \mathrm{sign}(g_t) \\
|
||||||
\end{gather*}
|
\end{gather*}
|
||||||
|
|
||||||
:math:`\Delta_{min/max}` represents the min/max step size, :math:`\eta_{+/-}` represents the factors of
|
:math:`\Delta_{min/max}` represents the min/max step size, :math:`\eta_{+/-}` represents the factors of
|
||||||
|
@ -53,8 +53,8 @@ class Rprop(Optimizer):
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
params (Union[list[Parameter], list[dict]]): Must be list of `Parameter` or list of `dict`. When the
|
params (Union[list[Parameter], list[dict]]): Must be list of `Parameter` or list of `dict`. When the
|
||||||
`parameters` is a list of `dict`, the "params", "lr", "weight_decay", "grad_centralization" and
|
`parameters` is a list of `dict`, the "params", "lr", "weight_decay", "grad_centralization" and
|
||||||
"order_params" are the keys can be parsed.
|
"order_params" are the keys can be parsed.
|
||||||
|
|
||||||
- params: Required. Parameters in current group. The value must be a list of `Parameter`.
|
- params: Required. Parameters in current group. The value must be a list of `Parameter`.
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@ class Rprop(Optimizer):
|
||||||
etas (tuple[float, float]): The factor of multiplicative increasing or
|
etas (tuple[float, float]): The factor of multiplicative increasing or
|
||||||
descreasing(etaminus, etaplus).
|
descreasing(etaminus, etaplus).
|
||||||
step_sizes(tuple[float, float]): The allowed minimal and maximal step size(min_step_sizes, max_step_size).
|
step_sizes(tuple[float, float]): The allowed minimal and maximal step size(min_step_sizes, max_step_size).
|
||||||
weight_decay (float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
|
weight_decay (int, float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
|
- **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
|
||||||
|
@ -104,7 +104,7 @@ class Rprop(Optimizer):
|
||||||
TypeError: If element of `parameters` is neither Parameter nor dict.
|
TypeError: If element of `parameters` is neither Parameter nor dict.
|
||||||
TypeError: If `step_sizes` or `etas` is not a tuple.
|
TypeError: If `step_sizes` or `etas` is not a tuple.
|
||||||
ValueError: If maximal step size is less than minimal step size.
|
ValueError: If maximal step size is less than minimal step size.
|
||||||
ValueError: If the length of `step_sizes` or `ets` is not equal to 2.
|
ValueError: If the length of `step_sizes` or `etas` is not equal to 2.
|
||||||
TypeError: If the element in `etas` or `step_sizes` is not a float.
|
TypeError: If the element in `etas` or `step_sizes` is not a float.
|
||||||
ValueError: If `etaminus` is not in the range of (0, 1) or `etaplus` is not greater than 1.
|
ValueError: If `etaminus` is not in the range of (0, 1) or `etaplus` is not greater than 1.
|
||||||
TypeError: If `weight_decay` is neither float nor int.
|
TypeError: If `weight_decay` is neither float nor int.
|
||||||
|
@ -136,7 +136,7 @@ class Rprop(Optimizer):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@opt_init_args_register
|
@opt_init_args_register
|
||||||
def __init__(self, params, learning_rate=0.1, etas=(0.5, 1.2), step_sizes=(1e-6, 50.), weight_decay=0.1):
|
def __init__(self, params, learning_rate=0.1, etas=(0.5, 1.2), step_sizes=(1e-6, 50.), weight_decay=0.):
|
||||||
|
|
||||||
super(Rprop, self).__init__(learning_rate, params, weight_decay)
|
super(Rprop, self).__init__(learning_rate, params, weight_decay)
|
||||||
if not isinstance(etas, tuple):
|
if not isinstance(etas, tuple):
|
||||||
|
|
Loading…
Reference in New Issue