!19196 fix print format error in nn.layer
Merge pull request !19196 from wangnan39/master
This commit is contained in:
commit
9b531e7877
|
@ -264,8 +264,8 @@ class Conv2d(_Conv):
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def extend_repr(self):
|
def extend_repr(self):
|
||||||
s = 'input_channels={}, output_channels={}, kernel_size={},' \
|
s = 'input_channels={}, output_channels={}, kernel_size={}, ' \
|
||||||
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
||||||
'group={}, has_bias={}, ' \
|
'group={}, has_bias={}, ' \
|
||||||
'weight_init={}, bias_init={}, format={}'.format(
|
'weight_init={}, bias_init={}, format={}'.format(
|
||||||
self.in_channels,
|
self.in_channels,
|
||||||
|
@ -456,9 +456,9 @@ class Conv1d(_Conv):
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def extend_repr(self):
|
def extend_repr(self):
|
||||||
s = 'input_channels={}, output_channels={}, kernel_size={},' \
|
s = 'input_channels={}, output_channels={}, kernel_size={}, ' \
|
||||||
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
||||||
'group={}, has_bias={},' \
|
'group={}, has_bias={}, ' \
|
||||||
'weight_init={}, bias_init={}'.format(
|
'weight_init={}, bias_init={}'.format(
|
||||||
self.in_channels,
|
self.in_channels,
|
||||||
self.out_channels,
|
self.out_channels,
|
||||||
|
@ -639,9 +639,9 @@ class Conv3d(_Conv):
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def extend_repr(self):
|
def extend_repr(self):
|
||||||
s = 'input_channels={}, output_channels={}, kernel_size={},' \
|
s = 'input_channels={}, output_channels={}, kernel_size={}, ' \
|
||||||
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
||||||
'group={}, has_bias={}' \
|
'group={}, has_bias={}, ' \
|
||||||
'weight_init={}, bias_init={}, format={}'.format(
|
'weight_init={}, bias_init={}, format={}'.format(
|
||||||
self.in_channels,
|
self.in_channels,
|
||||||
self.out_channels,
|
self.out_channels,
|
||||||
|
@ -816,9 +816,9 @@ class Conv3dTranspose(_Conv):
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def extend_repr(self):
|
def extend_repr(self):
|
||||||
s = 'input_channels={}, output_channels={}, kernel_size={},' \
|
s = 'input_channels={}, output_channels={}, kernel_size={}, ' \
|
||||||
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
||||||
'group={}, has_bias={},' \
|
'group={}, has_bias={}, ' \
|
||||||
'weight_init={}, bias_init={}'.format(self.in_channels,
|
'weight_init={}, bias_init={}'.format(self.in_channels,
|
||||||
self.out_channels,
|
self.out_channels,
|
||||||
self.kernel_size,
|
self.kernel_size,
|
||||||
|
@ -1018,9 +1018,9 @@ class Conv2dTranspose(_Conv):
|
||||||
return self.conv2d_transpose(x, self.weight, (n, self.out_channels, h_out, w_out))
|
return self.conv2d_transpose(x, self.weight, (n, self.out_channels, h_out, w_out))
|
||||||
|
|
||||||
def extend_repr(self):
|
def extend_repr(self):
|
||||||
s = 'input_channels={}, output_channels={}, kernel_size={},' \
|
s = 'input_channels={}, output_channels={}, kernel_size={}, ' \
|
||||||
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
||||||
'group={}, has_bias={},' \
|
'group={}, has_bias={}, ' \
|
||||||
'weight_init={}, bias_init={}'.format(self.in_channels,
|
'weight_init={}, bias_init={}'.format(self.in_channels,
|
||||||
self.out_channels,
|
self.out_channels,
|
||||||
self.kernel_size,
|
self.kernel_size,
|
||||||
|
@ -1207,9 +1207,9 @@ class Conv1dTranspose(_Conv):
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def extend_repr(self):
|
def extend_repr(self):
|
||||||
s = 'input_channels={}, output_channels={}, kernel_size={},' \
|
s = 'input_channels={}, output_channels={}, kernel_size={}, ' \
|
||||||
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
||||||
'group={}, has_bias={},' \
|
'group={}, has_bias={}, ' \
|
||||||
'weight_init={}, bias_init={}'.format(self.in_channels,
|
'weight_init={}, bias_init={}'.format(self.in_channels,
|
||||||
self.out_channels,
|
self.out_channels,
|
||||||
self.kernel_size,
|
self.kernel_size,
|
||||||
|
|
|
@ -512,8 +512,8 @@ class Conv2dThor(_ConvThor):
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def extend_repr(self):
|
def extend_repr(self):
|
||||||
s = 'input_channels={}, output_channels={}, kernel_size={},' 'stride={}, ' \
|
s = 'input_channels={}, output_channels={}, kernel_size={}, stride={}, ' \
|
||||||
'pad_mode={}, padding={}, dilation={}, ' 'group={}, has_bias={},' \
|
'pad_mode={}, padding={}, dilation={}, group={}, has_bias={}, ' \
|
||||||
'weight_init={}, bias_init={}'.format(self.in_channels, self.out_channels, self.kernel_size,
|
'weight_init={}, bias_init={}'.format(self.in_channels, self.out_channels, self.kernel_size,
|
||||||
self.stride, self.pad_mode, self.padding, self.dilation,
|
self.stride, self.pad_mode, self.padding, self.dilation,
|
||||||
self.group, self.has_bias, self.weight_init, self.bias_init)
|
self.group, self.has_bias, self.weight_init, self.bias_init)
|
||||||
|
|
|
@ -198,13 +198,14 @@ class Adam(Optimizer):
|
||||||
m_{t+1} = \beta_1 * m_{t} + (1 - \beta_1) * g \\
|
m_{t+1} = \beta_1 * m_{t} + (1 - \beta_1) * g \\
|
||||||
v_{t+1} = \beta_2 * v_{t} + (1 - \beta_2) * g * g \\
|
v_{t+1} = \beta_2 * v_{t} + (1 - \beta_2) * g * g \\
|
||||||
l = \alpha * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t} \\
|
l = \alpha * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t} \\
|
||||||
w_{t+1} = w_{t} - l * \frac{m_{t+1}}{\sqrt{v_{t+1}} + \eps}
|
w_{t+1} = w_{t} - l * \frac{m_{t+1}}{\sqrt{v_{t+1}} + \epsilon}
|
||||||
\end{array}
|
\end{array}
|
||||||
|
|
||||||
:math:`m` represents the 1st moment vector `moment1`, :math:`v` represents the 2nd moment vector `moment2`,
|
:math:`m` represents the 1st moment vector `moment1`, :math:`v` represents the 2nd moment vector `moment2`,
|
||||||
:math:`g` represents `gradients`, :math:`l` represents scaling factor, :math:`\beta_1, \beta_2` represent
|
:math:`g` represents `gradients`, :math:`l` represents scaling factor, :math:`\beta_1, \beta_2` represent
|
||||||
`beta1` and `beta2`, :math:`t` represents updating step while :math:`beta_1^t` and :math:`beta_2^t` represent
|
`beta1` and `beta2`, :math:`t` represents updating step while :math:`beta_1^t` and :math:`beta_2^t` represent
|
||||||
`beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`.
|
`beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`,
|
||||||
|
:math:`\epsilon` represents `eps`.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
When separating parameter groups, the weight decay in each group will be applied on the parameters if the
|
When separating parameter groups, the weight decay in each group will be applied on the parameters if the
|
||||||
|
@ -380,9 +381,9 @@ class AdamWeightDecay(Optimizer):
|
||||||
update = \frac{m_{t+1}}{\sqrt{v_{t+1}} + eps} \\
|
update = \frac{m_{t+1}}{\sqrt{v_{t+1}} + eps} \\
|
||||||
update =
|
update =
|
||||||
\begin{cases}
|
\begin{cases}
|
||||||
update + \weight\_decay * w_{t}
|
update + weight\_decay * w_{t}
|
||||||
& \text{ if } \weight\_decay > 0 \\
|
& \text{ if } weight\_decay > 0 \\
|
||||||
\update
|
update
|
||||||
& \text{ otherwise }
|
& \text{ otherwise }
|
||||||
\end{cases} \\
|
\end{cases} \\
|
||||||
w_{t+1} = w_{t} - lr * update
|
w_{t+1} = w_{t} - lr * update
|
||||||
|
@ -515,13 +516,14 @@ class AdamOffload(Optimizer):
|
||||||
m_{t+1} = \beta_1 * m_{t} + (1 - \beta_1) * g \\
|
m_{t+1} = \beta_1 * m_{t} + (1 - \beta_1) * g \\
|
||||||
v_{t+1} = \beta_2 * v_{t} + (1 - \beta_2) * g * g \\
|
v_{t+1} = \beta_2 * v_{t} + (1 - \beta_2) * g * g \\
|
||||||
l = \alpha * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t} \\
|
l = \alpha * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t} \\
|
||||||
w_{t+1} = w_{t} - l * \frac{m_{t+1}}{\sqrt{v_{t+1}} + \eps}
|
w_{t+1} = w_{t} - l * \frac{m_{t+1}}{\sqrt{v_{t+1}} + \epsilon}
|
||||||
\end{array}
|
\end{array}
|
||||||
|
|
||||||
:math:`m` represents the 1st moment vector `moment1`, :math:`v` represents the 2nd moment vector `moment2`,
|
:math:`m` represents the 1st moment vector `moment1`, :math:`v` represents the 2nd moment vector `moment2`,
|
||||||
:math:`g` represents `gradients`, :math:`l` represents scaling factor, :math:`\beta_1, \beta_2` represent
|
:math:`g` represents `gradients`, :math:`l` represents scaling factor, :math:`\beta_1, \beta_2` represent
|
||||||
`beta1` and `beta2`, :math:`t` represents updating step while :math:`beta_1^t` and :math:`beta_2^t` represent
|
`beta1` and `beta2`, :math:`t` represents updating step while :math:`beta_1^t` and :math:`beta_2^t` represent
|
||||||
`beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`.
|
`beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`,
|
||||||
|
:math:`\epsilon` represents `eps`.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
This optimizer only supports `GRAPH_MODE` currently.
|
This optimizer only supports `GRAPH_MODE` currently.
|
||||||
|
|
|
@ -117,13 +117,14 @@ class LazyAdam(Optimizer):
|
||||||
m_{t+1} = \beta_1 * m_{t} + (1 - \beta_1) * g \\
|
m_{t+1} = \beta_1 * m_{t} + (1 - \beta_1) * g \\
|
||||||
v_{t+1} = \beta_2 * v_{t} + (1 - \beta_2) * g * g \\
|
v_{t+1} = \beta_2 * v_{t} + (1 - \beta_2) * g * g \\
|
||||||
l = \alpha * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t} \\
|
l = \alpha * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t} \\
|
||||||
w_{t+1} = w_{t} - l * \frac{m_{t+1}}{\sqrt{v_{t+1}} + \eps}
|
w_{t+1} = w_{t} - l * \frac{m_{t+1}}{\sqrt{v_{t+1}} + \epsilon}
|
||||||
\end{array}
|
\end{array}
|
||||||
|
|
||||||
:math:`m` represents the 1st moment vector `moment1`, :math:`v` represents the 2nd moment vector `moment2`,
|
:math:`m` represents the 1st moment vector `moment1`, :math:`v` represents the 2nd moment vector `moment2`,
|
||||||
:math:`g` represents `gradients`, :math:`l` represents scaling factor, :math:`\beta_1, \beta_2` represent
|
:math:`g` represents `gradients`, :math:`l` represents scaling factor, :math:`\beta_1, \beta_2` represent
|
||||||
`beta1` and `beta2`, :math:`t` represents updating step while :math:`beta_1^t` and :math:`beta_2^t` represent
|
`beta1` and `beta2`, :math:`t` represents updating step while :math:`beta_1^t` and :math:`beta_2^t` represent
|
||||||
`beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`.
|
`beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`,
|
||||||
|
:math:`\epsilon` represents `eps`.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
When separating parameter groups, the weight decay in each group will be applied on the parameters if the
|
When separating parameter groups, the weight decay in each group will be applied on the parameters if the
|
||||||
|
|
|
@ -108,7 +108,7 @@ class _ConvVariational(_Conv):
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def extend_repr(self):
|
def extend_repr(self):
|
||||||
s = 'in_channels={}, out_channels={}, kernel_size={}, stride={}, pad_mode={}, ' \
|
s = 'in_channels={}, out_channels={}, kernel_size={}, stride={}, pad_mode={}, ' \
|
||||||
'padding={}, dilation={}, group={}, weight_mean={}, weight_std={}, has_bias={}' \
|
'padding={}, dilation={}, group={}, weight_mean={}, weight_std={}, has_bias={}' \
|
||||||
.format(self.in_channels, self.out_channels, self.kernel_size, self.stride, self.pad_mode, self.padding,
|
.format(self.in_channels, self.out_channels, self.kernel_size, self.stride, self.pad_mode, self.padding,
|
||||||
self.dilation, self.group, self.weight_posterior.mean, self.weight_posterior.untransformed_std,
|
self.dilation, self.group, self.weight_posterior.mean, self.weight_posterior.untransformed_std,
|
||||||
|
|
|
@ -342,7 +342,7 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell):
|
||||||
this function again to make modification, and sens needs to be of type Tensor.
|
this function again to make modification, and sens needs to be of type Tensor.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **sens**(Tensor) - The new sense whose shape and type are the same with original `scale_sense`.
|
- **sens** (Tensor) - The new sense whose shape and type are the same with original `scale_sense`.
|
||||||
"""
|
"""
|
||||||
if self.scale_sense and isinstance(sens, Tensor):
|
if self.scale_sense and isinstance(sens, Tensor):
|
||||||
self.scale_sense.set_data(sens)
|
self.scale_sense.set_data(sens)
|
||||||
|
@ -360,11 +360,11 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell):
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **pre_cond** (Tensor) - A precondition for starting overflow detection. It determines the executing order
|
- **pre_cond** (Tensor) - A precondition for starting overflow detection. It determines the executing order
|
||||||
of overflow state clearing and prior processions. It makes sure that the function 'start_overflow'
|
of overflow state clearing and prior processions. It makes sure that the function 'start_overflow'
|
||||||
clears status after finishing the process of precondition.
|
clears status after finishing the process of precondition.
|
||||||
- **compute_input** (object) - The input of subsequent process. Overflow detection should be performed on a
|
- **compute_input** (object) - The input of subsequent process. Overflow detection should be performed on a
|
||||||
certain computation. Set `compute_input` as the input of the computation, to ensure overflow status is
|
certain computation. Set `compute_input` as the input of the computation, to ensure overflow status is
|
||||||
cleared before executing the computation.
|
cleared before executing the computation.
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
Tuple[object, object], the first value is False for GPU backend, while it is a instance of
|
Tuple[object, object], the first value is False for GPU backend, while it is a instance of
|
||||||
|
@ -391,8 +391,8 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell):
|
||||||
Inputs:
|
Inputs:
|
||||||
- **status** (object) - A status instance used to detect the overflow.
|
- **status** (object) - A status instance used to detect the overflow.
|
||||||
- **compute_output** - Overflow detection should be performed on a certain computation. Set `compute_output`
|
- **compute_output** - Overflow detection should be performed on a certain computation. Set `compute_output`
|
||||||
as the output of the computation, to ensure overflow status is acquired before executing the
|
as the output of the computation, to ensure overflow status is acquired before executing the
|
||||||
computation.
|
computation.
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
bool, whether the overflow occurs or not.
|
bool, whether the overflow occurs or not.
|
||||||
|
|
Loading…
Reference in New Issue