forked from mindspore-Ecosystem/mindspore
!9973 update the doc string of some operations.
From: @wangshuide2020 Reviewed-by: @liangchenghui,@wuxuejian Signed-off-by: @liangchenghui
This commit is contained in:
commit
9eb6fb01bc
|
@ -34,7 +34,7 @@ abs_ = P.Abs()
|
||||||
|
|
||||||
def mean(x, axis=(), keep_dims=False):
|
def mean(x, axis=(), keep_dims=False):
|
||||||
"""
|
"""
|
||||||
Reduce a dimension of a tensor by averaging all elements in the dimension.
|
Reduces a dimension of a tensor by averaging all elements in the dimension.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
axis (Union[None, int, tuple(int)]): Dimensions of reduction,
|
axis (Union[None, int, tuple(int)]): Dimensions of reduction,
|
||||||
|
|
|
@ -338,7 +338,7 @@ class Tensor(Tensor_):
|
||||||
|
|
||||||
def mean(self, axis=(), keep_dims=False):
|
def mean(self, axis=(), keep_dims=False):
|
||||||
"""
|
"""
|
||||||
Reduce a dimension of a tensor by averaging all elements in the dimension.
|
Reduces a dimension of a tensor by averaging all elements in the dimension.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
axis (Union[None, int, tuple(int), list(int)]): Dimensions of reduction,
|
axis (Union[None, int, tuple(int), list(int)]): Dimensions of reduction,
|
||||||
|
|
|
@ -131,21 +131,21 @@ def matmul(inputs_x: Tensor, inputs_y: Tensor) -> Tensor:
|
||||||
|
|
||||||
|
|
||||||
def maximum(inputs: Tensor, axis: _Axis = (), keep_dims: bool = False) -> Tensor:
|
def maximum(inputs: Tensor, axis: _Axis = (), keep_dims: bool = False) -> Tensor:
|
||||||
"""Reduce a dimension of a tensor by the maximum value in this dimension."""
|
"""Reduces a dimension of a tensor by the maximum value in this dimension."""
|
||||||
max_op = op.ReduceMax(keep_dims)
|
max_op = op.ReduceMax(keep_dims)
|
||||||
outputs = max_op(inputs, axis)
|
outputs = max_op(inputs, axis)
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
def minimum(inputs: Tensor, axis: _Axis = (), keep_dims: bool = False) -> Tensor:
|
def minimum(inputs: Tensor, axis: _Axis = (), keep_dims: bool = False) -> Tensor:
|
||||||
"""Reduce a dimension of a tensor by the minimum value in the dimension."""
|
"""Reduces a dimension of a tensor by the minimum value in the dimension."""
|
||||||
max_op = op.ReduceMin(keep_dims)
|
max_op = op.ReduceMin(keep_dims)
|
||||||
outputs = max_op(inputs, axis)
|
outputs = max_op(inputs, axis)
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
def mean(inputs: Tensor, axis: _Axis = (), keep_dims: bool = False) -> Tensor:
|
def mean(inputs: Tensor, axis: _Axis = (), keep_dims: bool = False) -> Tensor:
|
||||||
"""Reduce a dimension of a tensor by averaging all elements in the dimension."""
|
"""Reduces a dimension of a tensor by averaging all elements in the dimension."""
|
||||||
mean_op = op.ReduceMean(keep_dims)
|
mean_op = op.ReduceMean(keep_dims)
|
||||||
outputs = mean_op(inputs, axis)
|
outputs = mean_op(inputs, axis)
|
||||||
return outputs
|
return outputs
|
||||||
|
@ -243,7 +243,7 @@ def softmax(axis: int = -1) -> Callable:
|
||||||
|
|
||||||
|
|
||||||
def summation(inputs: Tensor, axis: _Axis = (), keep_dims: bool = False) -> Tensor:
|
def summation(inputs: Tensor, axis: _Axis = (), keep_dims: bool = False) -> Tensor:
|
||||||
"""Reduce a dimension of a tensor by summing all elements in the dimension."""
|
"""Reduces a dimension of a tensor by summing all elements in the dimension."""
|
||||||
sum_op = op.ReduceSum(keep_dims)
|
sum_op = op.ReduceSum(keep_dims)
|
||||||
outputs = sum_op(inputs, axis)
|
outputs = sum_op(inputs, axis)
|
||||||
return outputs
|
return outputs
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
"""
|
"""
|
||||||
Neural Networks Cells.
|
Neural Networks Cells.
|
||||||
|
|
||||||
Pre-defined building blocks or computing units to construct Neural Networks.
|
Pre-defined building blocks or computing units to construct neural networks.
|
||||||
"""
|
"""
|
||||||
from . import layer, loss, optim, metrics, wrap, probability, sparse, dynamic_lr
|
from . import layer, loss, optim, metrics, wrap, probability, sparse, dynamic_lr
|
||||||
from .learning_rate_schedule import *
|
from .learning_rate_schedule import *
|
||||||
|
|
|
@ -913,7 +913,7 @@ class MatrixDiagPart(Cell):
|
||||||
|
|
||||||
class MatrixSetDiag(Cell):
|
class MatrixSetDiag(Cell):
|
||||||
r"""
|
r"""
|
||||||
Modify the batched diagonal part of a batched tensor.
|
Modifies the batched diagonal part of a batched tensor.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **x** (Tensor) - The batched tensor. Rank k+1, where k >= 1. It can be one of the following data types:
|
- **x** (Tensor) - The batched tensor. Rank k+1, where k >= 1. It can be one of the following data types:
|
||||||
|
|
|
@ -30,9 +30,9 @@ __all__ = [
|
||||||
|
|
||||||
class Conv2dBnAct(Cell):
|
class Conv2dBnAct(Cell):
|
||||||
r"""
|
r"""
|
||||||
A combination of convolution, Batchnorm, activation layer.
|
A combination of convolution, Batchnorm, and activation layer.
|
||||||
|
|
||||||
This part is a more detailed overview of Conv2d op.
|
This part is a more detailed overview of Conv2d operation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
in_channels (int): The number of input channel :math:`C_{in}`.
|
in_channels (int): The number of input channel :math:`C_{in}`.
|
||||||
|
|
|
@ -186,7 +186,7 @@ def _compute_multi_channel_loss(c1, c2, img1, img2, conv, concat, mean):
|
||||||
|
|
||||||
class SSIM(Cell):
|
class SSIM(Cell):
|
||||||
r"""
|
r"""
|
||||||
Returns SSIM index between img1 and img2.
|
Returns SSIM index between two images.
|
||||||
|
|
||||||
Its implementation is based on Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004). `Image quality
|
Its implementation is based on Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004). `Image quality
|
||||||
assessment: from error visibility to structural similarity <https://ieeexplore.ieee.org/document/1284395>`_.
|
assessment: from error visibility to structural similarity <https://ieeexplore.ieee.org/document/1284395>`_.
|
||||||
|
@ -266,7 +266,7 @@ def _downsample(img1, img2, op):
|
||||||
|
|
||||||
class MSSSIM(Cell):
|
class MSSSIM(Cell):
|
||||||
r"""
|
r"""
|
||||||
Returns MS-SSIM index between img1 and img2.
|
Returns MS-SSIM index between two images.
|
||||||
|
|
||||||
Its implementation is based on Wang, Zhou, Eero P. Simoncelli, and Alan C. Bovik. `Multiscale structural similarity
|
Its implementation is based on Wang, Zhou, Eero P. Simoncelli, and Alan C. Bovik. `Multiscale structural similarity
|
||||||
for image quality assessment <https://ieeexplore.ieee.org/document/1292216>`_.
|
for image quality assessment <https://ieeexplore.ieee.org/document/1292216>`_.
|
||||||
|
|
|
@ -43,7 +43,7 @@ def _check_input_dtype(input_dtype, param_name, allow_dtypes, cls_name):
|
||||||
|
|
||||||
class LSTM(Cell):
|
class LSTM(Cell):
|
||||||
r"""
|
r"""
|
||||||
LSTM (Long Short-Term Memory) layer.
|
Stacked LSTM (Long Short-Term Memory) layers.
|
||||||
|
|
||||||
Apply LSTM layer to the input.
|
Apply LSTM layer to the input.
|
||||||
|
|
||||||
|
|
|
@ -45,7 +45,7 @@ def _check_input_dtype(param_name, input_dtype, allow_dtypes, cls_name):
|
||||||
|
|
||||||
class ReduceLogSumExp(Cell):
|
class ReduceLogSumExp(Cell):
|
||||||
r"""
|
r"""
|
||||||
Reduce a dimension of a tensor by calculating exponential for all elements in the dimension,
|
Reduces a dimension of a tensor by calculating exponential for all elements in the dimension,
|
||||||
then calculate logarithm of the sum.
|
then calculate logarithm of the sum.
|
||||||
|
|
||||||
The dtype of the tensor to be reduced is number.
|
The dtype of the tensor to be reduced is number.
|
||||||
|
@ -158,7 +158,7 @@ class Range(Cell):
|
||||||
|
|
||||||
class LGamma(Cell):
|
class LGamma(Cell):
|
||||||
r"""
|
r"""
|
||||||
Calculate LGamma using Lanczos' approximation refering to "A Precision Approximationof the Gamma Function".
|
Calculates LGamma using Lanczos' approximation refering to "A Precision Approximationof the Gamma Function".
|
||||||
The algorithm is:
|
The algorithm is:
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
|
@ -886,7 +886,7 @@ class MatMul(Cell):
|
||||||
|
|
||||||
class Moments(Cell):
|
class Moments(Cell):
|
||||||
"""
|
"""
|
||||||
Calculate the mean and variance of `x`.
|
Calculates the mean and variance of `x`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
axis (Union[int, tuple(int)]): Calculates the mean and variance along the specified axis. Default: ().
|
axis (Union[int, tuple(int)]): Calculates the mean and variance along the specified axis. Default: ().
|
||||||
|
|
|
@ -62,7 +62,7 @@ def _shape_check(in_shape):
|
||||||
|
|
||||||
class MaxPool2d(_PoolNd):
|
class MaxPool2d(_PoolNd):
|
||||||
r"""
|
r"""
|
||||||
Max pooling operation for temporal data.
|
2D max pooling operation for temporal data.
|
||||||
|
|
||||||
Applies a 2D max pooling over an input Tensor which can be regarded as a composition of 2D planes.
|
Applies a 2D max pooling over an input Tensor which can be regarded as a composition of 2D planes.
|
||||||
|
|
||||||
|
@ -139,7 +139,7 @@ class MaxPool2d(_PoolNd):
|
||||||
|
|
||||||
class MaxPool1d(_PoolNd):
|
class MaxPool1d(_PoolNd):
|
||||||
r"""
|
r"""
|
||||||
Max pooling operation for temporal data.
|
1D max pooling operation for temporal data.
|
||||||
|
|
||||||
Applies a 1D max pooling over an input Tensor which can be regarded as a composition of 1D planes.
|
Applies a 1D max pooling over an input Tensor which can be regarded as a composition of 1D planes.
|
||||||
|
|
||||||
|
@ -220,7 +220,7 @@ class MaxPool1d(_PoolNd):
|
||||||
|
|
||||||
class AvgPool2d(_PoolNd):
|
class AvgPool2d(_PoolNd):
|
||||||
r"""
|
r"""
|
||||||
Average pooling for temporal data.
|
2D average pooling for temporal data.
|
||||||
|
|
||||||
Applies a 2D average pooling over an input Tensor which can be regarded as a composition of 2D input planes.
|
Applies a 2D average pooling over an input Tensor which can be regarded as a composition of 2D input planes.
|
||||||
|
|
||||||
|
@ -294,7 +294,7 @@ class AvgPool2d(_PoolNd):
|
||||||
|
|
||||||
class AvgPool1d(_PoolNd):
|
class AvgPool1d(_PoolNd):
|
||||||
r"""
|
r"""
|
||||||
Average pooling for temporal data.
|
1D average pooling for temporal data.
|
||||||
|
|
||||||
Applies a 1D average pooling over an input Tensor which can be regarded as a composition of 1D input planes.
|
Applies a 1D average pooling over an input Tensor which can be regarded as a composition of 1D input planes.
|
||||||
|
|
||||||
|
|
|
@ -210,7 +210,7 @@ class UniformQuantObserver(_Observer):
|
||||||
|
|
||||||
class FakeQuantWithMinMaxObserver(UniformQuantObserver):
|
class FakeQuantWithMinMaxObserver(UniformQuantObserver):
|
||||||
r"""
|
r"""
|
||||||
Quantization aware op. This OP provides the fake quantization observer function on data with min and max.
|
Quantization aware operation which provides the fake quantization observer function on data with min and max.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
min_init (int, float): The initialized min value. Default: -6.
|
min_init (int, float): The initialized min value. Default: -6.
|
||||||
|
@ -273,7 +273,7 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):
|
||||||
self.narrow_range = narrow_range
|
self.narrow_range = narrow_range
|
||||||
self.is_ascend = context.get_context('device_target') == "Ascend"
|
self.is_ascend = context.get_context('device_target') == "Ascend"
|
||||||
|
|
||||||
# init tensor min and max for fake quant op
|
# init tensor min and max for fake quantized operation
|
||||||
if self.per_channel:
|
if self.per_channel:
|
||||||
min_array = np.array([self.min_init] * self.num_channels).astype(np.float32)
|
min_array = np.array([self.min_init] * self.num_channels).astype(np.float32)
|
||||||
max_array = np.array([self.max_init] * self.num_channels).astype(np.float32)
|
max_array = np.array([self.max_init] * self.num_channels).astype(np.float32)
|
||||||
|
@ -335,9 +335,9 @@ quant_config_default = QuantConfig(weight=FakeQuantWithMinMaxObserver, activatio
|
||||||
|
|
||||||
class Conv2dBnFoldQuantOneConv(Cell):
|
class Conv2dBnFoldQuantOneConv(Cell):
|
||||||
r"""
|
r"""
|
||||||
2D convolution with BatchNormal op folded construct.
|
2D convolution which use the convolution layer statistics once to calculate BatchNormal operation folded construct.
|
||||||
|
|
||||||
This part is a more detailed overview of Conv2d op.
|
This part is a more detailed overview of Conv2d operation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
in_channels (int): The number of input channel :math:`C_{in}`.
|
in_channels (int): The number of input channel :math:`C_{in}`.
|
||||||
|
@ -546,9 +546,9 @@ class Conv2dBnFoldQuantOneConv(Cell):
|
||||||
|
|
||||||
class Conv2dBnFoldQuant(Cell):
|
class Conv2dBnFoldQuant(Cell):
|
||||||
r"""
|
r"""
|
||||||
2D convolution with BatchNormal op folded construct.
|
2D convolution with BatchNormal operation folded construct.
|
||||||
|
|
||||||
This part is a more detailed overview of Conv2d op.
|
This part is a more detailed overview of Conv2d operation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
in_channels (int): The number of input channel :math:`C_{in}`.
|
in_channels (int): The number of input channel :math:`C_{in}`.
|
||||||
|
@ -730,9 +730,9 @@ class Conv2dBnFoldQuant(Cell):
|
||||||
|
|
||||||
class Conv2dBnWithoutFoldQuant(Cell):
|
class Conv2dBnWithoutFoldQuant(Cell):
|
||||||
r"""
|
r"""
|
||||||
2D convolution + batchnorm without fold with fake quant construct.
|
2D convolution and batchnorm without fold with fake quantized construct.
|
||||||
|
|
||||||
This part is a more detailed overview of Conv2d op.
|
This part is a more detailed overview of Conv2d operation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
in_channels (int): The number of input channel :math:`C_{in}`.
|
in_channels (int): The number of input channel :math:`C_{in}`.
|
||||||
|
@ -844,9 +844,9 @@ class Conv2dBnWithoutFoldQuant(Cell):
|
||||||
|
|
||||||
class Conv2dQuant(Cell):
|
class Conv2dQuant(Cell):
|
||||||
r"""
|
r"""
|
||||||
2D convolution with fake quant op layer.
|
2D convolution with fake quantized operation layer.
|
||||||
|
|
||||||
This part is a more detailed overview of Conv2d op.
|
This part is a more detailed overview of Conv2d operation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
in_channels (int): The number of input channel :math:`C_{in}`.
|
in_channels (int): The number of input channel :math:`C_{in}`.
|
||||||
|
@ -953,9 +953,9 @@ class Conv2dQuant(Cell):
|
||||||
|
|
||||||
class DenseQuant(Cell):
|
class DenseQuant(Cell):
|
||||||
r"""
|
r"""
|
||||||
The fully connected layer with fake quant op.
|
The fully connected layer with fake quantized operation.
|
||||||
|
|
||||||
This part is a more detailed overview of Dense op.
|
This part is a more detailed overview of Dense operation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
in_channels (int): The dimension of the input space.
|
in_channels (int): The dimension of the input space.
|
||||||
|
@ -1057,7 +1057,8 @@ class DenseQuant(Cell):
|
||||||
|
|
||||||
class _QuantActivation(Cell):
|
class _QuantActivation(Cell):
|
||||||
r"""
|
r"""
|
||||||
Base class for quantization aware training activation function. Add Fake Quant OP after activation OP.
|
Base class for quantization aware training activation function. Add fake quantized operation
|
||||||
|
after activation operation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def get_origin(self):
|
def get_origin(self):
|
||||||
|
@ -1068,14 +1069,14 @@ class ActQuant(_QuantActivation):
|
||||||
r"""
|
r"""
|
||||||
Quantization aware training activation function.
|
Quantization aware training activation function.
|
||||||
|
|
||||||
Add the fake quant op to the end of activation op, by which the output of activation op will be truncated.
|
Add the fake quantized operation to the end of activation operation, by which the output of activation operation
|
||||||
Please check `FakeQuantWithMinMaxObserver` or other observer for more details.
|
will be truncated. Please check `FakeQuantWithMinMaxObserver` or other observer for more details.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
activation (Cell): Activation cell.
|
activation (Cell): Activation cell.
|
||||||
ema (bool): The exponential Moving Average algorithm updates min and max. Default: False.
|
ema (bool): The exponential Moving Average algorithm updates min and max. Default: False.
|
||||||
ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
|
ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
|
||||||
fake_before (bool): Whether add fake quant operation before activation. Default: False.
|
fake_before (bool): Whether add fake quantized operation before activation. Default: False.
|
||||||
quant_config (QuantConfig): Configs the oberser types and quant configs of weight and activation. Default:
|
quant_config (QuantConfig): Configs the oberser types and quant configs of weight and activation. Default:
|
||||||
both set to default FakeQuantWithMinMaxObserver.
|
both set to default FakeQuantWithMinMaxObserver.
|
||||||
quant_dtype (QuantDtype): Specifies the FakeQuant datatype. Default: QuantDtype.INT8.
|
quant_dtype (QuantDtype): Specifies the FakeQuant datatype. Default: QuantDtype.INT8.
|
||||||
|
@ -1134,9 +1135,9 @@ class ActQuant(_QuantActivation):
|
||||||
|
|
||||||
class TensorAddQuant(Cell):
|
class TensorAddQuant(Cell):
|
||||||
r"""
|
r"""
|
||||||
Add Fake Quant OP after TensorAdd OP.
|
Add fake quantized operation after TensorAdd operation.
|
||||||
|
|
||||||
This part is a more detailed overview of TensorAdd op.
|
This part is a more detailed overview of TensorAdd operation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
|
ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
|
||||||
|
@ -1185,9 +1186,9 @@ class TensorAddQuant(Cell):
|
||||||
|
|
||||||
class MulQuant(Cell):
|
class MulQuant(Cell):
|
||||||
r"""
|
r"""
|
||||||
Add Fake Quant OP after Mul OP.
|
Add fake quantized operation after `Mul` operation.
|
||||||
|
|
||||||
This part is a more detailed overview of Mul op.
|
This part is a more detailed overview of `Mul` operation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
|
ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
|
||||||
|
|
|
@ -66,7 +66,7 @@ class _Loss(Cell):
|
||||||
|
|
||||||
class L1Loss(_Loss):
|
class L1Loss(_Loss):
|
||||||
r"""
|
r"""
|
||||||
L1Loss creates a criterion to measure the mean absolute error (MAE) between :math:`x` and :math:`y` by element,
|
L1Loss creates a criterion to measure the mean absolute error (MAE) between :math:`x` and :math:`y` element-wise,
|
||||||
where :math:`x` is the input Tensor and :math:`y` is the target Tensor.
|
where :math:`x` is the input Tensor and :math:`y` is the target Tensor.
|
||||||
|
|
||||||
For simplicity, let :math:`x` and :math:`y` be 1-dimensional Tensor with length :math:`N`,
|
For simplicity, let :math:`x` and :math:`y` be 1-dimensional Tensor with length :math:`N`,
|
||||||
|
@ -114,7 +114,7 @@ class L1Loss(_Loss):
|
||||||
class MSELoss(_Loss):
|
class MSELoss(_Loss):
|
||||||
r"""
|
r"""
|
||||||
MSELoss creates a criterion to measure the mean squared error (squared L2-norm) between :math:`x` and :math:`y`
|
MSELoss creates a criterion to measure the mean squared error (squared L2-norm) between :math:`x` and :math:`y`
|
||||||
by element, where :math:`x` is the input and :math:`y` is the target.
|
element-wise, where :math:`x` is the input and :math:`y` is the target.
|
||||||
|
|
||||||
For simplicity, let :math:`x` and :math:`y` be 1-dimensional Tensor with length :math:`N`,
|
For simplicity, let :math:`x` and :math:`y` be 1-dimensional Tensor with length :math:`N`,
|
||||||
the unreduced loss (i.e. with argument reduction set to 'none') of :math:`x` and :math:`y` is given as:
|
the unreduced loss (i.e. with argument reduction set to 'none') of :math:`x` and :math:`y` is given as:
|
||||||
|
@ -490,7 +490,7 @@ class SampledSoftmaxLoss(_Loss):
|
||||||
|
|
||||||
class BCELoss(_Loss):
|
class BCELoss(_Loss):
|
||||||
r"""
|
r"""
|
||||||
BCELoss creates a criterion to measure the Binary Cross Entropy between the true labels and predicted labels.
|
BCELoss creates a criterion to measure the binary cross entropy between the true labels and predicted labels.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
Set the predicted labels as :math:`x`, true labels as :math:`y`, the output loss as :math:`\ell(x, y)`.
|
Set the predicted labels as :math:`x`, true labels as :math:`y`, the output loss as :math:`\ell(x, y)`.
|
||||||
|
|
|
@ -465,9 +465,9 @@ class AdamWeightDecay(Optimizer):
|
||||||
|
|
||||||
class AdamOffload(Optimizer):
|
class AdamOffload(Optimizer):
|
||||||
r"""
|
r"""
|
||||||
Updates gradients by the Adaptive Moment Estimation (Adam) algorithm. This optimizer will offload Adam optimizer to
|
This optimizer will offload Adam optimizer to host CPU and keep parameters being updated on the device,
|
||||||
host CPU and keep parameters being updated on the device, to minimize the memory cost. Although that would bring
|
to minimize the memory cost. Although that would bring about an increase of performance overhead,
|
||||||
about an increase of performance overhead, the optimizer could be used to run a larger model.
|
the optimizer could be used to run a larger model.
|
||||||
|
|
||||||
The Adam algorithm is proposed in `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_.
|
The Adam algorithm is proposed in `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_.
|
||||||
|
|
||||||
|
|
|
@ -51,7 +51,7 @@ def _check_param_value(accum, l1, l2, use_locking, prim_name=None):
|
||||||
|
|
||||||
class ProximalAdagrad(Optimizer):
|
class ProximalAdagrad(Optimizer):
|
||||||
"""
|
"""
|
||||||
Implement the ProximalAdagrad algorithm with ApplyProximalAdagrad Operator.
|
Implements the ProximalAdagrad algorithm with ApplyProximalAdagrad Operator.
|
||||||
|
|
||||||
ProximalAdagrad is an online Learning and Stochastic Optimization.
|
ProximalAdagrad is an online Learning and Stochastic Optimization.
|
||||||
Refer to paper `Efficient Learning using Forward-Backward Splitting
|
Refer to paper `Efficient Learning using Forward-Backward Splitting
|
||||||
|
|
|
@ -33,7 +33,7 @@ def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, accum, s
|
||||||
|
|
||||||
class SGD(Optimizer):
|
class SGD(Optimizer):
|
||||||
r"""
|
r"""
|
||||||
Implements stochastic gradient descent (optionally with momentum).
|
Implements stochastic gradient descent. Momentum is optional.
|
||||||
|
|
||||||
Introduction to SGD can be found at https://en.wikipedia.org/wiki/Stochastic_gradient_descent.
|
Introduction to SGD can be found at https://en.wikipedia.org/wiki/Stochastic_gradient_descent.
|
||||||
Nesterov momentum is based on the formula from paper `On the importance of initialization and
|
Nesterov momentum is based on the formula from paper `On the importance of initialization and
|
||||||
|
|
|
@ -4306,7 +4306,7 @@ class KLDivLoss(PrimitiveWithInfer):
|
||||||
|
|
||||||
class BinaryCrossEntropy(PrimitiveWithInfer):
|
class BinaryCrossEntropy(PrimitiveWithInfer):
|
||||||
r"""
|
r"""
|
||||||
Computes the Binary Cross Entropy between the target and the output.
|
Computes the binary cross entropy between the target and the output.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
Sets input as :math:`x`, input label as :math:`y`, output as :math:`\ell(x, y)`.
|
Sets input as :math:`x`, input label as :math:`y`, output as :math:`\ell(x, y)`.
|
||||||
|
|
Loading…
Reference in New Issue