From 86afb1725d19ba81bcd2c1c0c439539f9087b8a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=8D=97?= <wangnan39@huawei.com>
Date: Thu, 2 Dec 2021 18:27:55 +0800
Subject: [PATCH] chinese api comments

---
 .../mindspore.DynamicLossScaleManager.rst     |   2 +-
 .../mindspore.FixedLossScaleManager.rst       |   2 +-
 .../mindspore/mindspore.LossScaleManager.rst  |   5 +-
 .../api_python/nn/mindspore.nn.Adagrad.rst    |  87 -----------
 docs/api/api_python/nn/mindspore.nn.Adam.rst  | 101 -------------
 .../nn/mindspore.nn.AdamOffload.txt           |  93 ++++++++++++
 .../nn/mindspore.nn.AdamWeightDecay.txt       |  87 +++++++++++
 ...indspore.nn.DynamicLossScaleUpdateCell.txt |  58 +++++++
 docs/api/api_python/nn/mindspore.nn.FTRL.txt  | 103 +++++++++++++
 .../mindspore.nn.FixedLossScaleUpdateCell.txt |  51 +++++++
 docs/api/api_python/nn/mindspore.nn.LARS.txt  |  50 ++++++
 docs/api/api_python/nn/mindspore.nn.Lamb.txt  |  89 +++++++++++
 .../api_python/nn/mindspore.nn.LazyAdam.txt   |  93 ++++++++++++
 .../api/api_python/nn/mindspore.nn.Metric.rst |  76 ----------
 .../api_python/nn/mindspore.nn.Momentum.rst   |  91 -----------
 .../api_python/nn/mindspore.nn.Optimizer.rst  | 143 ------------------
 .../nn/mindspore.nn.ProximalAdagrad.txt       |  82 ++++++++++
 .../api_python/nn/mindspore.nn.RMSProp.txt    | 101 +++++++++++++
 docs/api/api_python/nn/mindspore.nn.SGD.txt   |  88 +++++++++++
 .../nn/mindspore.nn.TrainOneStepCell.txt      |  50 ++++++
 ...spore.nn.TrainOneStepWithLossScaleCell.txt | 116 ++++++++++++++
 .../nn/mindspore.nn.WithEvalCell.txt          |  29 ++++
 .../nn/mindspore.nn.WithLossCell.txt          |  42 +++++
 .../nn/mindspore.nn.optim_arg_dynamic_lr.rst  |   5 +
 .../nn/mindspore.nn.optim_arg_loss_scale.rst  |   1 +
 .../nn/mindspore.nn.optim_group_gc.rst        |   1 +
 .../nn/mindspore.nn.optim_group_lr.rst        |   1 +
 .../nn/mindspore.nn.optim_group_order.rst     |   1 +
 .../nn/mindspore.nn.optim_group_param.rst     |   1 +
 .../mindspore.nn.optim_group_weight_decay.rst |   1 +
 .../nn/mindspore.nn.optim_note_loss_scale.rst |   1 +
 .../nn/mindspore.nn.optim_note_sparse.rst     |   2 +
 .../mindspore.nn.optim_note_weight_decay.rst  |   2 +
 ...pore.nn.optim_target_unique_for_sparse.rst |   9 ++
 mindspore/nn/optim/adam.py                    |  12 +-
 mindspore/nn/optim/ftrl.py                    |  11 +-
 mindspore/nn/optim/lamb.py                    |   2 +-
 mindspore/nn/optim/lars.py                    |   8 +-
 mindspore/nn/optim/lazyadam.py                |  10 +-
 mindspore/nn/optim/proximal_ada_grad.py       |   4 +-
 mindspore/nn/optim/rmsprop.py                 |  13 +-
 mindspore/nn/optim/sgd.py                     |   4 +-
 mindspore/nn/wrap/cell_wrapper.py             |  23 ++-
 mindspore/nn/wrap/loss_scale.py               |  54 ++++---
 mindspore/train/loss_scale_manager.py         |   7 +-
 45 files changed, 1236 insertions(+), 576 deletions(-)
 delete mode 100644 docs/api/api_python/nn/mindspore.nn.Adagrad.rst
 delete mode 100644 docs/api/api_python/nn/mindspore.nn.Adam.rst
 create mode 100644 docs/api/api_python/nn/mindspore.nn.AdamOffload.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.AdamWeightDecay.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.DynamicLossScaleUpdateCell.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.FTRL.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.FixedLossScaleUpdateCell.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.LARS.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.Lamb.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.LazyAdam.txt
 delete mode 100644 docs/api/api_python/nn/mindspore.nn.Metric.rst
 delete mode 100644 docs/api/api_python/nn/mindspore.nn.Momentum.rst
 delete mode 100644 docs/api/api_python/nn/mindspore.nn.Optimizer.rst
 create mode 100644 docs/api/api_python/nn/mindspore.nn.ProximalAdagrad.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.RMSProp.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.SGD.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.TrainOneStepCell.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.TrainOneStepWithLossScaleCell.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.WithEvalCell.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.WithLossCell.txt
 create mode 100644 docs/api/api_python/nn/mindspore.nn.optim_arg_dynamic_lr.rst
 create mode 100644 docs/api/api_python/nn/mindspore.nn.optim_arg_loss_scale.rst
 create mode 100644 docs/api/api_python/nn/mindspore.nn.optim_group_gc.rst
 create mode 100644 docs/api/api_python/nn/mindspore.nn.optim_group_lr.rst
 create mode 100644 docs/api/api_python/nn/mindspore.nn.optim_group_order.rst
 create mode 100644 docs/api/api_python/nn/mindspore.nn.optim_group_param.rst
 create mode 100644 docs/api/api_python/nn/mindspore.nn.optim_group_weight_decay.rst
 create mode 100644 docs/api/api_python/nn/mindspore.nn.optim_note_loss_scale.rst
 create mode 100644 docs/api/api_python/nn/mindspore.nn.optim_note_sparse.rst
 create mode 100644 docs/api/api_python/nn/mindspore.nn.optim_note_weight_decay.rst
 create mode 100644 docs/api/api_python/nn/mindspore.nn.optim_target_unique_for_sparse.rst

diff --git a/docs/api/api_python/mindspore/mindspore.DynamicLossScaleManager.rst b/docs/api/api_python/mindspore/mindspore.DynamicLossScaleManager.rst
index 16536d3fbd2..079356d5757 100644
--- a/docs/api/api_python/mindspore/mindspore.DynamicLossScaleManager.rst
+++ b/docs/api/api_python/mindspore/mindspore.DynamicLossScaleManager.rst
@@ -38,7 +38,7 @@ mindspore.DynamicLossScaleManager
         
     .. py:method:: get_update_cell()
 
-        返回用于在 :class:`mindspore.TrainOneStepWithLossScaleCell` 中更新梯度放大系数的 `Cell` 实例。
+        返回用于更新梯度放大系数的 `Cell` 实例，:class:`mindspore.TrainOneStepWithLossScaleCell` 会调用该实例。
 
         **返回：**
 
diff --git a/docs/api/api_python/mindspore/mindspore.FixedLossScaleManager.rst b/docs/api/api_python/mindspore/mindspore.FixedLossScaleManager.rst
index 5d3e2a22410..750c6a98368 100644
--- a/docs/api/api_python/mindspore/mindspore.FixedLossScaleManager.rst
+++ b/docs/api/api_python/mindspore/mindspore.FixedLossScaleManager.rst
@@ -44,7 +44,7 @@ mindspore.FixedLossScaleManager
         
     .. py:method:: get_update_cell()
 
-        返回用于更新 `loss_scale` 值的 `Cell` 实例，该实例将在 :class:`mindspore.TrainOneStepWithLossScaleCell` 中执行。
+        返回用于更新 `loss_scale` 值的 `Cell` 实例，:class:`mindspore.TrainOneStepWithLossScaleCell`会调用该实例。该类使用固定的梯度放大系数，因此该实例不执行任何操作。
 
         **返回：**
 
diff --git a/docs/api/api_python/mindspore/mindspore.LossScaleManager.rst b/docs/api/api_python/mindspore/mindspore.LossScaleManager.rst
index 886088cf877..95f1aad1062 100644
--- a/docs/api/api_python/mindspore/mindspore.LossScaleManager.rst
+++ b/docs/api/api_python/mindspore/mindspore.LossScaleManager.rst
@@ -5,7 +5,8 @@ mindspore.LossScaleManager
 
    混合精度梯度放大系数（loss scale）管理器的抽象类。
 
-   派生类需要该类的所有方法。 `get_loss_scale` 用于获取当前的梯度放大系数。`update_loss_scale` 用于更新梯度放大系数，该方法将在训练过程中被调用。`get_update_cell` 用于获取更新梯度放大系数的 `Cell` 实例，该实例在将训练过程中被调用。下沉模式下仅 `get_update_cell` 方式生效，非下沉模式下两种更新梯度放大系数的方式均生效。
+   派生类需要实现该类的所有方法。 `get_loss_scale` 用于获取当前的梯度放大系数。 `update_loss_scale` 用于更新梯度放大系数，该方法将在训练过程中被调用。 `get_update_cell` 用于获取更新梯度放大系数的 `Cell` 实例，该实例将在训练过程中被调用。当前多使用`get_update_cell` 方式。
+
    例如：:class:`mindspore.FixedLossScaleManager` 和 :class:`mindspore.DynamicLossScaleManager` 。
     
     .. py:method:: get_loss_scale()
@@ -14,7 +15,7 @@ mindspore.LossScaleManager
 
     .. py:method:: get_update_cell()
       
-      获取用于更新梯度放大系数的 :class:`mindspore.nn.Cell` 实例。
+      获取用于更新梯度放大系数的Cell实例。
 
     .. py:method:: update_loss_scale(overflow)
 
diff --git a/docs/api/api_python/nn/mindspore.nn.Adagrad.rst b/docs/api/api_python/nn/mindspore.nn.Adagrad.rst
deleted file mode 100644
index ba3a28d382a..00000000000
--- a/docs/api/api_python/nn/mindspore.nn.Adagrad.rst
+++ /dev/null
@@ -1,87 +0,0 @@
-mindspore.nn.Adagrad
-=====================
-
-.. py:class:: mindspore.nn.Adagrad(*args, **kwargs)
-
-    使用ApplyAdagrad算子实现Adagrad算法。
-
-    Adagrad用于在线学习和随机优化。
-    请参阅论文 `Efficient Learning using Forward-Backward Splitting <https://proceedings.neurips.cc/paper/2009/file/621bf66ddb7c962aa0d22ac97d69b793-Paper.pdf>`_。
-    公式如下：
-
-    .. math::
-        \begin{array}{ll} \\
-            h_{t+1} = h_{t} + g\\
-            w_{t+1} = w_{t} - lr*\frac{1}{\sqrt{h_{t+1}}}*g
-        \end{array}
-
-    :math:`h` 表示梯度平方的累积和， :math:`g` 表示 `grads` 。
-    :math:`lr` 代表 `learning_rate`， :math:`w` 代表 `params` 。
-
-    .. note::
-        在参数未分组时，优化器配置的 `weight_decay` 应用于名称含有"beta"或"gamma"的网络参数，通过网络参数分组可调整权重衰减策略。分组时，每组网络参数均可配置 `weight_decay` ，若未配置，则该组网络参数使用优化器中配置的 `weight_decay` 。
-
-    **参数：**
-
-    - **params** (Union[list[Parameter], list[dict]]) - 必须是 `Parameter` 组成的列表或字典组成的列表。当列表元素是字典时，字典的键可以是"params"、"lr"、"weight_decay"、"grad_centralization"和"order_params"：
-
-      - **params** - 必填。当前组别的权重，该值必须是 `Parameter` 列表。
-      - **lr** - 可选。如果键中存在"lr"，则使用对应的值作为学习率。如果没有，则使用优化器中配置的 `learning_rate` 作为学习率。
-      - **weight_decay** - 可选。如果键中存在"weight_decay"，则使用对应的值作为权重衰减值。如果没有，则使用优化器中配置的 `weight_decay` 作为权重衰减值。
-      - **grad_centralization** - 可选。如果键中存在"grad_centralization"，则使用对应的值，该值必须为布尔类型。如果没有，则认为 `grad_centralization` 为False。该参数仅适用于卷积层。
-      - **order_params** - 可选。对应值是预期的参数更新顺序。当使用参数分组功能时，通常使用该配置项保持 `parameters` 的顺序以提升性能。如果键中存在"order_params"，则会忽略该组配置中的其他键。"order_params"中的参数必须在某一组 `params` 参数中。
-    
-    - **accum** (float) - 累加器 :math:`h` 的初始值，必须大于等于零。默认值：0.1。
-    - **learning_rate** (Union[float, Tensor, Iterable, LearningRateSchedule]) - 默认值：0.001。
-
-      - **float** - 固定的学习率。必须大于等于零。
-      - **int** - 固定的学习率。必须大于等于零。整数类型会被转换为浮点数。
-      - **Tensor** - 可以是标量或一维向量。标量是固定的学习率。一维向量是动态的学习率，第i步将取向量中第i个值作为学习率。
-      - **Iterable** - 动态的学习率。第i步将取迭代器第i个值作为学习率。
-      - **LearningRateSchedule** - 动态的学习率。在训练过程中，优化器将使用步数（step）作为输入，调用 `LearningRateSchedule` 实例来计算当前学习率。
-    
-    - **update_slots** (bool) - 如果为True，则更新累加器 :math:`h` 。默认值：True。
-    - **loss_scale** (float) - 梯度缩放系数，必须大于0。如果 `loss_scale` 是整数，它将被转换为浮点数。通常使用默认值，仅当训练时使用了 `FixedLossScaleManager` ，且 `FixedLossScaleManager` 的 `drop_overflow_update` 属性配置为False时，此值需要与 `FixedLossScaleManager` 中的 `loss_scale` 相同。有关更多详细信息，请参阅 :class:`mindspore.FixedLossScaleManager` 。默认值：1.0。
-    - **weight_decay** (Union[float, int]) - 要乘以权重的权重衰减值，必须大于等于0.0。默认值：0.0。
-
-    **输入：**
-
-    **grads** (tuple[Tensor]) - 优化器中 `params` 的梯度，形状（shape）与 `params` 相同。
-
-    **输出：**
-
-    Tensor[bool]，值为True。
-
-    **异常：**
-
-    - **TypeError** - `learning_rate` 不是int、float、Tensor、Iterable或 `LearningRateSchedule` 。
-    - **TypeError** - `parameters` 的元素是 `Parameter` 或字典。
-    - **TypeError** - `accum` 或 `loss_scale` 不是float。
-    - **TypeError** - `update_slots` 不是bool。
-    - **TypeError** - `weight_decay` 不是float或int。
-    - **ValueError** - `loss_scale` 小于或等于0。
-    - **ValueError** - `accum` 或 `weight_decay` 小于0。
-
-    **支持平台：**
-
-    ``Ascend``  ``CPU``  ``GPU``
-
-    **样例：**
-
-    >>> net = Net()
-    >>> #1) 所有参数使用相同的学习率和权重衰减
-    >>> optim = nn.Adagrad(params=net.trainable_params())
-    >>>
-    >>> #2) 使用参数组并设置不同的值
-    >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-    >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
-    >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'grad_centralization':True},
-    ...                 {'params': no_conv_params, 'lr': 0.01},
-    ...                 {'order_params': net.trainable_params()}]
-    >>> optim = nn.Adagrad(group_params, learning_rate=0.1, weight_decay=0.0)
-    >>> # conv_params参数组将使用优化器中的学习率0.1、该组的权重衰减0.01、该组的梯度中心化配置True。
-    >>> # no_conv_params参数组将使用该组的学习率0.01、优化器中的权重衰减0.0、梯度中心化使用默认值False。
-    >>> # 优化器按照"order_params"配置的参数顺序更新参数。
-    >>>
-    >>> loss = nn.SoftmaxCrossEntropyWithLogits()
-    >>> model = Model(net, loss_fn=loss, optimizer=optim)
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.Adam.rst b/docs/api/api_python/nn/mindspore.nn.Adam.rst
deleted file mode 100644
index c20ce158ddd..00000000000
--- a/docs/api/api_python/nn/mindspore.nn.Adam.rst
+++ /dev/null
@@ -1,101 +0,0 @@
-mindspore.nn.Adam
-==================
-
-.. py:class:: mindspore.nn.Adam(*args, **kwargs)
-
-    通过Adaptive Moment Estimation (Adam)算法更新梯度。
-
-    请参阅论文 `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_。
-
-    公式如下：
-
-    .. math::
-        \begin{array}{ll} \\
-            m_{t+1} = \beta_1 * m_{t} + (1 - \beta_1) * g \\
-            v_{t+1} = \beta_2 * v_{t} + (1 - \beta_2) * g * g \\
-            l = \alpha * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t} \\
-            w_{t+1} = w_{t} - l * \frac{m_{t+1}}{\sqrt{v_{t+1}} + \epsilon}
-        \end{array}
-
-    :math:`m` 代表第一个动量矩阵 `moment1` ，:math:`v` 代表第二个动量矩阵 `moment2` ，:math:`g` 代表 `gradients` ，:math:`l` 代表缩放因子，:math:`\beta_1,\beta_2` 代表 `beta1` 和 `beta2` ，:math:`t` 代表更新步骤，:math:`beta_1^t` 和 :math:`beta_2^t` 代表 `beta1_power` 和 `beta2_power` ，:math:`\alpha` 代表 `learning_rate` ， :math:`w` 代表 `params` ， :math:`\epsilon` 代表 `eps` 。
-
-    .. note::
-        如果前向网络使用了SparseGatherV2等算子，优化器会执行稀疏运算，通过设置 `target` 为CPU，可在主机（host）上进行稀疏运算。
-        稀疏特性在持续开发中。
-  
-        在参数未分组时，优化器配置的 `weight_decay` 应用于名称含有"beta"或"gamma"的网络参数，通过网络参数分组可调整权重衰减策略。分组时，每组网络参数均可配置 `weight_decay` ，若未配置，则该组网络参数使用优化器中配置的 `weight_decay` 。
-
-    **参数：**
-
-    - **params** (Union[list[Parameter], list[dict]]) - 必须是 `Parameter` 组成的列表或字典组成的列表。当列表元素是字典时，字典的键可以是"params"、"lr"、"weight_decay"、"grad_centralization"和"order_params"：
-            
-      - **params** - 必填。当前组别的权重，该值必须是 `Parameter` 列表。
-      - **lr** - 可选。如果键中存在"lr"，则使用对应的值作为学习率。如果没有，则使用优化器中配置的 `learning_rate` 作为学习率。
-      - **weight_decay** - 可选。如果键中存在"weight_decay”，则使用对应的值作为权重衰减值。如果没有，则使用优化器中配置的 `weight_decay` 作为权重衰减值。
-      - **grad_centralization** - 可选。如果键中存在"grad_centralization"，则使用对应的值，该值必须为布尔类型。如果没有，则认为 `grad_centralization` 为False。该参数仅适用于卷积层。
-      - **order_params** - 可选。对应值是预期的参数更新顺序。当使用参数分组功能时，通常使用该配置项保持 `parameters` 的顺序以提升性能。如果键中存在"order_params"，则会忽略该组配置中的其他键。"order_params"中的参数必须在某一组 `params` 参数中。
-    
-    - **learning_rate** (Union[float, Tensor, Iterable, LearningRateSchedule]): 默认值：1e-3。
-
-      - **float** - 固定的学习率。必须大于等于零。
-      - **int** - 固定的学习率。必须大于等于零。整数类型会被转换为浮点数。
-      - **Tensor** - 可以是标量或一维向量。标量是固定的学习率。一维向量是动态的学习率，第i步将取向量中第i个值作为学习率。
-      - **Iterable** - 动态的学习率。第i步将取迭代器第i个值作为学习率。
-      - **LearningRateSchedule** - 动态的学习率。在训练过程中，优化器将使用步数（step）作为输入，调用 `LearningRateSchedule` 实例来计算当前学习率。
-    
-    - **beta1** (float) - `moment1` 的指数衰减率。参数范围（0.0,1.0）。默认值：0.9。
-    - **beta2** (float) - `moment2` 的指数衰减率。参数范围（0.0,1.0）。默认值：0.999。
-    - **eps** (float) - 将添加到分母中，以提高数值稳定性。必须大于0。默认值：1e-8。
-    - **use_locking** (bool) - 是否对参数更新加锁保护。如果为True，则 `w` 、`m` 和 `v` 的tensor更新将受到锁的保护。如果为False，则结果不可预测。默认值：False。
-    - **use_nesterov** (bool) - 是否使用Nesterov Accelerated Gradient (NAG)算法更新梯度。如果为True，使用NAG更新梯度。如果为False，则在不使用NAG的情况下更新梯度。默认值：False。
-    - **weight_decay** (float) - 权重衰减（L2 penalty）。必须大于等于0。默认值：0.0。
-    - **loss_scale** (float) - 梯度缩放系数，必须大于0。如果 `loss_scale` 是整数，它将被转换为浮点数。通常使用默认值，仅当训练时使用了 `FixedLossScaleManager` ，且 `FixedLossScaleManager` 的 `drop_overflow_update` 属性配置为False时，此值需要与 `FixedLossScaleManager` 中的 `loss_scale` 相同。有关更多详细信息，请参阅 :class:`mindspore.FixedLossScaleManager` 。默认值：1.0。
-
-    **输入：**
-
-    **gradients** (tuple[Tensor]) - `params` 的梯度，形状（shape）与 `params` 相同。
-
-    **输出：**
-
-    Tensor[bool]，值为True。
-
-    **异常：**
-
-    - **TypeError** - `learning_rate` 不是int、float、Tensor、Iterable或LearningRateSchedule。
-    - **TypeError** - `parameters` 的元素不是Parameter或字典。
-    - **TypeError** - `beta1` 、`beta2` 、 `eps` 或 `loss_scale` 不是float。
-    - **TypeError** - `weight_decay` 不是float或int。
-    - **TypeError** - `use_locking` 或 `use_nesterov` 不是bool。
-    - **ValueError** - `loss_scale` 或 `eps` 小于或等于0。
-    - **ValueError** - `beta1` 、`beta2` 不在（0.0,1.0）范围内。
-    - **ValueError** - `weight_decay` 小于0。
-
-    **支持平台：**
-
-    ``Ascend``  ``GPU``  ``CPU``
-
-    **样例：**
-    
-    >>> net = Net()
-    >>> #1) 所有参数使用相同的学习率和权重衰减
-    >>> optim = nn.Adam(params=net.trainable_params())
-    >>>
-    >>> #2) 使用参数组并设置不同的值
-    >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-    >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
-    >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'grad_centralization':True},
-    ...                 {'params': no_conv_params, 'lr': 0.01},
-    ...                 {'order_params': net.trainable_params()}]
-    >>> optim = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0)
-    >>> # conv_params参数组将使用优化器中的学习率0.1、该组的权重衰减0.01、该组的梯度中心化配置True。
-    >>> # no_conv_params参数组将使用该组的学习率0.01、优化器中的权重衰减0.0、梯度中心化使用默认值False。
-    >>> # 优化器按照"order_params"配置的参数顺序更新参数。
-    >>>
-    >>> loss = nn.SoftmaxCrossEntropyWithLogits()
-    >>> model = Model(net, loss_fn=loss, optimizer=optim)
-    
-
-    .. py:method:: target
-        :property:
-
-        该属性用于指定在主机（host）上还是设备（device）上更新参数。输入类型为str，只能是'CPU'，'Ascend'或'GPU'。
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.AdamOffload.txt b/docs/api/api_python/nn/mindspore.nn.AdamOffload.txt
new file mode 100644
index 00000000000..a35aa1aaca5
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.AdamOffload.txt
@@ -0,0 +1,93 @@
+Class mindspore.nn.AdamOffload(params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-08, use_locking=False, use_nesterov=False, weight_decay=0.0, loss_scale=1.0)
+
+    姝や紭鍖栧櫒鍦ㄤ富鏈篊PU涓婅繍琛孉dam浼樺寲绠楁硶锛岃澶囦笂浠呮墽琛岀綉缁滃弬鏁扮殑鏇存柊锛屾渶澶ч檺搴﹀湴闄嶄綆鍐呭瓨鎴愭湰銆�
+    铏界劧浼氬鍔犳€ц兘寮€閿€锛屼絾浼樺寲鍣ㄥ彲浠ヨ繍琛屾洿澶х殑妯″瀷銆�
+    
+
+    Adam绠楁硶鍙傝`Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_銆�
+
+    鏇存柊鍏紡濡備笅锛�
+
+    .. math::
+        \begin{array}{ll} \\
+            m_{t+1} = \beta_1 * m_{t} + (1 - \beta_1) * g \\
+            v_{t+1} = \beta_2 * v_{t} + (1 - \beta_2) * g * g \\
+            l = \alpha * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t} \\
+            w_{t+1} = w_{t} - l * \frac{m_{t+1}}{\sqrt{v_{t+1}} + \epsilon}
+        \end{array}
+
+    :math:`m`浠ｈ〃绗竴涓煩鍚戦噺`moment1`锛�:math:`v`浠ｈ〃绗簩涓煩鍚戦噺`moment2`锛�:math:`g`浠ｈ〃`gradients`锛�:math:`l`浠ｈ〃缂╂斁鍥犲瓙锛�:math:`\beta_1,\beta_2`浠ｈ〃`beta1`鍜宍beta2`锛�:math:`t`浠ｈ〃褰撳墠step锛�:math:`beta_1^t`鍜�:math:`beta_2^t`浠ｈ〃`beta1_power`鍜宍beta2_power`锛�:math:`\alpha`浠ｈ〃`learning_rate`锛�:math:`w`浠ｈ〃`params`锛�:math:`\epsilon`浠ｈ〃`eps`銆�
+
+    .. note::
+        姝や紭鍖栧櫒鐩墠浠呮敮鎸佸浘妯″紡銆�
+
+        .. include:: mindspore.nn.optim_note_weight_decay.rst
+
+    鍙傛暟锛�
+    - **params** (Union[list[Parameter], list[dict]]) - 蹇呴』鏄� `Parameter` 缁勬垚鐨勫垪琛ㄦ垨瀛楀吀缁勬垚鐨勫垪琛ㄣ€傚綋鍒楄〃鍏冪礌鏄瓧鍏告椂锛屽瓧鍏哥殑閿彲浠ユ槸"params"銆�"lr"銆�"weight_decay"銆佸拰"order_params"锛�
+
+      .. include:: mindspore.nn.optim_group_param.rst
+
+      .. include:: mindspore.nn.optim_group_lr.rst
+
+      .. include:: mindspore.nn.optim_group_weight_decay.rst
+
+      .. include:: mindspore.nn.optim_group_order.rst
+              
+
+    - **learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): 榛樿鍊硷細1e-3銆�
+      .. include:: mindspore.nn.optim_arg_dynamic_lr.rst
+
+    - **beta1** (float) - `moment1` 鐨勬寚鏁拌“鍑忕巼銆傚弬鏁拌寖鍥达紙0.0,1.0锛夈€傞粯璁ゅ€硷細0.9銆�
+
+    - **beta2** (float) - `moment2` 鐨勬寚鏁拌“鍑忕巼銆傚弬鏁拌寖鍥达紙0.0,1.0锛夈€傞粯璁ゅ€硷細0.999銆�
+
+    - **eps** (float) - 灏嗘坊鍔犲埌鍒嗘瘝涓紝浠ユ彁楂樻暟鍊肩ǔ瀹氭€с€傚繀椤诲ぇ浜�0銆傞粯璁ゅ€硷細1e-8銆�
+
+    - **use_locking** (bool) - 鏄惁瀵瑰弬鏁版洿鏂板姞閿佷繚鎶ゃ€傚鏋滀负True锛屽垯 `w` 銆乣m` 鍜� `v` 鐨勬洿鏂板皢鍙楀埌閿佷繚鎶ゃ€傚鏋滀负False锛屽垯缁撴灉涓嶅彲棰勬祴銆傞粯璁ゅ€硷細False銆�
+
+    - **use_nesterov** (bool) - 鏄惁浣跨敤Nesterov Accelerated Gradient (NAG)绠楁硶鏇存柊姊害銆傚鏋滀负True锛屼娇鐢∟AG鏇存柊姊害銆傚鏋滀负False锛屽垯鍦ㄤ笉浣跨敤NAG鐨勬儏鍐典笅鏇存柊姊害銆傞粯璁ゅ€硷細False銆�
+
+    - **weight_decay** (float) - 鏉冮噸琛板噺锛圠2 penalty锛夈€傚繀椤诲ぇ浜庣瓑浜�0銆傞粯璁ゅ€硷細0.0銆�
+
+    .. include:: mindspore.nn.optim_arg_loss_scale.rst
+
+
+    杈撳叆锛�
+        - **gradients** (tuple[Tensor])锛歚params`鐨勬搴︼紝shape涓巂params`鐩稿悓銆�
+
+    杈撳嚭锛�
+        Tensor[bool]锛屽€间负True銆�
+
+    寮傚父锛�
+        TypeError锛歚learning_rate`涓嶆槸int銆乫loat銆乀ensor銆両terable鎴朙earningRateSchedule銆�
+        TypeError锛歚parameters`鐨勫厓绱犱笉鏄疨arameter鎴栧瓧鍏搞€�
+        TypeError锛歚beta1`銆乣beta2`銆乣eps`鎴朻loss_scale`涓嶆槸float銆�
+        TypeError锛歚weight_decay`涓嶆槸float鎴杋nt銆�
+        TypeError锛歚use_locking`鎴朻use_nesterov`涓嶆槸bool銆�
+        ValueError锛歚loss_scale`鎴朻eps`涓嶅ぇ浜�0銆�
+        ValueError锛歚beta1`銆乣beta2`涓嶅湪锛�0.0,1.0锛夎寖鍥村唴銆�
+        ValueError锛歚weight_decay`灏忎簬0銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``GPU`` ``CPU``
+
+    绀轰緥锛�
+        >>> net = Net()
+        >>> #1) 鎵€鏈夊弬鏁颁娇鐢ㄧ浉鍚岀殑瀛︿範鐜囧拰鏉冮噸琛板噺
+        >>> optim = nn.AdamOffload(params=net.trainable_params())
+        >>>
+        >>> #2) 浣跨敤鍙傛暟鍒嗙粍骞惰缃笉鍚岀殑鍊�
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
+        ...                 {'params': no_conv_params, 'lr': 0.01},
+        ...                 {'order_params': net.trainable_params()}]
+        >>> optim = nn.AdamOffload(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # conv_params鍙傛暟缁勫皢浣跨敤浼樺寲鍣ㄤ腑鐨勫涔犵巼0.1銆佽缁勭殑鏉冮噸琛板噺0.01銆�
+        >>> # no_conv_params鍙傛暟缁勫皢浣跨敤璇ョ粍鐨勫涔犵巼0.01銆佷紭鍖栧櫒涓殑鏉冮噸琛板噺0.0銆�
+        >>> # 浼樺寲鍣ㄦ寜鐓�"order_params"閰嶇疆鐨勫弬鏁伴『搴忔洿鏂板弬鏁般€�
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
+    
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.AdamWeightDecay.txt b/docs/api/api_python/nn/mindspore.nn.AdamWeightDecay.txt
new file mode 100644
index 00000000000..7c64aa499e6
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.AdamWeightDecay.txt
@@ -0,0 +1,87 @@
+Class mindspore.nn.AdamWeightDecay(params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-06, weight_decay=0.0)
+
+    瀹炵幇鏉冮噸琛板噺Adam绠楁硶銆�
+
+    .. math::
+        \begin{array}{ll} \\
+            m_{t+1} = \beta_1 * m_{t} + (1 - \beta_1) * g \\
+            v_{t+1} = \beta_2 * v_{t} + (1 - \beta_2) * g * g \\
+            update = \frac{m_{t+1}}{\sqrt{v_{t+1}} + eps} \\
+            update =
+            \begin{cases}
+                update + weight\_decay * w_{t}
+                    & \text{ if } weight\_decay > 0 \\
+                update
+                    & \text{ otherwise }
+            \end{cases} \\
+            w_{t+1}  = w_{t} - lr * update
+        \end{array}
+
+    :math:`m`琛ㄧず绗�1鐭╁悜閲廯moment1`,:math:`v`琛ㄧず绗�2鐭╁悜閲廯moment2`锛�:math:`g`琛ㄧず`gradients`锛�:math:`lr`琛ㄧず`learning_rate`锛�:math:`\beta_1, \beta_2`琛ㄧず`beta1`鍜宍beta2`,:math:`t`琛ㄧず褰撳墠step锛�:math:`w`琛ㄧず`params`銆�
+
+
+    .. note::
+        .. include:: mindspore.nn.optim_note_loss_scale.rst
+        .. include:: mindspore.nn.optim_note_weight_decay.rst
+
+    鍙傛暟锛�
+        params (Union[list[Parameter], list[dict]]) - 蹇呴』鏄� `Parameter` 缁勬垚鐨勫垪琛ㄦ垨瀛楀吀缁勬垚鐨勫垪琛ㄣ€傚綋鍒楄〃鍏冪礌鏄瓧鍏告椂锛屽瓧鍏哥殑閿彲浠ユ槸"params"銆�"lr"銆�"weight_decay"銆佸拰"order_params"锛�
+
+          .. include:: mindspore.nn.optim_group_param.rst
+
+          .. include:: mindspore.nn.optim_group_lr.rst
+
+          .. include:: mindspore.nn.optim_group_weight_decay.rst
+
+          .. include:: mindspore.nn.optim_group_order.rst
+
+
+        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): 榛樿鍊硷細1e-3銆�
+
+          .. include:: mindspore.nn.optim_arg_dynamic_lr.rst
+
+        beta1 (float)锛歚moment1` 鐨勬寚鏁拌“鍑忕巼銆傚弬鏁拌寖鍥达紙0.0,1.0锛夈€傞粯璁ゅ€硷細0.9銆�
+
+        beta2 (float)锛歚moment2` 鐨勬寚鏁拌“鍑忕巼銆傚弬鏁拌寖鍥达紙0.0,1.0锛夈€傞粯璁ゅ€硷細0.999銆�
+
+        eps (float)锛氬皢娣诲姞鍒板垎姣嶄腑锛屼互鎻愰珮鏁板€肩ǔ瀹氭€с€傚繀椤诲ぇ浜�0銆傞粯璁ゅ€硷細1e-6銆�
+
+        weight_decay (float)锛氭潈閲嶈“鍑忥紙L2 penalty锛夈€傚繀椤诲ぇ浜庣瓑浜�0銆傞粯璁ゅ€硷細0.0銆�
+
+    杈撳叆锛�
+        - **gradients** (tuple[Tensor])锛歚params`鐨勬搴︼紝shape涓巂params`鐩稿悓銆�
+
+    杈撳嚭锛�
+        tuple[bool]锛屾墍鏈夊厓绱犻兘涓篢rue銆�
+
+    寮傚父锛�
+        TypeError锛歚learning_rate`涓嶆槸int銆乫loat銆乀ensor銆両terable鎴朙earningRateSchedule銆�
+        TypeError锛歚parameters`鐨勫厓绱犱笉鏄疨arameter鎴栧瓧鍏搞€�
+        TypeError锛歚beta1`銆乣beta2`鎴朻eps`涓嶆槸float銆�
+        TypeError锛歚weight_decay`涓嶆槸float鎴杋nt銆�
+        ValueError锛歚eps`灏忎簬绛変簬0銆�
+        ValueError锛歚beta1`銆乣beta2`涓嶅湪锛�0.0,1.0锛夎寖鍥村唴銆�
+        ValueError锛歚weight_decay`灏忎簬0銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``GPU`` ``CPU``
+
+    绀轰緥锛�
+        >>> net = Net()
+        >>> #1) 鎵€鏈夊弬鏁颁娇鐢ㄧ浉鍚岀殑瀛︿範鐜囧拰鏉冮噸琛板噺
+        >>> optim = nn.AdamWeightDecay(params=net.trainable_params())
+        >>>
+        >>> #2) 浣跨敤鍙傛暟鍒嗙粍骞惰缃笉鍚岀殑鍊�
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
+        ...                 {'params': no_conv_params, 'lr': 0.01},
+        ...                 {'order_params': net.trainable_params()}]
+        >>> optim = nn.AdamWeightDecay(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # conv_params鍙傛暟缁勫皢浣跨敤浼樺寲鍣ㄤ腑鐨勫涔犵巼0.1銆佽缁勭殑鏉冮噸琛板噺0.01銆�
+        >>> # no_conv_params鍙傛暟缁勫皢浣跨敤璇ョ粍鐨勫涔犵巼0.01銆佷紭鍖栧櫒涓殑鏉冮噸琛板噺0.0銆�
+        >>> # 浼樺寲鍣ㄦ寜鐓�"order_params"閰嶇疆鐨勫弬鏁伴『搴忔洿鏂板弬鏁般€�
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
+   
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.DynamicLossScaleUpdateCell.txt b/docs/api/api_python/nn/mindspore.nn.DynamicLossScaleUpdateCell.txt
new file mode 100644
index 00000000000..3e3801d3601
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.DynamicLossScaleUpdateCell.txt
@@ -0,0 +1,58 @@
+Class mindspore.nn.DynamicLossScaleUpdateCell(loss_scale_value, scale_factor, scale_window)
+
+    鐢ㄤ簬鍔ㄦ€佸湴鏇存柊姊害鏀惧ぇ绯绘暟(loss scale)鐨勭缁忓厓銆�
+
+    浣跨敤姊害鏀惧ぇ鍔熻兘杩涜璁粌鏃讹紝鍒濆姊害鏀惧ぇ绯绘暟鍊间负`loss_scale_value`銆�
+    鍦ㄦ瘡涓缁冩楠や腑锛屽綋鍑虹幇婧㈠嚭鏃讹紝閫氳繃璁＄畻鍏紡`loss_scale`/`scale_factor`鍑忓皬姊害鏀惧ぇ绯绘暟銆�
+    濡傛灉杩炵画`scale_window`姝ワ紙step锛夋湭婧㈠嚭锛屽垯灏嗛€氳繃`loss_scale` * `scale_factor`澧炲ぇ姊害鏀惧ぇ绯绘暟銆�
+
+    璇ョ被鏄�:class:`mindspore.nn.DynamicLossScaleManager`鐨刞get_update_cell`鏂规硶鐨勮繑鍥炲€笺€�
+    璁粌杩囩▼涓紝绫�:class:`mindspore.TrainOneStepWithLossScaleCell`浼氳皟鐢ㄨCell鏉ユ洿鏂版搴︽斁澶х郴鏁般€�
+
+    鍙傛暟锛�
+        loss_scale_value (float)锛氬垵濮嬬殑姊害鏀惧ぇ绯绘暟銆�
+        scale_factor (int)锛氬鍑忕郴鏁般€�
+        scale_window (int)锛氭湭婧㈠嚭鏃讹紝澧炲ぇ姊害鏀惧ぇ绯绘暟鐨勬渶澶ц繛缁缁冩鏁般€�
+
+    杈撳叆锛�
+        - **loss_scale** (Tensor)锛氳缁冩湡闂寸殑姊害鏀惧ぇ绯绘暟锛宻hape涓�:math:`()`銆�
+        - **overflow** (bool)锛氭槸鍚﹀彂鐢熸孩鍑恒€�
+
+    杈撳嚭锛�
+        Bool锛屽嵆杈撳叆`overflow`銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``GPU``
+
+    绀轰緥锛�
+        >>> import numpy as np
+        >>> from mindspore import Tensor, Parameter, nn
+        >>> import mindspore.ops as ops
+        >>>
+        >>> class Net(nn.Cell):
+        ...     def __init__(self, in_features, out_features)锛�
+        ...         super(Net, self).__init__()
+        ...         self.weight = Parameter(Tensor(np.ones([in_features, out_features]).astype(np.float32)),
+        ...                                 name='weight')
+        ...         self.matmul = ops.MatMul()
+        ...
+        ...     def construct(self, x)锛�
+        ...         output = self.matmul(x, self.weight)
+        ...         return output
+        ...
+        >>> in_features, out_features = 16, 10
+        >>> net = Net(in_features, out_features)
+        >>> loss = nn.MSELoss()
+        >>> optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+        >>> net_with_loss = nn.WithLossCell(net, loss)
+        >>> manager = nn.DynamicLossScaleUpdateCell(loss_scale_value=2**12, scale_factor=2, scale_window=1000)
+        >>> train_network = nn.TrainOneStepWithLossScaleCell(net_with_loss, optimizer, scale_sense=manager)
+        >>> input = Tensor(np.ones([out_features, in_features]), mindspore.float32)
+        >>> labels = Tensor(np.ones([out_features,]), mindspore.float32)
+        >>> output = train_network(input, labels)
+    
+
+get_loss_scale()
+
+        鑾峰彇褰撳墠姊害鏀惧ぇ绯绘暟銆�
+        
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.FTRL.txt b/docs/api/api_python/nn/mindspore.nn.FTRL.txt
new file mode 100644
index 00000000000..ffc803b8124
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.FTRL.txt
@@ -0,0 +1,103 @@
+Class mindspore.nn.FTRL(*args, **kwargs)
+
+    浣跨敤ApplyFtrl绠楀瓙瀹炵幇FTRL绠楁硶銆�
+
+    FTRL鏄竴绉嶅湪绾垮嚫浼樺寲绠楁硶锛屾牴鎹崯澶卞嚱鏁拌嚜閫傚簲鍦伴€夋嫨姝ｅ垯鍖栧嚱鏁般€�
+    璇﹁璁烘枃`Adaptive Bound Optimization for Online Convex Optimization <https://arxiv.org/abs/1002.4908>`_銆�
+    宸ョ▼鏂囨。鍙傞槄`Ad Click Prediction: a View from the Trenches <https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf>`_銆�
+    
+
+    鏇存柊鍏紡濡備笅锛�
+
+    .. math::
+
+        \begin{array}{ll} \\
+            m_{t+1} = m_{t} + g^2 \\
+            u_{t+1} = u_{t} + g  - \frac{m_{t+1}^\text{-p} - m_{t}^\text{-p}}{\alpha } * \omega_{t} \\
+            \omega_{t+1} =
+            \begin{cases}
+                \frac{(sign(u_{t+1}) * l1 - u_{t+1})}{\frac{m_{t+1}^\text{-p}}{\alpha } + 2 * l2 }
+                    & \text{ if } |u_{t+1}| > l1 \\
+                0.0
+                    & \text{ otherwise }
+            \end{cases}\\
+        \end{array}
+
+    :math:`m`琛ㄧず绱姞鍣紝:math:`g`琛ㄧず`grads`锛�:math:`t`琛ㄧず褰撳墠step锛�:math:`u`琛ㄧず闇€瑕佹洿鏂扮殑绾挎€х郴鏁帮紝:math:`p`琛ㄧず`lr_power`锛�:math:`\alpha`琛ㄧず`learning_rate`锛�:math:`\omega`琛ㄧず`params`銆�
+
+    .. note::
+        .. include:: mindspore.nn.optim_note_sparse.rst
+
+        .. include:: mindspore.nn.optim_note_weight_decay.rst
+
+    鍙傛暟锛�
+        params (Union[list[Parameter], list[dict]]) - 蹇呴』鏄� `Parameter` 缁勬垚鐨勫垪琛ㄦ垨瀛楀吀缁勬垚鐨勫垪琛ㄣ€傚綋鍒楄〃鍏冪礌鏄瓧鍏告椂锛屽瓧鍏哥殑閿彲浠ユ槸"params"銆�"lr"銆�"weight_decay"銆�"grad_centralization"鍜�"order_params"锛�
+
+          .. include:: mindspore.nn.optim_group_param.rst
+
+          - **lr** - 瀛︿範鐜囧綋鍓嶄笉鏀寔鍙傛暟鍒嗙粍銆�
+
+          .. include:: mindspore.nn.optim_group_weight_decay.rst
+
+          .. include:: mindspore.nn.optim_group_gc.rst
+
+          .. include:: mindspore.nn.optim_group_order.rst
+
+
+        initial_accum (float)锛氱疮鍔犲櫒`m`鐨勫垵濮嬪€硷紝蹇呴』澶т簬绛変簬闆躲€傞粯璁ゅ€硷細0.1銆�
+
+        learning_rate (float)锛氬涔犻€熺巼鍊煎繀椤讳负闆舵垨姝ｆ暟锛屽綋鍓嶄笉鏀寔鍔ㄦ€佸涔犵巼銆傞粯璁ゅ€硷細0.001銆�
+
+        lr_power (float)锛氬涔犵巼鐨勫箓鍊硷紝鎺у埗璁粌鏈熼棿瀛︿範鐜囩殑涓嬮檷鏂瑰紡锛屽繀椤诲皬浜庢垨绛変簬闆躲€傚鏋渓r_power涓洪浂锛屽垯浣跨敤鍥哄畾鐨勫涔犵巼銆傞粯璁ゅ€硷細-0.5銆�
+
+        l1 (float)锛歭1姝ｅ垯鍖栧己搴︼紝蹇呴』澶т簬绛変簬闆躲€傞粯璁ゅ€硷細0.0銆�
+
+        l2 (float)锛歭2姝ｅ垯鍖栧己搴︼紝蹇呴』澶т簬绛変簬闆躲€傞粯璁ゅ€硷細0.0銆�
+
+        use_locking (bool)锛氬鏋滀负True锛屽垯鏇存柊鎿嶄綔浣跨敤閿佷繚鎶ゃ€傞粯璁ゅ€硷細False銆�
+
+        .. include:: mindspore.nn.optim_arg_loss_scale.rst
+
+        weight_decay (Union[float, int])锛氳涔樹互鏉冮噸鐨勬潈閲嶈“鍑忓€硷紝蹇呴』涓洪浂鎴栨鍊笺€傞粯璁ゅ€硷細0.0銆�
+
+    杈撳叆锛�
+        - **grads** (tuple[Tensor])锛氫紭鍖栧櫒涓璥params`鐨勬搴︼紝shape涓庝紭鍖栧櫒涓殑`params`鐩稿悓銆�
+          
+
+    杈撳嚭锛�
+        tuple[Parameter]锛屾洿鏂扮殑鍙傛暟锛宻hape涓巂params`鐩稿悓銆�
+
+    寮傚父锛�
+        TypeError锛歚initial_accum`銆乣learning_rate`銆乣lr_power`銆乣l1`銆乣l2`鎴朻loss_scale`涓嶆槸float銆�
+        TypeError锛歚parameters`鐨勫厓绱犱笉鏄疨arameter鎴杁ict銆�
+        TypeError锛歚weight_decay`涓嶆槸float鎴杋nt銆�
+        TypeError锛歚use_nesterov`涓嶆槸bool銆�
+        ValueError锛歚lr_power`澶т簬0銆�
+        ValueError锛歚loss_scale`灏忎簬绛変簬0銆�
+        ValueError锛歚initial_accum`銆乣l1`鎴朻l2`灏忎簬0銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``GPU`` ``CPU``
+
+    绀轰緥锛�
+        >>> net = Net()
+        >>> #1) 鎵€鏈夊弬鏁颁娇鐢ㄧ浉鍚岀殑瀛︿範鐜囧拰鏉冮噸琛板噺
+        >>> optim = nn.FTRL(params=net.trainable_params())
+        >>>
+        >>> #2) 浣跨敤鍙傛暟鍒嗙粍骞惰缃笉鍚岀殑鍊�
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'grad_centralization':True},
+        ...                 {'params': no_conv_params},
+        ...                 {'order_params': net.trainable_params()}]
+        >>> optim = nn.FTRL(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # conv_params鍙傛暟缁勫皢浣跨敤浼樺寲鍣ㄤ腑鐨勫涔犵巼0.1銆佽缁勭殑鏉冮噸琛板噺0.01銆佽缁勭殑姊害涓績鍖栭厤缃甌rue銆�
+        >>> # no_conv_params鍙傛暟缁勪娇鐢ㄤ紭鍖栧櫒涓殑瀛︿範鐜�0.1銆佷紭鍖栧櫒涓殑鏉冮噸琛板噺0.0銆佹搴︿腑蹇冨寲浣跨敤榛樿鍊糉alse銆�
+        >>> # 浼樺寲鍣ㄦ寜鐓�"order_params"閰嶇疆鐨勫弬鏁伴『搴忔洿鏂板弬鏁般€�
+        >>>
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
+    
+
+.. include::mindspore.nn.optim_target_unique_for_sparse.rst
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.FixedLossScaleUpdateCell.txt b/docs/api/api_python/nn/mindspore.nn.FixedLossScaleUpdateCell.txt
new file mode 100644
index 00000000000..a27be34ff09
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.FixedLossScaleUpdateCell.txt
@@ -0,0 +1,51 @@
+Class mindspore.nn.FixedLossScaleUpdateCell(loss_scale_value)
+
+    鍥哄畾姊害鏀惧ぇ绯绘暟鐨勭缁忓厓銆�
+
+    璇ョ被鏄�:class:`mindspore.nn.FixedLossScaleManager`鐨刞get_update_cell`鏂规硶鐨勮繑鍥炲€笺€�
+    璁粌杩囩▼涓紝绫�:class:`mindspore.TrainOneStepWithLossScaleCell`浼氳皟鐢ㄨCell銆�
+
+    鍙傛暟锛�
+        loss_scale_value (float)锛氬垵濮嬫搴︽斁澶х郴鏁般€�
+
+    杈撳叆锛�
+        - **loss_scale** (Tensor)锛氳缁冩湡闂寸殑姊害鏀惧ぇ绯绘暟锛宻hape涓�:math:`()`锛屽湪褰撳墠绫讳腑锛岃鍊艰蹇界暐銆�
+        - **overflow** (bool)锛氭槸鍚﹀彂鐢熸孩鍑恒€�
+
+    杈撳嚭锛�
+        Bool锛屽嵆杈撳叆`overflow`銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``GPU``
+
+    绀轰緥锛�
+        >>> import numpy as np
+        >>> from mindspore import Tensor, Parameter, nn, ops
+        >>>
+        >>> class Net(nn.Cell):
+        ...     def __init__(self, in_features, out_features)锛�
+        ...         super(Net, self).__init__()
+        ...         self.weight = Parameter(Tensor(np.ones([in_features, out_features]).astype(np.float32)),
+        ...                                 name='weight')
+        ...         self.matmul = ops.MatMul()
+        ...
+        ...     def construct(self, x)锛�
+        ...         output = self.matmul(x, self.weight)
+        ...         return output
+        ...
+        >>> in_features, out_features = 16, 10
+        >>> net = Net(in_features, out_features)
+        >>> loss = nn.MSELoss()
+        >>> optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+        >>> net_with_loss = nn.WithLossCell(net, loss)
+        >>> manager = nn.FixedLossScaleUpdateCell(loss_scale_value=2**12)
+        >>> train_network = nn.TrainOneStepWithLossScaleCell(net_with_loss, optimizer, scale_sense=manager)
+        >>> input = Tensor(np.ones([out_features, in_features]), mindspore.float32)
+        >>> labels = Tensor(np.ones([out_features,]), mindspore.float32)
+        >>> output = train_network(input, labels)
+    
+
+get_loss_scale()
+
+        鑾峰彇褰撳墠姊害鏀惧ぇ绯绘暟銆�
+        
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.LARS.txt b/docs/api/api_python/nn/mindspore.nn.LARS.txt
new file mode 100644
index 00000000000..3b5f46c37a2
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.LARS.txt
@@ -0,0 +1,50 @@
+Class mindspore.nn.LARS(*args, **kwargs)
+
+    浣跨敤LARSUpdate绠楀瓙瀹炵幇LARS绠楁硶銆�
+
+    LARS绠楁硶閲囩敤澶ч噺鐨勪紭鍖栨妧鏈€傝瑙佽鏂嘸LARGE BATCH TRAINING OF CONVOLUTIONAL NETWORKS <https://arxiv.org/abs/1708.03888>`_銆�
+
+    鏇存柊鍏紡濡備笅锛�
+
+    .. math::
+
+        \begin{array}{ll} \\
+            \lambda  = \frac{\theta  \text{ * } || \omega  ||  } \\
+                            {|| g_{t} || \text{ + } \delta \text{ * } || \omega  || }  \\
+            \lambda  =
+            \begin{cases}
+                \min(\frac{\lambda}{\alpha }, 1)
+                    & \text{ if } clip = True \\
+                \lambda
+                    & \text{ otherwise }
+            \end{cases}\\
+            g_{t+1} = \lambda * (g_{t} + \delta * \omega)
+        \end{array}
+
+    :math:`\theta`琛ㄧず`coefficient`锛�:math:`\omega`琛ㄧず缃戠粶鍙傛暟锛�:math:`g`琛ㄧず`gradients`锛�:math:`t`琛ㄧず褰撳墠step锛�:math:`\delta`琛ㄧず`optimizer`閰嶇疆鐨刞weight_decay`锛�:math:`\alpha`琛ㄧず`optimizer`閰嶇疆鐨刞learning_rate`锛�:math:`clip`琛ㄧず`use_clip`銆�
+
+
+    鍙傛暟锛�
+        optimizer (Optimizer)锛氬緟灏佽鍜屼慨鏀规搴︾殑MindSpore浼樺寲鍣ㄣ€�
+        epsilon (float)锛氬皢娣诲姞鍒板垎姣嶄腑锛屾彁楂樻暟鍊肩ǔ瀹氭€с€傞粯璁ゅ€硷細1e-05銆�
+        coefficient (float)锛氳绠楀眬閮ㄥ涔犻€熺巼鐨勪俊浠荤郴鏁般€傞粯璁ゅ€硷細0.001銆�
+        use_clip (bool)锛氳绠楀眬閮ㄥ涔犻€熺巼鏃舵槸鍚﹁鍓€傞粯璁ゅ€硷細False銆�
+        lars_filter (Function)锛氱敤浜庢寚瀹氫娇鐢↙ARS绠楁硶鐨勭綉缁滃弬鏁般€傞粯璁ゅ€硷細lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name銆�
+
+    杈撳叆锛�
+        - **gradients** (tuple[Tensor])锛氫紭鍖栧櫒涓璥params`鐨勬搴︼紝shape涓庝紭鍖栧櫒涓殑`params`鐩稿悓銆�
+          
+
+    杈撳嚭锛�
+        Union[Tensor[bool], tuple[Parameter]]锛屽彇鍐充簬`optimizer`鐨勮緭鍑恒€�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``CPU``
+
+    绀轰緥锛�
+        >>> net = Net()
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> opt = nn.Momentum(net.trainable_params(), 0.1, 0.9)
+        >>> opt_lars = nn.LARS(opt, epsilon=1e-08, coefficient=0.02)
+        >>> model = Model(net, loss_fn=loss, optimizer=opt_lars, metrics=None)
+    
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.Lamb.txt b/docs/api/api_python/nn/mindspore.nn.Lamb.txt
new file mode 100644
index 00000000000..5af0d19924e
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.Lamb.txt
@@ -0,0 +1,89 @@
+Class mindspore.nn.Lamb(*args, **kwargs)
+
+    LAMB锛圠ayer-wise Adaptive Moments optimizer for Batching training锛岀敤浜庢壒璁粌鐨勫垎灞傝嚜閫傚簲鐭╀紭鍖栧櫒锛夌畻娉曚紭鍖栧櫒銆�
+
+    LAMB鏄竴绉嶉噰鐢ㄥ垎灞傝嚜閫傚簲鎵逛紭鍖栨妧鏈殑浼樺寲绠楁硶銆�
+    璇﹁璁烘枃`LARGE BATCH OPTIMIZATION FOR DEEP LEARNING: TRAINING BERT IN 76 MINUTES <https://arxiv.org/abs/1904.00962>`_銆�
+
+    LAMB浼樺寲鍣ㄦ棬鍦ㄤ笉闄嶄綆绮惧害鐨勬儏鍐典笅澧炲姞璁粌batch size锛屾敮鎸佽嚜閫傚簲閫愬厓绱犳洿鏂板拰绮剧‘鐨勫垎灞傛牎姝ｃ€�
+    
+
+    鍙傛暟鏇存柊濡備笅锛�
+
+    ..  math::
+        \begin{gather*}
+        m_t = \beta_1 m_{t - 1}+ (1 - \beta_1)g_t\\
+        v_t = \beta_2 v_{t - 1}  + (1 - \beta_2)g_t^2\\
+        m_t = \frac{m_t}{\beta_1^t}\\
+        v_t = \frac{v_t}{\beta_2^t}\\
+        r_t = \frac{m_t}{\sqrt{v_t}+\epsilon}\\
+        w_t = w_{t-1} -\eta_t \frac{\| w_{t-1} \|}{\| r_t + \lambda w_{t-1} \|} (r_t + \lambda w_{t-1})
+        \end{gather*}
+
+    鍏朵腑锛宮ath:`m`浠ｈ〃绗竴涓煩鍚戦噺锛�:math:`v`浠ｈ〃绗簩涓煩鍚戦噺锛�:math:`\eta`琛ㄧず瀛︿範鐜囷紝:math:`\lambda`琛ㄧずLAMB鏉冮噸琛板噺鐜囥€�
+    
+
+    .. note::
+        .. include:: mindspore.nn.optim_note_weight_decay.rst
+
+        .. include:: mindspore.nn.optim_note_loss_scale.rst
+
+    鍙傛暟锛�
+        params (Union[list[Parameter], list[dict]]): 蹇呴』鏄� `Parameter` 缁勬垚鐨勫垪琛ㄦ垨瀛楀吀缁勬垚鐨勫垪琛ㄣ€傚綋鍒楄〃鍏冪礌鏄瓧鍏告椂锛屽瓧鍏哥殑閿彲浠ユ槸"params"銆�"lr"銆�"weight_decay"銆�"grad_centralization"鍜�"order_params"锛�
+
+          .. include:: mindspore.nn.optim_group_param.rst
+          .. include:: mindspore.nn.optim_group_lr.rst
+          .. include:: mindspore.nn.optim_group_weight_decay.rst
+          .. include:: mindspore.nn.optim_group_gc.rst
+          .. include:: mindspore.nn.optim_group_order.rst
+
+        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]):
+          .. include:: mindspore.nn.optim_arg_dynamic_lr.rst
+
+        beta1 (float)锛氱涓€鐭╃殑鎸囨暟琛板噺鐜囥€傚弬鏁拌寖鍥达紙0.0,1.0锛夈€傞粯璁ゅ€硷細0.9銆�
+
+        beta2 (float)锛氱浜岀煩鐨勬寚鏁拌“鍑忕巼銆傚弬鏁拌寖鍥达紙0.0,1.0锛夈€傞粯璁ゅ€硷細0.999銆�
+
+        eps (float)锛氬皢娣诲姞鍒板垎姣嶄腑锛屼互鎻愰珮鏁板€肩ǔ瀹氭€с€傚繀椤诲ぇ浜�0銆傞粯璁ゅ€硷細1e-6銆�
+
+        weight_decay (float)锛氭潈閲嶈“鍑忥紙L2 penalty锛夈€傚繀椤诲ぇ浜庣瓑浜�0銆傞粯璁ゅ€硷細0.0銆�
+
+    杈撳叆锛�
+        - **gradients** (tuple[Tensor])锛歚params`鐨勬搴︼紝shape涓巂params`鐩稿悓銆�
+
+    杈撳嚭锛�
+        tuple[bool]锛屾墍鏈夊厓绱犻兘涓篢rue銆�
+
+    寮傚父锛�
+        TypeError锛歚learning_rate`涓嶆槸int銆乫loat銆乀ensor銆両terable鎴朙earningRateSchedule銆�
+        TypeError锛歚parameters`鐨勫厓绱犱笉鏄疨arameter鎴杁ict銆�
+        TypeError锛歚beta1`銆乣beta2`鎴朻eps`涓嶆槸float銆�
+        TypeError锛歚weight_decay`涓嶆槸float鎴杋nt銆�
+        ValueError锛歚eps`灏忎簬绛変簬0銆�
+        ValueError锛歚beta1`銆乣beta2`涓嶅湪锛�0.0,1.0锛夎寖鍥村唴銆�
+        ValueError锛歚weight_decay`灏忎簬0銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``GPU`` ``CPU``
+
+    绀轰緥锛�
+        >>> net = Net()
+        >>> #1) 鎵€鏈夊弬鏁颁娇鐢ㄧ浉鍚岀殑瀛︿範鐜囧拰鏉冮噸琛板噺
+        >>> optim = nn.Lamb(params=net.trainable_params(), learning_rate=0.1)
+        >>>
+        >>> #2) 浣跨敤鍙傛暟鍒嗙粍骞惰缃笉鍚岀殑鍊�
+        >>> poly_decay_lr = learning_rate_schedule.PolynomialDecayLR(learning_rate=0.1, end_learning_rate=0.01,
+        ...                                                    decay_steps=4, power = 0.5)
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'grad_centralization':True},
+        ...                 {'params': no_conv_params, 'lr': poly_decay_lr},
+        ...                 {'order_params': net.trainable_params(0.01)}]
+        >>> optim = nn.Lamb(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # conv_params鍙傛暟缁勫皢浣跨敤浼樺寲鍣ㄤ腑鐨勫涔犵巼0.1銆佽缁勭殑鏉冮噸琛板噺0.01銆佽缁勭殑姊害涓績鍖栭厤缃甌rue銆�
+        >>> # no_conv_params鍙傛暟缁勫皢浣跨敤璇ョ粍鐨勮“鍑忓涔犵巼銆佷紭鍖栧櫒涓殑鏉冮噸琛板噺0.0銆佹搴︿腑蹇冨寲浣跨敤榛樿鍊糉alse銆�
+        >>> # 浼樺寲鍣ㄦ寜鐓�"order_params"閰嶇疆鐨勫弬鏁伴『搴忔洿鏂板弬鏁般€�
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
+    
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.LazyAdam.txt b/docs/api/api_python/nn/mindspore.nn.LazyAdam.txt
new file mode 100644
index 00000000000..46645e9b525
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.LazyAdam.txt
@@ -0,0 +1,93 @@
+Class mindspore.nn.LazyAdam(*args, **kwargs)
+
+    閫氳繃Adaptive Moment Estimation (Adam)绠楁硶鏇存柊姊害銆傝鍙傞槄璁烘枃`Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_銆�
+
+    褰撴搴︾█鐤忔椂锛屾浼樺寲鍣ㄥ皢浣跨敤Lazy Adam绠楁硶銆�
+
+    鏇存柊鍏紡濡備笅锛�
+
+    .. math::
+        \begin{array}{ll} \\
+            m_{t+1} = \beta_1 * m_{t} + (1 - \beta_1) * g \\
+            v_{t+1} = \beta_2 * v_{t} + (1 - \beta_2) * g * g \\
+            l = \alpha * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t} \\
+            w_{t+1} = w_{t} - l * \frac{m_{t+1}}{\sqrt{v_{t+1}} + \epsilon}
+        \end{array}
+
+    :math:`m`浠ｈ〃绗竴涓煩鍚戦噺`moment1`锛�:math:`v`浠ｈ〃绗簩涓煩鍚戦噺`moment2`锛�:math:`g`浠ｈ〃`gradients`锛�:math:`l`浠ｈ〃缂╂斁鍥犲瓙锛�:math:`\beta_1,\beta_2`浠ｈ〃`beta1`鍜宍beta2`锛�:math:`t`浠ｈ〃褰撳墠step锛�:math:`beta_1^t`鍜�:math:`beta_2^t`浠ｈ〃`beta1_power`鍜宍beta2_power`锛�:math:`\alpha`浠ｈ〃``learning_rate`锛�:math:`w`浠ｈ〃`params`锛�:math:`\epsilon`浠ｈ〃`eps`銆�
+    
+
+    .. note::
+        .. include:: mindspore.nn.optim_note_sparse.rst
+        闇€瑕佹敞鎰忕殑鏄紝姊害绋€鐤忔椂璇ヤ紭鍖栧櫒鍙洿鏂扮綉缁滃弬鏁扮殑褰撳墠鐨勭储寮曚綅缃紝绋€鐤忚涓轰笉绛夊悓浜嶢dam绠楁硶銆�
+
+        .. include:: mindspore.nn.optim_note_weight_decay.rst
+
+    鍙傛暟锛�
+        param (Union[list[Parameter], list[dict]]) - 蹇呴』鏄� `Parameter` 缁勬垚鐨勫垪琛ㄦ垨瀛楀吀缁勬垚鐨勫垪琛ㄣ€傚綋鍒楄〃鍏冪礌鏄瓧鍏告椂锛屽瓧鍏哥殑閿彲浠ユ槸"params"銆�"lr"銆�"weight_decay"銆�"grad_centralization"鍜�"order_params"锛�
+
+          .. include:: mindspore.nn.optim_group_param.rst
+          .. include:: mindspore.nn.optim_group_lr.rst
+          .. include:: mindspore.nn.optim_group_weight_decay.rst
+          .. include:: mindspore.nn.optim_group_gc.rst
+          .. include:: mindspore.nn.optim_group_order.rst
+
+        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): 榛樿鍊硷細1e-3銆�
+          .. include:: mindspore.nn.optim_dynamic_lr.rst
+
+        beta1 (float)锛歚moment1` 鐨勬寚鏁拌“鍑忕巼銆傚弬鏁拌寖鍥达紙0.0,1.0锛夈€傞粯璁ゅ€硷細0.9銆�
+
+        beta2 (float)锛歮oment2` 鐨勬寚鏁拌“鍑忕巼銆傚弬鏁拌寖鍥达紙0.0,1.0锛夈€傞粯璁ゅ€硷細0.999銆�
+
+        eps (float)锛氬皢娣诲姞鍒板垎姣嶄腑锛屼互鎻愰珮鏁板€肩ǔ瀹氭€с€傚繀椤诲ぇ浜�0銆傞粯璁ゅ€硷細1e-8銆�
+
+        use_locking (bool)锛氭槸鍚﹀鍙傛暟鏇存柊鍔犻攣淇濇姢銆傚鏋滀负True锛屽垯 `w` 銆乣m` 鍜� `v` 鐨凾ensor鏇存柊灏嗗彈鍒伴攣鐨勪繚鎶ゃ€傚鏋滀负False锛屽垯缁撴灉涓嶅彲棰勬祴銆傞粯璁ゅ€硷細False銆�
+
+        use_nesterov (bool)锛氭槸鍚︿娇鐢∟esterov Accelerated Gradient (NAG)绠楁硶鏇存柊姊害銆傚鏋滀负True锛屼娇鐢∟AG鏇存柊姊害銆傚鏋滀负False锛屽垯鍦ㄤ笉浣跨敤NAG鐨勬儏鍐典笅鏇存柊姊害銆傞粯璁ゅ€硷細False銆�
+
+        weight_decay (Union[float, int])锛氭潈閲嶈“鍑忥紙L2 penalty锛夈€傚繀椤诲ぇ浜庣瓑浜�0銆傞粯璁ゅ€硷細0.0銆�
+
+        .. include:: mindspore.nn.optim_arg_loss_scale.rst
+
+    杈撳叆锛�
+        - **gradients** (tuple[Tensor])锛歚params`鐨勬搴︼紝shape涓巂params`鐩稿悓銆�
+
+    杈撳嚭锛�
+        Tensor[bool]锛屽€间负True銆�
+
+    寮傚父锛�
+        TypeError锛歚learning_rate`涓嶆槸int銆乫loat銆乀ensor銆両terable鎴朙earningRateSchedule銆�
+        TypeError锛歚parameters`鐨勫厓绱犱笉鏄疨arameter鎴栧瓧鍏搞€�
+        TypeError锛歚beta1`銆乣beta2`銆乣eps`鎴朻loss_scale`涓嶆槸float銆�
+        TypeError锛歚weight_decay`涓嶆槸float鎴杋nt銆�
+        TypeError锛歚use_locking`鎴朻use_nesterov`涓嶆槸bool銆�
+        ValueError锛歚loss_scale`鎴朻eps`灏忎簬鎴栫瓑浜�0銆�
+        ValueError锛歚beta1`銆乣beta2`涓嶅湪锛�0.0,1.0锛夎寖鍥村唴銆�
+        ValueError锛歚weight_decay`灏忎簬0銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``GPU``
+
+    绀轰緥锛�
+        >>> net = Net()
+        >>> #1) 鎵€鏈夊弬鏁颁娇鐢ㄧ浉鍚岀殑瀛︿範鐜囧拰鏉冮噸琛板噺
+        >>> optim = nn.LazyAdam(params=net.trainable_params())
+        >>>
+        >>> #2) 浣跨敤鍙傛暟鍒嗙粍骞惰缃笉鍚岀殑鍊�
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'grad_centralization':True},
+        ...                 {'params': no_conv_params, 'lr': 0.01},
+        ...                 {'order_params': net.trainable_params()}]
+        >>> optim = nn.LazyAdam(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # conv_params鍙傛暟缁勫皢浣跨敤浼樺寲鍣ㄤ腑鐨勫涔犵巼0.1銆佽缁勭殑鏉冮噸琛板噺0.01銆佽缁勭殑姊害涓績鍖栭厤缃甌rue銆�
+        >>> # no_conv_params鍙傛暟缁勫皢浣跨敤璇ョ粍鐨勫涔犵巼0.01銆佷紭鍖栧櫒涓殑鏉冮噸琛板噺0.0銆佹搴︿腑蹇冨寲浣跨敤榛樿鍊糉alse銆�
+        >>> # 浼樺寲鍣ㄦ寜鐓�"order_params"閰嶇疆鐨勫弬鏁伴『搴忔洿鏂板弬鏁般€�
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
+    
+
+.. include:: mindspore.nn.optim_target_unique_for_sparse.rst
+        
+        
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.Metric.rst b/docs/api/api_python/nn/mindspore.nn.Metric.rst
deleted file mode 100644
index b177cda2d90..00000000000
--- a/docs/api/api_python/nn/mindspore.nn.Metric.rst
+++ /dev/null
@@ -1,76 +0,0 @@
-mindspore.nn.Metric
-====================
-
-.. py:class:: mindspore.nn.Metric
-
-    用于计算评估指标的基类。
-
-    在计算评估指标时需要调用 `clear` 、 `update` 和 `eval` 三个方法，在继承该类自定义评估指标时，也需要实现这三个方法。其中，`update` 用于计算中间过程的内部结果，`eval` 用于计算最终评估结果，`clear` 用于重置中间结果。
-    请勿直接使用该类，需使用子类如 :class:`mindspore.nn.MAE` 、 :class:`mindspore.nn.Recall` 等。
-
-    .. py:method:: clear()
-        :abstract:
-
-        描述了清除内部评估结果的行为。
-
-        .. note::
-            所有子类都必须重写此接口。
-
-    .. py:method:: eval()
-        :abstract:
-
-        描述了计算最终评估结果的行为。
-
-        .. note::
-            所有子类都必须重写此接口。
-        
-    .. py:method:: indexes
-        :property:
-
-        获取当前的 `indexes` 值。默认为None，调用 `set_indexes` 可修改 `indexes` 值。
-
-    .. py:method:: set_indexes(indexes)
-
-        该接口用于重排 `update` 的输入。
-
-        给定(label0, label1, logits)作为 `update` 的输入，将 `indexes` 设置为[2, 1]，则最终使用(logits, label1)作为 `update` 的真实输入。
-
-        .. note::
-            在继承该类自定义评估函数时，需要用装饰器 `mindspore.nn.rearrange_inputs` 修饰 `update` 方法，否则配置的 `indexes` 值不生效。
-            
-
-        **参数：**
-
-        **indexes** (List(int)) - logits和标签的目标顺序。
-
-        **输出：**
-
-        :class:`Metric` ，类实例本身。
-
-        **样例：**
-
-        >>> import numpy as np
-        >>> from mindspore import nn, Tensor
-        >>>
-        >>> x = Tensor(np.array([[0.2, 0.5], [0.3, 0.1], [0.9, 0.6]]))
-        >>> y = Tensor(np.array([1, 0, 1]))
-        >>> y2 = Tensor(np.array([0, 0, 1]))
-        >>> metric = nn.Accuracy('classification').set_indexes([0, 2])
-        >>> metric.clear()
-        >>> # indexes为[0, 2]，使用x作为预测值，y2作为真实标签
-        >>> metric.update(x, y, y2)
-        >>> accuracy = metric.eval()
-        >>> print(accuracy)
-        0.3333333333333333
-        
-    .. py:method:: update(*inputs)
-        :abstract:
-
-        描述了更新内部评估结果的行为。
-
-        .. note::
-            所有子类都必须重写此接口。
-
-        **参数：**
-
-        **inputs** - 可变长度输入参数列表。通常是预测值和对应的真实标签。
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.Momentum.rst b/docs/api/api_python/nn/mindspore.nn.Momentum.rst
deleted file mode 100644
index b9e63f2b16a..00000000000
--- a/docs/api/api_python/nn/mindspore.nn.Momentum.rst
+++ /dev/null
@@ -1,91 +0,0 @@
-mindspore.nn.Momentum
-======================
-
-.. py:class:: mindspore.nn.Momentum(*args, **kwargs)
-
-    Momentum算法优化器。
-
-    有关更多详细信息，请参阅论文 `On the importance of initialization and momentum in deep learning <https://dl.acm.org/doi/10.5555/3042817.3043064>`_。
-
-    .. math::
-            v_{t+1} = v_{t} \ast u + grad
-
-    如果 `use_nesterov` 为True：
-
-    .. math::
-            p_{t+1} =  p_{t} - (grad \ast lr + v_{t+1} \ast u \ast lr)
-
-    如果 `use_nesterov` 为False：
-
-    .. math::
-            p_{t+1} = p_{t} - lr \ast v_{t+1}
-
-    其中，:math:`grad` 、:math:`lr` 、:math:`p` 、:math:`v` 和 :math:`u` 分别表示梯度、学习率、参数、矩（Moment）和动量（Momentum）。
-
-    .. note::
-        在参数未分组时，优化器配置的 `weight_decay` 应用于名称含有"beta"或"gamma"的网络参数，通过网络参数分组可调整权重衰减策略。分组时，每组网络参数均可配置 `weight_decay` ，若未配置，则该组网络参数使用优化器中配置的 `weight_decay` 。
-
-    **参数：**
-        
-    - **params** (Union[list[Parameter], list[dict]]): 必须是 `Parameter` 组成的列表或字典组成的列表。当列表元素是字典时，字典的键可以是"params"、"lr"、"weight_decay"、"grad_centralization"和"order_params"：
-
-      - ** params** - 必填。当前组别的权重，该值必须是 `Parameter` 列表。
-      - ** lr** - 可选。如果键中存在"lr"，则使用对应的值作为学习率。如果没有，则使用优化器中配置的 `learning_rate` 作为学习率。
-      - ** weight_decay** - 可选。如果键中存在"weight_decay”，则使用对应的值作为权重衰减值。如果没有，则使用优化器中配置的 `weight_decay` 作为权重衰减值。
-      - ** grad_centralization** - 可选。如果键中存在"grad_centralization"，则使用对应的值，该值必须为布尔类型。如果没有，则认为 `grad_centralization` 为False。该参数仅适用于卷积层。
-      - ** order_params** - 可选。对应值是预期的参数更新顺序。当使用参数分组功能时，通常使用该配置项保持 `parameters` 的顺序以提升性能。如果键中存在"order_params"，则会忽略该组配置中的其他键。"order_params"中的参数必须在某一组 `params` 参数中。
-    
-    - **learning_rate** (Union[float, int, Tensor, Iterable, LearningRateSchedule]):
-
-      - **float** - 固定的学习率。必须大于等于零。
-      - **int** - 固定的学习率。必须大于等于零。整数类型会被转换为浮点数。
-      - **Tensor** - 可以是标量或一维向量。标量是固定的学习率。一维向量是动态的学习率，第i步将取向量中第i个值作为学习率。
-      - **Iterable** - 动态的学习率。第i步将取迭代器第i个值作为学习率。
-      - **LearningRateSchedule** - 动态的学习率。在训练过程中，优化器将使用步数（step）作为输入，调用 `LearningRateSchedule` 实例来计算当前学习率。
-    
-    - **momentum** (float) - 浮点数类型的超参，表示移动平均的动量。必须等于或大于0.0。
-    - **weight_decay** (int, float) - 权重衰减（L2 penalty）值。必须大于等于0.0。默认值：0.0。
-    - **loss_scale** (float) - 梯度缩放系数，必须大于0。如果 `loss_scale` 是整数，它将被转换为浮点数。通常使用默认值，仅当训练时使用了 `FixedLossScaleManager`，且 `FixedLossScaleManager` 的 `drop_overflow_update` 属性配置为False时，此值需要与 `FixedLossScaleManager` 中的 `loss_scale` 相同。有关更多详细信息，请参阅 :class:`mindspore.FixedLossScaleManager` 。默认值：1.0。
-    - **use_nesterov** (bool) - 是否使用Nesterov Accelerated Gradient (NAG)算法更新梯度。默认值：False。
-
-    **输入：**
-    
-    **gradients** (tuple[Tensor]) - `params` 的梯度，形状（shape）与 `params` 相同。
-
-    **输出：**
-
-    tuple[bool]，所有元素都为True。
-
-    **异常：**
-
-    - **TypeError** - `learning_rate` 不是int、float、Tensor、Iterable或LearningRateSchedule。
-    - **TypeError** - `parameters` 的元素不是 `Parameter` 或字典。
-    - **TypeError** - `loss_scale` 或 `momentum` 不是float。
-    - **TypeError** - `weight_decay` 不是float或int。
-    - **TypeError** - `use_nesterov` 不是bool。
-    - **ValueError** - `loss_scale` 小于或等于0。
-    - **ValueError** - `weight_decay` 或 `momentum` 小于0。
-
-    **支持平台：**
-    
-    ``Ascend``  ``GPU``  ``CPU``
-
-    **样例：**
-    
-    >>> net = Net()
-    >>> #1) 所有参数使用相同的学习率和权重衰减
-    >>> optim = nn.Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9)
-    >>>
-    >>> #2) 使用参数分组并设置不同的值
-    >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-    >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
-    >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'grad_centralization':True},
-    ...                 {'params': no_conv_params, 'lr': 0.01},
-    ...                 {'order_params': net.trainable_params()}]
-    >>> optim = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0)
-    >>> # conv_params参数组将使用优化器中的学习率0.1、该组的权重衰减0.01、该组的梯度中心化配置True。
-    >>> # no_conv_params参数组将使用该组的学习率0.01、优化器中的权重衰减0.0、梯度中心化使用默认值False。
-    >>> # 优化器按照"order_params"配置的参数顺序更新参数。
-    >>>
-    >>> loss = nn.SoftmaxCrossEntropyWithLogits()
-    >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.Optimizer.rst b/docs/api/api_python/nn/mindspore.nn.Optimizer.rst
deleted file mode 100644
index dd0f6b5fcd7..00000000000
--- a/docs/api/api_python/nn/mindspore.nn.Optimizer.rst
+++ /dev/null
@@ -1,143 +0,0 @@
-mindspore.nn.Optimizer
-======================
-
-.. py:class:: mindspore.nn.Optimizer(learning_rate, parameters, weight_decay=0.0, loss_scale=1.0)
-
-    用于参数更新的优化器基类。不要直接使用这个类，请实例化它的一个子类。
-
-    优化器支持参数分组。当参数分组时，每组参数均可配置不同的学习率（`lr` ）、权重衰减（`weight_decay`）和梯度中心化（`grad_centralization`）策略。
-
-    .. note::
-        在参数未分组时，优化器配置的 `weight_decay` 应用于名称含有"beta"或"gamma"的网络参数，通过网络参数分组可调整权重衰减策略。分组时，每组网络参数均可配置 `weight_decay` ，若未配置，则该组网络参数使用优化器中配置的 `weight_decay`。
-
-    **参数：**
-
-    - **learning_rate** (Union[float, int, Tensor, Iterable, LearningRateSchedule]):
-
-      - **float** - 固定的学习率。必须大于等于零。
-      - **int** - 固定的学习率。必须大于等于零。整数类型会被转换为浮点数。
-      - **Tensor** - 可以是标量或一维向量。标量是固定的学习率。一维向量是动态的学习率，第i步将取向量中第i个值作为学习率。
-      - **Iterable** - 动态的学习率。第i步将取迭代器第i个值作为学习率。
-      - **LearningRateSchedule** - 动态的学习率。在训练过程中，优化器将使用步数（step）作为输入，调用 `LearningRateSchedule` 实例来计算当前学习率。
-    
-    - **parameters (Union[list[Parameter], list[dict]])** - 必须是 `Parameter` 组成的列表或字典组成的列表。当列表元素是字典时，字典的键可以是"params"、"lr"、"weight_decay"、"grad_centralization"和"order_params"：
-
-      - **params** - 必填。当前组别的权重，该值必须是 `Parameter` 列表。
-      - **lr** - 可选。如果键中存在"lr"，则使用对应的值作为学习率。如果没有，则使用优化器中配置的 `learning_rate` 作为学习率。
-      - **weight_decay** - 可选。如果键中存在"weight_decay”，则使用对应的值作为权重衰减值。如果没有，则使用优化器中配置的 `weight_decay` 作为权重衰减值。
-      - **grad_centralization** - 可选。如果键中存在"grad_centralization"，则使用对应的值，该值必须为布尔类型。如果没有，则认为 `grad_centralization` 为False。该参数仅适用于卷积层。
-      - **order_params** - 可选。对应值是预期的参数更新顺序。当使用参数分组功能时，通常使用该配置项保持 `parameters` 的顺序以提升性能。如果键中存在"order_params"，则会忽略该组配置中的其他键。"order_params"中的参数必须在某一组 `params` 参数中。
-
-    - **weight_decay** (Union[float, int]) - 权重衰减的整数或浮点值。必须等于或大于0。如果 `weight_decay` 是整数，它将被转换为浮点数。默认值：0.0。
-    - **loss_scale** (float) - 梯度缩放系数，必须大于0。如果 `loss_scale` 是整数，它将被转换为浮点数。通常使用默认值，仅当训练时使用了 `FixedLossScaleManager` ，且 `FixedLossScaleManager` 的 `drop_overflow_update` 属性配置为False时，此值需要与 `FixedLossScaleManager` 中的 `loss_scale` 相同。有关更多详细信息，请参阅 :class:`mindspore.FixedLossScaleManager`。默认值：1.0。
-
-    **异常：**
-
-    - **TypeError** - `learning_rate` 不是int、float、Tensor、Iterable或LearningRateSchedule。
-    - **TypeError** - `parameters` 的元素不是Parameter或字典。
-    - **TypeError** - `loss_scale` 不是float。
-    - **TypeError** - `weight_decay` 不是float或int。
-    - **ValueError** - `loss_scale` 小于或等于0。
-    - **ValueError** - `weight_decay` 小于0。
-    - **ValueError** - `learning_rate` 是一个Tensor，但是Tensor的维度大于1。
-
-    **支持平台：**
-
-    ``Ascend``  ``GPU``  ``CPU``
-
-    .. py:method:: broadcast_params(optim_result)
-
-        按参数组的顺序进行参数广播。
-
-        **参数：**
-
-        **optim_result** (bool) - 参数更新结果。该输入用来保证参数更新完成后才执行参数广播。
-
-        **返回：**
-
-        bool，状态标志。
-
-    .. py:method:: decay_weight(gradients)
-
-        衰减权重。
-
-        一种减少深度学习神经网络模型过拟合的方法。继承  :class:`mindspore.nn.Optimizer` 自定义优化器时，可调用该接口进行权重衰减。
-
-        **参数：**
-
-        **gradients** (tuple[Tensor]) - 网络参数的梯度，形状（shape）与网络参数相同。
-
-        **返回：**
-
-        tuple[Tensor]，衰减权重后的梯度。
-
-    .. py:method:: get_lr()
-
-        优化器调用该接口获取当前步骤（step）的学习率。继承 :class:`mindspore.nn.Optimizer` 自定义优化器时，可在参数更新前调用该接口获取学习率。
-
-        **返回：**
-
-        float，当前步骤的学习率。
-
-    .. py:method:: get_lr_parameter(param)
-
-        用于在使用网络参数分组功能，且为不同组别配置不同的学习率时，获取指定参数的学习率。
-
-        **参数：**
-
-        **param** (Union[Parameter, list[Parameter]]) - `Parameter` 或 `Parameter` 列表。
-
-        **返回：**
-
-        Parameter，单个 `Parameter` 或 `Parameter` 列表。如果使用了动态学习率，返回用于计算学习率的 `LearningRateSchedule` 或 `LearningRateSchedule` 列表。
-
-        **样例：**
-
-        >>> from mindspore import nn
-        >>> net = Net()
-        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
-        >>> group_params = [{'params': conv_params, 'lr': 0.05},
-        ...                 {'params': no_conv_params, 'lr': 0.01}]
-        >>> optim = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0)
-        >>> conv_lr = optim.get_lr_parameter(conv_params)
-        >>> print(conv_lr[0].asnumpy())
-        0.05
-
-    .. py:method:: gradients_centralization(gradients)
-
-        梯度中心化。
-
-        一种优化卷积层参数以提高深度学习神经网络模型训练速度的方法。继承 :class:`mindspore.nn.Optimizer` 自定义优化器时，可调用该接口进行梯度中心化。
-
-        **参数：**
-        
-        **gradients** (tuple[Tensor]) - 网络参数的梯度，形状（shape）与网络参数相同。
-
-        **返回：**
-
-        tuple[Tensor]，梯度中心化后的梯度。
-
-    .. py:method:: scale_grad(gradients)
-
-        用于在混合精度场景还原梯度。
-
-        继承 :class:`mindspore.nn.Optimizer` 自定义优化器时，可调用该接口还原梯度。
-
-        **参数：**
-        
-        **gradients** (tuple[Tensor]) - 网络参数的梯度，形状（shape）与网络参数相同。
-
-        **返回：**
-
-        tuple[Tensor]，还原后的梯度。
-
-    .. py:method:: target
-        :property:
-
-        该属性用于指定在主机（host）上还是设备（device）上更新参数。输入类型为str，只能是'CPU'，'Ascend'或'GPU'。
-
-    .. py:method:: unique
-        :property:
-
-        该属性表示是否在优化器中进行梯度去重，通常用于稀疏网络。如果梯度是稀疏的则设置为True。如果前向稀疏网络已对权重去重，即梯度是稠密的，则设置为False。未设置时默认值为True。
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.ProximalAdagrad.txt b/docs/api/api_python/nn/mindspore.nn.ProximalAdagrad.txt
new file mode 100644
index 00000000000..cfa47abae42
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.ProximalAdagrad.txt
@@ -0,0 +1,82 @@
+Class mindspore.nn.ProximalAdagrad(*args, **kwargs)
+
+    浣跨敤ApplyProximalAdagrad绠楀瓙瀹炵幇ProximalAdagrad绠楁硶銆�
+
+    ProximalAdagrad鐢ㄤ簬鍦ㄧ嚎瀛︿範鍜岄殢鏈轰紭鍖栥€�
+    璇峰弬闃呰鏂嘸Efficient Learning using Forward-Backward Splitting <http://papers.nips.cc//paper/3793-efficient-learning-using-forward-backward-splitting.pdf>`_銆�
+
+    .. math::
+        accum_{t+1} = accum_{t} + grad * grad
+
+    .. math::
+        \text{prox_v} = var_{t} - lr * grad * \frac{1}{\sqrt{accum_{t+1}}}
+
+    .. math::
+        var_{t+1} = \frac{sign(\text{prox_v})}{1 + lr * l2} * \max(\left| \text{prox_v} \right| - lr * l1, 0)
+
+    鍏朵腑锛実rad銆乴r銆乿ar銆乤ccum鍜宼鍒嗗埆琛ㄧず`grads`, `learning_rate`, `params`銆佺疮鍔犲櫒鍜屽綋鍓峴tep銆�
+
+    .. note::
+        .. include:: mindspore.nn.optim_note_sparse.rst
+
+        .. include:: mindspore.nn.optim_note_weight_decay.rst
+
+    鍙傛暟锛�
+        param (Union[list[Parameter], list[dict]]) - 蹇呴』鏄� `Parameter` 缁勬垚鐨勫垪琛ㄦ垨瀛楀吀缁勬垚鐨勫垪琛ㄣ€傚綋鍒楄〃鍏冪礌鏄瓧鍏告椂锛屽瓧鍏哥殑閿彲浠ユ槸"params"銆�"lr"銆�"weight_decay"銆�"grad_centralization"鍜�"order_params"锛�
+
+          .. include:: mindspore.nn.optim_group_param.rst
+          .. include:: mindspore.nn.optim_group_lr.rst
+          .. include:: mindspore.nn.optim_group_weight_decay.rst
+          .. include:: mindspore.nn.optim_group_gc.rst
+          .. include:: mindspore.nn.optim_group_order.rst
+
+        accum (float)锛氱疮鍔犲櫒`accum`鐨勫垵濮嬪€硷紝璧峰鍊煎繀椤讳负闆舵垨姝ｅ€笺€傞粯璁ゅ€硷細0.1銆�
+
+        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): 榛樿鍊硷細1e-3銆�
+          .. include:: mindspore.nn.optim_arg_dynamic_lr.rst
+
+        l1 (float):l1姝ｅ垯鍖栧己搴︼紝蹇呴』澶т簬鎴栫瓑浜庨浂銆傞粯璁ゅ€硷細0.0銆�
+        l2 (float):l2姝ｅ垯鍖栧己搴︼紝蹇呴』澶т簬鎴栫瓑浜庨浂銆傞粯璁ゅ€硷細0.0銆�
+        use_locking (bool)锛氬鏋滀负True锛屽垯鏇存柊鎿嶄綔浣跨敤閿佷繚鎶ゃ€傞粯璁ゅ€硷細False銆�
+        .. include:: mindspore.nn.optim_arg_loss_scale.rst
+        weight_decay (Union[float, int])锛氳涔樹互鏉冮噸鐨勬潈閲嶈“鍑忓€硷紝蹇呴』涓洪浂鎴栨鍊笺€傞粯璁ゅ€硷細0.0銆�
+
+    杈撳叆锛�
+        - **grads** (tuple[Tensor]) - 浼樺寲鍣ㄤ腑`params`鐨勬搴︼紝shape涓庝紭鍖栧櫒涓殑`params`鐩稿悓銆�
+
+    杈撳嚭锛�
+        Tensor[bool]锛屽€间负True銆�
+
+    寮傚父锛�
+        TypeError锛歚learning_rate`涓嶆槸int銆乫loat銆乀ensor銆両terable鎴朙earningRateSchedule銆�
+        TypeError锛歚parameters`鐨勫厓绱犱笉鏄疨arameter鎴栧瓧鍏搞€�
+        TypeError锛歚accum`銆乣l1`銆乣l2`鎴朻loss_scale`涓嶆槸float銆�
+        TypeError锛歚weight_decay`涓嶆槸float鎴杋nt銆�
+        ValueError锛歚loss_scale`灏忎簬鎴栫瓑浜�0銆�
+        ValueError锛歚accum`銆乣l1`銆乣l2`鎴朻weight_decay`灏忎簬0銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend``
+
+    绀轰緥锛�
+        >>> net = Net()
+        >>> #1) 鎵€鏈夊弬鏁颁娇鐢ㄧ浉鍚岀殑瀛︿範鐜囧拰鏉冮噸琛板噺
+        >>> optim = nn.ProximalAdagrad(params=net.trainable_params())
+        >>>
+        >>> #2) 浣跨敤鍙傛暟缁勫苟璁剧疆涓嶅悓鐨勫€�
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'grad_centralization':True},
+        ...                 {'params': no_conv_params, 'lr': 0.01},
+        ...                 {'order_params': net.trainable_params()}]
+        >>> optim = nn.ProximalAdagrad(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # conv_params鍙傛暟缁勫皢浣跨敤浼樺寲鍣ㄤ腑鐨勫涔犵巼0.1銆佽缁勭殑鏉冮噸琛板噺0.01銆佽缁勭殑姊害涓績鍖栭厤缃甌rue銆�
+        >>> # no_conv_params鍙傛暟缁勫皢浣跨敤璇ョ粍鐨勫涔犵巼0.01銆佷紭鍖栧櫒涓殑鏉冮噸琛板噺0.0銆佹搴︿腑蹇冨寲浣跨敤榛樿鍊糉alse銆�
+        >>> # 浼樺寲鍣ㄦ寜鐓�"order_params"閰嶇疆鐨勫弬鏁伴『搴忔洿鏂板弬鏁般€�
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
+    
+
+.. include:: mindspore.nn.optim_target_unique_for_sparse.rst
+        
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.RMSProp.txt b/docs/api/api_python/nn/mindspore.nn.RMSProp.txt
new file mode 100644
index 00000000000..5955f4e86d7
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.RMSProp.txt
@@ -0,0 +1,101 @@
+Class mindspore.nn.RMSProp(*args, **kwargs)
+
+    瀹炵幇鍧囨柟鏍逛紶鎾紙RMSProp锛夌畻娉曘€�
+
+    鏍规嵁RMSProp绠楁硶鏇存柊`params`锛岀畻娉曡瑙乕http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf]绗�29椤点€�
+
+    鍏紡濡備笅锛�
+
+    .. math::
+        s_{t+1} = \rho s_{t} + (1 - \rho)(\nabla Q_{i}(w))^2
+
+    .. math::
+        m_{t+1} = \beta m_{t} + \frac{\eta} {\sqrt{s_{t+1} + \epsilon}} \nabla Q_{i}(w)
+
+    .. math::
+        w = w - m_{t+1}
+
+    绗竴涓柟绋嬭绠楁瘡涓潈閲嶇殑骞虫柟姊害鐨勭Щ鍔ㄥ钩鍧囥€傜劧鍚庡皢姊害闄や互:math:`\sqrt{ms_{t+1} + \epsilon}`銆�
+
+    濡傛灉centered涓篢rue锛�
+
+    .. math::
+        g_{t+1} = \rho g_{t} + (1 - \rho)\nabla Q_{i}(w)
+
+    .. math::
+        s_{t+1} = \rho s_{t} + (1 - \rho)(\nabla Q_{i}(w))^2
+
+    .. math::
+        m_{t+1} = \beta m_{t} + \frac{\eta} {\sqrt{s_{t+1} - g_{t+1}^2 + \epsilon}} \nabla Q_{i}(w)
+
+    .. math::
+        w = w - m_{t+1}
+
+    鍏朵腑:math:`w`浠ｈ〃寰呮洿鏂扮殑缃戠粶鍙傛暟`params`銆�
+    :math:`g_{t+1}`鏄钩鍧囨搴︺€�
+    :math:`s_{t+1}`鏄潎鏂规搴︺€�
+    :math:`m_{t+1}`鏄痬oment锛宍w`鐨刣elta銆�
+    :math:`\rho`浠ｈ〃`decay`銆�:math:`\beta`鏄姩閲忛」锛岃〃绀篳momentum`銆�
+    :math:`\epsilon`鏄钩婊戦」锛屽彲浠ラ伩鍏嶉櫎浠ラ浂锛岃〃绀篳epsilon`銆�
+    :math:`\eta`鏄涔犵巼锛岃〃绀篳learning_rate`銆�:math:`\nabla Q_{i}(w)`鏄搴︼紝琛ㄧず`gradients`銆�
+    :math:`t`琛ㄧず褰撳墠step銆� 
+
+    .. note::
+        .. include:: mindspore.nn.optim_note_weight_decay.rst
+
+    鍙傛暟锛�
+        params (Union[list[Parameter], list[dict]])锛氬繀椤绘槸 `Parameter` 缁勬垚鐨勫垪琛ㄦ垨瀛楀吀缁勬垚鐨勫垪琛ㄣ€傚綋鍒楄〃鍏冪礌鏄瓧鍏告椂锛屽瓧鍏哥殑閿彲浠ユ槸"params"銆�"lr"銆�"weight_decay"銆�"grad_centralization"鍜�"order_params"锛�
+
+          .. include:: mindspore.nn.optim_group_param.rst
+          .. include:: mindspore.nn.optim_group_lr.rst
+          .. include:: mindspore.nn.optim_group_weight_decay.rst
+          .. include:: mindspore.nn.optim_group_gc.rst
+          .. include:: mindspore.nn.optim_group_order.rst
+
+        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule])锛氶粯璁ゅ€硷細0.1銆�
+            .. include:: mindspore.nn.optim_arg_dynamic_lr.rst
+        decay (float)锛氳“鍑忕巼銆傚繀椤诲ぇ浜庣瓑浜�0銆傞粯璁ゅ€硷細0.9銆�
+        momentum (float)锛欶loat绫诲瀷鐨勮秴鍙傛暟锛岃〃绀虹Щ鍔ㄥ钩鍧囩殑鍔ㄩ噺锛坢omentum锛夈€傚繀椤诲ぇ浜庣瓑浜�0銆傞粯璁ゅ€硷細0.0銆�
+        epsilon (float)锛氬皢娣诲姞鍒板垎姣嶄腑锛屼互鎻愰珮鏁板€肩ǔ瀹氭€с€傚彇鍊煎ぇ浜�0銆傞粯璁ゅ€硷細1e-10銆�
+        use_locking (bool)锛氭槸鍚﹀鍙傛暟鏇存柊鍔犻攣淇濇姢銆傞粯璁ゅ€硷細False銆�
+        centered (bool)锛氬鏋滀负True锛屽垯姊害灏嗛€氳繃姊害鐨勪及璁℃柟宸繘琛屽綊涓€銆傞粯璁ゅ€硷細False銆�
+        .. include:: mindspore.nn.optim_arg_loss_scale.rst
+        weight_decay (Union[float, int])锛氭潈閲嶈“鍑忥紙L2 penalty锛夈€傚繀椤诲ぇ浜庣瓑浜�0銆傞粯璁ゅ€硷細0.0銆�
+
+    杈撳叆锛�
+        - **gradients** 锛坱uple[Tensor]锛� - `params`鐨勬搴︼紝shape涓巂params`鐩稿悓銆�
+
+    杈撳嚭锛�
+        Tensor[bool]锛屽€间负True銆�
+
+    寮傚父锛�
+        TypeError锛歚learning_rate`涓嶆槸int銆乫loat銆乀ensor銆両terable鎴朙earningRateSchedule銆�
+        TypeError锛歚decay`銆乣momentum`銆乣epsilon`鎴朻loss_scale`涓嶆槸float銆�
+        TypeError锛歚parameters`鐨勫厓绱犱笉鏄疨arameter鎴栧瓧鍏搞€�
+        TypeError锛歚weight_decay`涓嶆槸float鎴杋nt銆�
+        TypeError锛歚use_locking`鎴朻centered`涓嶆槸bool銆�
+        ValueError锛歚epsilon`灏忎簬鎴栫瓑浜�0銆�
+        ValueError锛歚decay`鎴朻momentum`灏忎簬0銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``GPU`` ``CPU``
+
+    绀轰緥锛�
+        >>> net = Net()
+        >>> #1) 鎵€鏈夊弬鏁颁娇鐢ㄧ浉鍚岀殑瀛︿範鐜囧拰鏉冮噸琛板噺
+        >>> optim = nn.RMSProp(params=net.trainable_params(), learning_rate=0.1)
+        >>>
+        >>> #2) 浣跨敤鍙傛暟鍒嗙粍骞惰缃笉鍚岀殑鍊�
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'grad_centralization':True},
+        ...                 {'params': no_conv_params, 'lr': 0.01},
+        ...                 {'order_params': net.trainable_params()}]
+        >>> optim = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # conv_params鍙傛暟缁勫皢浣跨敤浼樺寲鍣ㄤ腑鐨勫涔犵巼0.1銆佽缁勭殑鏉冮噸琛板噺0.01銆佽缁勭殑姊害涓績鍖栭厤缃甌rue銆�
+        >>> # no_conv_params鍙傛暟缁勫皢浣跨敤璇ョ粍鐨勫涔犵巼0.01銆佷紭鍖栧櫒涓殑鏉冮噸琛板噺0.0銆佹搴︿腑蹇冨寲浣跨敤榛樿鍊糉alse銆�
+        >>> # 浼樺寲鍣ㄦ寜鐓�"order_params"閰嶇疆鐨勫弬鏁伴『搴忔洿鏂板弬鏁般€�
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
+    
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.SGD.txt b/docs/api/api_python/nn/mindspore.nn.SGD.txt
new file mode 100644
index 00000000000..00ef719883f
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.SGD.txt
@@ -0,0 +1,88 @@
+mindspore.nn.SGD
+================
+
+.. py:class:: mindspore.nn.SGD(*args, **kwargs)
+
+    瀹炵幇闅忔満姊害涓嬮檷銆傚姩閲忓彲閫夈€�
+
+    SGD鐩稿叧浠嬬粛鍙傝 `SGD <https://en.wikipedia.org/wiki/Stochastic_gradient_dencent>`_ 銆�
+
+    Nesterov鍔ㄩ噺鍏紡鍙傝璁烘枃 `On the importance of initialization and momentum in deep learning <http://proceedings.mlr.press/v28/sutskever13.html>`_ 銆�
+
+    .. math::
+            v_{t+1} = u \ast v_{t} + gradient \ast (1-dampening)
+
+    濡傛灉nesterov涓篢rue锛�
+
+    .. math::
+            p_{t+1} = p_{t} - lr \ast (gradient + u \ast v_{t+1})
+
+    濡傛灉nesterov涓篎alse锛�
+
+    .. math::
+            p_{t+1} = p_{t} - lr \ast v_{t+1}
+
+    闇€瑕佹敞鎰忕殑鏄紝瀵逛簬璁粌鐨勭涓€姝� :math:`v_{t+1} = gradient`銆傚叾涓紝p銆乿鍜寀鍒嗗埆琛ㄧず `parameters`銆乣accum` 鍜� `momentum`銆�
+
+    .. note::
+
+        .. include:: mindspore.nn.optim_note_weight_decay.rst
+
+    **鍙傛暟锛�**
+
+        - **params** (Union[list[Parameter], list[dict]]): 褰� `params` 涓轰細鏇存柊鐨� `Parameter` 鍒楄〃鏃讹紝`params` 涓殑鍏冪礌蹇呴』涓虹被 `Parameter`銆傚綋 `params` 涓� `dict` 鍒楄〃鏃讹紝"params"銆�"lr"銆�"weight_decay"銆�"grad_centralization"鍜�"order_params"涓哄彲浠ヨВ鏋愮殑閿€�
+          .. include:: mindspore.nn.optim_group_param.rst
+          .. include:: mindspore.nn.optim_group_lr.rst
+          .. include:: mindspore.nn.optim_group_weight_decay.rst
+          .. include:: mindspore.nn.optim_group_gc.rst
+          .. include:: mindspore.nn.optim_group_order.rst
+
+        - **learning_rate** (Union[float, Tensor, Iterable, LearningRateSchedule]): 榛樿鍊硷細0.1銆�
+          .. include:: mindspore.nn.optim_arg_dynamic_lr.rst
+
+        - **momentum** (float): 娴偣鍔ㄩ噺锛屽繀椤诲ぇ浜庣瓑浜�0.0銆傞粯璁ゅ€硷細0.0銆�
+        - **dampening** (float): 娴偣鍔ㄩ噺闃诲凹鍊硷紝蹇呴』澶т簬绛変簬0.0銆傞粯璁ゅ€硷細0.0銆�
+        - **weight_decay** (float): 鏉冮噸琛板噺锛圠2 penalty锛夛紝蹇呴』澶т簬绛変簬0銆傞粯璁ゅ€硷細0.0銆�
+        - **nesterov** (bool): 鍚敤Nesterov鍔ㄩ噺銆傚鏋滀娇鐢∟esterov锛屽姩閲忓繀椤讳负姝ｏ紝闃诲凹蹇呴』绛変簬0.0銆傞粯璁ゅ€硷細False銆�
+        .. include:: mindspore.nn.optim_arg_loss_scale.rst
+
+    **杈撳叆锛�**
+
+        **gradients** (tuple[Tensor])锛歚params` 鐨勬搴︼紝shape涓� `params` 鐩稿悓銆�
+
+    **杈撳嚭锛�**
+
+        Tensor[bool]锛屽€间负True銆�
+
+    **寮傚父锛�**
+
+        **ValueError锛�** 鍔ㄩ噺銆侀樆灏兼垨閲嶉噺琛板噺鍊煎皬浜�0.0銆�
+
+    **鏀寔骞冲彴锛�**
+
+        ``Ascend`` ``GPU`` ``CPU``
+
+    **鏍蜂緥锛�**
+
+    .. code-block::
+
+        >>> net = Net()
+        >>> # 1) 鎵€鏈夊弬鏁颁娇鐢ㄧ浉鍚岀殑瀛︿範鐜囧拰鏉冮噸琛板噺
+        >>> optim = nn.SGD(params=net.trainable_params())
+        >>>
+        >>> # 2) 浣跨敤鍙傛暟缁勫苟璁剧疆涓嶅悓鐨勫€�
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params,'grad_centralization':True},
+        ...                 {'params': no_conv_params, 'lr': 0.01},
+        ...                 {'order_params': net.trainable_params()}]
+        >>> optim = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # con_params鐨勫弬鏁板皢浣跨敤榛樿瀛︿範鐜�0.1銆侀粯璁ゆ潈閲嶈“鍑�0.0銆佹搴﹂泦涓害涓篢rue銆�
+        >>> # 
+        >>> # no_con_params鐨勫弬鏁板皢浣跨敤瀛︿範鐜�0.01銆侀粯璁ゆ潈閲嶈“鍑�0.0銆佹搴﹂泦涓害涓篎alse銆�
+        >>> #
+        >>> # 浼樺寲鍣ㄧ殑鏈€缁堝弬鏁伴『搴忛噰鐢�'order_params'鐨勫€笺€�
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
+    
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.TrainOneStepCell.txt b/docs/api/api_python/nn/mindspore.nn.TrainOneStepCell.txt
new file mode 100644
index 00000000000..de9ec56668f
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.TrainOneStepCell.txt
@@ -0,0 +1,50 @@
+Class mindspore.nn.TrainOneStepCell(network, optimizer, sens=1.0)
+
+    璁粌缃戠粶灏佽绫汇€�
+
+    灏佽`network`鍜宍optimizer`锛屾瀯寤轰竴涓緭鍏�'\*inputs'鐨勭敤浜庤缁冪殑Cell銆�
+    鎵ц鍑芥暟`construct`涓細鏋勫缓鍙嶅悜鍥句互鏇存柊缃戠粶鍙傛暟銆傛敮鎸佷笉鍚岀殑骞惰璁粌妯″紡銆�
+
+    鍙傛暟锛�
+        network (Cell)锛氳缁冪綉缁溿€傚彧鏀寔鍗曡緭鍑虹綉缁溿€�
+        optimizer (Union[Cell])锛氱敤浜庢洿鏂扮綉缁滃弬鏁扮殑浼樺寲鍣ㄣ€�
+        sens (numbers.Number)锛氬弽鍚戜紶鎾殑杈撳叆锛岀缉鏀剧郴鏁般€傞粯璁ゅ€间负1.0銆�
+
+    杈撳叆锛�
+        - **(\*inputs)** (Tuple(Tensor)) - shape涓�:math:`(N, \ldots)`鐨凾ensor缁勬垚鐨勫厓缁勩€�
+
+    杈撳嚭锛�
+        Tensor锛屾崯澶卞嚱鏁板€硷紝鍏秙hape閫氬父涓�:math:`()`銆�
+
+    寮傚父锛�
+        TypeError锛歚sens`涓嶆槸numbers.Number銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``GPU`` ``CPU``
+
+    绀轰緥锛�
+        >>> net = Net()
+        >>> loss_fn = nn.SoftmaxCrossEntropyWithLogits()
+        >>> optim = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+        >>> # 1锛変娇鐢∕indSpore鎻愪緵鐨刉ithLossCell
+        >>> loss_net = nn.WithLossCell(net, loss_fn)
+        >>> train_net = nn.TrainOneStepCell(loss_net, optim)
+        >>>
+        >>> # 2锛夌敤鎴疯嚜瀹氫箟鐨刉ithLossCell
+        >>> class MyWithLossCell(Cell):
+        ...    def __init__(self, backbone, loss_fn):
+        ...        super(MyWithLossCell, self).__init__(auto_prefix=False)
+        ...        self._backbone = backbone
+        ...        self._loss_fn = loss_fn
+        ...
+        ...    def construct(self, x, y, label):
+        ...        out = self._backbone(x, y)
+        ...        return self._loss_fn(out, label)
+        ...
+        ...    @property
+        ...    def backbone_network(self):
+        ...        return self._backbone
+        ...
+        >>> loss_net = MyWithLossCell(net, loss_fn)
+        >>> train_net = nn.TrainOneStepCell(loss_net, optim)
+    
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.TrainOneStepWithLossScaleCell.txt b/docs/api/api_python/nn/mindspore.nn.TrainOneStepWithLossScaleCell.txt
new file mode 100644
index 00000000000..73c7145f632
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.TrainOneStepWithLossScaleCell.txt
@@ -0,0 +1,116 @@
+Class mindspore.nn.TrainOneStepWithLossScaleCell(network, optimizer, scale_sense)
+
+    浣跨敤姊害鏀惧ぇ鍔熻兘锛坙oss scale锛夌殑璁粌缃戠粶銆�
+
+    瀹炵幇浜嗗寘鍚搴︽斁澶у姛鑳界殑鍗曟璁粌銆傚畠浣跨敤缃戠粶銆佷紭鍖栧櫒鍜岀敤浜庢洿鏂版搴︽斁澶х郴鏁扮殑Cell(鎴栦竴涓猅ensor)浣滀负鍙傛暟銆傚彲鍦╤ost渚ф垨device渚ф洿鏂版搴︽斁澶х郴鏁般€�
+    濡傛灉闇€瑕佸湪host渚ф洿鏂帮紝浣跨敤Tensor浣滀负`scale_sense`锛屽惁鍒欙紝浣跨敤鍙洿鏂版搴︽斁澶х郴鏁扮殑Cell瀹炰緥浣滀负`scale_sense`銆�
+
+    鍙傛暟锛�
+        network (Cell)锛氳缁冪綉缁溿€備粎鏀寔鍗曡緭鍑虹綉缁溿€�
+        optimizer (Cell)锛氱敤浜庢洿鏂扮綉缁滃弬鏁扮殑浼樺寲鍣ㄣ€�
+        scale_sense (Union[Tensor, Cell])锛氬鏋滄鍊间负Cell绫诲瀷锛宍TrainOneStepWithLossScaleCell`浼氳皟鐢ㄥ畠鏉ユ洿鏂版搴︽斁澶х郴鏁般€傚鏋滄鍊间负Tensor绫诲瀷锛屽彲璋冪敤`set_sense_scale`鏉ユ洿鏂版搴︽斁澶х郴鏁帮紝shape涓�:math:`()`鎴�:math:`(1,)`銆�
+
+    杈撳叆锛�
+        - **(*inputs)** (Tuple(Tensor))- shape涓�:math:`(N, \ldots)`鐨凾ensor缁勬垚鐨勫厓缁勩€�
+
+    杈撳嚭锛�
+        Tuple锛屽寘鍚笁涓猅ensor锛屽垎鍒负鎹熷け鍑芥暟鍊笺€佹孩鍑虹姸鎬佸拰褰撳墠姊害鏀惧ぇ绯绘暟銆�
+
+        - **loss** 锛圱ensor锛� - shape涓�:math:`()`鐨凾ensor銆�
+        - **overflow** 锛圱ensor锛�- shape涓�:math:`()`鐨凾ensor锛岀被鍨嬩负bool銆�
+        - **loss scale** 锛圱ensor锛�- shape涓�:math:`()`鐨凾ensor銆�
+
+    寮傚父锛�
+        TypeError锛歚scale_sense`鏃笉鏄疌ell锛屼篃涓嶆槸Tensor銆�
+        ValueError锛歚scale_sense`鐨剆hape鏃笉鏄�(1,)涔熶笉鏄�()銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``GPU``
+
+    绀轰緥锛�
+        >>> import numpy as np
+        >>> from mindspore import Tensor, Parameter, nn, ops
+        >>> from mindspore import dtype as mstype
+        >>>
+        >>> class Net(nn.Cell):
+        ...     def __init__(self, in_features, out_features):
+        ...         super(Net, self).__init__()
+        ...         self.weight = Parameter(Tensor(np.ones([in_features, out_features]).astype(np.float32)),
+        ...                                 name='weight')
+        ...         self.matmul = ops.MatMul()
+        ...
+        ...     def construct(self, x):
+        ...         output = self.matmul(x, self.weight)
+        ...         return output
+        ...
+        >>> size, in_features, out_features = 16, 16, 10
+        >>> #1锛塻cale_sense绫诲瀷涓篊ell鏃讹細
+        >>> net = Net(in_features, out_features)
+        >>> loss = nn.MSELoss()
+        >>> optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+        >>> net_with_loss = nn.WithLossCell(net, loss)
+        >>> manager = nn.DynamicLossScaleUpdateCell(loss_scale_value=2**12, scale_factor=2, scale_window=1000)
+        >>> train_network = nn.TrainOneStepWithLossScaleCell(net_with_loss, optimizer, scale_sense=manager)
+        >>> input = Tensor(np.ones([out_features, in_features]), mindspore.float32)
+        >>> labels = Tensor(np.ones([out_features,]), mindspore.float32)
+        >>> output = train_network(input, labels)
+        >>>
+        >>>> #2锛夊綋scale_sense绫诲瀷涓篢ensor鏃讹細
+        >>> net = Net(in_features, out_features)
+        >>> loss = nn.MSELoss()
+        >>> optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+        >>> net_with_loss = nn.WithLossCell(net, loss)
+        >>> inputs = Tensor(np.ones([size, in_features]).astype(np.float32))
+        >>> label = Tensor(np.zeros([size, out_features]).astype(np.float32))
+        >>> scaling_sens = Tensor(np.full((1), np.finfo(np.float32).max), dtype=mstype.float32)
+        >>> train_network = nn.TrainOneStepWithLossScaleCell(net_with_loss, optimizer, scale_sense=scaling_sens)
+        >>> output = train_network(inputs, label)
+    
+
+get_overflow_status(status, compute_output)
+
+        鑾峰彇娴偣婧㈠嚭鐘舵€併€�
+
+        婧㈠嚭妫€娴嬬殑鐩爣杩囩▼鎵ц瀹屾垚鍚庯紝鑾峰彇婧㈠嚭缁撴灉銆傜户鎵胯绫昏嚜瀹氫箟璁粌缃戠粶鏃讹紝鍙鐢ㄨ鎺ュ彛銆�
+
+        杈撳叆锛�
+            - **status** (object) - 鐢ㄤ簬妫€娴嬫孩鍑虹殑鐘舵€佸疄渚嬨€�
+            - **compute_output** - 瀵圭壒瀹氳绠楄繃绋嬭繘琛屾孩鍑烘娴嬫椂锛屽皢`compute_output`璁剧疆涓鸿璁＄畻杩囩▼鐨勮緭鍑猴紝浠ョ‘淇濆湪鎵ц璁＄畻涔嬪墠鑾峰彇浜哷status`銆�
+
+        杈撳嚭锛�
+            bool锛屾槸鍚﹀彂鐢熸孩鍑恒€�
+        
+
+process_loss_scale(overflow)
+
+        鏍规嵁婧㈠嚭鐘舵€佽绠楁搴︽斁澶х郴鏁般€傜户鎵胯绫昏嚜瀹氫箟璁粌缃戠粶鏃讹紝鍙鐢ㄨ鎺ュ彛銆�
+
+        杈撳叆锛�
+            - **overflow** (bool) - 鏄惁鍙戠敓婧㈠嚭銆�
+
+        杈撳嚭锛�
+            bool锛屾孩鍑虹姸鎬侊紝鍗宠緭鍏ャ€�
+        
+
+set_sense_scale(sens)
+
+        濡傛灉浣跨敤浜員ensor绫诲瀷鐨刞scale_sense`锛屽彲璋冪敤姝ゅ嚱鏁颁慨鏀瑰畠鐨勫€笺€�
+
+        杈撳叆锛�
+            - **sens** 锛圱ensor锛�- 鏂扮殑姊害鏀惧ぇ绯绘暟锛屽叾shape鍜岀被鍨嬮渶瑕佷笌鍘熷`scale_sense`鐩稿悓銆�
+        
+
+start_overflow_check(pre_cond, compute_input)
+
+        鍚姩娴偣婧㈠嚭妫€娴嬨€傚垱寤哄苟娓呴櫎婧㈠嚭妫€娴嬬姸鎬併€�
+
+        鎸囧畾鍙傛暟'pre_cond'鍜�'compute_input'锛屼互纭繚鍦ㄦ纭殑鏃堕棿娓呴櫎婧㈠嚭鐘舵€併€�
+        浠ュ綋鍓嶆帴鍙ｄ负渚嬶紝鎴戜滑闇€瑕佸湪鎹熷け鍑芥暟璁＄畻鍚庤繘琛屾竻闄ょ姸鎬侊紝鍦ㄦ搴﹁绠楄繃绋嬩腑妫€娴嬫孩鍑恒€傚湪杩欑鎯呭喌涓嬶紝pre_cond搴斾负鎹熷け鍑芥暟鐨勮緭鍑猴紝鑰宑ompute_input搴斾负姊害璁＄畻鍑芥暟鐨勮緭鍏ャ€傜户鎵胯绫昏嚜瀹氫箟璁粌缃戠粶鏃讹紝鍙鐢ㄨ鎺ュ彛銆�
+        杈撳叆锛�
+            - **pre_cond** (Tensor) -鍚姩婧㈠嚭妫€娴嬬殑鍏堝喅鏉′欢銆傚畠鍐冲畾婧㈠嚭鐘舵€佹竻闄ゅ拰鍏堝墠澶勭悊鐨勬墽琛岄『搴忋€傚畠纭繚鍑芥暟'start_overflow'鍦ㄦ墽琛屽畬鍏堝喅鏉′欢鍚庢竻闄ょ姸鎬併€�
+            - **compute_input** (object) - 鍚庣画杩愮畻鐨勮緭鍏ャ€傞渶瑕佸鐗瑰畾鐨勮绠楄繃绋嬭繘琛屾孩鍑烘娴嬨€傚皢`compute_input`璁剧疆杩欎竴璁＄畻杩囩▼鐨勮緭鍏ワ紝浠ョ‘淇濆湪鎵ц璇ヨ绠椾箣鍓嶆竻闄や簡婧㈠嚭鐘舵€併€�
+
+        杈撳嚭锛�
+            Tuple[object, object]锛孏PU鍚庣鐨勭涓€涓€间负False锛岃€屽叾浠栧悗绔殑绗竴涓€兼槸NPUAllocFloatStatus鐨勫疄渚嬨€傝鍊肩敤浜庡湪`get_overflow_status`鏈熼棿妫€娴嬫孩鍑恒€�
+            绗簩涓€间笌`compute_input`鐨勮緭鍏ョ浉鍚岋紝鐢ㄤ簬鎺у埗鎵ц搴忋€�
+        
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.WithEvalCell.txt b/docs/api/api_python/nn/mindspore.nn.WithEvalCell.txt
new file mode 100644
index 00000000000..1bbf08d6c4a
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.WithEvalCell.txt
@@ -0,0 +1,29 @@
+Class mindspore.nn.WithEvalCell(network, loss_fn, add_cast_fp32=False)
+
+    灏佽鍓嶅悜缃戠粶鍜屾崯澶卞嚱鏁帮紝杩斿洖鐢ㄤ簬璁＄畻璇勪及鎸囨爣鐨勬崯澶卞嚱鏁板€笺€佸墠鍚戣緭鍑哄拰鏍囩銆�
+
+
+    鍙傛暟锛�
+        network (Cell)锛氬墠鍚戠綉缁溿€�
+        loss_fn (Cell)锛氭崯澶卞嚱鏁般€�
+        add_cast_fp32 (bool)锛氭槸鍚﹀皢鏁版嵁绫诲瀷璋冩暣涓篺loat32銆傞粯璁ゅ€硷細False銆�
+
+    杈撳叆锛�
+        - **data** 锛圱ensor锛� - shape涓�:math:`(N, \ldots)`鐨凾ensor銆�
+        - **label** 锛圱ensor锛� - shape涓�:math:`(N, \ldots)`鐨凾ensor銆�
+
+    杈撳嚭锛�
+        Tuple(Tensor)锛屽寘鎷爣閲忔崯澶卞嚱鏁般€乻hape涓�:math:`(N, \ldots)`鐨勭綉缁滆緭鍑哄拰shape涓�:math:`(N, \ldots)`鐨勬爣绛俱€�
+
+    寮傚父锛�
+        TypeError锛歚add_cast_fp32`涓嶆槸bool銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``GPU`` ``CPU``
+
+    绀轰緥锛�
+        >>> # 鏈寘鍚崯澶卞嚱鏁扮殑鍓嶅悜缃戠粶
+        >>> net = Net()
+        >>> loss_fn = nn.SoftmaxCrossEntropyWithLogits()
+        >>> eval_net = nn.WithEvalCell(net, loss_fn)
+    
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.WithLossCell.txt b/docs/api/api_python/nn/mindspore.nn.WithLossCell.txt
new file mode 100644
index 00000000000..0f2da5b0b0c
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.WithLossCell.txt
@@ -0,0 +1,42 @@
+Class mindspore.nn.WithLossCell(backbone, loss_fn)
+
+    鍖呭惈鎹熷け鍑芥暟鐨凜ell銆�
+
+    灏佽`backbone`鍜宍loss_fn`銆傛Cell鎺ュ彈鏁版嵁鍜屾爣绛句綔涓鸿緭鍏ワ紝骞跺皢杩斿洖鎹熷け鍑芥暟浣滀负璁＄畻缁撴灉銆�
+
+    鍙傛暟锛�
+        backbone (Cell)锛氳灏佽鐨勭洰鏍囩綉缁溿€�
+        loss_fn (Cell)锛氱敤浜庤绠楁崯澶卞嚱鏁般€�
+
+    杈撳叆锛�
+        - **data** 锛圱ensor锛� - shape涓�:math:`(N, \ldots)`鐨凾ensor銆�
+        - **label** 锛圱ensor锛� - shape涓�:math:`(N, \ldots)`鐨凾ensor銆�
+
+    杈撳嚭锛�
+        Tensor锛宭oss鍊硷紝鍏秙hape閫氬父涓�:math:`()`銆�
+
+    寮傚父锛�
+        TypeError锛歚data`鎴朻label`鐨勬暟鎹被鍨嬫棦涓嶆槸float16涔熶笉鏄痜loat32銆�
+
+    鏀寔骞冲彴锛�
+        ``Ascend`` ``GPU`` ``CPU``
+
+    绀轰緥锛�
+        >>> net = Net()
+        >>> loss_fn = nn.SoftmaxCrossEntropyWithLogits(sparse=False)
+        >>> net_with_criterion = nn.WithLossCell(net, loss_fn)
+        >>>
+        >>> batch_size = 2
+        >>> data = Tensor(np.ones([batch_size, 1, 32, 32]).astype(np.float32) * 0.01)
+        >>> label = Tensor(np.ones([batch_size, 10]).astype(np.float32))
+        >>>
+        >>> output_data = net_with_criterion(data, label)
+    
+
+backbone_network
+
+        鑾峰彇楠ㄥ共缃戠粶銆�
+
+        杩斿洖锛�
+            Cell锛岄骞茬綉缁溿€�
+        
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.optim_arg_dynamic_lr.rst b/docs/api/api_python/nn/mindspore.nn.optim_arg_dynamic_lr.rst
new file mode 100644
index 00000000000..39b4ed0cdde
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.optim_arg_dynamic_lr.rst
@@ -0,0 +1,5 @@
+- **float** - 固定的学习率。必须大于等于零。
+- **int** - 固定的学习率。必须大于等于零。整数类型会被转换为浮点数。
+- **Tensor** - 可以是标量或一维向量。标量是固定的学习率。一维向量是动态的学习率，第i步将取向量中第i个值作为学习率。
+- **Iterable** - 动态的学习率。第i步将取迭代器第i个值作为学习率。
+- **LearningRateSchedule** - 动态的学习率。在训练过程中，优化器将使用步数（step）作为输入，调用 `LearningRateSchedule` 实例来计算当前学习率。
\ No newline at end of file
diff --git a/docs/api/api_python/nn/mindspore.nn.optim_arg_loss_scale.rst b/docs/api/api_python/nn/mindspore.nn.optim_arg_loss_scale.rst
new file mode 100644
index 00000000000..a0d3c0f7935
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.optim_arg_loss_scale.rst
@@ -0,0 +1 @@
+- **loss_scale** (float) - 姊害缂╂斁绯绘暟锛屽繀椤诲ぇ浜�0銆傚鏋� `loss_scale` 鏄暣鏁帮紝瀹冨皢琚浆鎹负娴偣鏁般€傞€氬父浣跨敤榛樿鍊硷紝浠呭綋璁粌鏃朵娇鐢ㄤ簡 `FixedLossScaleManager`锛屼笖 `FixedLossScaleManager `鐨� `drop_overflow_update` 灞炴€ч厤缃负False鏃讹紝姝ゅ€奸渶瑕佷笌 `FixedLossScaleManager` 涓殑 `loss_scale` 鐩稿悓銆傛湁鍏虫洿澶氳缁嗕俊鎭紝璇峰弬闃卌lass锛歚mindspore.FixedLossScaleManager`銆傞粯璁ゅ€硷細1.0銆�
diff --git a/docs/api/api_python/nn/mindspore.nn.optim_group_gc.rst b/docs/api/api_python/nn/mindspore.nn.optim_group_gc.rst
new file mode 100644
index 00000000000..0fd1c721468
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.optim_group_gc.rst
@@ -0,0 +1 @@
+- **grad_centralization** - 鍙€夈€傚鏋滈敭涓瓨鍦�"grad_centralization"锛屽垯浣跨敤瀵瑰簲鐨勫€硷紝璇ュ€煎繀椤讳负甯冨皵绫诲瀷銆傚鏋滄病鏈夛紝鍒欒涓� `grad_centralization` 涓篎alse銆傝鍙傛暟浠呴€傜敤浜庡嵎绉眰銆�
diff --git a/docs/api/api_python/nn/mindspore.nn.optim_group_lr.rst b/docs/api/api_python/nn/mindspore.nn.optim_group_lr.rst
new file mode 100644
index 00000000000..e62a33446df
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.optim_group_lr.rst
@@ -0,0 +1 @@
+- **lr** - 鍙€夈€傚鏋滈敭涓瓨鍦�"lr"锛屽垯浣跨敤瀵瑰簲鐨勫€间綔涓哄涔犵巼銆傚鏋滄病鏈夛紝鍒欎娇鐢ㄤ紭鍖栧櫒涓厤缃殑 `learning_rate` 浣滀负瀛︿範鐜囥€�
diff --git a/docs/api/api_python/nn/mindspore.nn.optim_group_order.rst b/docs/api/api_python/nn/mindspore.nn.optim_group_order.rst
new file mode 100644
index 00000000000..c6744f4ced9
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.optim_group_order.rst
@@ -0,0 +1 @@
+- **order_params** - 鍙€夈€傚搴斿€兼槸棰勬湡鐨勫弬鏁版洿鏂伴『搴忋€傚綋浣跨敤鍙傛暟鍒嗙粍鍔熻兘鏃讹紝閫氬父浣跨敤璇ラ厤缃」淇濇寔 `parameters` 鐨勯『搴忎互鎻愬崌鎬ц兘銆傚鏋滈敭涓瓨鍦�"order_params"锛屽垯浼氬拷鐣ヨ缁勯厤缃腑鐨勫叾浠栭敭銆�"order_params"涓殑鍙傛暟蹇呴』鍦ㄦ煇涓€缁� `params` 鍙傛暟涓€�
diff --git a/docs/api/api_python/nn/mindspore.nn.optim_group_param.rst b/docs/api/api_python/nn/mindspore.nn.optim_group_param.rst
new file mode 100644
index 00000000000..a11c9759f85
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.optim_group_param.rst
@@ -0,0 +1 @@
+- **params** - 蹇呭～銆傚綋鍓嶇粍鍒殑鏉冮噸锛岃鍊煎繀椤绘槸 `Parameter` 鍒楄〃銆�
diff --git a/docs/api/api_python/nn/mindspore.nn.optim_group_weight_decay.rst b/docs/api/api_python/nn/mindspore.nn.optim_group_weight_decay.rst
new file mode 100644
index 00000000000..fe14cb3504f
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.optim_group_weight_decay.rst
@@ -0,0 +1 @@
+- **weight_decay** - 鍙€夈€傚鏋滈敭涓瓨鍦�"weight_decay鈥濓紝鍒欎娇鐢ㄥ搴旂殑鍊间綔涓烘潈閲嶈“鍑忓€笺€傚鏋滄病鏈夛紝鍒欎娇鐢ㄤ紭鍖栧櫒涓厤缃殑 `weight_decay` 浣滀负鏉冮噸琛板噺鍊笺€�
diff --git a/docs/api/api_python/nn/mindspore.nn.optim_note_loss_scale.rst b/docs/api/api_python/nn/mindspore.nn.optim_note_loss_scale.rst
new file mode 100644
index 00000000000..cc5bc478eff
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.optim_note_loss_scale.rst
@@ -0,0 +1 @@
+浼樺寲鍣ㄥ拰娣峰悎绮惧害涔嬮棿閫氬父娌℃湁鑱旂郴銆備絾鏄紝褰撲娇鐢╜FixedLossScaleManager`涓擿FixedLossScaleManager`涓殑`drop_overflow_update`璁剧疆涓篎alse鏃讹紝浼樺寲鍣ㄩ渶瑕佽缃�'loss_scale'銆傜敱浜庢浼樺寲鍣ㄦ病鏈塦loss_scale`鐨勫弬鏁帮紝鍥犳闇€瑕侀€氳繃鍏朵粬鏂瑰紡澶勭悊`loss_scale`锛屽浣曟纭鐞哷loss_scale`璇﹁`LossScale <https://www.mindspore.cn/docs/programming_guide/zh-CN/master/lossscale.html>`銆�
diff --git a/docs/api/api_python/nn/mindspore.nn.optim_note_sparse.rst b/docs/api/api_python/nn/mindspore.nn.optim_note_sparse.rst
new file mode 100644
index 00000000000..b4246f1091d
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.optim_note_sparse.rst
@@ -0,0 +1,2 @@
+濡傛灉鍓嶅悜缃戠粶浣跨敤浜哠parseGatherV2绛夌畻瀛愶紝浼樺寲鍣ㄤ細鎵ц绋€鐤忚繍绠楋紝閫氳繃璁剧疆 `target` 涓篊PU锛屽彲鍦ㄤ富鏈猴紙host锛変笂杩涜绋€鐤忚繍绠椼€�
+绋€鐤忕壒鎬у湪鎸佺画寮€鍙戜腑銆�
diff --git a/docs/api/api_python/nn/mindspore.nn.optim_note_weight_decay.rst b/docs/api/api_python/nn/mindspore.nn.optim_note_weight_decay.rst
new file mode 100644
index 00000000000..cd835e6e8d1
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.optim_note_weight_decay.rst
@@ -0,0 +1,2 @@
+鍦ㄥ弬鏁版湭鍒嗙粍鏃讹紝浼樺寲鍣ㄩ厤缃殑 `weight_decay` 搴旂敤浜庡悕绉板惈鏈�"beta"鎴�"gamma"鐨勭綉缁滃弬鏁帮紝閫氳繃缃戠粶鍙傛暟鍒嗙粍鍙皟鏁存潈閲嶈“鍑忕瓥鐣ャ€傚垎缁勬椂锛屾瘡缁勭綉缁滃弬鏁板潎鍙厤缃� `weight_decay` 锛岃嫢鏈厤缃紝鍒欒缁勭綉缁滃弬鏁颁娇鐢ㄤ紭鍖栧櫒涓厤缃殑 `weight_decay` 銆�
+
diff --git a/docs/api/api_python/nn/mindspore.nn.optim_target_unique_for_sparse.rst b/docs/api/api_python/nn/mindspore.nn.optim_target_unique_for_sparse.rst
new file mode 100644
index 00000000000..cf75e1637a7
--- /dev/null
+++ b/docs/api/api_python/nn/mindspore.nn.optim_target_unique_for_sparse.rst
@@ -0,0 +1,9 @@
+    .. py:method:: target
+        :property:
+
+        璇ュ睘鎬х敤浜庢寚瀹氬湪涓绘満锛坔ost锛変笂杩樻槸璁惧锛坉evice锛変笂鏇存柊鍙傛暟銆傝緭鍏ョ被鍨嬩负str锛屽彧鑳芥槸'CPU'锛�'Ascend'鎴�'GPU'銆�
+
+    .. py:method:: unique
+        :property:
+
+        璇ュ睘鎬ц〃绀烘槸鍚﹀湪浼樺寲鍣ㄤ腑杩涜姊害鍘婚噸锛岄€氬父鐢ㄤ簬绋€鐤忕綉缁溿€傚鏋滄搴︽槸绋€鐤忕殑鍒欒缃负True銆傚鏋滃墠鍚戠█鐤忕綉缁滃凡瀵规潈閲嶅幓閲嶏紝鍗虫搴︽槸绋犲瘑鐨勶紝鍒欒缃负False銆傛湭璁剧疆鏃堕粯璁ゅ€间负True銆�
diff --git a/mindspore/nn/optim/adam.py b/mindspore/nn/optim/adam.py
index 6df63f9f7c9..a7d76e8c68e 100755
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -206,7 +206,7 @@ class Adam(Optimizer):
 
     :math:`m` represents the 1st moment vector `moment1`, :math:`v` represents the 2nd moment vector `moment2`,
     :math:`g` represents `gradients`, :math:`l` represents scaling factor, :math:`\beta_1, \beta_2` represent
-    `beta1` and `beta2`, :math:`t` represents updating step while :math:`beta_1^t` and :math:`beta_2^t` represent
+    `beta1` and `beta2`, :math:`t` represents the current step while :math:`beta_1^t` and :math:`beta_2^t` represent
     `beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`,
     :math:`\epsilon` represents `eps`.
 
@@ -263,7 +263,7 @@ class Adam(Optimizer):
                        Default: 0.999.
         eps (float): Term added to the denominator to improve numerical stability. Should be greater than 0. Default:
                      1e-8.
-        use_locking (bool): Whether to enable a lock to protect variable tensors from being updated.
+        use_locking (bool): Whether to enable a lock to protect the updating process of variable tensors.
             If true, updates of the `w`, `m`, and `v` tensors will be protected by a lock.
             If false, the result is unpredictable. Default: False.
         use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients.
@@ -380,7 +380,7 @@ class Adam(Optimizer):
 
 class AdamWeightDecay(Optimizer):
     r"""
-    Implements the Adam algorithm to fix the weight decay.
+    Implements the Adam algorithm with weight decay.
 
     .. math::
         \begin{array}{ll} \\
@@ -399,7 +399,7 @@ class AdamWeightDecay(Optimizer):
 
     :math:`m` represents the 1st moment vector `moment1`, :math:`v` represents the 2nd moment vector `moment2`,
     :math:`g` represents `gradients`, :math:`lr` represents `learning_rate`,
-    :math:`\beta_1, \beta_2` represent `beta1` and `beta2`, :math:`t` represents updating step while
+    :math:`\beta_1, \beta_2` represent `beta1` and `beta2`, :math:`t` represents the current step,
     :math:`w` represents `params`.
 
     Note:
@@ -542,7 +542,7 @@ class AdamOffload(Optimizer):
 
     :math:`m` represents the 1st moment vector `moment1`, :math:`v` represents the 2nd moment vector `moment2`,
     :math:`g` represents `gradients`, :math:`l` represents scaling factor, :math:`\beta_1, \beta_2` represent
-    `beta1` and `beta2`, :math:`t` represents updating step while :math:`beta_1^t` and :math:`beta_2^t` represent
+    `beta1` and `beta2`, :math:`t` represents the current step while :math:`beta_1^t` and :math:`beta_2^t` represent
     `beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`,
     :math:`\epsilon` represents `eps`.
 
@@ -593,7 +593,7 @@ class AdamOffload(Optimizer):
                        Default: 0.999.
         eps (float): Term added to the denominator to improve numerical stability. Should be greater than 0. Default:
                      1e-8.
-        use_locking (bool): Whether to enable a lock to protect variable tensors from being updated.
+        use_locking (bool): Whether to enable a lock to protect the updating process of variable tensors.
             If true, updates of the `w`, `m`, and `v` tensors will be protected by a lock.
             If false, the result is unpredictable. Default: False.
         use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients.
diff --git a/mindspore/nn/optim/ftrl.py b/mindspore/nn/optim/ftrl.py
index 91f83654c8b..ed576f1c33b 100644
--- a/mindspore/nn/optim/ftrl.py
+++ b/mindspore/nn/optim/ftrl.py
@@ -98,9 +98,9 @@ class FTRL(Optimizer):
             \end{cases}\\
         \end{array}
 
-    :math:`m` represents `accum`, :math:`g` represents `grads`, :math:`t` represents updating step,
-    :math:`u` represents `linear`, :math:`p` represents `lr_power`, :math:`\alpha` represents `learning_rate`,
-    :math:`\omega` represents `params`.
+    :math:`m` represents accumulators, :math:`g` represents `grads`, :math:`t` represents the current step,
+    :math:`u` represents the linear coefficient to be updated,, :math:`p` represents `lr_power`, :math:`\alpha`
+    represents `learning_rate`, :math:`\omega` represents `params`.
 
     Note:
         The sparse strategy is applied while the SparseGatherV2 operator is used for forward network. If the sparse
@@ -134,7 +134,7 @@ class FTRL(Optimizer):
               If `order_params` in the keys, other keys will be ignored and the element of 'order_params' must be in
               one group of `params`.
 
-        initial_accum (float): The starting value for accumulators, must be zero or positive values. Default: 0.1.
+        initial_accum (float): The starting value for accumulators `m`, must be zero or positive values. Default: 0.1.
         learning_rate (float): The learning rate value, must be zero or positive, dynamic learning rate is currently
             not supported. Default: 0.001.
         lr_power (float): Learning rate power controls how the learning rate decreases during training, must be less
@@ -183,7 +183,8 @@ class FTRL(Optimizer):
         >>> optim = nn.FTRL(group_params, learning_rate=0.1, weight_decay=0.0)
         >>> # The conv_params's parameters will use default learning rate of 0.1 and weight decay of 0.01 and grad
         >>> # centralization of True.
-        >>> # The no_conv_params's parameters will use default weight decay of 0.0 and grad centralization of False.
+        >>> # The no_conv_params's parameters will use default learning rate of 0.1 will use default weight decay
+        >>> # of 0.0 and grad centralization of False.
         >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
         >>>
         >>> loss = nn.SoftmaxCrossEntropyWithLogits()
diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py
index fb124395615..1295a3e324c 100755
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@@ -172,7 +172,7 @@ def _check_param_value(beta1, beta2, eps, prim_name):
 
 class Lamb(Optimizer):
     r"""
-    Lamb(Layer-wise Adaptive Moments optimizer for Batching training) Dynamic Learning Rate.
+    An optimizer that implements the Lamb(Layer-wise Adaptive Moments optimizer for Batching training) algorithm.
 
     LAMB is an optimization algorithm employing a layerwise adaptive large batch optimization technique.
     Refer to the paper `LARGE BATCH OPTIMIZATION FOR DEEP LEARNING: TRAINING BERT IN 76
diff --git a/mindspore/nn/optim/lars.py b/mindspore/nn/optim/lars.py
index f8336daf185..af84e3e7351 100755
--- a/mindspore/nn/optim/lars.py
+++ b/mindspore/nn/optim/lars.py
@@ -71,16 +71,16 @@ class LARS(Optimizer):
             g_{t+1} = \lambda * (g_{t} + \delta * \omega)
         \end{array}
 
-    :math:`\theta` represents `coefficient`, :math:`\omega` represents `parameters`, :math:`g` represents `gradients`,
-    :math:`t` represents updating step, :math:`\delta` represents `weight_decay`,
-    :math:`\alpha` represents `learning_rate`, :math:`clip` represents `use_clip`.
+    :math:`\theta` represents `coefficient`, :math:`\omega` represents the network parameters, :math:`g` represents
+    `gradients`, :math:`t` represents the current step, :math:`\delta` represents `weight_decay` in `optimizer`,
+    :math:`\alpha` represents `learning_rate` in `optimizer`, :math:`clip` represents `use_clip`.
 
     Args:
         optimizer (Optimizer): MindSpore optimizer for which to wrap and modify gradients.
         epsilon (float): Term added to the denominator to improve numerical stability. Default: 1e-05.
         coefficient (float): Trust coefficient for calculating the local learning rate. Default: 0.001.
         use_clip (bool): Whether to use clip operation for calculating the local learning rate. Default: False.
-        lars_filter (Function): A function to determine whether apply the LARS algorithm. Default:
+        lars_filter (Function): A function to determine which of the network parameters to use LARS algorithm. Default:
                                 lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name.
 
     Inputs:
diff --git a/mindspore/nn/optim/lazyadam.py b/mindspore/nn/optim/lazyadam.py
index 7101a479836..853efc4910c 100644
--- a/mindspore/nn/optim/lazyadam.py
+++ b/mindspore/nn/optim/lazyadam.py
@@ -106,10 +106,10 @@ def _check_param_value(beta1, beta2, eps, weight_decay, prim_name):
 
 class LazyAdam(Optimizer):
     r"""
-    This optimizer will apply a lazy adam algorithm when gradient is sparse.
+    Updates gradients by the Adaptive Moment Estimation (Adam) algorithm. The Adam algorithm is proposed
+    in `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_.
 
-    The original adam algorithm is proposed in
-    `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_.
+    This optimizer will apply a lazy adam algorithm when gradient is sparse.
 
     The updating formulas are as follows,
 
@@ -123,7 +123,7 @@ class LazyAdam(Optimizer):
 
     :math:`m` represents the 1st moment vector `moment1`, :math:`v` represents the 2nd moment vector `moment2`,
     :math:`g` represents `gradients`, :math:`l` represents scaling factor, :math:`\beta_1, \beta_2` represent
-    `beta1` and `beta2`, :math:`t` represents updating step while :math:`beta_1^t` and :math:`beta_2^t` represent
+    `beta1` and `beta2`, :math:`t` represents the current step while :math:`beta_1^t` and :math:`beta_2^t` represent
     `beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`,
     :math:`\epsilon` represents `eps`.
 
@@ -182,7 +182,7 @@ class LazyAdam(Optimizer):
                        Default: 0.999.
         eps (float): Term added to the denominator to improve numerical stability. Should be greater than 0. Default:
                      1e-8.
-        use_locking (bool): Whether to enable a lock to protect variable tensors from being updated.
+        use_locking (bool): Whether to enable a lock to protect the updating process of variable tensors.
             If true, updates of the `w`, `m`, and `v` tensors will be protected by a lock.
             If false, the result is unpredictable. Default: False.
         use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients.
diff --git a/mindspore/nn/optim/proximal_ada_grad.py b/mindspore/nn/optim/proximal_ada_grad.py
index 1a477fdf8eb..c1e3a571f42 100644
--- a/mindspore/nn/optim/proximal_ada_grad.py
+++ b/mindspore/nn/optim/proximal_ada_grad.py
@@ -69,7 +69,7 @@ class ProximalAdagrad(Optimizer):
     .. math::
         var_{t+1} = \frac{sign(\text{prox_v})}{1 + lr * l2} * \max(\left| \text{prox_v} \right| - lr * l1, 0)
 
-    Here : where grad, lr, var, accum and t denote the gradients, learning_rate, params and accumulation and current
+    Here : where grad, lr, var, accum and t denote the `grads`, `learning_rate`, `params`, accumulation and current
     step respectively.
 
     Note:
@@ -105,7 +105,7 @@ class ProximalAdagrad(Optimizer):
               If `order_params` in the keys, other keys will be ignored and the element of 'order_params' must be in
               one group of `params`.
 
-        accum (float): The starting value for accumulators, must be zero or positive values. Default: 0.1.
+        accum (float): The starting value for accumulators `accum`, must be zero or positive values. Default: 0.1.
         learning_rate (Union[float, int, Tensor, Iterable, LearningRateSchedule]): Default: 0.001.
 
             - float: The fixed learning rate value. Must be equal to or greater than 0.
diff --git a/mindspore/nn/optim/rmsprop.py b/mindspore/nn/optim/rmsprop.py
index 463827e03fb..51e57f6f418 100644
--- a/mindspore/nn/optim/rmsprop.py
+++ b/mindspore/nn/optim/rmsprop.py
@@ -75,13 +75,14 @@ class RMSProp(Optimizer):
         w = w - m_{t+1}
 
     where :math:`w` represents `params`, which will be updated.
-    :math:`g_{t+1}` is mean gradients, :math:`g_{t}` is the last moment of :math:`g_{t+1}`.
-    :math:`s_{t+1}` is the mean square gradients, :math:`s_{t}` is the last moment of :math:`s_{t+1}`,
-    :math:`m_{t+1}` is moment, the delta of `w`, :math:`m_{t}` is the last moment of :math:`m_{t+1}`.
+    :math:`g_{t+1}` is mean gradients.
+    :math:`s_{t+1}` is the mean square gradients.
+    :math:`m_{t+1}` is moment, the delta of `w`.
     :math:`\\rho` represents `decay`. :math:`\\beta` is the momentum term, represents `momentum`.
     :math:`\\epsilon` is a smoothing term to avoid division by zero, represents `epsilon`.
     :math:`\\eta` is learning rate, represents `learning_rate`. :math:`\\nabla Q_{i}(w)` is gradients,
     represents `gradients`.
+    :math:`t` represents the current step.
 
     Note:
         If parameters are not grouped, the `weight_decay` in optimizer will be applied on the network parameters without
@@ -131,9 +132,9 @@ class RMSProp(Optimizer):
                           greater than 0. Default: 0.0.
         epsilon (float): Term added to the denominator to improve numerical stability. Should be greater than
                          0. Default: 1e-10.
-        use_locking (bool):  Whether to enable a lock to protect the variable and accumulation tensors from being
-                             updated. Default: False.
-        centered (bool): If true, gradients are normalized by the estimated variance of the gradient. Default: False.
+        use_locking (bool):  Whether to enable a lock to protect the updating process of variable tensors.
+            Default: False.
+        centered (bool): If True, gradients are normalized by the estimated variance of the gradient. Default: False.
         loss_scale (float): A floating point value for the loss scale. Should be greater than 0. In general, use the
             default value. Only when `FixedLossScaleManager` is used for training and the `drop_overflow_update` in
             `FixedLossScaleManager` is set to False, then this value needs to be the same as the `loss_scale` in
diff --git a/mindspore/nn/optim/sgd.py b/mindspore/nn/optim/sgd.py
index 6f3f1bba2d7..9dbb690d580 100755
--- a/mindspore/nn/optim/sgd.py
+++ b/mindspore/nn/optim/sgd.py
@@ -128,8 +128,8 @@ class SGD(Optimizer):
         ...                 {'params': no_conv_params, 'lr': 0.01},
         ...                 {'order_params': net.trainable_params()}]
         >>> optim = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0)
-        >>> # The conv_params's parameters will use default learning rate of 0.1 default weight decay of 0.0 and grad
-        >>> # centralization of True.
+        >>> # The conv_params's parameters will use default learning rate of 0.1 and default weight decay of 0.0
+        >>> # and grad centralization of True.
         >>> # The no_conv_params's parameters will use learning rate of 0.01 and default weight decay of 0.0 and grad
         >>> # centralization of False.
         >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
diff --git a/mindspore/nn/wrap/cell_wrapper.py b/mindspore/nn/wrap/cell_wrapper.py
index 22178566d56..9bacd44544f 100644
--- a/mindspore/nn/wrap/cell_wrapper.py
+++ b/mindspore/nn/wrap/cell_wrapper.py
@@ -287,13 +287,13 @@ class TrainOneStepCell(Cell):
     r"""
     Network training package class.
 
-    Wraps the network with an optimizer. The resulting Cell is trained with input '\*inputs'.
+    Wraps the `network` with the `optimizer`. The resulting Cell is trained with input '\*inputs'.
     The backward graph will be created in the construct function to update the parameter. Different
     parallel modes are available for training.
 
     Args:
         network (Cell): The training network. The network only supports single output.
-        optimizer (Union[Cell]): Optimizer for updating the weights.
+        optimizer (Union[Cell]): Optimizer for updating the network parameters.
         sens (numbers.Number): The scaling number to be filled as the input of backpropagation. Default value is 1.0.
 
     Inputs:
@@ -303,7 +303,7 @@ class TrainOneStepCell(Cell):
         Tensor, a tensor means the loss value, the shape of which is usually :math:`()`.
 
     Raises:
-        TypeError: If `sens` is not a number.
+        TypeError: If `sens` is not a numbers.Number.
 
     Supported Platforms:
         ``Ascend`` ``GPU`` ``CPU``
@@ -312,7 +312,7 @@ class TrainOneStepCell(Cell):
         >>> net = Net()
         >>> loss_fn = nn.SoftmaxCrossEntropyWithLogits()
         >>> optim = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
-        >>> #1) Using the WithLossCell existing provide
+        >>> #1) Using the WithLossCell provided by MindSpore
         >>> loss_net = nn.WithLossCell(net, loss_fn)
         >>> train_net = nn.TrainOneStepCell(loss_net, optim)
         >>>
@@ -596,22 +596,21 @@ class VirtualDatasetCellTriple(Cell):
 
 class WithEvalCell(Cell):
     r"""
-    Cell that returns loss, output and label for evaluation.
+    Wraps the forward network with the loss function.
 
-    This Cell accepts a network and loss function as arguments and computes loss for model.
-    It returns loss, output and label to calculate the metrics.
+    It returns loss, forward output and label to calculate the metrics.
 
     Args:
-        network (Cell): The network Cell.
-        loss_fn (Cell): The loss Cell.
-        add_cast_fp32 (bool): Adjust the data type to float32. Default: False.
+        network (Cell): The forward network.
+        loss_fn (Cell): The loss function.
+        add_cast_fp32 (bool): Whether to adjust the data type to float32. Default: False.
 
     Inputs:
         - **data** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
         - **label** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
 
     Outputs:
-        Tuple, containing a scalar loss Tensor, a network output Tensor of shape :math:`(N, \ldots)`
+        Tuple(Tensor), containing a scalar loss Tensor, a network output Tensor of shape :math:`(N, \ldots)`
         and a label Tensor of shape :math:`(N, \ldots)`.
 
     Raises:
@@ -621,7 +620,7 @@ class WithEvalCell(Cell):
         ``Ascend`` ``GPU`` ``CPU``
 
     Examples:
-        >>> # For a defined network Net without loss function
+        >>> # Forward network without loss function
         >>> net = Net()
         >>> loss_fn = nn.SoftmaxCrossEntropyWithLogits()
         >>> eval_net = nn.WithEvalCell(net, loss_fn)
diff --git a/mindspore/nn/wrap/loss_scale.py b/mindspore/nn/wrap/loss_scale.py
index 61210976bcf..a2034b737a2 100644
--- a/mindspore/nn/wrap/loss_scale.py
+++ b/mindspore/nn/wrap/loss_scale.py
@@ -59,16 +59,17 @@ class DynamicLossScaleUpdateCell(Cell):
     Dynamic Loss scale update cell.
 
     For loss scaling training, the initial loss scaling value will be set to be `loss_scale_value`.
-    In each training step, the loss scaling value  will be updated by loss scaling value/`scale_factor`
-    when there is an overflow. And it will be increased by loss scaling value * `scale_factor` if there is no
-    overflow for a continuous `scale_window` steps. This cell is used for Graph mode training in which all
-    logic will be executed on device side(Another training mode is normal(non-sink) mode in which some logic will be
-    executed on host).
+    In each training step, the loss scaling value will be decreased by `loss_scale`/`scale_factor`
+    when there is an overflow. And it will be increased by `loss_scale` * `scale_factor` if there is no
+    overflow for a continuous `scale_window` steps.
+
+    `get_update_cell` method of :class:`mindspore.nn.DynamicLossScaleManager` will return this class, it will be called
+    by :class:`mindspore.TrainOneStepWithLossScaleCell` during training to update loss scale.
 
     Args:
         loss_scale_value (float): Initializes loss scale.
         scale_factor (int): Coefficient of increase and decrease.
-        scale_window (int): Maximum continuous training steps that do not have overflow.
+        scale_window (int): Maximum continuous training steps that do not have overflow to increase the loss scale.
 
     Inputs:
         - **loss_scale** (Tensor) - The loss scale value during training with shape :math:`()`.
@@ -77,9 +78,6 @@ class DynamicLossScaleUpdateCell(Cell):
     Outputs:
         bool, the input `overflow`.
 
-    Raises:
-        TypeError: If dtype of `inputs` or `label` is neither float16 nor float32.
-
     Supported Platforms:
         ``Ascend`` ``GPU``
 
@@ -162,15 +160,17 @@ class DynamicLossScaleUpdateCell(Cell):
 
 class FixedLossScaleUpdateCell(Cell):
     """
-    Static scale update cell, the loss scaling value will not be updated.
+    Update cell with fixed loss scaling value.
 
-    For usage, refer to `DynamicLossScaleUpdateCell`.
+    `get_update_cell` method of :class:`mindspore.nn.FixedLossScaleManager` will return this class, it will be called
+    by :class:`mindspore.TrainOneStepWithLossScaleCell` during trainning.
 
     Args:
         loss_scale_value (float): Initializes loss scale.
 
     Inputs:
-        - **loss_scale** (Tensor) - The loss scale value during training with shape :math:`()`, that will be ignored.
+        - **loss_scale** (Tensor) - The loss scale value during training with shape :math:`()`, it is ignored in this
+          class.
         - **overflow** (bool) - Whether the overflow occurs or not.
 
     Outputs:
@@ -227,28 +227,27 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell):
     r"""
     Network training with loss scaling.
 
-    This is a training step with loss scaling. It takes a network, an optimizer and possibly a scale update
-    Cell as args. The loss scale value can be updated in both host side or device side. The
-    TrainOneStepWithLossScaleCell will be compiled to be graph which takes `*inputs` as input data.
-    The Tensor type of `scale_sense` is acting as loss scaling value. If you want to update it on host side,
-    the value must be provided. If  the Tensor type of `scale_sense` is not given, the loss scale update logic
-    must be provide by Cell type of `scale_sense`.
+    This is a training step with loss scaling. It takes a network, an optimizer and a scale update Cell(or a Tensor) as
+    args. The loss scale value can be updated in both host side or device side. If you want to update it on
+    host side, using a value of Tensor type as `scale_sense`, otherwise, using a Cell instance for updating loss
+    scale as `scale_sense`.
 
     Args:
         network (Cell): The training network. The network only supports single output.
-        optimizer (Cell): Optimizer for updating the weights.
-        scale_sense (Union[Tensor, Cell]): If this value is Cell type, the loss scaling update logic cell.If this value
-            is Tensor type, Tensor with shape :math:`()` or :math:`(1,)`.
+        optimizer (Cell): Optimizer for updating the network parameters.
+        scale_sense (Union[Tensor, Cell]): If this value is a Cell, it will be called by `TrainOneStepWithLossScaleCell`
+            to update loss scale. If this value is a Tensor, the loss scale can be modified by `set_sense_scale`,
+            the shape should be :math:`()` or :math:`(1,)`.
 
     Inputs:
         - **(*inputs)** (Tuple(Tensor)) - Tuple of input tensors with shape :math:`(N, \ldots)`.
 
     Outputs:
-        Tuple of 3 Tensor, the loss, overflow flag and current loss scaling value.
+        Tuple of 3 Tensor, the loss, overflow flag and current loss scale value.
 
         - **loss** (Tensor) -  Tensor with shape :math:`()`.
         - **overflow** (Tensor) -  Tensor with shape :math:`()`, type is bool.
-        - **loss scaling value** (Tensor) -  Tensor with shape :math:`()`
+        - **loss scale** (Tensor) -  Tensor with shape :math:`()`
 
     Raises:
         TypeError: If `scale_sense` is neither Cell nor Tensor.
@@ -350,8 +349,7 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell):
 
     def set_sense_scale(self, sens):
         """
-        If the user has set the sens in the training process and wants to reassign the value, he can call
-        this function again to make modification, and sens needs to be of type Tensor.
+        If the user has set the `scale_sense` of Tensor type, he can call this function to reassign the value.
 
         Args:
             sens(Tensor): The new sense whose shape and type are the same with original `scale_sense`.
@@ -382,7 +380,7 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell):
 
         Returns:
             Tuple[object, object], the first value is False for GPU backend, while it is an instance of
-            NPUAllocFloatStatus for other backend. The status is used to detect overflow during overflow detection.
+            NPUAllocFloatStatus for other backend. The status is used to detect overflow during `get_overflow_status`.
             The second value is the same as the input of `compute_input`, but contains some information about the
             execution order.
         """
@@ -406,7 +404,7 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell):
         Args:
             status (object): A status instance used to detect the overflow.
             compute_output: Overflow detection should be performed on a certain computation. Set `compute_output`
-              as the output of the computation, to ensure overflow status is acquired before executing the
+              as the output of the computation, to ensure overflow `status` is acquired before executing the
               computation.
 
         Returns:
@@ -442,7 +440,7 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell):
             overflow(bool): Whether the overflow occurs or not.
 
         Returns:
-            bool, overflow value.
+            bool, the input overflow value.
         """
         if self.loss_scaling_manager is not None:
             return self.loss_scaling_manager(self.scale_sense, overflow)
diff --git a/mindspore/train/loss_scale_manager.py b/mindspore/train/loss_scale_manager.py
index d0b3fa6ea5c..c3d528fe8bd 100644
--- a/mindspore/train/loss_scale_manager.py
+++ b/mindspore/train/loss_scale_manager.py
@@ -25,8 +25,8 @@ class LossScaleManager:
     Derived class needs to implement all of its methods. `get_loss_scale` is used to get current loss scale value.
     `update_loss_scale` is used to update loss scale value, `update_loss_scale` will be called during the training.
     `get_update_cell` is used to get the instance of :class:`mindspore.nn.Cell` that is used to update the loss scale,
-    the instance will be called during the training. When using sink mode, only the `get_update_cell` works, otherwise
-    both `update_loss_scale` and `get_update_cell` works.
+    the instance will be called during the training. Currently, the `get_update_cell` is mostly used.
+
     For example, :class:`mindspore.FixedLossScaleManager` and :class:`mindspore.DynamicLossScaleManager`.
     """
     def get_loss_scale(self):
@@ -105,7 +105,8 @@ class FixedLossScaleManager(LossScaleManager):
     def get_update_cell(self):
         """
         Returns the instance of :class:`mindspore.nn.Cell` that used to update the loss scale which will be called at
-        :class:`mindspore.nn.TrainOneStepWithLossScaleCell`.
+        :class:`mindspore.nn.TrainOneStepWithLossScaleCell`. As the loss scale is fixed in this class, the instance
+        will do nothing.
 
         Returns:
             None or :class:`mindspore.FixedLossScaleUpdateCell`. Instance of