!30579 add auto parallel adasum docs

Merge pull request !30579 from yao_yf/adasum_docs
2022-02-28 09:41:12 +00:00 · 2022-02-28 09:41:12 +00:00 · cf5248276e
parent f9ba4caba5 87c8e74fbb
commit cf5248276e
5 changed files with 101 additions and 8 deletions
--- a/docs/api/api_python/mindspore.nn.rst
+++ b/docs/api/api_python/mindspore.nn.rst
@ -231,6 +231,8 @@ MindSpore中 `mindspore.nn` 接口与上一版本相比，新增、删除和支
    mindspore.nn.Adam
    mindspore.nn.AdamOffload
    mindspore.nn.AdamWeightDecay
+    mindspore.nn.AdaSumByDeltaWeightWrapCell
+    mindspore.nn.AdaSumByGradWrapCell
    mindspore.nn.ASGD
    mindspore.nn.FTRL
    mindspore.nn.Lamb
@ -244,6 +246,7 @@ MindSpore中 `mindspore.nn` 接口与上一版本相比，新增、删除和支
    mindspore.nn.SGD
    mindspore.nn.thor

+
 Wrapper
 ---------

--- a/docs/api/api_python/nn/mindspore.nn.AdaSumbyDeltaWeightWrapCell.rst
+++ b/docs/api/api_python/nn/mindspore.nn.AdaSumbyDeltaWeightWrapCell.rst
@ -0,0 +1,37 @@
+mindspore.nn.AdaSumByDeltaWeightWrapCell
+========================================
+
+.. py:class:: mindspore.nn.AdaSumByDeltaWeightWrapCell(optimizer)
+
+    Adaptive Summation (AdaSum)算法的实现，根据更新前后的参数差计算。
+
+    请参阅论文 `AdaSum: Scaling Distributed Training with Adaptive Summation <https://arxiv.org/abs/2006.02924>`_。
+
+    公式如下：
+
+    .. math::
+        \begin{array}{ll}
+          w_{t+1}=w_{t} - \alpha \cdot Adasum(g_{1}, g_{2})  \\
+          w_{t+1}=w_{t} - \alpha \cdot [(1 - \frac{g_2^{T}\cdot g_1}{2\cdot \left \| g_1 \right \|^2 })\cdot g_1 +  (1 - \frac{g_1^{T}\cdot g_2}{2\cdot \left \| g_2 \right \|^2 })\cdot g_2]  \\
+        \end{array}
+
+    在本实现中， :math:`g` 代表优化器更新前后的权重的变化量，下标代表数据并行维度下不同的设备。
+
+    .. note::
+            本接口推荐应用于半自动并行或者全自动并行模式。针对数据并行模式，推荐使用mindspore.boost功能以使用AdaSum。
+            使用本接口时，训练的卡的数量必须是2的幂，并且至少需要16张卡。目前，使用本接口时不支持优化器并行和流水线并行。
+
+    **参数：**
+
+    - **optimizer** (nn.optimizer) - 必须是单输入的优化器：
+
+    **输入：**
+
+    - **gradients** (tuple[Tensor]) - `params` 的梯度，形状（shape）与 `params` 相同，与所传优化器的输入一致。
+
+    **异常：**
+
+    - **RuntimeError** - `parallel_mode` 使用了`stand_alone`模式， AdaSum仅支持在分布式场景下使用。
+    - **RuntimeError** - 同时使用了优化器并行， 暂时不支持在优化器并行场景下使用AdaSum。
+    - **RuntimeError** - 同时使用了流水线并行， 暂时不支持在流水线并行场景下使用AdaSum。
+    - **RuntimeError** - `device_num` 不是2的幂，或者小于16。
--- a/docs/api/api_python/nn/mindspore.nn.AdaSumbyGradWrapCell.rst
+++ b/docs/api/api_python/nn/mindspore.nn.AdaSumbyGradWrapCell.rst
@ -0,0 +1,37 @@
+mindspore.nn.AdaSumByGradWrapCell
+=================================
+
+.. py:class:: mindspore.nn.AdaSumByGradWrapCell(optimizer)
+
+    Adaptive Summation (AdaSum)算法的实现，根据梯度计算。
+
+    请参阅论文 `AdaSum: Scaling Distributed Training with Adaptive Summation <https://arxiv.org/abs/2006.02924>`_。
+
+    公式如下：
+
+    .. math::
+        \begin{array}{ll}
+          w_{t+1}=w_{t} - \alpha \cdot Adasum(g_{1}, g_{2})  \\
+          w_{t+1}=w_{t} - \alpha \cdot [(1 - \frac{g_2^{T}\cdot g_1}{2\cdot \left \| g_1 \right \|^2 })\cdot g_1 +  (1 - \frac{g_1^{T}\cdot g_2}{2\cdot \left \| g_2 \right \|^2 })\cdot g_2]  \\
+        \end{array}
+
+    在本实现中， :math:`g` 代表权重的梯度，下标代表数据并行维度下不同的设备。
+
+    .. note::
+            本接口推荐应用于半自动并行或者全自动并行模式。针对数据并行模式，推荐使用mindspore.boost功能以使用AdaSum。
+            使用本接口时，训练的卡的数量必须是2的幂，并且至少需要16张卡。目前，使用本接口时不支持优化器并行和流水线并行。
+
+    **参数：**
+
+    - **optimizer** (nn.optimizer) - 必须是单输入的优化器：
+
+    **输入：**
+
+    - **gradients** (tuple[Tensor]) - `params` 的梯度，形状（shape）与 `params` 相同，与所传优化器的输入一致。
+
+    **异常：**
+
+    - **RuntimeError** - `parallel_mode` 使用了`stand_alone`模式， AdaSum仅支持在分布式场景下使用。
+    - **RuntimeError** - 同时使用了优化器并行， 暂时不支持在优化器并行场景下使用AdaSum。
+    - **RuntimeError** - 同时使用了流水线并行， 暂时不支持在流水线并行场景下使用AdaSum。
+    - **RuntimeError** - `device_num` 不是2的幂，或者小于16。
--- a/mindspore/python/mindspore/communication/management.py
+++ b/mindspore/python/mindspore/communication/management.py
@ -224,9 +224,10 @@ def get_local_rank(group=GlobalComm.WORLD_COMM_GROUP):
        ValueError: If backend is invalid.
        RuntimeError: If HCCL is not available or MindSpore is GPU version.
    Examples:
-        >>> from mindspore.context import set_context
+        >>> from mindspore.context import set_context, set_auto_parallel_context
        >>> from mindspore.communication.management import init, get_rank, get_local_rank
-        >>> set_context(device_target="Ascend", device_num=16) # 2 server, each server with 8 NPU.
+        >>> set_context(device_target="Ascend")
+        >>> set_auto_parallel_context(device_num=16) # 2 server, each server with 8 NPU.
        >>> init()
        >>> world_rank = get_rank() # rank_id is 9.
        >>> local_rank = get_local_rank()
@ -260,9 +261,10 @@ def get_group_size(group=GlobalComm.WORLD_COMM_GROUP):
        RuntimeError: If HCCL/NCCL is not available.

    Examples:
-        >>> from mindspore.context import set_context
+        >>> from mindspore.context import set_context, set_auto_parallel_context
        >>> from mindspore.communication.management import init, get_group_size
-        >>> set_context(device_target="Ascend", device_num=8)
+        >>> set_context(device_target="Ascend")
+        >>> set_auto_parallel_context(device_num=8)
        >>> init()
        >>> group_size = get_group_size()
        >>> print("group_size is: ", group_size)
@ -295,9 +297,10 @@ def get_local_rank_size(group=GlobalComm.WORLD_COMM_GROUP):
        ValueError: If backend is invalid.
        RuntimeError: If HCCL is not available or MindSpore is GPU version.
    Examples:
-        >>> from mindspore.context import set_context
+        >>> from mindspore.context import set_context, set_auto_parallel_context
        >>> from mindspore.communication.management import init, get_local_rank_size
-        >>> set_context(device_target="Ascend", device_num=16) # 2 server, each server with 8 NPU.
+        >>> set_context(device_target="Ascend")
+        >>> set_auto_parallel_context(device_num=16) # 2 server, each server with 8 NPU.
        >>> init()
        >>> local_rank_size = get_local_rank_size()
        >>> print("local_rank_size is: ", local_rank_size)
--- a/mindspore/python/mindspore/nn/optim/adasum.py
+++ b/mindspore/python/mindspore/nn/optim/adasum.py
@ -373,13 +373,20 @@ def _parallel_check():
        raise RuntimeError("Currently, the optimizer shard is not supported with applying adasum.")
    if context.get_auto_parallel_context("pipeline_stages") > 1:
        raise RuntimeError("Currently, the pipeline parallel is not supported with applying adasum.")
-    if _get_stage_device_num() < 16:
-        raise RuntimeError("The device_num should be at least 16 when applying adasum.")
+    stage_device_num = _get_stage_device_num()
+    if stage_device_num < 16 or (stage_device_num & (stage_device_num - 1) != 0):
+        raise RuntimeError("The device_num should be at least 16 and should be the power of 2 when applying adasum.")

 class AdaSumByGradWrapCell(Cell):
    r"""
    Enable the adasum in "auto_parallel/semi_auto_parallel" mode.

+    Note:
+        When using AdaSum, the number of traning cards needs to be a power of 2 and at least 16 cards are required.
+        Currently, the optimizer sharding and pipeline parallel is not supported when using AdaSum.
+        It is recommended to using AdaSumByGradWrapCell in semi auto parallel/auto parallel mode, and in data parallel
+        mode, we recommend to using mindspore.boost to applying AdaSum.
+
    Args:
        optimizer (Union[Cell]): Optimizer for updating the weights. The construct function of the optimizer
            requires only one input.
@ -419,6 +426,12 @@ class AdaSumByDeltaWeightWrapCell(Cell):
    r"""
    Enable the adasum in "auto_parallel/semi_auto_parallel" mode.

+    Note:
+        When using AdaSum, the number of traning cards needs to be a power of 2 and at least 16 cards are required.
+        Currently, the optimizer sharding and pipeline parallel is not supported when using AdaSum.
+        It is recommended to using AdaSumByDeltaWeightWrapCell in semi auto parallel/auto parallel mode,
+        and in data parallel mode, we recommend to using mindspore.boost to applying AdaSum.
+
    Args:
        optimizer (Union[Cell]): Optimizer for updating the weights. The construct function of the optimizer
            requires only one input.