From 0af87eb9cc062f5fc763310af7c76eb806061a2f Mon Sep 17 00:00:00 2001 From: yao_yf Date: Wed, 23 Mar 2022 20:30:59 +0800 Subject: [PATCH] bugs fix --- ...ndspore.nn.AdaSumByDeltaWeightWrapCell.rst | 2 +- .../nn/mindspore.nn.AdaSumByGradWrapCell.rst | 2 +- .../frontend/parallel/ops_info/matmul_info.cc | 8 ++++++ .../parallel/ops_info/operator_info.cc | 7 ++++++ mindspore/python/mindspore/nn/optim/adasum.py | 25 +++++++++++++++++++ 5 files changed, 42 insertions(+), 2 deletions(-) diff --git a/docs/api/api_python/nn/mindspore.nn.AdaSumByDeltaWeightWrapCell.rst b/docs/api/api_python/nn/mindspore.nn.AdaSumByDeltaWeightWrapCell.rst index d15c5f0b234..3a706f3c13f 100644 --- a/docs/api/api_python/nn/mindspore.nn.AdaSumByDeltaWeightWrapCell.rst +++ b/docs/api/api_python/nn/mindspore.nn.AdaSumByDeltaWeightWrapCell.rst @@ -3,7 +3,7 @@ mindspore.nn.AdaSumByDeltaWeightWrapCell .. py:class:: mindspore.nn.AdaSumByDeltaWeightWrapCell(optimizer) - Adaptive Summation (AdaSum)算法的实现,根据更新前后的参数差计算。 + Adaptive Summation (AdaSum)算法的实现,根据更新前后的参数差计算。应用于semi_auto_parallel/auto_parallel模式。 请参阅论文 `AdaSum: Scaling Distributed Training with Adaptive Summation `_。 diff --git a/docs/api/api_python/nn/mindspore.nn.AdaSumByGradWrapCell.rst b/docs/api/api_python/nn/mindspore.nn.AdaSumByGradWrapCell.rst index cd900d84fcf..be16f4959eb 100644 --- a/docs/api/api_python/nn/mindspore.nn.AdaSumByGradWrapCell.rst +++ b/docs/api/api_python/nn/mindspore.nn.AdaSumByGradWrapCell.rst @@ -3,7 +3,7 @@ mindspore.nn.AdaSumByGradWrapCell .. py:class:: mindspore.nn.AdaSumByGradWrapCell(optimizer) - Adaptive Summation (AdaSum)算法的实现,根据梯度计算。 + Adaptive Summation (AdaSum)算法的实现,根据梯度计算。应用于semi_auto_parallel/auto_parallel模式。 请参阅论文 `AdaSum: Scaling Distributed Training with Adaptive Summation `_。 diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.cc index 9077664c8de..0bd5f503d44 100644 --- a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.cc +++ b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.cc @@ -484,6 +484,14 @@ Status MatMulBase::GenerateStrategiesNotPower2(int64_t stage_id, size_t dev_num_ } } strategy_cost_.clear(); + // add the repeated strategy + auto repeated_stra_arrays{inputs_shape_}; + for (auto &stra_array : repeated_stra_arrays) { + std::fill(stra_array.begin(), stra_array.end(), 1); + } + StrategyPtr repeated_stra = std::make_shared(stage_id, repeated_stra_arrays); + sp_vector.push_back(repeated_stra); + for (auto &sp : sp_vector) { if (SetCostUnderStrategy(sp) == FAILED) { MS_LOG(WARNING) << name_ << " : Calculating cost for strategy failed."; diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/operator_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/operator_info.cc index 0141c9c1f09..f67ad9d3844 100644 --- a/mindspore/ccsrc/frontend/parallel/ops_info/operator_info.cc +++ b/mindspore/ccsrc/frontend/parallel/ops_info/operator_info.cc @@ -1470,6 +1470,13 @@ Status GenerateStrategiesForIndependentInputs(int64_t stage_id, const Shapes &in } } } + // add the repeated strategy + auto repeated_stra_arrays{splittable_inputs}; + for (auto &stra_array : repeated_stra_arrays) { + std::fill(stra_array.begin(), stra_array.end(), 1); + } + StrategyPtr repeated_stra = std::make_shared(stage_id, repeated_stra_arrays); + sp_vector->push_back(repeated_stra); return SUCCESS; } diff --git a/mindspore/python/mindspore/nn/optim/adasum.py b/mindspore/python/mindspore/nn/optim/adasum.py index 82d703b971f..15965f9e284 100644 --- a/mindspore/python/mindspore/nn/optim/adasum.py +++ b/mindspore/python/mindspore/nn/optim/adasum.py @@ -380,6 +380,18 @@ def _parallel_check(): class AdaSumByGradWrapCell(Cell): r""" Enable the adasum in "auto_parallel/semi_auto_parallel" mode. + The implementation of the Adaptive Summation (AdaSum) algorithm is calculated by gradients. + See the paper `AdaSum: Scaling Distributed Training with Adaptive Summation `_. + + .. math:: + \begin{array}{ll} + w_{t+1}=w_{t} - \alpha \cdot Adasum(g_{1}, g_{2}) \\ + w_{t+1}=w_{t} - \alpha \cdot [(1 - \frac{g_2^{T}\cdot g_1}{2\cdot \left \| g_1 \right \|^2 })\cdot g_1 + + (1 - \frac{g_1^{T}\cdot g_2}{2\cdot \left \| g_2 \right \|^2 })\cdot g_2] \\ + \end{array} + + In this implementation, :math:`g` represents the gradient of the weights, + and the subscripts represent different devices in the data-parallel dimension. Note: When using AdaSum, the number of traning cards needs to be a power of 2 and at least 16 cards are required. @@ -433,6 +445,19 @@ class AdaSumByGradWrapCell(Cell): class AdaSumByDeltaWeightWrapCell(Cell): r""" Enable the adasum in "auto_parallel/semi_auto_parallel" mode. + The implementation of the Adaptive Summation (AdaSum) algorithm is calculated based on the difference of weights + before and after the updating of optimizer. + See the paper `AdaSum: Scaling Distributed Training with Adaptive Summation `_. + + .. math:: + \begin{array}{ll} + w_{t+1}=w_{t} - \alpha \cdot Adasum(g_{1}, g_{2}) \\ + w_{t+1}=w_{t} - \alpha \cdot [(1 - \frac{g_2^{T}\cdot g_1}{2\cdot \left \| g_1 \right \|^2 })\cdot g_1 + + (1 - \frac{g_1^{T}\cdot g_2}{2\cdot \left \| g_2 \right \|^2 })\cdot g_2] \\ + \end{array} + + In this implementation, :math:`g` represents the weight difference before and after the updating of optimizer, + and the subscripts represent different devices in the data parallel dimension. Note: When using AdaSum, the number of traning cards needs to be a power of 2 and at least 16 cards are required.