From 0af87eb9cc062f5fc763310af7c76eb806061a2f Mon Sep 17 00:00:00 2001
From: yao_yf <yaoyifan1@huawei.com>
Date: Wed, 23 Mar 2022 20:30:59 +0800
Subject: [PATCH] bugs fix

---
 ...ndspore.nn.AdaSumByDeltaWeightWrapCell.rst |  2 +-
 .../nn/mindspore.nn.AdaSumByGradWrapCell.rst  |  2 +-
 .../frontend/parallel/ops_info/matmul_info.cc |  8 ++++++
 .../parallel/ops_info/operator_info.cc        |  7 ++++++
 mindspore/python/mindspore/nn/optim/adasum.py | 25 +++++++++++++++++++
 5 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/docs/api/api_python/nn/mindspore.nn.AdaSumByDeltaWeightWrapCell.rst b/docs/api/api_python/nn/mindspore.nn.AdaSumByDeltaWeightWrapCell.rst
index d15c5f0b234..3a706f3c13f 100644
--- a/docs/api/api_python/nn/mindspore.nn.AdaSumByDeltaWeightWrapCell.rst
+++ b/docs/api/api_python/nn/mindspore.nn.AdaSumByDeltaWeightWrapCell.rst
@@ -3,7 +3,7 @@ mindspore.nn.AdaSumByDeltaWeightWrapCell
 
 .. py:class:: mindspore.nn.AdaSumByDeltaWeightWrapCell(optimizer)
 
-    Adaptive Summation (AdaSum)算法的实现，根据更新前后的参数差计算。
+    Adaptive Summation (AdaSum)算法的实现，根据更新前后的参数差计算。应用于semi_auto_parallel/auto_parallel模式。
 
     请参阅论文 `AdaSum: Scaling Distributed Training with Adaptive Summation <https://arxiv.org/abs/2006.02924>`_。
 
diff --git a/docs/api/api_python/nn/mindspore.nn.AdaSumByGradWrapCell.rst b/docs/api/api_python/nn/mindspore.nn.AdaSumByGradWrapCell.rst
index cd900d84fcf..be16f4959eb 100644
--- a/docs/api/api_python/nn/mindspore.nn.AdaSumByGradWrapCell.rst
+++ b/docs/api/api_python/nn/mindspore.nn.AdaSumByGradWrapCell.rst
@@ -3,7 +3,7 @@ mindspore.nn.AdaSumByGradWrapCell
 
 .. py:class:: mindspore.nn.AdaSumByGradWrapCell(optimizer)
 
-    Adaptive Summation (AdaSum)算法的实现，根据梯度计算。
+    Adaptive Summation (AdaSum)算法的实现，根据梯度计算。应用于semi_auto_parallel/auto_parallel模式。
 
     请参阅论文 `AdaSum: Scaling Distributed Training with Adaptive Summation <https://arxiv.org/abs/2006.02924>`_。
 
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.cc
index 9077664c8de..0bd5f503d44 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.cc
@@ -484,6 +484,14 @@ Status MatMulBase::GenerateStrategiesNotPower2(int64_t stage_id, size_t dev_num_
     }
   }
   strategy_cost_.clear();
+  // add the repeated strategy
+  auto repeated_stra_arrays{inputs_shape_};
+  for (auto &stra_array : repeated_stra_arrays) {
+    std::fill(stra_array.begin(), stra_array.end(), 1);
+  }
+  StrategyPtr repeated_stra = std::make_shared<Strategy>(stage_id, repeated_stra_arrays);
+  sp_vector.push_back(repeated_stra);
+
   for (auto &sp : sp_vector) {
     if (SetCostUnderStrategy(sp) == FAILED) {
       MS_LOG(WARNING) << name_ << " : Calculating cost for strategy failed.";
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/operator_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/operator_info.cc
index 0141c9c1f09..f67ad9d3844 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/operator_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/operator_info.cc
@@ -1470,6 +1470,13 @@ Status GenerateStrategiesForIndependentInputs(int64_t stage_id, const Shapes &in
       }
     }
   }
+  // add the repeated strategy
+  auto repeated_stra_arrays{splittable_inputs};
+  for (auto &stra_array : repeated_stra_arrays) {
+    std::fill(stra_array.begin(), stra_array.end(), 1);
+  }
+  StrategyPtr repeated_stra = std::make_shared<Strategy>(stage_id, repeated_stra_arrays);
+  sp_vector->push_back(repeated_stra);
   return SUCCESS;
 }
 
diff --git a/mindspore/python/mindspore/nn/optim/adasum.py b/mindspore/python/mindspore/nn/optim/adasum.py
index 82d703b971f..15965f9e284 100644
--- a/mindspore/python/mindspore/nn/optim/adasum.py
+++ b/mindspore/python/mindspore/nn/optim/adasum.py
@@ -380,6 +380,18 @@ def _parallel_check():
 class AdaSumByGradWrapCell(Cell):
     r"""
     Enable the adasum in "auto_parallel/semi_auto_parallel" mode.
+    The implementation of the Adaptive Summation (AdaSum) algorithm is calculated by gradients.
+    See the paper `AdaSum: Scaling Distributed Training with Adaptive Summation <https://arxiv.org/abs/2006.02924>`_.
+
+    .. math::
+        \begin{array}{ll}
+          w_{t+1}=w_{t} - \alpha \cdot Adasum(g_{1}, g_{2})  \\
+          w_{t+1}=w_{t} - \alpha \cdot [(1 - \frac{g_2^{T}\cdot g_1}{2\cdot \left \| g_1 \right \|^2 })\cdot g_1 +
+           (1 - \frac{g_1^{T}\cdot g_2}{2\cdot \left \| g_2 \right \|^2 })\cdot g_2]  \\
+        \end{array}
+
+    In this implementation, :math:`g` represents the gradient of the weights,
+    and the subscripts represent different devices in the data-parallel dimension.
 
     Note:
         When using AdaSum, the number of traning cards needs to be a power of 2 and at least 16 cards are required.
@@ -433,6 +445,19 @@ class AdaSumByGradWrapCell(Cell):
 class AdaSumByDeltaWeightWrapCell(Cell):
     r"""
     Enable the adasum in "auto_parallel/semi_auto_parallel" mode.
+    The implementation of the Adaptive Summation (AdaSum) algorithm is calculated based on the difference of weights
+    before and after the updating of optimizer.
+    See the paper `AdaSum: Scaling Distributed Training with Adaptive Summation <https://arxiv.org/abs/2006.02924>`_.
+
+    .. math::
+        \begin{array}{ll}
+          w_{t+1}=w_{t} - \alpha \cdot Adasum(g_{1}, g_{2})  \\
+          w_{t+1}=w_{t} - \alpha \cdot [(1 - \frac{g_2^{T}\cdot g_1}{2\cdot \left \| g_1 \right \|^2 })\cdot g_1 +
+           (1 - \frac{g_1^{T}\cdot g_2}{2\cdot \left \| g_2 \right \|^2 })\cdot g_2]  \\
+        \end{array}
+
+    In this implementation, :math:`g` represents the weight difference before and after the updating of optimizer,
+    and the subscripts represent different devices in the data parallel dimension.
 
     Note:
         When using AdaSum, the number of traning cards needs to be a power of 2 and at least 16 cards are required.