!14993 [GraphKernel][Ascend]Increase the rules for enabling atomic addition

From: @hanhuifeng2020 Reviewed-by: @gaoxiong1,@dylangeng Signed-off-by: @dylangeng
2021-04-13 14:54:26 +08:00 · 2021-04-13 14:54:26 +08:00 · 3c61a7c778
parent b172624fb9 cbe79265c7
commit 3c61a7c778
1 changed files with 48 additions and 7 deletions
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc
@ -37,13 +37,18 @@
 namespace mindspore {
 namespace opt {
 namespace {
-std::set<int64_t> GetUniqReduceAxes(const AnfNodePtr &node) {
+std::set<int64_t> GetUniqReduceAxes(const AnfNodePtr &node, bool is_ascend = false) {
  if (!IsPrimitiveCNode(node, prim::kPrimReduceSum)) {
    MS_LOG(EXCEPTION) << "Only process for reduce sum!";
  }

  auto input = node->cast<CNodePtr>()->input(kFirstDataInputIndex);
-  auto src_shape_vec = GetShape(input);
+  ShapeVector src_shape_vec;
+  if (is_ascend) {
+    src_shape_vec = GetDeviceShape(input);
+  } else {
+    src_shape_vec = GetShape(input);
+  }
  auto axis_vec = GetReduceAxis(node);
  if (axis_vec.empty()) {
    for (size_t i = 0; i < src_shape_vec.size(); ++i) {
@ -140,7 +145,8 @@ bool AtomicAddChecker::CanActivateAtomicAdd(const AnfNodePtr &anf_node) {
  //    which mean it should be in output list.
  // 2. The reduce axis and reduce number should meet condition:
  //    (GPU) all-reduce or reduce-x when fuse number is greater than or equal to 1024, or reduce-y.
-  //    (Ascend) all-reduce or non-reduce axes with dimension 1
+  //    (Ascend) The first valid axis of the input data is the reduce axis or the non-reduce axis
+  //    cannot make full use of multi-core.
  // 3. No other ReduceSum as output ReduceSum's predecessors (reduce compile limitation).

  // Rule 1.
@ -180,11 +186,46 @@ bool AtomicAddCheckerGPU::SuitableForAtomicAdd(const AnfNodePtr &node) {
 }

 bool AtomicAddCheckerAscend::SuitableForAtomicAdd(const AnfNodePtr &node) {
-  auto dst_shape_vec = AnfAlgo::GetOutputDeviceShape(node, 0);
+  auto input = node->cast<CNodePtr>()->input(kFirstDataInputIndex);

-  // all reduce
-  // non-reduce axes with dimension 1
-  return std::all_of(dst_shape_vec.cbegin(), dst_shape_vec.cend(), [](const size_t &dim) { return dim == 1; });
+  // Atomic addition is enabled only when the data type is fp32
+  auto type = AnfAlgo::GetOutputDeviceDataType(input, 0);
+  if (type != kNumberTypeFloat32) {
+    return false;
+  }
+
+  // If the first valid axis of the input data is the reduce axis, enable atomic addition
+  auto src_shape_vec = GetDeviceShape(input);
+  std::set<int64_t> reduce_axis_set = GetUniqReduceAxes(node, true);
+  auto start_with_reduce = false;
+  for (size_t i = 0; i < src_shape_vec.size(); ++i) {
+    auto dim = src_shape_vec[i];
+    if (dim != 1) {
+      if (reduce_axis_set.count(i)) {
+        start_with_reduce = true;
+      }
+      break;
+    }
+  }
+  if (start_with_reduce) {
+    return true;
+  }
+
+  // If the non-reduce axis cannot make full use of multi-core, enable atomic addition
+  auto processor_core_num = 32;
+  auto start_non_reduce_dim = 1;
+  for (size_t i = 0; i < src_shape_vec.size(); ++i) {
+    auto dim = src_shape_vec[i];
+    if (reduce_axis_set.count(i)) {
+      break;
+    }
+    start_non_reduce_dim = start_non_reduce_dim * dim;
+  }
+  if (start_non_reduce_dim < processor_core_num) {
+    return true;
+  }
+
+  return false;
 }

 void AtomicCleanInsertter::CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input) {