diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc index 906e3baccda..31f76a95a4a 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc @@ -37,13 +37,18 @@ namespace mindspore { namespace opt { namespace { -std::set GetUniqReduceAxes(const AnfNodePtr &node) { +std::set GetUniqReduceAxes(const AnfNodePtr &node, bool is_ascend = false) { if (!IsPrimitiveCNode(node, prim::kPrimReduceSum)) { MS_LOG(EXCEPTION) << "Only process for reduce sum!"; } auto input = node->cast()->input(kFirstDataInputIndex); - auto src_shape_vec = GetShape(input); + ShapeVector src_shape_vec; + if (is_ascend) { + src_shape_vec = GetDeviceShape(input); + } else { + src_shape_vec = GetShape(input); + } auto axis_vec = GetReduceAxis(node); if (axis_vec.empty()) { for (size_t i = 0; i < src_shape_vec.size(); ++i) { @@ -140,7 +145,8 @@ bool AtomicAddChecker::CanActivateAtomicAdd(const AnfNodePtr &anf_node) { // which mean it should be in output list. // 2. The reduce axis and reduce number should meet condition: // (GPU) all-reduce or reduce-x when fuse number is greater than or equal to 1024, or reduce-y. - // (Ascend) all-reduce or non-reduce axes with dimension 1 + // (Ascend) The first valid axis of the input data is the reduce axis or the non-reduce axis + // cannot make full use of multi-core. // 3. No other ReduceSum as output ReduceSum's predecessors (reduce compile limitation). // Rule 1. @@ -180,11 +186,46 @@ bool AtomicAddCheckerGPU::SuitableForAtomicAdd(const AnfNodePtr &node) { } bool AtomicAddCheckerAscend::SuitableForAtomicAdd(const AnfNodePtr &node) { - auto dst_shape_vec = AnfAlgo::GetOutputDeviceShape(node, 0); + auto input = node->cast()->input(kFirstDataInputIndex); - // all reduce - // non-reduce axes with dimension 1 - return std::all_of(dst_shape_vec.cbegin(), dst_shape_vec.cend(), [](const size_t &dim) { return dim == 1; }); + // Atomic addition is enabled only when the data type is fp32 + auto type = AnfAlgo::GetOutputDeviceDataType(input, 0); + if (type != kNumberTypeFloat32) { + return false; + } + + // If the first valid axis of the input data is the reduce axis, enable atomic addition + auto src_shape_vec = GetDeviceShape(input); + std::set reduce_axis_set = GetUniqReduceAxes(node, true); + auto start_with_reduce = false; + for (size_t i = 0; i < src_shape_vec.size(); ++i) { + auto dim = src_shape_vec[i]; + if (dim != 1) { + if (reduce_axis_set.count(i)) { + start_with_reduce = true; + } + break; + } + } + if (start_with_reduce) { + return true; + } + + // If the non-reduce axis cannot make full use of multi-core, enable atomic addition + auto processor_core_num = 32; + auto start_non_reduce_dim = 1; + for (size_t i = 0; i < src_shape_vec.size(); ++i) { + auto dim = src_shape_vec[i]; + if (reduce_axis_set.count(i)) { + break; + } + start_non_reduce_dim = start_non_reduce_dim * dim; + } + if (start_non_reduce_dim < processor_core_num) { + return true; + } + + return false; } void AtomicCleanInsertter::CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input) {