diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.cc index fcf67cea3af..84b6e5d1a3d 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.cc @@ -20,16 +20,35 @@ #include #include #include +#include #include "mindspore/core/ops/index_add.h" #include "plugin/device/cpu/hal/device/cpu_device_address.h" -#include "include/common/thread_pool.h" namespace mindspore { namespace kernel { namespace { constexpr size_t kIndexAddInputsNum = 3; constexpr size_t kIndexAddOutputsNum = 1; + +bool HasDuplicateIndex(const int32_t *indices, size_t len) { + MS_EXCEPTION_IF_NULL(indices); + std::unordered_set unique_idx; + for (size_t i = 0; i < len; ++i) { + if (unique_idx.find(indices[i]) != unique_idx.end()) { + return true; + } + unique_idx.insert(indices[i]); + } + return false; +} + +size_t CalcSizePerThread(size_t total_block) { + size_t pool_thread_num = GetActorMgrInnerThreadPool()->GetKernelThreadNum(); + pool_thread_num = pool_thread_num == 0 ? 1 : pool_thread_num; + size_t block_num = (total_block + pool_thread_num - 1) / pool_thread_num; + return block_num; +} } // namespace bool IndexAddCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector &inputs, @@ -100,6 +119,7 @@ void IndexAddCpuKernelMod::CheckParams() { x_nums_ = 1; y_nums_ = 1; inner_size_ = 1; + outer_size_ = 1; for (size_t i = 0; i < x_shape_.size(); ++i) { if (x_shape_[i] <= 0 || y_shape_[i] <= 0) { MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', 'x' shape[" << i << "] or 'y' shape [" << i @@ -113,7 +133,9 @@ void IndexAddCpuKernelMod::CheckParams() { } x_nums_ *= LongToSize(x_shape_[i]); y_nums_ *= LongToSize(y_shape_[i]); - if (i > axis) { + if (i < axis) { + outer_size_ *= LongToSize(x_shape_[i]); + } else if (i > axis) { inner_size_ *= LongToSize(x_shape_[i]); } } @@ -165,9 +187,15 @@ bool IndexAddCpuKernelMod::LaunchKernel(const std::vector &i } }; + auto heavy_task_block = [this, task_block](const size_t start, const size_t end) { + task_block(start * y_axis_size_, end * y_axis_size_); + }; + const float block_size = 1024; const size_t inner_block_size = 100; - if (inner_size_ > 1 && inner_size_ <= inner_block_size) { + if (HasDuplicateIndex(indices, y_axis_size_)) { + ParallelLaunch(heavy_task_block, outer_size_, CalcSizePerThread(outer_size_), this); + } else if (inner_size_ > 1 && inner_size_ <= inner_block_size) { ParallelLaunch(task_block, y_nums_ / inner_size_, block_size / inner_size_, this); } else { ParallelLaunch(task, y_nums_, block_size, this); diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.h index ce8e0486d06..005533029e8 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.h @@ -63,6 +63,7 @@ class IndexAddCpuKernelMod : public NativeCpuKernelMod, public MatchKernelHelper size_t x_nums_{1}; size_t y_nums_{1}; size_t inner_size_{1}; + size_t outer_size_{1}; size_t x_axis_size_{1}; size_t y_axis_size_{1}; }; diff --git a/tests/st/ops/cpu/test_pad_op.py b/tests/st/ops/cpu/test_pad_op.py index 3ad126f8929..7dc05a537d7 100644 --- a/tests/st/ops/cpu/test_pad_op.py +++ b/tests/st/ops/cpu/test_pad_op.py @@ -148,7 +148,7 @@ class PadNet(nn.Cell): @pytest.mark.level0 -@pytest.mark.platform_x86_gpu_training +@pytest.mark.platform_x86_cpu @pytest.mark.env_onecard @pytest.mark.parametrize('mode', [context.GRAPH_MODE, context.PYNATIVE_MODE]) @pytest.mark.parametrize('dtype', [np.bool_, np.uint8, np.uint16, np.uint32, np.uint64, np.int8, np.int16, np.int32,