diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.cc
index fcf67cea3af..84b6e5d1a3d 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.cc
@@ -20,16 +20,35 @@
 #include <memory>
 #include <utility>
 #include <map>
+#include <unordered_set>
 
 #include "mindspore/core/ops/index_add.h"
 #include "plugin/device/cpu/hal/device/cpu_device_address.h"
-#include "include/common/thread_pool.h"
 
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kIndexAddInputsNum = 3;
 constexpr size_t kIndexAddOutputsNum = 1;
+
+bool HasDuplicateIndex(const int32_t *indices, size_t len) {
+  MS_EXCEPTION_IF_NULL(indices);
+  std::unordered_set<int32_t> unique_idx;
+  for (size_t i = 0; i < len; ++i) {
+    if (unique_idx.find(indices[i]) != unique_idx.end()) {
+      return true;
+    }
+    unique_idx.insert(indices[i]);
+  }
+  return false;
+}
+
+size_t CalcSizePerThread(size_t total_block) {
+  size_t pool_thread_num = GetActorMgrInnerThreadPool()->GetKernelThreadNum();
+  pool_thread_num = pool_thread_num == 0 ? 1 : pool_thread_num;
+  size_t block_num = (total_block + pool_thread_num - 1) / pool_thread_num;
+  return block_num;
+}
 }  // namespace
 
 bool IndexAddCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
@@ -100,6 +119,7 @@ void IndexAddCpuKernelMod::CheckParams() {
   x_nums_ = 1;
   y_nums_ = 1;
   inner_size_ = 1;
+  outer_size_ = 1;
   for (size_t i = 0; i < x_shape_.size(); ++i) {
     if (x_shape_[i] <= 0 || y_shape_[i] <= 0) {
       MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', 'x' shape[" << i << "] or 'y' shape [" << i
@@ -113,7 +133,9 @@ void IndexAddCpuKernelMod::CheckParams() {
     }
     x_nums_ *= LongToSize(x_shape_[i]);
     y_nums_ *= LongToSize(y_shape_[i]);
-    if (i > axis) {
+    if (i < axis) {
+      outer_size_ *= LongToSize(x_shape_[i]);
+    } else if (i > axis) {
       inner_size_ *= LongToSize(x_shape_[i]);
     }
   }
@@ -165,9 +187,15 @@ bool IndexAddCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &i
     }
   };
 
+  auto heavy_task_block = [this, task_block](const size_t start, const size_t end) {
+    task_block(start * y_axis_size_, end * y_axis_size_);
+  };
+
   const float block_size = 1024;
   const size_t inner_block_size = 100;
-  if (inner_size_ > 1 && inner_size_ <= inner_block_size) {
+  if (HasDuplicateIndex(indices, y_axis_size_)) {
+    ParallelLaunch(heavy_task_block, outer_size_, CalcSizePerThread(outer_size_), this);
+  } else if (inner_size_ > 1 && inner_size_ <= inner_block_size) {
     ParallelLaunch(task_block, y_nums_ / inner_size_, block_size / inner_size_, this);
   } else {
     ParallelLaunch(task, y_nums_, block_size, this);
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.h
index ce8e0486d06..005533029e8 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/index_add_cpu_kernel.h
@@ -63,6 +63,7 @@ class IndexAddCpuKernelMod : public NativeCpuKernelMod, public MatchKernelHelper
   size_t x_nums_{1};
   size_t y_nums_{1};
   size_t inner_size_{1};
+  size_t outer_size_{1};
   size_t x_axis_size_{1};
   size_t y_axis_size_{1};
 };
diff --git a/tests/st/ops/cpu/test_pad_op.py b/tests/st/ops/cpu/test_pad_op.py
index 3ad126f8929..7dc05a537d7 100644
--- a/tests/st/ops/cpu/test_pad_op.py
+++ b/tests/st/ops/cpu/test_pad_op.py
@@ -148,7 +148,7 @@ class PadNet(nn.Cell):
 
 
 @pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 @pytest.mark.parametrize('mode', [context.GRAPH_MODE, context.PYNATIVE_MODE])
 @pytest.mark.parametrize('dtype', [np.bool_, np.uint8, np.uint16, np.uint32, np.uint64, np.int8, np.int16, np.int32,