acc arthitic op

2022-10-27 16:18:58 +08:00 · 2022-10-27 16:18:58 +08:00 · ec43c8b63b
parent 527906dbd0
commit ec43c8b63b
9 changed files with 271 additions and 339 deletions
--- a/docs/api/api_python/mindspore/mindspore.set_context.rst
+++ b/docs/api/api_python/mindspore/mindspore.set_context.rst
@ -139,7 +139,7 @@ mindspore.set_context
        - **enable_compile_cache** (bool) - 表示是否加载或者保存前端编译的图。当 `enable_compile_cache` 被设置为True时，在第一次执行的过程中，一个硬件无关的编译缓存会被生成并且导出为一个MINDIR文件。当该网络被再次执行时，如果 `enable_compile_cache` 仍然为True并且网络脚本没有被更改，那么这个编译缓存会被加载。注意目前只支持有限的Python脚本更改的自动检测，这意味着可能有正确性风险。默认值：False。这是一个实验特性，可能会被更改或者删除。
        - **compile_cache_path** (str) - 保存前端图编译缓存的路径。默认值："."。如果目录不存在，系统会自动创建这个目录。缓存会被保存到如下目录： `compile_cache_path/rank_${rank_id}/` 。 `rank_id` 是集群上当前设备的ID。
        - **inter_op_parallel_num** (int) - 算子间并行数控制。 默认值为0，表示由框架默认指定。
-        - **runtime_num_threads** (int) - 运行时actor和CPU算子核使用的线程池线程数，必须大于0。默认值为30，如果同时运行多个进程，应将该值设置得小一些，以避免线程争用。
+        - **runtime_num_threads** (int) - 运行时actor和CPU算子核使用的线程池线程数，必须大于等于0。默认值为30，如果同时运行多个进程，应将该值设置得小一些，以避免线程争用。
        - **disable_format_transform** (bool) - 表示是否取消NCHW到NHWC的自动格式转换功能。当fp16的网络性能不如fp32的时，可以设置 `disable_format_transform` 为True，以尝试提高训练性能。默认值：False。
        - **support_binary** (bool) - 是否支持在图形模式下运行.pyc或.so。如果要支持在图形模式下运行.so或.pyc，可将 `support_binary` 置为True，并运行一次.py文件，从而将接口源码保存到接口定义.py文件中，因此要保证该文件可写。然后将.py文件编译成.pyc或.so文件，即可在图模式下运行。
        - **memory_optimize_level** (str) - 内存优化级别，默认值：O0。其值必须在 ['O0', 'O1'] 范围中。
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/arithmetic_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/arithmetic_cpu_kernel.cc
@ -28,6 +28,7 @@
 #include "plugin/device/cpu/kernel/nnacl/fp32/mul_fp32.h"
 #include "plugin/device/cpu/kernel/nnacl/fp32/power_fp32.h"
 #include "plugin/device/cpu/kernel/nnacl/fp32/sub_fp32.h"
+#include "plugin/device/cpu/kernel/nnacl/fp32/add_fp32.h"

 namespace mindspore {
 namespace kernel {
@ -130,6 +131,7 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc {
    CPUKernelUtils::GetElementNumEveryDim(input_shape2_, &input_element_num2_);
    output_element_num_.clear();
    CPUKernelUtils::GetElementNumEveryDim(output_shape_, &output_element_num_);
+    is_init_broadcast_ = false;
    return KRET_OK;
  }

@ -180,6 +182,20 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc {
  }

 private:
+  void InitBroadCast() {
+    BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
+    base_iter.SetPos(0);
+    input_index1_.clear();
+    input_index2_.clear();
+    input_index1_.resize(output_size_);
+    input_index2_.resize(output_size_);
+    for (size_t i = 0; i < output_size_; i++) {
+      input_index1_[i] = base_iter.GetInputPosA();
+      input_index2_[i] = base_iter.GetInputPosB();
+      base_iter.GenNextPos();
+    }
+    is_init_broadcast_ = true;
+  }
  void InitComputeFunc() {
    if (kernel_name_ == kAssignAdd || kernel_name_ == kAssignSub) {
      return;
@ -229,10 +245,13 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc {

  ShapeVector input_shape1_;
  ShapeVector input_shape2_;
+  std::vector<size_t> input_index1_;
+  std::vector<size_t> input_index2_;
  std::vector<size_t> input_element_num1_;
  std::vector<size_t> input_element_num2_;
  ShapeVector output_shape_;
  std::vector<size_t> output_element_num_;
+  bool is_init_broadcast_{false};

  using TypeComputeFunc = std::function<void(ArithmeticCpuTypeFunc *, const T *in_x, const T *in_y, T *out)>;
  TypeComputeFunc compute_func_{nullptr};
@ -268,13 +287,32 @@ void ArithmeticCpuTypeFunc<T>::AssignSub(T *input1, const T *input2, T *out) {

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::Add(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if constexpr (std::is_same_v<T, float>) {
+    if (input_shape1_ == input_shape2_) {
+      auto task = [input1, input2, out](size_t start, size_t end) {
+        (void)ElementAdd(input1 + start, input2 + start, out + start, end - start);
+      };
+      ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
+      return;
+    }
+    if (op_para_.in_elements_num0_ == 1 || op_para_.in_elements_num1_ == 1) {
+      auto task = [this, input1, input2, out](size_t start, size_t end) {
+        if (op_para_.in_elements_num0_ == 1) {
+          (void)ElementOptAdd(input1, input2 + start, out + start, end - start, &op_para_);
+        } else {
+          (void)ElementOptAdd(input1 + start, input2, out + start, end - start, &op_para_);
+        }
+      };
+      ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
+      return;
+    }
+  }
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      out[i] = static_cast<T>(input1[iter.GetInputPosA()] + input2[iter.GetInputPosB()]);
-      iter.GenNextPos();
+      out[i] = static_cast<T>(input1[input_index1_[i]] + input2[input_index2_[i]]);
    }
  };
  ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -282,13 +320,12 @@ void ArithmeticCpuTypeFunc<T>::Add(const T *input1, const T *input2, T *out) {

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::AddV2(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      out[i] = static_cast<T>(input1[iter.GetInputPosA()] + input2[iter.GetInputPosB()]);
-      iter.GenNextPos();
+      out[i] = static_cast<T>(input1[input_index1_[i]] + input2[input_index2_[i]]);
    }
  };
  ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -316,14 +353,12 @@ void ArithmeticCpuTypeFunc<T>::Sub(const T *input1, const T *input2, T *out) {
      return;
    }
  }
-
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
  auto task = [&](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
    for (size_t i = start; i < end; i++) {
-      out[i] = static_cast<T>(input1[iter.GetInputPosA()] - input2[iter.GetInputPosB()]);
-      iter.GenNextPos();
+      out[i] = static_cast<T>(input1[input_index1_[i]] - input2[input_index2_[i]]);
    }
  };
  ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -351,17 +386,16 @@ void ArithmeticCpuTypeFunc<T>::Mul(const T *input1, const T *input2, T *out) {
      return;
    }
  }
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
      if constexpr (std::is_same_v<T, bool>) {
-        out[i] = static_cast<T>(input1[iter.GetInputPosA()] && input2[iter.GetInputPosB()]);
+        out[i] = static_cast<T>(input1[input_index1_[i]] && input2[input_index2_[i]]);
      } else {
-        out[i] = static_cast<T>(input1[iter.GetInputPosA()] * input2[iter.GetInputPosB()]);
+        out[i] = static_cast<T>(input1[input_index1_[i]] * input2[input_index2_[i]]);
      }
-      iter.GenNextPos();
    }
  };
  ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -391,14 +425,14 @@ void ArithmeticCpuTypeFunc<T>::RealDiv(const T *input1, const T *input2, T *out)
    return;
  }

-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      auto dividend = input1[iter.GetInputPosA()];
-      auto divisor = input2[iter.GetInputPosB()];
-      iter.GenNextPos();
+      auto dividend = input1[input_index1_[i]];
+      auto divisor = input2[input_index2_[i]];
+
      auto zero = static_cast<T>(0);
      if (divisor == zero) {
        if (dividend == zero) {
@ -442,14 +476,14 @@ void ArithmeticCpuTypeFunc<T>::RealDivComplex(const T *input1, const T *input2,
    return;
  }

-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      auto dividend = input1[iter.GetInputPosA()];
-      auto divisor = input2[iter.GetInputPosB()];
-      iter.GenNextPos();
+      auto dividend = input1[input_index1_[i]];
+      auto divisor = input2[input_index2_[i]];
+
      auto zero = static_cast<T>(0);
      if (divisor == zero) {
        out[i] = std::numeric_limits<T>::quiet_NaN();
@ -463,14 +497,14 @@ void ArithmeticCpuTypeFunc<T>::RealDivComplex(const T *input1, const T *input2,

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::Div(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      auto dividend = input1[iter.GetInputPosA()];
-      auto divisor = input2[iter.GetInputPosB()];
-      iter.GenNextPos();
+      auto dividend = input1[input_index1_[i]];
+      auto divisor = input2[input_index2_[i]];
+
      auto zero = static_cast<T>(0);
      if (divisor == zero) {
        if (dividend == zero) {
@ -492,14 +526,14 @@ void ArithmeticCpuTypeFunc<T>::Div(const T *input1, const T *input2, T *out) {

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::DivComplex(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      auto dividend = input1[iter.GetInputPosA()];
-      auto divisor = input2[iter.GetInputPosB()];
-      iter.GenNextPos();
+      auto dividend = input1[input_index1_[i]];
+      auto divisor = input2[input_index2_[i]];
+
      auto zero = static_cast<T>(0);
      if (divisor == zero) {
        if (dividend == zero) {
@ -516,14 +550,14 @@ void ArithmeticCpuTypeFunc<T>::DivComplex(const T *input1, const T *input2, T *o

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::DivNoNan(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      auto dividend = input1[iter.GetInputPosA()];
-      auto divisor = input2[iter.GetInputPosB()];
-      iter.GenNextPos();
+      auto dividend = input1[input_index1_[i]];
+      auto divisor = input2[input_index2_[i]];
+
      auto zero = static_cast<T>(0);
      if constexpr (std::is_same_v<T, double>) {
        if (common::IsDoubleEqual(divisor, zero)) {
@ -551,14 +585,14 @@ void ArithmeticCpuTypeFunc<T>::DivNoNan(const T *input1, const T *input2, T *out

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::FloorDiv(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      auto dividend = input1[iter.GetInputPosA()];
-      auto divisor = input2[iter.GetInputPosB()];
-      iter.GenNextPos();
+      auto dividend = input1[input_index1_[i]];
+      auto divisor = input2[input_index2_[i]];
+
      auto zero = static_cast<T>(0);
      if (divisor == zero) {
        if (dividend == zero) {
@ -580,14 +614,14 @@ void ArithmeticCpuTypeFunc<T>::FloorDiv(const T *input1, const T *input2, T *out

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::Mod(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      auto x = static_cast<double>(input1[iter.GetInputPosA()]);
-      auto y = static_cast<double>(input2[iter.GetInputPosB()]);
-      iter.GenNextPos();
+      auto x = static_cast<double>(input1[input_index1_[i]]);
+      auto y = static_cast<double>(input2[input_index2_[i]]);
+
      auto data_div = x / y;
      auto data_div_min = data_div < 0.0 ? data_div : 0.0;
      auto data_div_max = data_div > 0.0 ? data_div : 0.0;
@ -602,14 +636,14 @@ void ArithmeticCpuTypeFunc<T>::Mod(const T *input1, const T *input2, T *out) {

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::FloorMod(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      auto x = static_cast<double>(input1[iter.GetInputPosA()]);
-      auto y = static_cast<double>(input2[iter.GetInputPosB()]);
-      iter.GenNextPos();
+      auto x = static_cast<double>(input1[input_index1_[i]]);
+      auto y = static_cast<double>(input2[input_index2_[i]]);
+
      auto res = x - floor(x / y) * y;
      out[i] = static_cast<T>((std::abs(res) > 1e-9) && ((res < 0.0) != (y < 0.0)) ? res + y : res);
    }
@ -650,70 +684,63 @@ void ArithmeticCpuTypeFunc<T>::Pow(const T *input1, const T *input2, T *out) {
    }
  }

-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
  if (output_size_ > kMaxPowSerialSize) {
-    auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-      auto iter = base_iter;
-      iter.SetPos(start);
+    auto task = [&](size_t start, size_t end) {
      for (size_t i = start; i < end; i++) {
-        auto x = static_cast<double>(input1[iter.GetInputPosA()]);
-        auto y = static_cast<double>(input2[iter.GetInputPosB()]);
+        auto x = static_cast<double>(input1[input_index1_[i]]);
+        auto y = static_cast<double>(input2[input_index2_[i]]);
        out[i] = static_cast<T>(std::pow(x, y));
-        iter.GenNextPos();
      }
    };
    ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
  } else {
-    base_iter.SetPos(0);
    for (size_t i = 0; i < output_size_; i++) {
-      auto sx = static_cast<double>(input1[base_iter.GetInputPosA()]);
-      auto sy = static_cast<double>(input2[base_iter.GetInputPosB()]);
+      auto sx = static_cast<double>(input1[input_index1_[i]]);
+      auto sy = static_cast<double>(input2[input_index2_[i]]);
      out[i] = static_cast<T>(std::pow(sx, sy));
-      base_iter.GenNextPos();
    }
  }
 }

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::PowComplex(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
  if (output_size_ > kMaxPowSerialSize) {
-    auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-      auto iter = base_iter;
-      iter.SetPos(start);
+    auto task = [&](size_t start, size_t end) {
      for (size_t i = start; i < end; i++) {
-        auto x = (input1[iter.GetInputPosA()]);
-        auto y = (input2[iter.GetInputPosB()]);
+        auto x = (input1[input_index1_[i]]);
+        auto y = (input2[input_index2_[i]]);
        out[i] = static_cast<T>(std::pow(x, y));
-        iter.GenNextPos();
      }
    };
    ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
  } else {
-    base_iter.SetPos(0);
    for (size_t i = 0; i < output_size_; i++) {
-      auto sx = (input1[base_iter.GetInputPosA()]);
-      auto sy = (input2[base_iter.GetInputPosB()]);
+      auto sx = (input1[input_index1_[i]]);
+      auto sy = (input2[input_index2_[i]]);
      out[i] = static_cast<T>(std::pow(sx, sy));
-      base_iter.GenNextPos();
    }
  }
 }

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::SquaredDifference(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      T diff = input1[iter.GetInputPosA()] - input2[iter.GetInputPosB()];
+      T diff = input1[input_index1_[i]] - input2[input_index2_[i]];
      if constexpr (std::is_same_v<T, bool>) {
        out[i] = static_cast<T>(diff);
      } else {
        out[i] = static_cast<T>(diff * diff);
      }
-      iter.GenNextPos();
    }
  };
  ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -721,14 +748,13 @@ void ArithmeticCpuTypeFunc<T>::SquaredDifference(const T *input1, const T *input

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::SquaredDifferenceComplex(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      T diff = input1[iter.GetInputPosA()] - input2[iter.GetInputPosB()];
+      T diff = input1[input_index1_[i]] - input2[input_index2_[i]];
      out[i] = static_cast<T>(std::conj(diff) * diff);
-      iter.GenNextPos();
    }
  };
  ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -736,15 +762,15 @@ void ArithmeticCpuTypeFunc<T>::SquaredDifferenceComplex(const T *input1, const T

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::Xlogy(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
-      auto x1 = input1[iter.GetInputPosA()];
-      auto x2 = input2[iter.GetInputPosB()];
+      auto x1 = input1[input_index1_[i]];
+      auto x2 = input2[input_index2_[i]];
      auto logx2 = log(x2);
-      iter.GenNextPos();
+
      if constexpr (std::is_same_v<T, bool>) {
        out[i] = static_cast<T>(x1 && static_cast<bool>(logx2));
      } else {
@ -757,14 +783,13 @@ void ArithmeticCpuTypeFunc<T>::Xlogy(const T *input1, const T *input2, T *out) {

 template <typename T>
 void ArithmeticCpuTypeFunc<T>::Atan2(const T *input1, const T *input2, T *out) {
-  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
-  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
-    auto iter = base_iter;
-    iter.SetPos(start);
+  if (!is_init_broadcast_) {
+    InitBroadCast();
+  }
+  auto task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
      out[i] = static_cast<T>(
-        atan2(static_cast<double>(input1[iter.GetInputPosA()]), static_cast<double>(input2[iter.GetInputPosB()])));
-      iter.GenNextPos();
+        atan2(static_cast<double>(input1[input_index1_[i]]), static_cast<double>(input2[input_index2_[i]])));
    }
  };
  ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -776,6 +801,41 @@ std::shared_ptr<CpuKernelFunc> SpecializeArithFunc() {
 }
 using ArithmeticCpuFuncCreator = std::function<std::shared_ptr<CpuKernelFunc>()>;
 static std::map<std::string, std::vector<std::pair<KernelAttr, ArithmeticCpuFuncCreator>>> kernel_attr_list = {
+  {kAdd,
+   {{KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
+     SpecializeArithFunc<int8_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
+     SpecializeArithFunc<int16_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+     SpecializeArithFunc<int32_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+     SpecializeArithFunc<float>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
+     SpecializeArithFunc<int64_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
+     SpecializeArithFunc<double>},
+    {KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
+     SpecializeArithFunc<uint8_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeUInt16).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
+     SpecializeArithFunc<uint16_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeUInt32).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
+     SpecializeArithFunc<uint32_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeUInt64).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
+     SpecializeArithFunc<uint64_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
+     SpecializeArithFunc<bool>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+     SpecializeArithFunc<float16>},
+    {KernelAttr()
+       .AddInputAttr(kNumberTypeComplex64)
+       .AddInputAttr(kNumberTypeComplex64)
+       .AddOutputAttr(kNumberTypeComplex64),
+     SpecializeArithFunc<complex64>},
+    {KernelAttr()
+       .AddInputAttr(kNumberTypeComplex128)
+       .AddInputAttr(kNumberTypeComplex128)
+       .AddOutputAttr(kNumberTypeComplex128),
+     SpecializeArithFunc<complex128>}}},
  {kSub,
   {{KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
     SpecializeArithFunc<int8_t>},
@ -1182,7 +1242,8 @@ std::vector<KernelAttr> ArithmeticCpuKernelMod::GetOpSupport() {

  return support_list;
 }
-
+MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Add,
+                                 []() { return std::make_shared<ArithmeticCpuKernelMod>(kAdd); });
 MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Sub,
                                 []() { return std::make_shared<ArithmeticCpuKernelMod>(kSub); });
 MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Mul,
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/reduce_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/reduce_cpu_kernel.cc
@ -25,6 +25,7 @@
 #include "plugin/device/cpu/hal/device/cpu_device_address.h"
 #include "utils/check_convert_utils.h"
 #include "ops/reduce.h"
+#include "plugin/device/cpu/kernel/nnacl/errorcode.h"

 namespace mindspore {
 namespace kernel {
@ -60,6 +61,8 @@ class ReduceCpuKernelFunc : public CpuKernelFunc {
  void AccelerateLongVector(T *input_addr, T *output_addr, size_t input_size);
  void ChooseFunc(const std::string &kernel_name_);
  void HandleInputAxis();
+  void SpecialExcute();
+  void CalAxesAndStride(std::vector<size_t> *axes, size_t *stride);

  enum class ReduceFuncType {
    kReduceAllType,
@ -179,6 +182,26 @@ void ReduceAny(const T *in, T *out, size_t start, size_t end, TransposeIterator
  }
 }

+template <typename T>
+void ReduceCpuKernelFunc<T>::SpecialExcute() {
+  // special accelerate for axis = 1 and input has 2 dims
+  if ((reduce_type_ == ReduceFuncType::kReduceMeanType || reduce_type_ == ReduceFuncType::kReduceSumType) &&
+      axis_.size() == 1 && axis_[0] == 1 && input_shape_.size() == kDim2) {
+    simple_execute_ = true;
+  }
+  // special accelerate for axis[0] = 0 and other dims for axis is 1.
+  if (reduce_type_ == ReduceFuncType::kReduceSumType && axis_.size() >= 1 && axis_[0] == 0 &&
+      input_shape_.size() >= kDim2) {
+    simple_execute_ = true;
+    for (size_t i = 1; i < axis_.size(); ++i) {
+      if (static_cast<int64_t>(input_shape_.size()) > axis_[i] && input_shape_[axis_[i]] != 1) {
+        simple_execute_ = false;
+        break;
+      }
+    }
+  }
+}
+
 template <typename T>
 void ReduceCpuKernelFunc<T>::HandleInputAxis() {
  int64_t dimension = SizeToLong(input_shape_.size());
@ -201,12 +224,8 @@ void ReduceCpuKernelFunc<T>::HandleInputAxis() {
  sort(axis_.begin(), axis_.end());
  auto last = std::unique(axis_.begin(), axis_.end());
  axis_.erase(last, axis_.end());
-  // special accelerate for axis = 1 and input has 2 dims
  if constexpr (std::is_same<T, float>::value) {
-    if ((reduce_type_ == ReduceFuncType::kReduceMeanType || reduce_type_ == ReduceFuncType::kReduceSumType) &&
-        axis_.size() == 1 && axis_[0] == 1 && input_shape_.size() == kDim2) {
-      simple_execute_ = true;
-    }
+    SpecialExcute();
  }
 }

@ -285,6 +304,26 @@ void ReduceCpuKernelFunc<T>::InitFunc(const BaseOperatorPtr &base_operator, cons
  ChooseFunc(kernel_name_);
 }

+template <typename T>
+void ReduceCpuKernelFunc<T>::CalAxesAndStride(std::vector<size_t> *axes, size_t *stride) {
+  int dimension = SizeToInt(input_shape_.size());
+  size_t j = 0;
+  size_t k = 0;
+  for (int i = 0; i < dimension; ++i) {
+    if (j == axis_.size() || i != axis_[j]) {
+      (*axes)[k] = IntToSize(i);
+      ++k;
+    } else {
+      *stride *= LongToSize(input_shape_[IntToSize(i)]);
+      ++j;
+    }
+  }
+  for (auto &it : axis_) {
+    (*axes)[k] = IntToSize(it);
+    ++k;
+  }
+}
+
 template <typename T>
 bool ReduceCpuKernelFunc<T>::RunFunc(const std::vector<kernel::AddressPtr> &inputs,
                                     const std::vector<kernel::AddressPtr> &,
@ -314,43 +353,41 @@ bool ReduceCpuKernelFunc<T>::RunFunc(const std::vector<kernel::AddressPtr> &inpu
    }
  } else {
    // Calculate transpose axes and stride
-    int dimension = SizeToInt(input_shape_.size());
    size_t stride = 1;
    std::vector<size_t> axes(input_shape_.size());
-    size_t j = 0;
-    size_t k = 0;
-    for (int i = 0; i < dimension; ++i) {
-      if (j == axis_.size() || i != axis_[j]) {
-        axes[k] = i;
-        ++k;
-      } else {
-        stride *= LongToSize(input_shape_[IntToSize(i)]);
-        ++j;
-      }
-    }
-    for (auto &it : axis_) {
-      axes[k] = it;
-      ++k;
-    }
+    CalAxesAndStride(&axes, &stride);

    size_t output_size = outputs[0]->size / sizeof(T);
    if constexpr (std::is_same<T, float>::value) {
      if (simple_execute_) {
-        auto task = [&](size_t start, size_t end) {
-          for (size_t i = start; i < end; ++i) {
-            (void)ReduceSumDim2Axis1(stride, input_addr + i * stride, output_addr + i);
-            if (reduce_type_ == ReduceFuncType::kReduceMeanType) {
-              output_addr[i] /= SizeToFloat(stride);
+        if (axis_[0] == 1) {
+          auto task = [&](size_t start, size_t end) {
+            for (size_t i = start; i < end; ++i) {
+              (void)ReduceSumDim2Axis1(stride, input_addr + i * stride, output_addr + i);
+              if (reduce_type_ == ReduceFuncType::kReduceMeanType) {
+                output_addr[i] /= SizeToFloat(stride);
+              }
            }
-          }
-        };
-        ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
-        return true;
+          };
+          ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
+          return true;
+        } else {
+          auto task = [&](size_t start, size_t end) {
+            int ret =
+              ReduceSumDim2Axis0(end - start, output_size, input_shape_[0], input_addr + start, output_addr + start);
+            if (ret != NNACL_OK) {
+              MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', ReduceSumDim2Axis0 failed.Error no: " << ret;
+            }
+          };
+          ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
+          return true;
+        }
      }
    }

    // Calculate transpose shape
    std::vector<int64_t> transpose_shape(input_shape_.size());
+    int dimension = SizeToInt(input_shape_.size());
    for (int i = 0; i < dimension; ++i) {
      transpose_shape[i] = input_shape_[axes[i]];
    }
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/tensoradd_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/tensoradd_cpu_kernel.cc
@ -1,113 +0,0 @@
-/**
- * Copyright 2020-2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/cpu/kernel/tensoradd_cpu_kernel.h"
-#include <algorithm>
-#include <functional>
-#include <utility>
-#include <vector>
-
-namespace mindspore {
-namespace kernel {
-namespace {
-constexpr size_t kTensorAddInputsSize = 2;
-constexpr size_t kTensorAddOutputsSize = 1;
-}  // namespace
-
-void TensorAddCpuKernelMod::InitKernel(const CNodePtr &kernel_node) {
-  MS_EXCEPTION_IF_NULL(kernel_node);
-  kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
-  // Init shape and strides
-  input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
-  input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
-  output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
-
-  auto kernel_attr = GetKernelAttrFromNode(kernel_node);
-  auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
-  if (!is_match) {
-    MS_LOG(EXCEPTION) << "Add does not support this kernel data type: " << kernel_attr;
-  }
-  kernel_func_ = func_list_[index].second;
-}
-
-template <typename T>
-bool TensorAddCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
-                                         const std::vector<kernel::AddressPtr> &,
-                                         const std::vector<kernel::AddressPtr> &outputs) {
-  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kTensorAddInputsSize, kernel_name_);
-  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kTensorAddOutputsSize, kernel_name_);
-  T *input_addr_a = reinterpret_cast<T *>(inputs[0]->addr);
-  T *input_addr_b = reinterpret_cast<T *>(inputs[1]->addr);
-  T *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
-  size_t output_size = outputs[0]->size / sizeof(T);
-  if (input_shape_a_ == input_shape_b_) {
-    auto task = [output_addr, input_addr_a, input_addr_b](size_t start, size_t end) {
-      for (size_t i = start; i < end; ++i) {
-        output_addr[i] = input_addr_a[i] + input_addr_b[i];
-      }
-    };
-    ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
-  } else {  // Broadcast
-    BroadcastIterator base_iter(input_shape_a_, input_shape_b_, output_shape_);
-    auto task = [&base_iter, output_addr, input_addr_a, input_addr_b](size_t start, size_t end) {
-      auto iter = base_iter;
-      iter.SetPos(start);
-      for (size_t i = start; i < end; ++i) {
-        output_addr[i] = input_addr_a[iter.GetInputPosA()] + input_addr_b[iter.GetInputPosB()];
-        iter.GenNextPos();
-      }
-    };
-    ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
-  }
-  return true;
-}
-
-std::vector<std::pair<KernelAttr, TensorAddCpuKernelMod::AddFunc>> TensorAddCpuKernelMod::func_list_ = {
-  {KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
-   &TensorAddCpuKernelMod::LaunchKernel<int64_t>},
-  {KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-   &TensorAddCpuKernelMod::LaunchKernel<int32_t>},
-  {KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
-   &TensorAddCpuKernelMod::LaunchKernel<int16_t>},
-  {KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
-   &TensorAddCpuKernelMod::LaunchKernel<int8_t>},
-  {KernelAttr().AddInputAttr(kNumberTypeUInt64).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
-   &TensorAddCpuKernelMod::LaunchKernel<uint64_t>},
-  {KernelAttr().AddInputAttr(kNumberTypeUInt32).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
-   &TensorAddCpuKernelMod::LaunchKernel<uint32_t>},
-  {KernelAttr().AddInputAttr(kNumberTypeUInt16).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
-   &TensorAddCpuKernelMod::LaunchKernel<uint16_t>},
-  {KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
-   &TensorAddCpuKernelMod::LaunchKernel<uint8_t>},
-  {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
-   &TensorAddCpuKernelMod::LaunchKernel<double>},
-  {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-   &TensorAddCpuKernelMod::LaunchKernel<float>},
-  {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-   &TensorAddCpuKernelMod::LaunchKernel<float16>},
-  {KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
-   &TensorAddCpuKernelMod::LaunchKernel<bool>}};
-
-std::vector<KernelAttr> TensorAddCpuKernelMod::GetOpSupport() {
-  std::vector<KernelAttr> support_list;
-  (void)std::transform(func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
-                       [](const std::pair<KernelAttr, AddFunc> &pair) { return pair.first; });
-  return support_list;
-}
-
-MS_KERNEL_FACTORY_REG(NativeCpuKernelMod, Add, TensorAddCpuKernelMod);
-}  // namespace kernel
-}  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/tensoradd_cpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/tensoradd_cpu_kernel.h
@ -1,57 +0,0 @@
-/**
- * Copyright 2020-2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TENSORADD_CPU_KERNEL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TENSORADD_CPU_KERNEL_H_
-
-#include <vector>
-#include <utility>
-#include "plugin/device/cpu/kernel/cpu_kernel.h"
-#include "plugin/factory/ms_factory.h"
-
-namespace mindspore {
-namespace kernel {
-class TensorAddCpuKernelMod : public DeprecatedNativeCpuKernelMod {
- public:
-  TensorAddCpuKernelMod() = default;
-  ~TensorAddCpuKernelMod() override = default;
-
-  void InitKernel(const CNodePtr &kernel_node) override;
-
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
-              const std::vector<AddressPtr> &outputs) override {
-    return kernel_func_(this, inputs, workspace, outputs);
-  }
-
-  std::vector<KernelAttr> GetOpSupport() override;
-
- private:
-  template <typename T>
-  bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &workspace,
-                    const std::vector<kernel::AddressPtr> &outputs);
-  using AddFunc = std::function<bool(TensorAddCpuKernelMod *, const std::vector<kernel::AddressPtr> &,
-                                     const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
-  static std::vector<std::pair<KernelAttr, AddFunc>> func_list_;
-  AddFunc kernel_func_;
-
-  std::vector<int64_t> input_shape_a_;
-  std::vector<int64_t> input_shape_b_;
-  std::vector<int64_t> output_shape_;
-};
-}  // namespace kernel
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TENSORADD_CPU_KERNEL_H_
--- a/mindspore/core/mindrt/src/actor/actormgr.cc
+++ b/mindspore/core/mindrt/src/actor/actormgr.cc
@ -71,7 +71,6 @@ int ActorMgr::Initialize(bool use_inner_pool, size_t actor_thread_num, size_t ma
        return MINDRT_ERROR;
      }
      inner_pool_->SetActorThreadNum(actor_thread_num);
-      inner_pool_->DisableOccupiedActorThread();
      inner_pool_->SetKernelThreadNum(max_thread_num - actor_thread_num);
    }
    if (inner_pool_ != nullptr) {
--- a/mindspore/core/mindrt/src/thread/threadpool.cc
+++ b/mindspore/core/mindrt/src/thread/threadpool.cc
@ -191,10 +191,15 @@ void Worker::Active(std::vector<TaskSplit> *task_list, int task_id_start, int ta
    std::lock_guard<std::mutex> _l(mutex_);
    // add the first to task_, and others to queue.
    status_ = kThreadBusy;
-    task_id_.store(task_id_start, std::memory_order_relaxed);
-    THREAD_TEST_TRUE(task_ == nullptr);
-    task_.store((*task_list)[0].task_, std::memory_order_release);
-    for (int i = task_id_start + 1; i < task_id_end; ++i) {
+    Task *task = task_.load(std::memory_order_consume);
+    int to_atomic_task = 0;
+    if (task == nullptr) {
+      task_id_.store(task_id_start, std::memory_order_relaxed);
+      THREAD_TEST_TRUE(task_ == nullptr);
+      task_.store((*task_list)[0].task_, std::memory_order_release);
+      to_atomic_task = 1;
+    }
+    for (int i = task_id_start + to_atomic_task; i < task_id_end; ++i) {
      while (!local_task_queue_->Enqueue(&(*task_list)[i])) {
      }
    }
--- a/mindspore/core/mindrt/src/thread/threadpool.h
+++ b/mindspore/core/mindrt/src/thread/threadpool.h
@ -169,7 +169,7 @@ class MS_CORE_API ThreadPool {
  void DisableOccupiedActorThread() { occupied_actor_thread_ = false; }
  void SetActorThreadNum(size_t actor_thread_num) { actor_thread_num_ = actor_thread_num; }
  void SetKernelThreadNum(size_t kernel_thread_num) { kernel_thread_num_ = kernel_thread_num; }
-  size_t GetKernelThreadNum() const { return kernel_thread_num_; }
+  size_t GetKernelThreadNum() const { return kernel_thread_num_ + actor_thread_num_; }
  size_t GetActorThreadNum() const { return actor_thread_num_; }
  void SetKernelThreadMaxSpinCount(int spin_count);
  void SetSpinCountMaxValue();
--- a/mindspore/python/mindspore/context.py
+++ b/mindspore/python/mindspore/context.py
@ -375,8 +375,8 @@ class _Context:

    def set_runtime_num_threads(self, runtime_num_threads):
        """Check and set runtime_num_threads."""
-        if runtime_num_threads <= 0:
-            raise ValueError("The num of thread must bigger than 0.")
+        if runtime_num_threads < 0:
+            raise ValueError("The num of thread must bigger than or equal to 0.")
        self.set_param(ms_ctx_param.runtime_num_threads, runtime_num_threads)

    def set_op_timeout(self, op_timeout):
@ -924,7 +924,7 @@ def set_context(**kwargs):
        inter_op_parallel_num(int): The thread number of op parallel at the same time. Default value is 0,
            which means use the default num.
        runtime_num_threads(int): The thread pool number of cpu kernel used in runtime,
-            which must bigger than 0. Default value is 30, if you run many processes at
+            which must bigger than or equal to 0. Default value is 30, if you run many processes at
            the same time, you should set the value smaller to avoid thread contention.
        disable_format_transform (bool): Whether to disable the automatic format transform function from NCHW to NHWC.
            When the network training performance of fp16 is worse than fp32,