migrates aicpu kernels to MS from lqk

2022-12-21 11:40:36 +08:00 · 2022-12-21 11:40:36 +08:00 · 389acff921
parent 2f3d008c2b
commit 389acff921
21 changed files with 2012 additions and 144 deletions
--- a/.jenkins/check/config/filter_cppcheck.txt
+++ b/.jenkins/check/config/filter_cppcheck.txt
@ -86,4 +86,4 @@
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "variableScope"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constParameter"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constVariable"
-"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "unreadVariable"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "unreadVariable"
--- a/.jenkins/check/config/filter_cpplint.txt
+++ b/.jenkins/check/config/filter_cpplint.txt
@ -130,4 +130,5 @@
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "readability/namespace"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "whitespace/braces"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "build/include"
-"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "whitespace/end_of_line"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "whitespace/end_of_line"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "readability/casting"
--- a/mindspore/ccsrc/include/common/utils/utils.h
+++ b/mindspore/ccsrc/include/common/utils/utils.h
@ -387,6 +387,7 @@ constexpr auto kLSTMOpName = "LSTM";
 constexpr auto kLuUnpackOpName = "LuUnpack";
 constexpr auto kMaskedFillOpName = "MaskedFill";
 constexpr auto kMaskedSelectOpName = "MaskedSelect";
+constexpr auto kMaskedSelectGradOpName = "MaskedSelectGrad";
 constexpr auto kMatMulOpName = "MatMul";
 constexpr auto kMatMulV2OpName = "MatMulV2";
 constexpr auto kMatrixDiagOpName = "MatrixDiag";
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/acos.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/acos.cc
@ -1,127 +0,0 @@
-/**
- * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cpu_kernel/ms_kernel/acos.h"
-
-#include <unsupported/Eigen/CXX11/Tensor>
-#include <algorithm>
-
-#include "cpu_kernel/common/cpu_kernel_utils.h"
-#include "cpu_kernel/inc/cpu_types.h"
-#include "mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/common/kernel_log.h"
-#include "cpu_kernel/common/status.h"
-#include "utils/kernel_util.h"
-
-namespace {
-const std::uint32_t kAcosInputNum{1u};
-const std::uint32_t kAcosOutputNum{1u};
-const char *const kAcos{"Acos"};
-const std::int64_t kAcosParallelNum{64 * 1024};
-}  // namespace
-
-namespace aicpu {
-namespace detail {
-template <typename T>
-inline T ScalarAcos(const T x) {
-  return std::acos(x);
-}
-
-template <>
-inline Eigen::half ScalarAcos(const Eigen::half x) {
-  const Eigen::half val{static_cast<Eigen::half>(std::acos(static_cast<std::float_t>(x)))};
-  return val;
-}
-
-inline std::uint32_t ParallelForAcos(const CpuKernelContext &ctx, std::int64_t total, std::int64_t per_unit_size,
-                                     const std::function<void(std::int64_t, std::int64_t)> &work) {
-  if (total > kAcosParallelNum)
-    return aicpu::CpuKernelUtils::ParallelFor(ctx, total, per_unit_size, work);
-  else
-    work(0, total);
-  return KERNEL_STATUS_OK;
-}
-
-template <typename T>
-inline std::uint32_t ComputeAcosKernel(const CpuKernelContext &ctx) {
-  T *input0{static_cast<T *>(ctx.Input(0)->GetData())};
-  T *output{static_cast<T *>(ctx.Output(0)->GetData())};
-  std::int64_t total{ctx.Input(0)->NumElements()};
-  std::uint32_t cores{aicpu::CpuKernelUtils::GetCPUNum(ctx)};
-  std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
-  return ParallelForAcos(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
-    std::transform(input0 + begin, input0 + end, output + begin, ScalarAcos<T>);
-  });
-}
-
-template <typename T>
-inline std::uint32_t ComputeAcos(const CpuKernelContext &ctx) {
-  std::uint32_t result{ComputeAcosKernel<T>(ctx)};
-  if (result != KERNEL_STATUS_OK) {
-    KERNEL_LOG_ERROR("Acos compute failed.");
-  }
-  return result;
-}
-
-inline std::uint32_t ExtraCheckAcos(const CpuKernelContext &ctx) {
-  if (ctx.Input(0)->GetData() == nullptr) {
-    KERNEL_LOG_ERROR("Get input data failed.");
-    return KERNEL_STATUS_PARAM_INVALID;
-  }
-  if (ctx.Output(0)->GetData() == nullptr) {
-    KERNEL_LOG_ERROR("Get output data failed.");
-    return KERNEL_STATUS_PARAM_INVALID;
-  }
-  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
-    KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
-                     DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
-    return KERNEL_STATUS_PARAM_INVALID;
-  }
-  if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
-    KERNEL_LOG_ERROR(
-      "The data size of the input [%llu] need be the same as the output "
-      "[%llu].",
-      ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
-    return KERNEL_STATUS_PARAM_INVALID;
-  }
-  return KERNEL_STATUS_OK;
-}
-
-inline std::uint32_t CheckAcos(const CpuKernelContext &ctx, std::uint32_t inputs_num, std::uint32_t outputs_num) {
-  return NormalCheck(ctx, inputs_num, outputs_num) ? KERNEL_STATUS_PARAM_INVALID : ExtraCheckAcos(ctx);
-}
-
-inline std::uint32_t ComputeAcos(const CpuKernelContext &ctx) {
-  DataType input_type{ctx.Input(0)->GetDataType()};
-  switch (input_type) {
-    case DT_FLOAT16:
-      return ComputeAcos<Eigen::half>(ctx);
-    case DT_FLOAT:
-      return ComputeAcos<std::float_t>(ctx);
-    case DT_DOUBLE:
-      return ComputeAcos<std::double_t>(ctx);
-    default:
-      KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
-      return KERNEL_STATUS_PARAM_INVALID;
-  }
-}
-}  // namespace detail
-
-std::uint32_t AcosCpuKernel::Compute(const CpuKernelContext &ctx) {
-  return detail::CheckAcos(ctx, kAcosInputNum, kAcosOutputNum) ? KERNEL_STATUS_PARAM_INVALID : detail::ComputeAcos(ctx);
-}
-
-REGISTER_CPU_KERNEL(kAcos, AcosCpuKernel);
-}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cache_swap_table.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cache_swap_table.cc
@ -0,0 +1,154 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cache_swap_table.h"
+#include <securec.h>
+#include <map>
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/sparse_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *const kCacheSwapTable = "CacheSwapTable";
+}
+
+namespace aicpu {
+template <typename T>
+uint32_t CacheSwapTableTask(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, int64_t batch_size,
+                            int64_t output_size, int64_t one_line_col, int type_size) {
+  if (inputs.size() == 0 || outputs.size() == 0) {
+    KERNEL_LOG_ERROR("CacheSwapTable input or output is empty.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  char *cache_table = reinterpret_cast<char *>(inputs[0]->GetData());
+  T *swap_cache_idx = reinterpret_cast<T *>(inputs[1]->GetData());
+  uint64_t swap_cache_idx_size = inputs[1]->GetDataSize();
+  char *miss_value = reinterpret_cast<char *>(inputs[2]->GetData());
+
+  char *old_value = reinterpret_cast<char *>(outputs[0]->GetData());
+
+  errno_t ret = memset_s(old_value, static_cast<size_t>(output_size * type_size), 0x00,
+                         static_cast<size_t>(output_size * type_size));
+  if (ret != EOK) {
+    KERNEL_LOG_ERROR("Memset failed, result[%d]", ret);
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+
+  uint64_t single_copy_size = static_cast<uint64_t>(type_size * one_line_col);
+
+  if (swap_cache_idx_size < static_cast<uint64_t>(batch_size)) {
+    KERNEL_LOG_ERROR(
+      "The value of swap_cache_idx_size:[%llu] must be less than "
+      "batch_size:[%lld]",
+      swap_cache_idx_size, batch_size);
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+
+  uint64_t old_value_size = outputs[0]->GetDataSize();
+  uint64_t cache_table_size = inputs[0]->GetDataSize();
+  for (int64_t i = 0; i < batch_size; ++i) {
+    if (swap_cache_idx[i] < 0) {
+      continue;
+    }
+    ret = memcpy_s(old_value + i * single_copy_size, old_value_size, cache_table + swap_cache_idx[i] * single_copy_size,
+                   single_copy_size);
+    old_value_size -= single_copy_size;
+    if (ret != EOK) {
+      KERNEL_LOG_ERROR("CacheSwapTable memcpy failed, result [%d].", ret);
+      return KERNEL_STATUS_INNER_ERROR;
+    }
+    ret = memcpy_s(cache_table + swap_cache_idx[i] * single_copy_size, cache_table_size,
+                   miss_value + i * single_copy_size, single_copy_size);
+    cache_table_size -= single_copy_size;
+    if (ret != EOK) {
+      KERNEL_LOG_ERROR("CacheSwapTable memcpy failed, result [%d].", ret);
+      return KERNEL_STATUS_INNER_ERROR;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t CacheSwapTableMsCpuKernel::DoCompute() {
+  std::map<int, std::function<uint32_t(std::vector<Tensor *> &, std::vector<Tensor *> &, int64_t &, int64_t &,
+                                       int64_t &, int &)>>
+    calls;
+  calls[DT_INT32] = CacheSwapTableTask<int32_t>;
+  calls[DT_INT64] = CacheSwapTableTask<int64_t>;
+
+  if (calls.find(indices_type_) == calls.end()) {
+    KERNEL_LOG_ERROR(
+      "CacheSwapTableMsCpuKernel op doesn't support indices tensor types: "
+      "[%s]",
+      DTypeStr(indices_type_).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  int type_size = GetSizeByDataType(param_type_);
+  return calls[indices_type_](inputs_, outputs_, batch_size_, output_size_, one_line_col_, type_size);
+}
+
+uint32_t CacheSwapTableMsCpuKernel::GetInputAndCheck(const CpuKernelContext &ctx) {
+  KERNEL_LOG_INFO("GetInputAndCheck start!");
+  // get input Tensors
+  const uint32_t kNumInput = 3;
+  for (uint32_t i = 0; i < kNumInput; ++i) {
+    Tensor *tensor = ctx.Input(i);
+    KERNEL_CHECK_NULLPTR(tensor, KERNEL_STATUS_PARAM_INVALID, "Get input tensor[%d] failed", i)
+    inputs_.push_back(tensor);
+  }
+  // get output Tensors
+  const uint32_t kNumOutput = 1;
+  for (uint32_t i = 0; i < kNumOutput; ++i) {
+    Tensor *tensor = ctx.Output(i);
+    KERNEL_CHECK_NULLPTR(tensor, KERNEL_STATUS_PARAM_INVALID, "Get output tensor[%d] failed", i)
+    outputs_.push_back(tensor);
+  }
+  // get param type
+  param_type_ = static_cast<DataType>(inputs_[0]->GetDataType());
+  indices_type_ = static_cast<DataType>(inputs_[1]->GetDataType());
+  KERNEL_LOG_INFO("GetInputAndCheck success!");
+
+  std::shared_ptr<TensorShape> cache_table_shape = ctx.Input(0)->GetTensorShape();
+  std::shared_ptr<TensorShape> indices_shape = ctx.Input(1)->GetTensorShape();
+
+  for (int32_t i = 1; i < cache_table_shape->GetDims(); ++i) {
+    KERNEL_CHECK_ASSIGN_64S_MULTI(one_line_col_, cache_table_shape->GetDimSize(i), one_line_col_,
+                                  KERNEL_STATUS_PARAM_INVALID);
+  }
+  for (int32_t i = 0; i < indices_shape->GetDims(); ++i) {
+    KERNEL_CHECK_ASSIGN_64S_MULTI(batch_size_, indices_shape->GetDimSize(i), batch_size_, KERNEL_STATUS_PARAM_INVALID);
+  }
+  output_size_ = batch_size_ * one_line_col_;
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t CacheSwapTableMsCpuKernel::Compute(const CpuKernelContext &ctx) {
+  uint32_t res = GetInputAndCheck(ctx);
+  if (res != KERNEL_STATUS_OK) {
+    return res;
+  }
+
+  res = DoCompute();
+  if (res != KERNEL_STATUS_OK) {
+    KERNEL_LOG_ERROR("Compute failed");
+    return res;
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kCacheSwapTable, CacheSwapTableMsCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cache_swap_table.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cache_swap_table.h
@ -0,0 +1,44 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_CACHE_SWAP_TABLE_H
+#define AICPU_KERNELS_NORMALIZED_CACHE_SWAP_TABLE_H
+
+#include <cmath>
+#include <vector>
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class CacheSwapTableMsCpuKernel : public CpuKernel {
+ public:
+  ~CacheSwapTableMsCpuKernel() = default;
+  uint32_t Compute(const CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t DoCompute();
+
+  uint32_t GetInputAndCheck(const CpuKernelContext &ctx);
+
+  int64_t batch_size_ = 1;
+  int64_t one_line_col_ = 1;
+  int64_t output_size_ = 1;
+
+  std::vector<Tensor *> inputs_;
+  std::vector<Tensor *> outputs_;
+  DataType param_type_ = DT_FLOAT;
+  DataType indices_type_ = DT_INT32;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fill.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fill.cc
@ -0,0 +1,143 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fill.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *const kFill = "Fill";
+}
+
+namespace aicpu {
+template <typename T>
+void FillGenerateCase(Tensor *&value_tensor, Tensor *&output) {
+  auto value = *(reinterpret_cast<T *>(value_tensor->GetData()));
+  if (AddrAlignedCheck(output->GetData())) {
+    Eigen::TensorMap<Eigen::Tensor<T, 1>, Eigen::Aligned> eigen_output(static_cast<T *>(output->GetData()),
+                                                                       output->GetTensorShape()->NumElements());
+    eigen_output.setConstant(value);
+  } else {
+    Eigen::TensorMap<Eigen::Tensor<T, 1>, Eigen::Unaligned> eigen_output(static_cast<T *>(output->GetData()),
+                                                                         output->GetTensorShape()->NumElements());
+    eigen_output.setConstant(value);
+  }
+}
+
+uint32_t FillCpuKernel::GetDimsByType(const CpuKernelContext &ctx) {
+  dims.clear();
+  Tensor *dims_tensor = ctx.Input(0);
+  KERNEL_CHECK_NULLPTR(dims_tensor, KERNEL_STATUS_PARAM_INVALID, "Get dims input failed")
+  uint32_t ret;
+  auto dims_dtype = dims_tensor->GetDataType();
+  switch (dims_dtype) {
+    case (DT_INT32):
+      ret = CalcDims<int32_t>(dims_tensor, dims);
+      break;
+    case (DT_INT64):
+      ret = CalcDims<int64_t>(dims_tensor, dims);
+      break;
+    default:
+      KERNEL_LOG_ERROR(
+        "Fill kernel dims data_type [%u] not support, support data_types: "
+        "DT_INT32, DT_INT64",
+        dims_dtype);
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ret != KERNEL_STATUS_OK) {
+    KERNEL_LOG_ERROR("Fill kernel calculate dims failed");
+  }
+  return ret;
+}
+
+uint32_t FillCpuKernel::Compute(const CpuKernelContext &ctx) {
+  uint32_t check = GetDimsByType(ctx);
+  if (check != KERNEL_STATUS_OK) {
+    return check;
+  }
+  Tensor *value_tensor = ctx.Input(1);
+  KERNEL_CHECK_NULLPTR(value_tensor, KERNEL_STATUS_PARAM_INVALID, "Get value input failed")
+  KERNEL_CHECK_NULLPTR(value_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get value input data failed")
+  KERNEL_CHECK_NULLPTR(value_tensor->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get value input shape failed")
+  if (!value_tensor->GetTensorShape()->GetDimSizes().empty()) {
+    KERNEL_LOG_ERROR("Fill kernel value input is not a scalar.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  Tensor *output = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "Get output failed")
+  KERNEL_CHECK_NULLPTR(output->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
+  KERNEL_CHECK_NULLPTR(output->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get output shape failed")
+  if (output->GetTensorShape()->GetDimSizes() != dims) {
+    KERNEL_LOG_ERROR("Fill kernel output shape not matched.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  auto input_dtype = value_tensor->GetDataType();
+  auto output_dtype = output->GetDataType();
+  if (input_dtype != output_dtype) {
+    KERNEL_LOG_ERROR("Fill kernel data type not matched, value input dtype [%u], output dtype [%u].", input_dtype,
+                     output_dtype);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  std::map<int, std::function<void(Tensor *&, Tensor *&)>> calls;
+  calls[DT_INT8] = FillGenerateCase<int8_t>;
+  calls[DT_UINT8] = FillGenerateCase<uint8_t>;
+  calls[DT_INT16] = FillGenerateCase<int16_t>;
+  calls[DT_UINT16] = FillGenerateCase<uint16_t>;
+  calls[DT_INT32] = FillGenerateCase<int32_t>;
+  calls[DT_UINT32] = FillGenerateCase<uint32_t>;
+  calls[DT_INT64] = FillGenerateCase<int64_t>;
+  calls[DT_UINT64] = FillGenerateCase<uint64_t>;
+  calls[DT_BOOL] = FillGenerateCase<bool>;
+  calls[DT_FLOAT16] = FillGenerateCase<Eigen::half>;
+  calls[DT_FLOAT] = FillGenerateCase<float>;
+  calls[DT_DOUBLE] = FillGenerateCase<double>;
+
+  if (calls.find(output_dtype) == calls.end()) {
+    KERNEL_LOG_ERROR("Fill kernel data type [%u] not support", output_dtype);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  calls[output_dtype](value_tensor, output);
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t FillCpuKernel::CalcDims(const Tensor *dims_tensor, std::vector<int64_t> &dim_vec) {
+  uint64_t data_num = dims_tensor->GetDataSize() / sizeof(T);
+  if (data_num == 0) {
+    KERNEL_LOG_INFO("Fill kernel: dims is empty, fill scalar output.");
+    return KERNEL_STATUS_OK;
+  }
+
+  KERNEL_CHECK_NULLPTR(dims_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get dims data failed")
+  for (uint64_t i = 0; i < data_num; i++) {
+    auto dim = *(reinterpret_cast<const T *>(dims_tensor->GetData()) + i);
+    if (dim < 0) {
+      KERNEL_LOG_ERROR("Fill kernel: input dim [%llu] is negative, value=[%lld]", i, static_cast<int64_t>(dim));
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    // zero dim is different from empty dim.
+    if (dim == 0) {
+      KERNEL_LOG_INFO("Fill kernel: input dim [%llu] is zero", i);
+    }
+    dim_vec.emplace_back(dim);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kFill, FillCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fill.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fill.h
@ -0,0 +1,43 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_FILL_H
+#define AICPU_KERNELS_NORMALIZED_FILL_H
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class FillCpuKernel : public CpuKernel {
+ public:
+  FillCpuKernel() = default;
+  ~FillCpuKernel() override = default;
+  uint32_t Compute(const CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t GetDimsByType(const CpuKernelContext &ctx);
+  /**
+   * @brief calc dims from input dims tensor
+   * @param dims_tensor input dims tensor
+   * @param dims output shape dims
+   * @return status if success
+   */
+  template <typename T>
+  uint32_t CalcDims(const Tensor *dims_tensor, std::vector<int64_t> &dims);
+
+  std::vector<int64_t> dims;
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_FILL_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/masked_select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/masked_select.cc
@ -0,0 +1,293 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "masked_select.h"
+#include <array>
+#include <atomic>
+#include <algorithm>
+#include <vector>
+#include "Eigen/Core"
+#include "securec.h"
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/broadcast_iterator.h"
+#include "utils/kernel_util.h"
+
+namespace {
+constexpr uint32_t kMaskedSelectInputNum = 2;
+constexpr uint32_t kMaskedSelectOutputNum = 1;
+constexpr int64_t kParallelDataNums = 32 * 1000;
+const char *const kMaskedSelect = "MaskedSelect";
+struct OutputInfo {
+  int64_t startIdx;
+  int64_t len;
+  OutputInfo() {
+    startIdx = 0;
+    len = 0;
+  }
+};
+
+bool CompareFunc(const OutputInfo &a, const OutputInfo &b) { return a.startIdx <= b.startIdx; }
+
+// calculate the index stride of dataShape.
+// dataShape:[m, 1, k] and broadcastShape:[j, m, n, k] --> index_stride:[0, k, 0, 1]
+std::vector<int64_t> CalIndexStride(const std::vector<int64_t> &dataShape, const std::vector<int64_t> &broadcastShape) {
+  int broadcastDimNum = broadcastShape.size();
+  int dataDimNum = dataShape.size();
+  int diffDimNum = broadcastDimNum - dataDimNum;
+  std::vector<int64_t> indexStride(broadcastDimNum, 0);
+  indexStride[broadcastDimNum - 1] = 1;
+  for (int i = broadcastDimNum - 1; i > diffDimNum; i--) {
+    indexStride[i - 1] = indexStride[i] * dataShape[i];
+  }
+  for (int i = 0; i < dataDimNum; i++) {
+    if (dataShape[i] == 1) {
+      indexStride[i + diffDimNum] = 0;
+    }
+  }
+  return indexStride;
+}
+
+// calculate the index stride of shape.
+// shape:[m, n, k] --> index_stride:[n*k, k, 1]
+std::vector<int64_t> CalIndexStride(const std::vector<int64_t> &shape) {
+  int dimNum = shape.size();
+  std::vector<int64_t> indexStride(dimNum, 1);
+  for (int i = dimNum - 1; i > 0; i--) {
+    indexStride[i - 1] = indexStride[i] * shape[i];
+  }
+  return indexStride;
+}
+
+// calculate the original index of data.
+// shape:[7,8,9] indexStride:[72,9,1] and flatten_index:11--> ori_index:[0,1,2]
+bool CalIndexInfo(const std::vector<int64_t> &indexStride, int64_t flattenIndex, std::vector<int64_t> &oriIndex,
+                  int dimNum) {
+  for (int i = 0; i < dimNum - 1; i++) {
+    if (indexStride[i] == 0) {
+      return false;
+    }
+    oriIndex[i] = flattenIndex / indexStride[i];
+    flattenIndex = flattenIndex % indexStride[i];
+  }
+  oriIndex[dimNum - 1] = flattenIndex;
+  return true;
+}
+
+inline int64_t CalFlattenIndex(const std::vector<int64_t> &indexStride, const std::vector<int64_t> &oriIndex,
+                               int dimNum) {
+  int64_t flattenIndex = 0;
+  for (int i = 0; i < dimNum; i++) {
+    flattenIndex += indexStride[i] * oriIndex[i];
+  }
+  return flattenIndex;
+}
+
+void UpdateIndexByCarry(std::vector<int64_t> &preIndex, const std::vector<int64_t> &shape, int dimNum) {
+  // shape:[7,3,10,17] and last index:[0,0,9,16] -> next index:[0,1,0,0]
+  constexpr int64_t carryBit = 1;
+  for (int i = dimNum - 1; i >= 0; i--) {
+    preIndex[i] = preIndex[i] + carryBit;
+    if (preIndex[i] < shape[i]) {
+      break;
+    } else {
+      preIndex[i] = preIndex[i] - shape[i];
+    }
+  }
+  return;
+}
+}  // namespace
+
+namespace aicpu {
+uint32_t MaskedSelectCpuKernel::Compute(const CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMaskedSelectInputNum, kMaskedSelectOutputNum), "[%s] check params failed.",
+                      kMaskedSelect);
+
+  // choose compute function depend on dataType
+  auto data_type0 = static_cast<DataType>(ctx.Input(kFirstInputIndex)->GetDataType());
+  auto data_type1 = static_cast<DataType>(ctx.Input(kSecondInputIndex)->GetDataType());
+  auto data_type2 = static_cast<DataType>(ctx.Output(kFirstOutputIndex)->GetDataType());
+  if (data_type1 != DT_BOOL) {
+    KERNEL_LOG_ERROR("[%s] Data type of mask requires bool, but got data type [%s].", ctx.GetOpType().c_str(),
+                     DTypeStr(data_type1).c_str());
+    return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+  }
+  if (data_type0 != data_type2) {
+    KERNEL_LOG_ERROR("[%s] Data type of x and y requires same, but got data type [%s] and [%s].",
+                     ctx.GetOpType().c_str(), DTypeStr(data_type0).c_str(), DTypeStr(data_type2).c_str());
+    return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+  }
+  switch (data_type0) {
+    case DT_FLOAT16:
+      return MaskedSelectCompute<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return MaskedSelectCompute<float>(ctx);
+    case DT_DOUBLE:
+      return MaskedSelectCompute<double>(ctx);
+    case DT_INT8:
+      return MaskedSelectCompute<int8_t>(ctx);
+    case DT_INT16:
+      return MaskedSelectCompute<int16_t>(ctx);
+    case DT_INT32:
+      return MaskedSelectCompute<int32_t>(ctx);
+    case DT_INT64:
+      return MaskedSelectCompute<int64_t>(ctx);
+    case DT_UINT8:
+      return MaskedSelectCompute<uint8_t>(ctx);
+    case DT_UINT16:
+      return MaskedSelectCompute<uint16_t>(ctx);
+    case DT_UINT32:
+      return MaskedSelectCompute<uint32_t>(ctx);
+    case DT_UINT64:
+      return MaskedSelectCompute<uint64_t>(ctx);
+    case DT_BOOL:
+      return MaskedSelectCompute<bool>(ctx);
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                       DTypeStr(data_type0).c_str());
+      return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+  }
+}
+
+template <typename T>
+uint32_t MaskedSelectCpuKernel::ParallelCompute(const CpuKernelContext &ctx, const std::vector<int64_t> &inputShapeX,
+                                                const std::vector<int64_t> &inputShapeMask,
+                                                const std::vector<int64_t> &outputShape, int64_t dataNum) {
+  T *x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  bool *mask = reinterpret_cast<bool *>(ctx.Input(1)->GetData());
+  T *y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+
+  std::atomic<int> threadNum{0};
+  std::atomic<bool> taskFlag(true);
+  constexpr int queueLen = 100;
+  std::array<OutputInfo, queueLen> outputIndexList;
+
+  std::vector<int64_t> indexStrideX = CalIndexStride(inputShapeX, outputShape);
+  std::vector<int64_t> indexStrideMask = CalIndexStride(inputShapeMask, outputShape);
+  std::vector<int64_t> indexStrideOutput = CalIndexStride(outputShape);
+  KERNEL_LOG_DEBUG("index stride of x[%s].", VectorToString(indexStrideX).c_str());
+  KERNEL_LOG_DEBUG("index stride of mask[%s].", VectorToString(indexStrideMask).c_str());
+
+  auto work = [=, &threadNum, &taskFlag, &outputIndexList](int64_t start, int64_t end) {
+    int64_t cnt = 0;
+    int dimNum = outputShape.size();
+    std::vector<int64_t> indexValue(dimNum, 0);
+    if (!CalIndexInfo(indexStrideOutput, start, indexValue, dimNum)) {
+      taskFlag.store(false);
+      KERNEL_LOG_ERROR("Invalid index stride, please check.");
+      return;
+    }
+
+    for (int64_t i = start; i < end; ++i) {
+      int64_t maskFlatIndex = CalFlattenIndex(indexStrideMask, indexValue, dimNum);
+      int64_t xFlatIndex = CalFlattenIndex(indexStrideX, indexValue, dimNum);
+      if (mask[maskFlatIndex]) {
+        y[start + cnt] = x[xFlatIndex];
+        cnt++;
+      }
+      UpdateIndexByCarry(indexValue, outputShape, dimNum);
+    }
+    int idx = threadNum.fetch_add(1, std::memory_order_relaxed);
+    if (idx >= queueLen) {
+      taskFlag.store(false);
+      return;
+    }
+    outputIndexList[idx].startIdx = start;
+    outputIndexList[idx].len = cnt;
+    KERNEL_LOG_DEBUG("outputIndexList[%d] startIdx is [%lld], len is  [%lld].", idx, outputIndexList[idx].startIdx,
+                     outputIndexList[idx].len);
+  };
+  constexpr int perUnitSize = 1000;
+  KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, dataNum, perUnitSize, work), "MaskedSelect calculate failed.");
+
+  if (!taskFlag.load()) {
+    KERNEL_LOG_ERROR("Invalid array.");
+    return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+  }
+
+  int validNum = threadNum.load();
+  std::sort(outputIndexList.begin(), outputIndexList.begin() + validNum, CompareFunc);
+
+  int validOffset = outputIndexList[0].len;
+  int64_t copyLen = 0;
+  int ret = 0;
+  for (int i = 1; i < validNum; i++) {
+    copyLen = outputIndexList[i].len;
+    if (copyLen <= 0) {
+      continue;
+    }
+    int64_t byteLen = copyLen * static_cast<int64_t>(sizeof(T));
+    ret = memmove_s(y + validOffset, byteLen, y + outputIndexList[i].startIdx, byteLen);
+    KERNEL_CHECK_FALSE((ret == EOK), KERNEL_STATUS_PARAM_INVALID, "Memmove failed, result = [%d].", ret);
+    validOffset += copyLen;
+  }
+  ctx.Output(0)->GetTensorShape()->SetDimSizes({validOffset});
+  return static_cast<uint32_t>(KERNEL_STATUS_OK);
+}
+
+template <typename T>
+uint32_t MaskedSelectCpuKernel::MaskedSelectCompute(const CpuKernelContext &ctx) {
+  T *x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  KERNEL_CHECK_NULLPTR(x, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "[%s] get input_data[0] failed.",
+                       kMaskedSelect);
+  bool *mask = reinterpret_cast<bool *>(ctx.Input(1)->GetData());
+  KERNEL_CHECK_NULLPTR(mask, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "[%s] get input_data[1] failed.",
+                       kMaskedSelect);
+  T *y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  KERNEL_CHECK_NULLPTR(y, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "[%s] get output_data[0] failed.",
+                       kMaskedSelect);
+
+  auto input_shape_a = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  auto input_shape_b = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  if (IsScalar(input_shape_a) && IsScalar(input_shape_b)) {
+    if (mask[0]) {
+      y[0] = x[0];
+      ctx.Output(0)->GetTensorShape()->SetDimSizes({1});
+    } else {
+      ctx.Output(0)->GetTensorShape()->SetDimSizes({0});
+    }
+    return static_cast<uint32_t>(KERNEL_STATUS_OK);
+  }
+  std::vector<int64_t> output_shape;
+  auto ret = GetBroadcastShape(input_shape_a, input_shape_b, &output_shape);
+  KERNEL_CHECK_FALSE(ret == KERNEL_STATUS_OK, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID),
+                     "Shape of x and mask can't be broadcast.");
+  int64_t tensor_size = 1;
+  for (const int64_t &d : output_shape) {
+    tensor_size *= d;
+  }
+
+  if (tensor_size >= kParallelDataNums) {
+    ret = ParallelCompute<T>(ctx, input_shape_a, input_shape_b, output_shape, tensor_size);
+    return ret;
+  }
+
+  int64_t j = 0;
+  BroadcastIterator iter(input_shape_a, input_shape_b, &output_shape);
+  iter.SetPos(0);
+  for (int64_t i = 0; i < tensor_size; ++i) {
+    if (mask[iter.GetInputPosB()]) {
+      y[j++] = x[iter.GetInputPosA()];
+    }
+    iter.GenNextPos();
+  }
+  ctx.Output(0)->GetTensorShape()->SetDimSizes({j});
+  return static_cast<uint32_t>(KERNEL_STATUS_OK);
+}
+REGISTER_CPU_KERNEL(kMaskedSelect, MaskedSelectCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/masked_select.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/masked_select.h
@ -0,0 +1,41 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_MASKED_SELECT_H_
+#define AICPU_KERNELS_NORMALIZED_MASKED_SELECT_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class MaskedSelectCpuKernel : public CpuKernel {
+ public:
+  ~MaskedSelectCpuKernel() = default;
+  uint32_t Compute(const CpuKernelContext &ctx) override;
+
+ private:
+  /**
+   * @brief compute for all types
+   * @param ctx cpu kernel context
+   * @return status if success
+   */
+  template <typename T>
+  uint32_t MaskedSelectCompute(const CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t ParallelCompute(const CpuKernelContext &ctx, const std::vector<int64_t> &inputShapeX,
+                           const std::vector<int64_t> &inputShapeMask, const std::vector<int64_t> &outputShape,
+                           int64_t dataNum);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/masked_select_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/masked_select_grad.cc
@ -0,0 +1,121 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "masked_select_grad.h"
+
+#include "Eigen/Core"
+#include "securec.h"
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/broadcast_iterator.h"
+#include "utils/kernel_util.h"
+
+namespace {
+constexpr uint32_t kMaskedSelectGradInputNum = 3;
+constexpr uint32_t kMaskedSelectGradOutputNum = 1;
+const char *const kMaskedSelectGrad = "MaskedSelectGrad";
+}  // namespace
+
+namespace aicpu {
+uint32_t MaskedSelectGradCpuKernel::Compute(const CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMaskedSelectGradInputNum, kMaskedSelectGradOutputNum),
+                      "[%s] check params failed.", kMaskedSelectGrad);
+
+  // choose compute function depend on dataType
+  auto data_type0 = static_cast<DataType>(ctx.Input(kFirstInputIndex)->GetDataType());
+  auto data_type1 = static_cast<DataType>(ctx.Input(kSecondInputIndex)->GetDataType());
+  auto data_type2 = static_cast<DataType>(ctx.Input(2)->GetDataType());
+  if (data_type1 != DT_BOOL) {
+    KERNEL_LOG_ERROR("[%s] Data type of mask requires bool, but got data type [%s].", ctx.GetOpType().c_str(),
+                     DTypeStr(data_type1).c_str());
+    return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+  }
+  if (data_type0 != data_type2) {
+    KERNEL_LOG_ERROR("[%s] Data type of x and y requires same, but got data type [%s] and [%s].",
+                     ctx.GetOpType().c_str(), DTypeStr(data_type0).c_str(), DTypeStr(data_type2).c_str());
+    return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+  }
+  switch (data_type0) {
+    case DT_FLOAT16:
+      return MaskedSelectGradCompute<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return MaskedSelectGradCompute<float>(ctx);
+    case DT_DOUBLE:
+      return MaskedSelectGradCompute<double>(ctx);
+    case DT_INT8:
+      return MaskedSelectGradCompute<int8_t>(ctx);
+    case DT_INT16:
+      return MaskedSelectGradCompute<int16_t>(ctx);
+    case DT_INT32:
+      return MaskedSelectGradCompute<int32_t>(ctx);
+    case DT_INT64:
+      return MaskedSelectGradCompute<int64_t>(ctx);
+    case DT_UINT8:
+      return MaskedSelectGradCompute<uint8_t>(ctx);
+    case DT_UINT16:
+      return MaskedSelectGradCompute<uint16_t>(ctx);
+    case DT_UINT32:
+      return MaskedSelectGradCompute<uint32_t>(ctx);
+    case DT_UINT64:
+      return MaskedSelectGradCompute<uint64_t>(ctx);
+    case DT_BOOL:
+      return MaskedSelectGradCompute<bool>(ctx);
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                       DTypeStr(data_type0).c_str());
+      return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+  }
+}
+
+template <typename T>
+uint32_t MaskedSelectGradCpuKernel::MaskedSelectGradCompute(const CpuKernelContext &ctx) {
+  bool *mask = reinterpret_cast<bool *>(ctx.Input(1)->GetData());
+  KERNEL_CHECK_NULLPTR(mask, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "[%s] get input_data[1] failed.",
+                       kMaskedSelectGrad);
+  T *grad = reinterpret_cast<T *>(ctx.Input(2)->GetData());
+  KERNEL_CHECK_NULLPTR(grad, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "[%s] get input_data[2] failed.",
+                       kMaskedSelectGrad);
+  T *dx = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  KERNEL_CHECK_NULLPTR(dx, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "[%s] get output_data[0] failed.",
+                       kMaskedSelectGrad);
+
+  auto input_shape_a = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  auto input_shape_b = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> output_shape;
+  auto ret = GetBroadcastShape(input_shape_a, input_shape_b, &output_shape);
+  KERNEL_CHECK_FALSE(ret == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID, "Shape of x and mask can't be broadcast.");
+  int64_t tensor_size = 1;
+  for (const int64_t &d : output_shape) {
+    tensor_size *= d;
+  }
+  const T NUM_ZERO = static_cast<T>(0);
+  for (int k = 0; k < tensor_size; ++k) {
+    dx[k] = NUM_ZERO;
+  }
+  int64_t j = 0;
+  BroadcastIterator iter(input_shape_a, input_shape_b, &output_shape);
+  iter.SetPos(0);
+  for (int64_t i = 0; i < tensor_size; ++i) {
+    if (mask[iter.GetInputPosB()]) {
+      dx[iter.GetInputPosA()] += grad[j++];
+    }
+    iter.GenNextPos();
+  }
+  return static_cast<uint32_t>(KERNEL_STATUS_OK);
+}
+REGISTER_CPU_KERNEL(kMaskedSelectGrad, MaskedSelectGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/masked_select_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/masked_select_grad.h
@ -1,5 +1,5 @@
 /**
- * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -13,16 +13,25 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+#ifndef AICPU_KERNELS_NORMALIZED_MASKED_SELECT_GRAD_H_
+#define AICPU_KERNELS_NORMALIZED_MASKED_SELECT_GRAD_H_

-#ifndef AICPU_KERNELS_NORMALIZED_ACOS_H
-#define AICPU_KERNELS_NORMALIZED_ACOS_H
-
-#include "cpu_kernel/inc/cpu_ops_kernel.h"
+#include "cpu_ops_kernel.h"

 namespace aicpu {
-class AcosCpuKernel final : public CpuKernel {
+class MaskedSelectGradCpuKernel : public CpuKernel {
 public:
-  std::uint32_t Compute(const CpuKernelContext &ctx) override;
+  ~MaskedSelectGradCpuKernel() = default;
+  uint32_t Compute(const CpuKernelContext &ctx) override;
+
+ private:
+  /**
+   * @brief compute for all types
+   * @param ctx cpu kernel context
+   * @return status if success
+   */
+  template <typename T>
+  uint32_t MaskedSelectGradCompute(const CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nms_with_mask.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nms_with_mask.cc
@ -0,0 +1,137 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nms_with_mask.h"
+#include <numeric>
+#include "Eigen/Core"
+#include "utils/kernel_util.h"
+
+namespace {
+const int32_t kInputNum = 1;
+const int32_t kOutputNum = 3;
+const int kColNum5 = 5;
+const int kColNum8 = 8;
+const char *kNMSWithMask = "NMSWithMask";
+}  // namespace
+
+namespace aicpu {
+uint32_t NMSWithMaskCpuKernel::Compute(const CpuKernelContext &ctx) {
+  // check param
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "NMSWithMask check input or output is failed");
+  AttrValue *iou_threshold = ctx.GetAttr("iou_threshold");
+  KERNEL_CHECK_FALSE((iou_threshold != nullptr), KERNEL_STATUS_PARAM_INVALID, "Get attr [iou_threshold] failed.");
+  iou_value_ = iou_threshold->GetFloat();
+
+  Tensor *input_data = ctx.Input(0);
+  auto data_type = input_data->GetDataType();
+  KERNEL_CHECK_FALSE((data_type == DT_FLOAT || data_type == DT_FLOAT16), KERNEL_STATUS_PARAM_INVALID,
+                     "Input[0] data type[%s] is unsupported", DTypeStr(data_type).c_str());
+  auto input_shape = input_data->GetTensorShape()->GetDimSizes();
+  num_input_ = input_shape[0];  //  Get N values in  [N, 5] data.
+  box_size_ = input_shape[1];
+  if (box_size_ != kColNum5 && box_size_ != kColNum8) {
+    KERNEL_LOG_INFO("NMSWithMask the col number of input[0] must be [%d] or [%d], but got [%d]!", kColNum5, kColNum8,
+                    box_size_);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  uint32_t res;
+  switch (data_type) {
+    case DT_FLOAT16:
+      res = DoCompute<Eigen::half>(ctx);
+      break;
+    case DT_FLOAT:
+      res = DoCompute<float>(ctx);
+      break;
+    default:
+      KERNEL_LOG_INFO("NMSWithMask input[0] only support type[DT_FLOAT16, DT_FLOAT], but got type[%s]",
+                      DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+      break;
+  }
+  return res;
+}
+
+template <typename T>
+uint32_t NMSWithMaskCpuKernel::DoCompute(const CpuKernelContext &ctx) {
+  auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output = reinterpret_cast<T *>(ctx.Output(OUTPUT)->GetData());
+  auto sel_idx = reinterpret_cast<int *>(ctx.Output(SEL_IDX)->GetData());
+  auto sel_boxes = reinterpret_cast<bool *>(ctx.Output(SEL_BOXES)->GetData());
+  std::fill(&sel_idx[0], &sel_idx[num_input_], 0);
+  std::fill(&sel_boxes[0], &sel_boxes[num_input_], false);
+
+  const int box_size = box_size_;
+  const auto comp = [input, box_size](const size_t a, const size_t b) {
+    const size_t index_a = a * box_size + 4;
+    const size_t index_b = b * box_size + 4;
+    if (input[index_b] == input[index_a]) {
+      return a < b;
+    };
+    return input[index_b] < input[index_a];
+  };
+  std::vector<int> order(num_input_);
+  std::iota(order.begin(), order.end(), 0);
+  std::sort(order.begin(), order.end(), comp);
+
+  std::vector<T> areas(num_input_);
+  for (int64_t i = 0; i < num_input_; i++) {
+    areas[i] =
+      (input[i * box_size_ + 2] - input[i * box_size_]) * (input[i * box_size_ + 3] - input[i * box_size_ + 1]);
+  }
+
+  int64_t num_to_keep = 0;
+  for (int64_t _i = 0; _i < num_input_; _i++) {
+    auto i = order[_i];
+    if (sel_boxes[i] == 1) continue;
+    sel_idx[num_to_keep++] = i;
+    auto ix1 = input[i * box_size_];
+    auto iy1 = input[i * box_size_ + 1];
+    auto ix2 = input[i * box_size_ + 2];
+    auto iy2 = input[i * box_size_ + 3];
+
+    for (int64_t _j = _i + 1; _j < num_input_; _j++) {
+      auto j = order[_j];
+      if (sel_boxes[j] == 1) continue;
+      auto xx1 = std::max(ix1, input[j * box_size_]);
+      auto yy1 = std::max(iy1, input[j * box_size_ + 1]);
+      auto xx2 = std::min(ix2, input[j * box_size_ + 2]);
+      auto yy2 = std::min(iy2, input[j * box_size_ + 3]);
+
+      auto w = std::max(static_cast<T>(0), xx2 - xx1);
+      auto h = std::max(static_cast<T>(0), yy2 - yy1);
+      auto inter = w * h;
+      auto ovr = inter / (areas[i] + areas[j] - inter);
+      if (static_cast<float>(ovr) > iou_value_) {
+        sel_boxes[j] = 1;
+      }
+    }
+  }
+
+  for (int k = 0; k < num_input_; ++k) {
+    for (int j = 0; j < box_size_; ++j) {
+      if (k < num_to_keep) {
+        output[k * kColNum5 + j] = input[sel_idx[k] * box_size_ + j];
+        sel_boxes[k] = true;
+      } else {
+        output[k * kColNum5 + j] = static_cast<T>(0);
+        sel_boxes[k] = false;
+      }
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kNMSWithMask, NMSWithMaskCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nms_with_mask.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nms_with_mask.h
@ -0,0 +1,48 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_NMS_WITH_MASK_H
+#define AICPU_KERNELS_NORMALIZED_NMS_WITH_MASK_H
+
+#include <vector>
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "securec.h"
+
+namespace aicpu {
+class NMSWithMaskCpuKernel : public CpuKernel {
+ public:
+  NMSWithMaskCpuKernel() = default;
+  ~NMSWithMaskCpuKernel() override = default;
+  uint32_t Compute(const CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t DoCompute(const CpuKernelContext &ctx);
+
+  int num_input_{0};
+  float iou_value_{0.0};
+  size_t ceil_power_2{0};
+  int box_size_ = 5;  //  pre_defined box width
+  enum output_list_ { OUTPUT, SEL_IDX, SEL_BOXES };
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_sum.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_sum.cc
@ -0,0 +1,265 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "reduce_sum.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kReduceSumInputNum = 2;
+const uint32_t kReduceSumOutputNum = 1;
+const char *const kReduceSum = "ReduceSum";
+#define REDUCESUM_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                           \
+    uint32_t result = ReduceSumCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                       \
+      KERNEL_LOG_ERROR("ReduceSum kernel compute failed."); \
+      return result;                                        \
+    }                                                       \
+    break;                                                  \
+  }
+#define REDUCESUM_COMPUTE_CASE_COMPLEX(DTYPE, TYPE, IN_TYPE, CTX) \
+  case (DTYPE): {                                                 \
+    uint32_t result = ReduceSumCompute2<TYPE, IN_TYPE>(CTX);      \
+    if (result != KERNEL_STATUS_OK) {                             \
+      KERNEL_LOG_ERROR("ReduceSum kernel compute failed.");       \
+      return result;                                              \
+    }                                                             \
+    break;                                                        \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t ReduceSumCpuKernel::Compute(const CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kReduceSumInputNum, kReduceSumOutputNum), "[%s] check input and output failed.",
+                      kReduceSum);
+  KERNEL_HANDLE_ERROR(ReduceSumCheck(ctx), "[%s] check params failed.", kReduceSum);
+  auto input_data_type = ctx.Input(0)->GetDataType();
+  switch (input_data_type) {
+    REDUCESUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    REDUCESUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    REDUCESUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    REDUCESUM_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    REDUCESUM_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    REDUCESUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    REDUCESUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    REDUCESUM_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    REDUCESUM_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
+    REDUCESUM_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
+    REDUCESUM_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
+    REDUCESUM_COMPUTE_CASE_COMPLEX(DT_COMPLEX64, std::complex<float>, float, ctx)
+    REDUCESUM_COMPUTE_CASE_COMPLEX(DT_COMPLEX128, std::complex<double>, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("ReduceSum kernel data type [%s] not support.", DTypeStr(input_data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+uint32_t ReduceSumCpuKernel::ReduceSumCheck(const CpuKernelContext &ctx) const {
+  KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "get input failed.");
+  KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.");
+  KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "get output failed.");
+  if (ctx.Input(1)->GetData() != nullptr) {
+    KERNEL_CHECK_FALSE((ctx.Input(1)->GetDataType() == DT_INT32 || ctx.Input(1)->GetDataType() == DT_INT64),
+                       KERNEL_STATUS_PARAM_INVALID, "Data type of axis is not support, axis data type is [%u].",
+                       ctx.Input(1)->GetDataType());
+  }
+  return KERNEL_STATUS_OK;
+}
+template <typename T>
+uint32_t ReduceSumCpuKernel::ReduceSumCompute(const CpuKernelContext &ctx) {
+  std::vector<int64_t> input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  auto input_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  if (input_shape.size() == 0) {
+    output_data[0] = input_data[0];
+    return KERNEL_STATUS_OK;
+  }
+  auto axes_data = reinterpret_cast<int32_t *>(ctx.Input(1)->GetData());
+  if (axes_data == nullptr) {
+    int64_t data_num = ctx.Input(0)->NumElements();
+    auto accumulator = static_cast<T>(0);
+    for (int64_t i = 0; i < data_num; i++) {
+      accumulator += input_data[i];
+    }
+    output_data[0] = accumulator;
+    return KERNEL_STATUS_OK;
+  }
+  std::vector<int64_t> axes;
+  KERNEL_HANDLE_ERROR(ReduceSumDedupAxes(ctx, axes), "ReduceSum deduplicate failed.");
+  int64_t output_num = ctx.Output(0)->NumElements();
+  uint32_t axes_idx = 0;
+  KERNEL_HANDLE_ERROR(ReduceSumOneAxes<T>(input_data, input_shape, output_data, output_num, axes, axes_idx),
+                      "Reduce sum compute failed.");
+  return KERNEL_STATUS_OK;
+}
+template <typename T>
+uint32_t ReduceSumCpuKernel::ReduceSumOneAxes(const T *input_data, std::vector<int64_t> &input_shape, T *output_data,
+                                              int64_t output_num, std::vector<int64_t> &axes, uint32_t &axes_idx) {
+  if (axes_idx >= axes.size()) {
+    for (int64_t i = 0; i < output_num; i++) {
+      output_data[i] = input_data[i];
+    }
+    return KERNEL_STATUS_OK;
+  }
+  int64_t inner = 1, outer = 1, depth = 1;
+  KERNEL_HANDLE_ERROR(ReduceSumParseAxes(input_shape, axes, axes_idx, inner, outer, depth), "parse axes failed.");
+  auto output_data_temp = new (std::nothrow) T[inner * outer];
+  KERNEL_CHECK_NULLPTR(output_data_temp, KERNEL_STATUS_INNER_ERROR, "apply memory failed.");
+  for (int64_t outer_index = 0; outer_index < outer; ++outer_index) {
+    for (int64_t inner_index = 0; inner_index < inner; inner_index++) {
+      auto accumulator = static_cast<T>(0);
+      for (int64_t depth_index = 0; depth_index < depth; depth_index++) {
+        int64_t index = outer_index;
+        index += depth_index * outer;
+        index += inner_index * depth * outer;
+        accumulator += input_data[index];
+      }
+      int64_t output_index = outer_index;
+      output_index += inner_index * outer;
+      output_data_temp[output_index] = accumulator;
+    }
+  }
+  uint32_t result = ReduceSumOneAxes<T>(output_data_temp, input_shape, output_data, output_num, axes, axes_idx);
+  if (output_data_temp != nullptr) {
+    delete[] output_data_temp;
+  }
+  return result;
+}
+template <typename T, typename T2>
+uint32_t ReduceSumCpuKernel::ReduceSumCompute2(const CpuKernelContext &ctx) {
+  std::vector<int64_t> input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  auto input_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  if (input_shape.size() == 0) {
+    output_data[0] = std::complex<T2>(input_data[0].real(), input_data[0].imag());
+    return KERNEL_STATUS_OK;
+  }
+  auto axes_data = reinterpret_cast<int32_t *>(ctx.Input(1)->GetData());
+  int64_t input_num = ctx.Input(0)->NumElements();
+  if (axes_data == nullptr) {
+    auto accumulator_real = static_cast<T2>(0);
+    auto accumulator_imag = static_cast<T2>(0);
+    for (int64_t i = 0; i < input_num; i++) {
+      accumulator_real += input_data[i].real();
+      accumulator_imag += input_data[i].imag();
+    }
+    output_data[0] = std::complex<T2>(accumulator_real, accumulator_imag);
+    return KERNEL_STATUS_OK;
+  }
+  std::vector<int64_t> axes;
+  KERNEL_HANDLE_ERROR(ReduceSumDedupAxes(ctx, axes), "ReduceSum deduplicate failed.");
+  int64_t output_num = ctx.Output(0)->NumElements();
+  uint32_t axes_idx = 0;
+  KERNEL_HANDLE_ERROR(
+    (ReduceSumOneAxes2<T, T2>(input_data, input_num, input_shape, output_data, output_num, axes, axes_idx)),
+    "Reduce sum compute failed.");
+  return KERNEL_STATUS_OK;
+}
+template <typename T, typename T2>
+uint32_t ReduceSumCpuKernel::ReduceSumOneAxes2(const T *input_data, int64_t input_num, std::vector<int64_t> input_shape,
+                                               T *output_data, int64_t output_num, std::vector<int64_t> &axes,
+                                               uint32_t &axes_idx) {
+  if (axes_idx >= axes.size()) {
+    auto accumulator_real = static_cast<T2>(0);
+    auto accumulator_imag = static_cast<T2>(0);
+    for (int64_t i = 0; i < output_num; i++) {
+      accumulator_real = input_data[i].real();
+      accumulator_imag = input_data[i].imag();
+      output_data[i] = std::complex<T2>(accumulator_real, accumulator_imag);
+    }
+    return KERNEL_STATUS_OK;
+  }
+  int64_t inner = 1, outer = 1, depth = 1;
+  KERNEL_HANDLE_ERROR(ReduceSumParseAxes(input_shape, axes, axes_idx, inner, outer, depth), "parse axes failed.");
+  std::vector<T2> input_data_real(input_num);
+  std::vector<T2> input_data_imag(input_num);
+  for (int64_t i = 0; i < input_num; i++) {
+    input_data_real[i] = input_data[i].real();
+    input_data_imag[i] = input_data[i].imag();
+  }
+  int64_t output_num_temp = inner * outer;
+  auto *output_data_temp = new (std::nothrow) T[output_num_temp];
+  KERNEL_CHECK_NULLPTR(output_data_temp, KERNEL_STATUS_INNER_ERROR, "apply memory failed.");
+  for (int64_t outer_index = 0; outer_index < outer; outer_index++) {
+    for (int64_t inner_index = 0; inner_index < inner; inner_index++) {
+      auto accumulator_real = static_cast<T2>(0);
+      auto accumulator_imag = static_cast<T2>(0);
+      for (int64_t depth_index = 0; depth_index < depth; depth_index++) {
+        int64_t index = outer_index;
+        index += inner_index * depth * outer;
+        index += depth_index * outer;
+        accumulator_real += input_data_real[index];
+        accumulator_imag += input_data_imag[index];
+      }
+      int64_t output_index = outer_index;
+      output_index += inner_index * outer;
+      output_data_temp[output_index] = std::complex<T2>(accumulator_real, accumulator_imag);
+    }
+  }
+  uint32_t result =
+    ReduceSumOneAxes2<T, T2>(output_data_temp, output_num_temp, input_shape, output_data, output_num, axes, axes_idx);
+  if (output_data_temp != nullptr) {
+    delete[] output_data_temp;
+  }
+  return result;
+}
+uint32_t ReduceSumCpuKernel::ReduceSumDedupAxes(const CpuKernelContext &ctx, std::vector<int64_t> &axes) {
+  int32_t rank = ctx.Input(0)->GetTensorShape()->GetDims();
+  auto axes_data = reinterpret_cast<int32_t *>(ctx.Input(1)->GetData());
+  int64_t axes_num = ctx.Input(1)->NumElements();
+  for (int64_t i = 0; i < axes_num; i++) {
+    int32_t axis = axes_data[i];
+    KERNEL_CHECK_FALSE((axis < rank) && (axis >= -rank), KERNEL_STATUS_PARAM_INVALID,
+                       "axes[%d] is out of input dims rank[%d]", axis, rank);
+    if (axis < 0) {
+      axis += rank;
+    }
+    axes.push_back(axis);
+  }
+  int64_t j = 1;
+  while (j < axes_num) {
+    std::vector<int64_t>::iterator iter = find(axes.begin(), axes.begin() + j, axes[j]);
+    if (iter != axes.begin() + j) {
+      axes.erase(iter);
+      axes_num--;
+    } else {
+      j++;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+uint32_t ReduceSumCpuKernel::ReduceSumParseAxes(std::vector<int64_t> &input_shape, std::vector<int64_t> &axes,
+                                                uint32_t &axes_idx, int64_t &inner, int64_t &outer,
+                                                int64_t &depth) const {
+  int64_t axis = axes[axes_idx];
+  axes_idx++;
+  int64_t rank = input_shape.size();
+  for (int64_t i = 0; i < rank; i++) {
+    if (i < axis) {
+      inner *= input_shape[i];
+    } else if (i > axis) {
+      outer *= input_shape[i];
+    } else {
+      depth = input_shape[i];
+      input_shape[i] = 1;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kReduceSum, ReduceSumCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_sum.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_sum.h
@ -0,0 +1,53 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_REDUCE_SUM_H
+#define AICPU_KERNELS_NORMALIZED_REDUCE_SUM_H
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class ReduceSumCpuKernel : public CpuKernel {
+ public:
+  ReduceSumCpuKernel() = default;
+  ~ReduceSumCpuKernel() override = default;
+
+  uint32_t Compute(const CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t ReduceSumCheck(const CpuKernelContext &ctx) const;
+
+  template <typename T>
+  uint32_t ReduceSumCompute(const CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t ReduceSumOneAxes(const T *input_data, std::vector<int64_t> &input_shape, T *output_data, int64_t output_num,
+                            std::vector<int64_t> &axes, uint32_t &axes_idx);
+
+  template <typename T, typename T2>
+  uint32_t ReduceSumCompute2(const CpuKernelContext &ctx);
+
+  template <typename T, typename T2>
+  uint32_t ReduceSumOneAxes2(const T *input_data, int64_t input_num, std::vector<int64_t> input_shape, T *output_data,
+                             int64_t output_num, std::vector<int64_t> &axes, uint32_t &axes_idx);
+
+  uint32_t ReduceSumDedupAxes(const CpuKernelContext &ctx, std::vector<int64_t> &axes);
+
+  uint32_t ReduceSumParseAxes(std::vector<int64_t> &input_shape, std::vector<int64_t> &axes, uint32_t &axes_idx,
+                              int64_t &inner, int64_t &outer, int64_t &depth) const;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_group.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_group.cc
@ -0,0 +1,60 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sparse_group.h"
+
+namespace aicpu {
+void GroupIterable::IteratorStep::UpdateEndOfGroup() {
+  ++next_loc_;
+  const auto &ix_t = iter_->ix_matrix_;
+  const int64_t N = ix_t.dimension(0);
+  while (next_loc_ < N && iter_->GroupMatches(ix_t, loc_, next_loc_)) {
+    ++next_loc_;
+  }
+}
+
+bool GroupIterable::IteratorStep::operator!=(const IteratorStep &rhs) const { return (rhs.loc_ != loc_); }
+
+bool GroupIterable::IteratorStep::operator==(const IteratorStep &rhs) const { return (rhs.loc_ == loc_); }
+
+GroupIterable::IteratorStep &GroupIterable::IteratorStep::operator++() {  // prefix ++
+  loc_ = next_loc_;
+  UpdateEndOfGroup();
+  return *this;
+}
+
+const GroupIterable::IteratorStep GroupIterable::IteratorStep::operator++(int)  // postfix ++
+{
+  IteratorStep lhs(*this);
+  ++(*this);
+  return lhs;
+}
+
+Group GroupIterable::IteratorStep::operator*() const { return Group(iter_, loc_, next_loc_); }
+
+std::vector<int64_t> Group::group() const {
+  std::vector<int64_t> g;
+  const auto &ix_t = iter_->ix_matrix_;
+  for (const int64_t d : iter_->group_dims_) {
+    g.push_back(ix_t(loc_, d));
+  }
+  return g;
+}
+
+TTypes<int64_t>::UnalignedConstMatrix Group::indices() const {
+  return TTypes<int64_t>::UnalignedConstMatrix(&(iter_->ix_matrix_(loc_, 0)), next_loc_ - loc_, iter_->dims_);
+}
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_group.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_group.h
@ -0,0 +1,154 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CPU_KERNEL_UTIL_SPARSE_GROUP_ITERATOR_H_
+#define CPU_KERNEL_UTIL_SPARSE_GROUP_ITERATOR_H_
+
+#include <vector>
+#include "eigen_tensor.h"
+
+namespace aicpu {
+class Group;  // Predeclare Group for GroupIterable.
+
+// ///////////////
+// GroupIterable
+// ///////////////
+//
+// Returned when calling sparse_tensor.group({dim0, dim1, ...}).
+//
+// Please note: the sparse_tensor should already be ordered according
+// to {dim0, dim1, ...}.  Otherwise this iteration will return invalid groups.
+//
+// Allows grouping and iteration of the SparseTensor according to the
+// subset of dimensions provided to the group call.
+//
+// The actual grouping dimensions are stored in the
+// internal vector group_dims_.  Iterators inside the iterable provide
+// the three methods:
+//
+// *  group(): returns a vector with the current group dimension values.
+// *  indices(): a map of index, providing the indices in
+//    this group.
+// *  values(): a map of values, providing the values in
+//    this group.
+//
+// To iterate across GroupIterable, see examples in README.md.
+//
+
+// Forward declaration of SparseTensor
+class GroupIterable {
+ public:
+  using VarDimArray = std::vector<int64_t>;
+
+  GroupIterable(Tensor *ix, Tensor *vals, int dims, const VarDimArray &group_dims)
+      : ix_(ix),
+        ix_matrix_(EigenTensor(ix, ix->GetData()).matrix<int64_t>()),
+        vals_(vals),
+        dims_(dims),
+        group_dims_(group_dims.begin(), group_dims.end()) {}
+
+  ~GroupIterable() {}
+
+  class IteratorStep;
+
+  IteratorStep begin() { return IteratorStep(this, 0); }
+
+  IteratorStep at(int64_t loc) {
+    if (!(loc >= 0 && loc <= static_cast<int64_t>(ix_->GetTensorShape()->GetDimSize(0)))) {
+      KERNEL_LOG_WARN("loc should in [0, %d], but got: %d", ix_->GetTensorShape()->GetDimSize(0), loc);
+    }
+    return IteratorStep(this, loc);
+  }
+
+  IteratorStep end() { return IteratorStep(this, ix_->GetTensorShape()->GetDimSize(0)); }
+
+  template <typename TIX>
+  inline bool GroupMatches(const TIX &ix, int64_t loc_a, int64_t loc_b) const {
+    for (int64_t d : group_dims_) {
+      if (ix(loc_a, d) != ix(loc_b, d)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  class IteratorStep {
+   public:
+    IteratorStep(GroupIterable *iter, int64_t loc) : iter_(iter), loc_(loc), next_loc_(loc_) { UpdateEndOfGroup(); }
+
+    ~IteratorStep() { iter_ = nullptr; }
+
+    void UpdateEndOfGroup();
+
+    bool operator!=(const IteratorStep &rhs) const;
+
+    bool operator==(const IteratorStep &rhs) const;
+
+    IteratorStep &operator++();
+
+    const IteratorStep operator++(int);
+
+    Group operator*() const;
+
+    int64_t loc() const { return loc_; }
+
+   private:
+    GroupIterable *iter_;
+    int64_t loc_;
+    int64_t next_loc_;
+  };
+
+ private:
+  friend class Group;
+  Tensor *ix_;
+  TTypes<int64_t>::Matrix ix_matrix_;
+  Tensor *vals_;
+  const int dims_;
+  const std::vector<int64_t> group_dims_;
+};
+
+// This class is returned when dereferencing a GroupIterable iterator.
+// It provides the methods group(), indices(), and values(), which
+// provide access into the underlying SparseTensor.
+class Group {
+ public:
+  Group(GroupIterable *iter, int64_t loc, int64_t next_loc) : iter_(iter), loc_(loc), next_loc_(next_loc) {}
+
+  ~Group() { iter_ = NULL; }
+
+  std::vector<int64_t> group() const;
+
+  TTypes<int64_t>::UnalignedConstMatrix indices() const;
+
+  int64_t group_at(size_t index) const {
+    const auto &ix_t = iter_->ix_matrix_;
+    return ix_t(loc_, index);
+  }
+
+  template <typename T>
+  typename TTypes<T>::UnalignedVec values() const {
+    return typename TTypes<T>::UnalignedVec(&(EigenTensor(iter_->vals_, iter_->vals_->GetData()).vec<T>()(loc_)),
+                                            next_loc_ - loc_);
+  }
+
+ private:
+  GroupIterable *iter_;
+  int64_t loc_;
+  int64_t next_loc_;
+};
+}  // namespace aicpu
+
+#endif  // CPU_KERNEL_UTIL_SPARSE_GROUP_ITERATOR_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_tensor.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_tensor.cc
@ -0,0 +1,128 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sparse_tensor.h"
+#include "cpu_types.h"
+
+namespace aicpu {
+uint32_t SparseTensor::CreateSparseTensor(Tensor *ix, Tensor *tensorvals, std::vector<int64_t> shape,
+                                          std::vector<int64_t> order) {
+  KERNEL_LOG_INFO("Start to execute CreateSparseTensor.");
+  if (ix == nullptr || ix->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Ix is nullptr.");
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  if (tensorvals == nullptr || tensorvals->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Vals is nullptr.");
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+
+  if (ix->GetTensorShape()->GetDims() > 2) {
+    KERNEL_LOG_ERROR("Index tensor dim size less than 2 or equal to 2, got size [%d] ",
+                     ix->GetTensorShape()->GetDims());
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+
+  int64_t dims = (ix->GetTensorShape()->GetDims() == 0) ? 1 : ix->GetTensorShape()->GetDimSize(0);
+  int64_t vals_dim0 = (tensorvals->GetTensorShape()->GetDims() == 0) ? 1 : tensorvals->GetTensorShape()->GetDimSize(0);
+  if (dims != vals_dim0) {
+    KERNEL_LOG_ERROR("Ix dim_size_0 [%ld] != tensorvals dim_size_0 [%ld]", dims, vals_dim0);
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  dims = ix->GetTensorShape()->GetDims() == 2 ? ix->GetTensorShape()->GetDimSize(1) : 1;
+  int64_t orderSize = static_cast<int64_t>(order.size());
+  int64_t shapeSize = static_cast<int64_t>(shape.size());
+  if (orderSize != dims) {
+    KERNEL_LOG_ERROR("orderSize [%ld] != dims [%ld]", orderSize, dims);
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  if (shapeSize != dims) {
+    KERNEL_LOG_ERROR("shapeSize [%ld] != dims [%ld]", shapeSize, dims);
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  ix_ = std::make_shared<EigenTensor>(ix, ix->GetData());
+  vals_ = std::make_shared<EigenTensor>(tensorvals, tensorvals->GetData());
+  if (ix_ == nullptr || vals_ == nullptr) {
+    KERNEL_LOG_ERROR("Indices or values create eigen tensor failed.");
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+
+  shape_.assign(shape.begin(), shape.end());
+  order_.assign(order.begin(), order.end());
+  dims_ = static_cast<int32_t>(dims);
+  KERNEL_LOG_INFO("Execute CreateSparseTensor end");
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SparseTensor::IndicesValid(CpuKernelContext &ctx) const {
+  if (std::any_of(order_.begin(), order_.end(), [](int64_t ord) { return ord < 0; })) {
+    KERNEL_LOG_ERROR("Order was not provided.");
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  if (ix_->GetTensor()->GetDataType() == DT_INT32) {
+    if (EigenTensorIndicesValid<int32_t>(ctx) != KERNEL_STATUS_OK) {
+      KERNEL_LOG_ERROR("Indices valid failed.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  } else {
+    if (EigenTensorIndicesValid<int64_t>(ctx) != KERNEL_STATUS_OK) {
+      KERNEL_LOG_ERROR("Indices valid failed.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+bool SparseTensor::ValidateToDense(const Tensor *out) const {
+  KERNEL_LOG_INFO("Start execute ValidateToDense.");
+  if (out->GetDataType() != vals_->GetTensor()->GetDataType()) {
+    KERNEL_LOG_ERROR("Output data type must match vals, got out [%d], vals [%d].", out->GetDataType(),
+                     vals_->GetTensor()->GetDataType());
+    return false;
+  }
+  if (out->GetTensorShape()->GetDims() != dims_) {
+    KERNEL_LOG_ERROR("Output dims must match idx, got output dims [%d], idx dims [%d].",
+                     out->GetTensorShape()->GetDims(), dims_);
+    return false;
+  }
+  const auto out_shape = out->GetTensorShape();
+  int32_t shapeSize = static_cast<int32_t>(shape_.size());
+  if (shapeSize != out_shape->GetDims()) {
+    KERNEL_LOG_ERROR("output dims must match shape dims, got output dim [%d], shape dim [%d].", out_shape->GetDims(),
+                     shapeSize);
+    return false;
+  }
+  for (size_t d = 0; d < shape_.size(); ++d) {
+    if (shape_[d] > out_shape->GetDimSize(static_cast<int32_t>(d))) {
+      KERNEL_LOG_ERROR(
+        "Valid output shape dims value failed, index [%zu], shape value [%ld], "
+        "greater than output shape value [%d].",
+        d, shape_[d], out_shape->GetDimSize(static_cast<int32_t>(d)));
+      return false;
+    }
+  }
+  KERNEL_LOG_INFO("Execute Validate dense end.");
+  return true;
+}
+
+GroupIterable SparseTensor::group(const std::vector<int64_t> &group_ix) const {
+  if (group_ix.size() > static_cast<size_t>(dims_)) {
+    KERNEL_LOG_WARN("Grop_ix.size:%zu > dims_:%d", group_ix.size(), dims_);
+  }
+  return GroupIterable(const_cast<Tensor *>(ix_->GetTensor()), const_cast<Tensor *>(vals_->GetTensor()), dims_,
+                       group_ix);
+}
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_tensor.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_tensor.h
@ -0,0 +1,296 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_SPARSETENSOR_H
+#define AICPU_SPARSETENSOR_H
+
+#include <algorithm>
+#include <memory>
+
+#include "cpu_tensor.h"
+#include "eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "sparse_group.h"
+#include "status.h"
+
+namespace aicpu {
+template <typename T>
+const T SubtleMustCopy(const T &x) {
+  auto *to_x = reinterpret_cast<const volatile T *>(&x);
+  return *to_x;
+}
+}  // namespace aicpu
+
+namespace aicpu {
+class SparseTensor {
+ public:
+  SparseTensor() : dims_(0) {}
+  ~SparseTensor() = default;
+
+  /*
+   * create sparse tensor
+   * @param ix: index tensor
+   * @param tensorvals: tensorvals tensor
+   * @param shape: shape vec
+   * @param order: order vec
+   * @return uint32_t: 0->success other->failed
+   */
+  uint32_t CreateSparseTensor(Tensor *ix, Tensor *tensorvals, std::vector<int64_t> shape, std::vector<int64_t> order);
+
+  /*
+   * sparse indices valid
+   * @return uint32_t: 0->success other->failed
+   */
+  uint32_t IndicesValid(CpuKernelContext &ctx) const;
+
+  /*
+   * group sparse tensor
+   * @return GroupIterable
+   */
+  GroupIterable group(const std::vector<int64_t> &group_ix) const;
+  /*
+   * sparse eigen tensor indices valid
+   * @return uint32_t: 0->success other->failed
+   */
+  template <typename T>
+  uint32_t EigenTensorIndicesValidCheck(int64_t dims_size) const {
+    const auto ix_t = ix_->matrix<T>();
+    for (int64_t n = 1; n < dims_size; ++n) {
+      bool valid = true;
+      bool different = false;
+      bool increasing = true;
+      for (int32_t di = 0; di < dims_; ++di) {
+        if (ix_t(n, di) < 0 || ix_t(n, di) >= shape_[di]) {
+          valid = false;
+        }
+        int64_t diff = ix_t(n, order_[di]) - ix_t(n - 1, order_[di]);
+        if (diff > 0) {
+          different = true;
+        }
+        if (!different && diff < 0) {
+          increasing = false;
+        }
+      }
+      if (!valid) {
+        KERNEL_LOG_ERROR("Indices is out of bounds, index=%lld.", n);
+        return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+      }
+      if (!increasing) {
+        KERNEL_LOG_ERROR("indices is out of order, index=%lld.", n);
+        return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+      }
+      if (!different) {
+        KERNEL_LOG_ERROR("indices is repeated, index=%lld.", n);
+        return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+      }
+    }
+    return static_cast<uint32_t>(KERNEL_STATUS_OK);
+  }
+  /*
+   * sparse eigen tensor indices valid
+   * @return uint32_t: 0->success other->failed
+   */
+  template <typename T>
+  uint32_t EigenTensorIndicesValidParaCheck(const CpuKernelContext &ctx, int64_t dims_size) const {
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    uint32_t result = static_cast<uint32_t>(KERNEL_STATUS_OK);
+    (void)aicpu::CpuKernelUtils::ParallelFor(ctx, dims_size, dims_size / max_core_num,
+                                             [&](std::int64_t begin, std::int64_t end) {
+                                               int64_t start = begin;
+                                               if (begin == 0) {
+                                                 start = begin + 1;
+                                               }
+                                               const auto ix_t = ix_->matrix<T>();
+                                               for (int64_t n = start; n < end; ++n) {
+                                                 bool valid = true;
+                                                 bool different = false;
+                                                 bool increasing = true;
+                                                 for (int32_t di = 0; di < dims_; ++di) {
+                                                   if (ix_t(n, di) < 0 || ix_t(n, di) >= shape_[di]) {
+                                                     valid = false;
+                                                   }
+                                                   int64_t diff = ix_t(n, order_[di]) - ix_t(n - 1, order_[di]);
+                                                   if (diff > 0) {
+                                                     different = true;
+                                                   }
+                                                   if (!different && diff < 0) {
+                                                     increasing = false;
+                                                   }
+                                                 }
+                                                 if (!valid) {
+                                                   KERNEL_LOG_ERROR("Indices is out of bounds, index=%lld.", n);
+                                                   result = static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+                                                   return;
+                                                 }
+                                                 if (!increasing) {
+                                                   KERNEL_LOG_ERROR("indices is out of order, index=%lld.", n);
+                                                   result = static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+                                                   return;
+                                                 }
+                                                 if (!different) {
+                                                   KERNEL_LOG_ERROR("indices is repeated, index=%lld.", n);
+                                                   result = static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+                                                   return;
+                                                 }
+                                               }
+                                             });
+    return result;
+  }
+  /*
+   * sparse eigen tensor indices valid
+   * @return uint32_t: 0->success other->failed
+   */
+  template <typename T>
+  uint32_t EigenTensorIndicesValid(const CpuKernelContext &ctx) const {
+    const auto ix_t = ix_->matrix<T>();
+    int64_t dims_size =
+      (ix_->GetTensor()->GetTensorShape()->GetDims() == 0) ? 1 : ix_->GetTensor()->GetTensorShape()->GetDimSize(0);
+    if (dims_size > 0) {
+      for (int32_t di = 0; di < dims_; ++di) {
+        if ((ix_t(0, di) < 0) || (ix_t(0, di) >= shape_[di])) {
+          KERNEL_LOG_ERROR("Indices is out of bounds, index=0.");
+          return KERNEL_STATUS_PARAM_INVALID;
+        }
+      }
+    }
+    const int64_t paralled_data_size = 16 * 1024;
+    if (dims_size < paralled_data_size) {
+      return EigenTensorIndicesValidCheck<T>(dims_size);
+    } else {
+      return EigenTensorIndicesValidParaCheck<T>(ctx, dims_size);
+    }
+  }
+
+  /*
+   * validate sparse to dense
+   * @param output: output tensor
+   * @return bool: true->success false->failed
+   */
+  bool ValidateToDense(const Tensor *out) const;
+
+  /*
+   * sparse tensor to dense tensor
+   * @param output: output tensor
+   * @return uint32_t: 0->success other->failed
+   */
+  template <typename IndiceT, typename ValueT>
+  uint32_t ToDenseParallel(const CpuKernelContext &ctx, Tensor *output) {
+    EigenTensor outputET(output, output->GetData());
+    auto output_t = outputET.flat<ValueT>();
+    auto ix_t = ix_->matrix<IndiceT>();
+    std::vector<int64_t> strides(dims_);
+    const auto &out_shape = output->GetTensorShape();
+    if (dims_ > 0) {
+      strides[dims_ - 1] = 1;
+    }
+    for (int32_t d = dims_ - 2; d >= 0; --d) {
+      strides[d] = strides[d + 1] * out_shape->GetDimSize(d + 1);
+    }
+    auto vals_t = vals_->vec<ValueT>();
+    int64_t vals_size = vals_t.dimension(0);
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    uint32_t result = static_cast<uint32_t>(KERNEL_STATUS_OK);
+    auto parallel_proc = [&](std::int64_t begin, std::int64_t end) {
+      for (int64_t n = begin; n < end; ++n) {
+        bool invalid_dims = false;
+        int64_t ix = 0;
+        for (int d = 0; d < dims_; ++d) {
+          const int64_t ix_n_d = ix_t(n, d);
+          if (ix_n_d > out_shape->GetDimSize(d)) {
+            invalid_dims = true;
+          }
+          ix += strides[d] * ix_n_d;
+        }
+        if (invalid_dims) {
+          result = static_cast<uint32_t>(KERNEL_STATUS_INNER_ERROR);
+          KERNEL_LOG_ERROR("Sparse to dense got invalid dims.");
+          return;
+        }
+        output_t(ix) = vals_t(n);
+      }
+      return;
+    };
+    KERNEL_HANDLE_ERROR(aicpu::CpuKernelUtils::ParallelFor(ctx, vals_size, vals_size / max_core_num, parallel_proc),
+                        "SparseToDense Compute failed.");
+    return result;
+  }
+
+  /*
+   * sparse tensor to dense tensor
+   * @param output: output tensor
+   * @return uint32_t: 0->success other->failed
+   */
+  template <typename IndiceT, typename ValueT>
+  uint32_t ToDense(const CpuKernelContext &ctx, Tensor *output) {
+    KERNEL_LOG_INFO("Start to execute ToDense.");
+    if (output == nullptr || output->GetData() == nullptr) {
+      KERNEL_LOG_ERROR("Output tensor is nullptr.");
+      return KERNEL_STATUS_INNER_ERROR;
+    }
+    if (!ValidateToDense(output)) {
+      KERNEL_LOG_ERROR("Validate to dense param failed.");
+      return KERNEL_STATUS_INNER_ERROR;
+    }
+    auto vals_t = vals_->vec<ValueT>();
+    int64_t vals_size = vals_t.dimension(0);
+    const int64_t paralled_data_size = 16 * 1024;
+    if (vals_size >= paralled_data_size) {
+      return ToDenseParallel<IndiceT, ValueT>(ctx, output);
+    }
+    EigenTensor outputET(output, output->GetData());
+    auto output_t = outputET.flat<ValueT>();
+    auto ix_t = ix_->matrix<IndiceT>();
+    std::vector<int64_t> strides(dims_);
+    const auto &out_shape = output->GetTensorShape();
+    if (dims_ > 0) {
+      strides[dims_ - 1] = 1;
+    }
+    for (int32_t d = dims_ - 2; d >= 0; --d) {
+      strides[d] = strides[d + 1] * out_shape->GetDimSize(d + 1);
+    }
+    for (int64_t n = 0; n < vals_size; ++n) {
+      bool invalid_dims = false;
+      int64_t ix = 0;
+      for (int d = 0; d < dims_; ++d) {
+        const int64_t ix_n_d = ix_t(n, d);
+        if (ix_n_d > out_shape->GetDimSize(d)) {
+          invalid_dims = true;
+        }
+        ix += strides[d] * ix_n_d;
+      }
+      if (invalid_dims) {
+        KERNEL_LOG_ERROR("Sparse to dense got invalid dims.");
+        return KERNEL_STATUS_INNER_ERROR;
+      }
+      output_t(ix) = vals_t(n);
+    }
+    return KERNEL_STATUS_OK;
+  }
+
+ private:
+  std::shared_ptr<EigenTensor> ix_;
+  std::shared_ptr<EigenTensor> vals_;
+  std::vector<int64_t> shape_;
+  std::vector<int64_t> order_;
+  int32_t dims_;
+};
+}  // namespace aicpu
+
+#endif  // AICPU_SPARSETENSOR_H
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
@ -48,14 +48,18 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
                                                      kSliceGradOpName,
                                                      kRandomShuffleOpName,
                                                      kRangeOpName};
-  static const std::set<std::string> kMigrateAicpuKernelOps = {
-    mindspore::kACosOpName,
-    mindspore::kLogMatrixDeterminantOpName,
-    mindspore::kAdaptiveAvgPool2dOpName,
-    mindspore::kAdaptiveAvgPool2dGradOpName,
-    mindspore::kMedianOpName,
-    mindspore::kMedianGradOpName,
-  };
+  static const std::set<std::string> kMigrateAicpuKernelOps = {mindspore::kACosOpName,
+                                                               mindspore::kAdaptiveAvgPool2dOpName,
+                                                               mindspore::kAdaptiveAvgPool2dGradOpName,
+                                                               mindspore::kCacheSwapTableOpName,
+                                                               mindspore::kFillOpName,
+                                                               mindspore::kLogMatrixDeterminantOpName,
+                                                               mindspore::kMaskedSelectOpName,
+                                                               mindspore::kMaskedSelectGradOpName,
+                                                               mindspore::kMedianOpName,
+                                                               mindspore::kMedianGradOpName,
+                                                               mindspore::kNMSWithMaskOpName,
+                                                               mindspore::kReduceSumOpName};
  static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
  static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";