merge canndev code to mindspore

2023-02-07 14:35:40 +08:00 · 2023-02-07 14:35:40 +08:00 · 0dd977ccef
parent aacab0ca60
commit 0dd977ccef
50 changed files with 6861 additions and 0 deletions
--- a/.jenkins/check/config/filter_cppcheck.txt
+++ b/.jenkins/check/config/filter_cppcheck.txt
@ -97,6 +97,9 @@
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "knownConditionTrueFalse"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "passedByValue"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitMemberVar"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "syntaxError"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unusedVariable"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowArgument"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"
--- a/.jenkins/check/config/filter_cpplint.txt
+++ b/.jenkins/check/config/filter_cpplint.txt
@ -134,5 +134,6 @@
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/operators"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/comma"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "runtime/indentation_namespace"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/blank_line"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/line_length"

--- a/.jenkins/check/config/whitelizard.txt
+++ b/.jenkins/check/config/whitelizard.txt
@ -344,6 +344,15 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool.cc:aicpu::FractionalAvgPoolCpuKernel::DoCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/densetosparsesetoperation.cc:aicpu::DenseToSparseSetOperationCpuKernel::ComputeDenseToSparse
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_dense_cwise_utils.cc:aicpu::SparseDenseCwiseOpKernel<Op>::SparseDenseCwiseOpSpecialCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/resize_area.cc:aicpu::ResizeAreaCpuKernel::DoCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/trace_grad.cc:aicpu::TraceGradCpuKernel::Compute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sspaddmm.cc:aicpu::SspaddmmCpuKernel::ValidParam
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_sum.cc:aicpu::SegmentSumCpuKernel::SegmentSumCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_min.cc:aicpu::SegmentMinCpuKernel::SegmentMinCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_mat_mul.cc:aicpu::SparseTensorDenseMatMulCpuKernel::Compute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_mat_mul.cc:aicpu::SparseTensorDenseMatMulCpuKernel::regular_calculate
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sspaddmm.cc:aicpu::SspaddmmCpuKernel::ScalarSparseMul
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_add.cc:aicpu::SparseTensorDenseAddCpuKernel::ValidateInputs
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_dense_cwise_utils.cc:aicpu::SparseDenseCwiseOpKernel<Op>::SparseDenseCwiseOpBcastCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_dense_cwise_utils.cc:aicpu::SparseDenseCwiseOpKernel<Op>::SparseDenseCwiseOpSpecialComputeComplex
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_dense_cwise_utils.cc:aicpu::SparseDenseCwiseOpKernel<Op>::SparseDenseCwiseOpBcastComputeComplex
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/next_after.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/next_after.cc
@ -0,0 +1,209 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "next_after.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *kNextAfter = "NextAfter";
+// when input data size is more than kParallelDataNum, use Parallel func
+const int64_t kParallelDataNum = 2 * 1024;
+const int64_t kParallelDataNumMid = 16 * 1024;
+const int64_t kParallelDataNumSameShape = 7 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
+
+#define NEXTAFTER_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                           \
+    uint32_t result = NextAfterCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                       \
+      KERNEL_LOG_ERROR("NextAfter kernel compute failed."); \
+      return result;                                        \
+    }                                                       \
+    break;                                                  \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t NextAfterCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "NextAfter check input and output number failed.");
+  KERNEL_HANDLE_ERROR(NextAfterParamCheck(ctx), "NextAfter check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    NEXTAFTER_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    NEXTAFTER_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("NextAfter kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t NextAfterCpuKernel::NextAfterParamCheck(CpuKernelContext &ctx) {
+  // the non null of input_0, input_1, output has been verified in NormalCheck
+  Tensor *input_0 = ctx.Input(0);
+  Tensor *input_1 = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  DataType input0_type = input_0->GetDataType();
+  DataType input1_type = input_1->GetDataType();
+  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with "
+                     "input1 [%s].",
+                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
+  KERNEL_LOG_DEBUG(
+    "NextAfterCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+/*
+special compute is used in the following situations.
+1. the shapes of input1 and input2 are the same
+2. input1 is a 1D tensor with only one element or input1 is scalar
+3. input2 is a 1D tensor with only one element or input2 is scalar
+4. the shapes of input1 and input2 are different
+*/
+template <typename T>
+void NextAfterCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
+                                        const T *input2, T *output) {
+  switch (type) {
+    case BcastShapeType::SAME_SHAPE:
+      for (int64_t i = start; i < end; ++i) {
+        *(output + i) = nextafter(*(input1 + i), *(input2 + i));
+      }
+      break;
+    case BcastShapeType::X_ONE_ELEMENT:
+      for (int64_t i = start; i < end; ++i) {
+        *(output + i) = nextafter(*input1, *(input2 + i));
+      }
+      break;
+    case BcastShapeType::Y_ONE_ELEMENT:
+      for (int64_t i = start; i < end; ++i) {
+        *(output + i) = nextafter(*(input1 + i), *input2);
+      }
+      break;
+    default:
+      KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
+      break;
+  }
+}
+
+template <typename T>
+uint32_t NextAfterCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
+  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
+  int64_t data_num = ctx.Output(0)->NumElements();
+  BcastShapeType type = in0_elements_nums == in1_elements_nums
+                          ? BcastShapeType::SAME_SHAPE
+                          : (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
+
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+
+    if (data_num <= kParallelDataNumSameShapeMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_nextafter = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, in0, in1, out); };
+
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_nextafter),
+                        "NextAfter Compute failed.");
+  } else {
+    SpecialCompute<T>(type, 0, data_num, in0, in1, out);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t NextAfterCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Output(0)->NumElements();
+
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_nextafter = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; ++i) {
+        *(out + i) = nextafter(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
+      }
+    };
+
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_nextafter),
+                        "NextAfter Compute failed.");
+  } else {
+    for (int64_t i = 0; i < data_num; ++i) {
+      *(out + i) = nextafter(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t NextAfterCpuKernel::NextAfterCompute(CpuKernelContext &ctx) {
+  Tensor *input0_tensor = ctx.Input(0);
+  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input0_elements_nums = input0_tensor->NumElements();
+
+  Tensor *input1_tensor = ctx.Input(1);
+  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input1_elements_nums = input1_tensor->NumElements();
+
+  bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
+  if (isNeedBcast) {
+    return NoBcastCompute<T>(ctx);
+  } else {
+    Bcast bcast(input0_shape, input1_shape);
+    if (!bcast.IsValid()) {
+      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+
+    return BcastCompute<T>(ctx, bcast);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kNextAfter, NextAfterCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/next_after.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/next_after.h
@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_NEXTAFTER_H_
+#define AICPU_KERNELS_NORMALIZED_NEXTAFTER_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class NextAfterCpuKernel : public CpuKernel {
+ public:
+  NextAfterCpuKernel() = default;
+  ~NextAfterCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t NextAfterParamCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, T *output);
+
+  template <typename T>
+  uint32_t NoBcastCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
+
+  template <typename T>
+  uint32_t NextAfterCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/non_deterministic_ints.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/non_deterministic_ints.cc
@ -0,0 +1,127 @@
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "non_deterministic_ints.h"
+
+#include <cmath>
+#include <ctime>
+#include <iostream>
+#include <random>
+
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kNonDeterministicInts = "NonDeterministicInts";
+const uint32_t kInputNum = 1;
+const uint32_t kOutputNum = 1;
+const uint32_t kInputDims = 1;
+const uint32_t kInputSizes = 2;
+constexpr int64_t kParallelDataNums = 7 * 1024;
+}  // namespace
+
+namespace aicpu {
+template <typename T1, typename T2>
+uint32_t NonDeterministicIntsCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  Tensor *input = ctx.Input(0);
+  Tensor *output = ctx.Output(0);
+  auto input_nums = input->NumElements();
+  auto input_data = reinterpret_cast<T2 *>(input->GetData());
+  auto output_data = reinterpret_cast<T1 *>(output->GetData());
+  auto output_nums = ctx.Output(0)->NumElements();
+  auto max_data = std::numeric_limits<T1>::max();
+  std::vector<int64_t> out_put_dims;
+  for (auto i = 0; i < input_nums; i++) {
+    if (*(input_data + i) <= 0) {
+      KERNEL_LOG_ERROR("Shape elements must be > 0.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    out_put_dims.push_back(input_data[i]);
+  }
+  if (output_nums <= kParallelDataNums) {
+    std::default_random_engine seed(time(0));
+    std::uniform_int_distribution<T1> u(-max_data, max_data);
+    for (auto j = 0; j < output_nums; j++) {
+      *(output_data + j) = u(seed);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > output_nums) {
+      max_core_num = output_nums;
+    }
+    auto shard_non_deterministic_ints = [&](int64_t start, int64_t end) {
+      std::default_random_engine seed(time(0));
+      std::uniform_int_distribution<T1> u(-max_data, max_data);
+      for (auto j = start; j < end; j++) {
+        *(output_data + j) = u(seed);
+      }
+    };
+    KERNEL_HANDLE_ERROR(
+      CpuKernelUtils::ParallelFor(ctx, output_nums, output_nums / max_core_num, shard_non_deterministic_ints),
+      "NonDeterministicInts compute failed.");
+  }
+  output->GetTensorShape()->SetDimSizes(out_put_dims);
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t NonDeterministicIntsCpuKernel::DataAndTypeCheck(CpuKernelContext &ctx) {
+  // the non null of input and output has been verified in NormalCheck
+  Tensor *input = ctx.Input(0);
+  auto input_data_nums = input->NumElements();
+  auto data_type = input->GetDataType();
+  KERNEL_CHECK_FALSE((data_type == DT_INT32 || data_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
+                     " Input type must be one of int32 or int64.");
+  KERNEL_CHECK_FALSE((input_data_nums >= kInputSizes), KERNEL_STATUS_PARAM_INVALID, "Input data elements must >= 2.");
+  KERNEL_CHECK_FALSE((input->GetTensorShape()->GetDimSizes().size() == kInputDims), KERNEL_STATUS_PARAM_INVALID,
+                     "Input tensor must be a 1-D tensor.");
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t NonDeterministicIntsCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check params failed.", kNonDeterministicInts);
+  KERNEL_HANDLE_ERROR(DataAndTypeCheck(ctx), " data or type check failed.");
+  auto output_data_type = ctx.Output(0)->GetDataType();
+  auto input_data_type = ctx.Input(0)->GetDataType();
+  uint32_t ret = KERNEL_STATUS_OK;
+  switch (output_data_type) {
+    case DT_INT32: {
+      if (input_data_type == DT_INT32) {
+        ret = DoCompute<int32_t, int32_t>(ctx);
+      } else {
+        ret = DoCompute<int32_t, int64_t>(ctx);
+      }
+      break;
+    }
+    case DT_INT64: {
+      if (input_data_type == DT_INT32) {
+        ret = DoCompute<int64_t, int32_t>(ctx);
+      } else {
+        ret = DoCompute<int64_t, int64_t>(ctx);
+      }
+      break;
+    }
+    default: {
+      KERNEL_LOG_ERROR("NonDeterministicInts kernel data type [%s] not support.", DTypeStr(output_data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  KERNEL_CHECK_FALSE((ret == KERNEL_STATUS_OK), ret, "Compute failed.");
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kNonDeterministicInts, NonDeterministicIntsCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/non_deterministic_ints.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/non_deterministic_ints.h
@ -0,0 +1,34 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_NONDETERMINISTICINTS_H_
+#define AICPU_KERNELS_NORMALIZED_NONDETERMINISTICINTS_H_
+#include "cpu_ops_kernel.h"
+namespace aicpu {
+
+class NonDeterministicIntsCpuKernel : public CpuKernel {
+ public:
+  NonDeterministicIntsCpuKernel() = default;
+  ~NonDeterministicIntsCpuKernel() override = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t DataAndTypeCheck(CpuKernelContext &ctx);
+  template <typename T1, typename T2>
+  static uint32_t DoCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pow.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pow.cc
@ -0,0 +1,208 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pow.h"
+#include <math.h>
+#include <stdint.h>
+#include "Eigen/Dense"
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "utils/kernel_util.h"
+#include "securec.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *kPow = "Pow";
+const int64_t kParallelDataNum = 2 * 1024;
+const int64_t kParallelDataNumMid = 16 * 1024;
+const int64_t kParallelDataNumSameShape = 7 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
+
+#define POW_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                     \
+    uint32_t result = PowCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                 \
+      KERNEL_LOG_ERROR("Pow kernel compute failed."); \
+      return result;                                  \
+    }                                                 \
+    break;                                            \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t PowCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Pow check input and output number failed.");
+  KERNEL_HANDLE_ERROR(PowParamCheck(ctx), "Pow check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    POW_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    POW_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    POW_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    POW_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    POW_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    POW_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    POW_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
+    POW_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("Pow kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t PowCpuKernel::PowParamCheck(CpuKernelContext &ctx) {
+  // the non null of input_0, input_1, output has been verified in NormalCheck
+  Tensor *input_0 = ctx.Input(0);
+  Tensor *input_1 = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  DataType input0_type = input_0->GetDataType();
+  DataType input1_type = input_1->GetDataType();
+  auto input0_Shape = input_0->GetTensorShape();
+  auto input1_Shape = input_1->GetTensorShape();
+  KERNEL_CHECK_NULLPTR(input0_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input0_Shape failed.")
+  KERNEL_CHECK_NULLPTR(input1_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input1_Shape failed.")
+  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with "
+                     "input1 [%s].",
+                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
+  KERNEL_LOG_DEBUG(
+    "PowCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+void PowCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, T *input1, T *input2, T *output) {
+  switch (type) {
+    case BcastShapeType::SAME_SHAPE:
+      for (int64_t i = start; i < end; ++i) {
+        *(output + i) = pow(*(input1 + i), *(input2 + i));
+      }
+      break;
+    case BcastShapeType::X_ONE_ELEMENT:
+      for (int64_t i = start; i < end; ++i) {
+        *(output + i) = pow(*(input1), *(input2 + i));
+      }
+      break;
+    default:
+      for (int64_t i = start; i < end; ++i) {
+        *(output + i) = pow(*(input1 + i), *(input2));
+      }
+      break;
+  }
+}
+
+template <typename T>
+uint32_t PowCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
+  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
+  int64_t data_num = ctx.Output(0)->NumElements();
+  BcastShapeType type = in0_elements_nums == in1_elements_nums
+                          ? BcastShapeType::SAME_SHAPE
+                          : (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
+
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+
+    if (data_num <= kParallelDataNumSameShapeMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_pow = [&](size_t start, size_t end) { SpecialCompute<T>(type, start, end, in0, in1, out); };
+
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_pow),
+                        "Pow Compute failed.");
+  } else {
+    SpecialCompute<T>(type, 0, data_num, in0, in1, out);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PowCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Output(0)->NumElements();
+
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_pow = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; i++) {
+        auto input1 = in0 + bcast.GetBroadcastXIndex(i);  // i-th value of input0
+        auto input2 = in1 + bcast.GetBroadcastYIndex(i);  // i-th value of input1
+        *(out + i) = pow((*input1), (*input2));
+      }
+    };
+
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_pow),
+                        "Pow Compute failed.");
+  } else {
+    for (int64_t i = 0; i < data_num; i++) {
+      auto input1 = in0 + bcast.GetBroadcastXIndex(i);  // i-th value of input0
+      auto input2 = in1 + bcast.GetBroadcastYIndex(i);  // i-th value of input1
+      *(out + i) = pow((*input1), (*input2));
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PowCpuKernel::PowCompute(CpuKernelContext &ctx) {
+  Tensor *input0_tensor = ctx.Input(0);
+  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input0_elements_nums = input0_tensor->NumElements();
+
+  Tensor *input1_tensor = ctx.Input(1);
+  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input1_elements_nums = input1_tensor->NumElements();
+
+  bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
+  if (isNeedBcast) {
+    return NoBcastCompute<T>(ctx);
+  } else {
+    Bcast bcast(input0_shape, input1_shape);
+
+    return BcastCompute<T>(ctx, bcast);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kPow, PowCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pow.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pow.h
@ -0,0 +1,45 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_POW_H_
+#define AICPU_KERNELS_NORMALIZED_POW_H_
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+
+class PowCpuKernel : public CpuKernel {
+ public:
+  PowCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t PowParamCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, T *input1, T *input2, T *output);
+
+  template <typename T>
+  uint32_t NoBcastCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
+
+  template <typename T>
+  uint32_t PowCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/real.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/real.cc
@ -0,0 +1,100 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "real.h"
+#include "Eigen/Eigen"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 1;
+const char *kReal = "Real";
+constexpr int64_t kFolatDataNums = 8 * 128 * 1024;
+constexpr int64_t kDoubleDataNums = 16 * 128 * 1024;
+
+#define Real_COMPUTE_CASE(IN_DTYPE, IN_TYPE, OUT_DTYPE, CTX)                                             \
+  case (IN_DTYPE): {                                                                                     \
+    switch (OUT_DTYPE) {                                                                                 \
+      case (DT_FLOAT): {                                                                                 \
+        uint32_t result = RealCompute<IN_TYPE, float>(CTX);                                              \
+        if (result != KERNEL_STATUS_OK) {                                                                \
+          KERNEL_LOG_ERROR("Real kernel compute failed.");                                               \
+          return result;                                                                                 \
+        }                                                                                                \
+        break;                                                                                           \
+      }                                                                                                  \
+      case (DT_DOUBLE): {                                                                                \
+        uint32_t result = RealCompute<IN_TYPE, double>(CTX);                                             \
+        if (result != KERNEL_STATUS_OK) {                                                                \
+          KERNEL_LOG_ERROR("Real kernel compute failed.");                                               \
+          return result;                                                                                 \
+        }                                                                                                \
+        break;                                                                                           \
+      }                                                                                                  \
+      default:                                                                                           \
+        KERNEL_LOG_ERROR("Real kernel output data type [%s] not support.", DTypeStr(OUT_DTYPE).c_str()); \
+        return KERNEL_STATUS_PARAM_INVALID;                                                              \
+    }                                                                                                    \
+    break;                                                                                               \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t RealCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kReal);
+
+  DataType input_type = ctx.Input(0)->GetDataType();
+  switch (input_type) {
+    Real_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, DT_FLOAT, ctx)
+      Real_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, DT_DOUBLE, ctx) default
+        : KERNEL_LOG_ERROR("Real kernel input data type [%s] not support.", DTypeStr(input_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T, typename t>
+uint32_t RealCpuKernel::RealCompute(CpuKernelContext &ctx) {
+  auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output = reinterpret_cast<t *>(ctx.Output(0)->GetData());
+
+  auto data_type = ctx.Input(0)->GetDataType();
+  int64_t data_num = ctx.Output(0)->NumElements();
+  int64_t data_size = data_num * sizeof(T);
+  if ((data_type == DT_COMPLEX64 && data_size <= kFolatDataNums) ||
+      (data_type == DT_COMPLEX128 && data_size <= kDoubleDataNums)) {
+    for (int64_t index = 0; index < data_num; ++index) {
+      *(output + index) = (*(input + index)).real();
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shard_real = [&](size_t start, size_t end) {
+      for (size_t index = start; index < end; ++index) {
+        *(output + index) = (*(input + index)).real();
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_real),
+                        "real Compute failed");
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kReal, RealCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/real.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/real.h
@ -0,0 +1,40 @@
+/**
+ * Copyright (C)  2020-2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the Apache License Version 2.0.You may not use this file except in compliance with the License.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Apache License for more details at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * @brief
+ *
+ * @version 1.0
+ *
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_REAL_H_
+#define AICPU_KERNELS_NORMALIZED_REAL_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class RealCpuKernel : public CpuKernel {
+ public:
+  RealCpuKernel() = default;
+  ~RealCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t RealCheck(CpuKernelContext &ctx);
+
+  template <typename T, typename t>
+  uint32_t RealCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/resize_area.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/resize_area.cc
@ -0,0 +1,312 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "resize_area.h"
+#include <securec.h>
+#include <vector>
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+#include "utils/sparse_tensor.h"
+
+namespace {
+constexpr uint32_t kInputNum = 2;
+constexpr uint32_t kOutputNum = 1;
+const int64_t kParallelDataNum = 1024 * 1024;
+const char *kResizeArea = "ResizeArea";
+
+#define RESIZEAREA_COMPUTE_CASE(DTYPE, CHANNEL, TYPE, CTX)          \
+  case (DTYPE): {                                                   \
+    uint32_t result = DoCompute<TYPE>(st, x_interps, CHANNEL, CTX); \
+    if (result != KERNEL_STATUS_OK) {                               \
+      KERNEL_LOG_ERROR("ResizeArea kernel compute failed.");        \
+      return result;                                                \
+    }                                                               \
+    break;                                                          \
+  }
+
+}  // namespace
+
+namespace aicpu {
+
+inline int64_t Bound(int64_t val, int64_t limit) { return std::min(limit - 1, std::max(int64_t{0}, val)); }
+
+float Scaling_(size_t in_size, size_t out_size, bool align_corners) {
+  return (align_corners && out_size > 1) ? (in_size - 1) / static_cast<float>(out_size - 1)
+                                         : in_size / static_cast<float>(out_size);
+}
+
+uint32_t ResizeAreaCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  uint32_t res = GetInputAndCheck(ctx);
+  KERNEL_CHECK_FALSE(res == KERNEL_STATUS_OK, res, "GetInputAndCheck failed.");
+  ResizeAreaSt st;
+  st.CalSt(ctx);
+  // compute the weight of pixels in rows
+  std::vector<ResizeAreaCachedInterpolation> x_interps(st.out_width);
+  for (size_t x = 0; x < st.out_width; x++) {
+    auto &x_interp = x_interps[x];
+    const float transit_x0 = x * st.width_scale;
+    const float transit_x1 = (x + 1) * st.width_scale;
+    size_t v = std::floor(transit_x0);
+    x_interp.start = v;
+    x_interp.start_scale = (v + 1 > transit_x1 ? st.width_scale : v + 1 - transit_x0);
+    v = std::ceil(transit_x1);
+    x_interp.end = v;
+    v = x_interp.end - 1;
+    x_interp.end_minus_one_scale = (v + 1 > transit_x1 ? transit_x1 - v : 1.0);
+  }
+  auto channels_num = -1;
+  if (st.channels == 3) {
+    channels_num = 3;
+  }
+
+  switch (dtype_) {
+    RESIZEAREA_COMPUTE_CASE(DT_INT8, channels_num, int8_t, ctx)
+    RESIZEAREA_COMPUTE_CASE(DT_INT16, channels_num, int16_t, ctx)
+    RESIZEAREA_COMPUTE_CASE(DT_INT32, channels_num, int32_t, ctx)
+    RESIZEAREA_COMPUTE_CASE(DT_INT64, channels_num, int64_t, ctx)
+    RESIZEAREA_COMPUTE_CASE(DT_UINT8, channels_num, uint8_t, ctx)
+    RESIZEAREA_COMPUTE_CASE(DT_UINT16, channels_num, uint16_t, ctx)
+    RESIZEAREA_COMPUTE_CASE(DT_FLOAT, channels_num, float, ctx)
+    RESIZEAREA_COMPUTE_CASE(DT_FLOAT16, channels_num, Eigen::half, ctx)
+    RESIZEAREA_COMPUTE_CASE(DT_DOUBLE, channels_num, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("ResizeArea doesn't support input tensor types: [%s]", DTypeStr(dtype_).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t ResizeAreaCpuKernel::DoCompute(const ResizeAreaSt &st, std::vector<ResizeAreaCachedInterpolation> &x_interps,
+                                        int64_t kKnownNumChannels, CpuKernelContext &ctx) {
+  auto input_ptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_ptr = reinterpret_cast<float *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Input(0)->NumElements();
+  float scale = 1.0 / (st.height_scale * st.width_scale);
+
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num > st.out_height) {
+      max_core_num = st.out_height;
+    }
+    for (size_t b = 0; b < st.batch_size; ++b) {
+      auto shared_resize_area = [&](size_t start, size_t end) {
+        // compute the weight of pixels in columns
+        for (size_t y = start; y < end; ++y) {
+          const float transit_y0 = y * st.height_scale;
+          const float transit_y1 = (y + 1) * st.height_scale;
+          // The start and end height indices of all the cells that could
+          // contribute to the target cell.
+          const int64_t y_start = std::floor(transit_y0);
+          const int64_t y_end = std::ceil(transit_y1);
+          std::vector<float> y_scales;
+          std::vector<const T *> y_ptrs;
+          y_scales.clear();
+          y_ptrs.clear();
+          for (int64_t i = y_start; i < y_end; ++i) {
+            float scale_y;
+            if (i < transit_y0) {
+              scale_y = (i + 1 > transit_y1 ? st.height_scale : i + 1 - transit_y0);
+            } else {
+              scale_y = (i + 1 > transit_y1 ? transit_y1 - i : 1.0);
+            }
+            y_scales.push_back(scale_y);
+            y_ptrs.push_back(input_ptr + (b * st.in_height * st.in_width * st.channels +
+                                          Bound(i, st.in_height) * st.in_width * st.channels));
+          }
+          float *output_patch_ptr =
+            output_ptr + (b * st.out_height * st.out_width * st.channels + y * st.out_width * st.channels);
+          if (kKnownNumChannels == 3) {
+            for (size_t x = 0; x < st.out_width; ++x) {
+              const ResizeAreaCachedInterpolation &x_interp = x_interps[x];
+              if (x_interp.needs_bounding) {
+                ComputePatchSumOf3Channels<true>(scale, st, y_ptrs, y_scales, x_interp, output_patch_ptr);
+              } else {
+                ComputePatchSumOf3Channels<false>(scale, st, y_ptrs, y_scales, x_interp, output_patch_ptr);
+              }
+              output_patch_ptr += 3;
+            }
+          } else {
+            for (size_t x = 0; x < st.out_width; ++x) {
+              const ResizeAreaCachedInterpolation &x_interp = x_interps[x];
+              if (x_interp.needs_bounding) {
+                ComputePatchSum<true>(scale, st, y_ptrs, y_scales, x_interp, output_patch_ptr);
+              } else {
+                ComputePatchSum<false>(scale, st, y_ptrs, y_scales, x_interp, output_patch_ptr);
+              }
+              output_patch_ptr += st.channels;
+            }
+          }
+        }
+      };
+      CpuKernelUtils::ParallelFor(ctx, st.out_height, st.out_height / max_core_num, shared_resize_area);
+    }
+  } else {
+    std::vector<float> y_scales;
+    std::vector<const T *> y_ptrs;
+    for (size_t b = 0; b < st.batch_size; ++b) {
+      for (size_t y = 0; y < st.out_height; ++y) {
+        y_scales.clear();
+        y_ptrs.clear();
+        const float transit_y0 = y * st.height_scale;
+        const float transit_y1 = (y + 1) * st.height_scale;
+        // The start and end height indices of all the cells that could
+        // contribute to the target cell.
+        const size_t y_start = std::floor(transit_y0);
+        const size_t y_end = std::ceil(transit_y1);
+        for (size_t i = y_start; i < y_end; ++i) {
+          float scale_y;
+          if (i < transit_y0) {
+            scale_y = (i + 1 > transit_y1 ? st.height_scale : i + 1 - transit_y0);
+          } else {
+            scale_y = (i + 1 > transit_y1 ? transit_y1 - i : 1.0);
+          }
+          y_scales.push_back(scale_y);
+          y_ptrs.push_back(input_ptr + (b * st.in_height * st.in_width * st.channels +
+                                        Bound(i, st.in_height) * st.in_width * st.channels));
+        }
+        if (kKnownNumChannels == 3) {
+          for (size_t x = 0; x < st.out_width; ++x) {
+            const ResizeAreaCachedInterpolation &x_interp = x_interps[x];
+            if (x_interp.needs_bounding) {
+              ComputePatchSumOf3Channels<true>(scale, st, y_ptrs, y_scales, x_interp, output_ptr);
+            } else {
+              ComputePatchSumOf3Channels<false>(scale, st, y_ptrs, y_scales, x_interp, output_ptr);
+            }
+            output_ptr += 3;
+          }
+        } else {
+          for (size_t x = 0; x < st.out_width; ++x) {
+            const ResizeAreaCachedInterpolation &x_interp = x_interps[x];
+            if (x_interp.needs_bounding) {
+              ComputePatchSum<true>(scale, st, y_ptrs, y_scales, x_interp, output_ptr);
+            } else {
+              ComputePatchSum<false>(scale, st, y_ptrs, y_scales, x_interp, output_ptr);
+            }
+            output_ptr += st.channels;
+          }
+        }
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+// compute the value of the specific pxiel when the num of channels is 3
+template <bool NeedsXBounding, typename T>
+void ResizeAreaCpuKernel::ComputePatchSumOf3Channels(float scale, const ResizeAreaSt &st,
+                                                     const std::vector<const T *> &y_ptrs,
+                                                     const std::vector<float> &y_scales,
+                                                     const ResizeAreaCachedInterpolation &x_interp,
+                                                     float *&output_patch_ptr) {
+#define BOUND_IF_NEEDED(x, y) (NeedsXBounding ? Bound(x, y) : (x))
+
+  float sum_0 = 0;
+  float sum_1 = 0;
+  float sum_2 = 0;
+  for (size_t i = 0; i < y_ptrs.size(); ++i) {
+    const T *ptr = y_ptrs[i];
+    float scale_x = x_interp.start_scale;
+    int64_t offset = 3 * BOUND_IF_NEEDED(x_interp.start, st.in_width);
+    float sum_y_0 = static_cast<float>(ptr[offset + 0]) * scale_x;
+    float sum_y_1 = static_cast<float>(ptr[offset + 1]) * scale_x;
+    float sum_y_2 = static_cast<float>(ptr[offset + 2]) * scale_x;
+
+    if (x_interp.start + 1 != x_interp.end) {
+      for (size_t x = x_interp.start + 1; x < x_interp.end - 1; ++x) {
+        int64_t offset = 3 * BOUND_IF_NEEDED(x, st.in_width);
+        sum_y_0 += static_cast<float>(ptr[offset + 0]);
+        sum_y_1 += static_cast<float>(ptr[offset + 1]);
+        sum_y_2 += static_cast<float>(ptr[offset + 2]);
+      }
+      scale_x = x_interp.end_minus_one_scale;
+      offset = 3 * BOUND_IF_NEEDED(x_interp.end - 1, st.in_width);
+      sum_y_0 += static_cast<float>(ptr[offset + 0]) * scale_x;
+      sum_y_1 += static_cast<float>(ptr[offset + 1]) * scale_x;
+      sum_y_2 += static_cast<float>(ptr[offset + 2]) * scale_x;
+    }
+    sum_0 += sum_y_0 * y_scales[i];
+    sum_1 += sum_y_1 * y_scales[i];
+    sum_2 += sum_y_2 * y_scales[i];
+  }
+
+  output_patch_ptr[0] = sum_0 * scale;
+  output_patch_ptr[1] = sum_1 * scale;
+  output_patch_ptr[2] = sum_2 * scale;
+
+#undef BOUND_IF_NEEDED
+}
+
+// compute the value of the specific pxiel when the num of channels is not 3
+template <bool NeedsXBounding, typename T>
+void ResizeAreaCpuKernel::ComputePatchSum(float scale, const ResizeAreaSt &st, const std::vector<const T *> &y_ptrs,
+                                          const std::vector<float> &y_scales,
+                                          const ResizeAreaCachedInterpolation &x_interp, float *&output_patch_ptr) {
+#define BOUND_IF_NEEDED(x, y) (NeedsXBounding ? Bound(x, y) : (x))
+
+  const auto num_channels = st.channels;
+  for (size_t c = 0; c < num_channels; ++c) {
+    float sum = 0;
+    for (size_t i = 0; i < y_ptrs.size(); ++i) {
+      const T *ptr = y_ptrs[i];
+      float scale_x = x_interp.start_scale;
+      float sum_y = static_cast<float>(ptr[num_channels * BOUND_IF_NEEDED(x_interp.start, st.in_width) + c]) * scale_x;
+      if (x_interp.start + 1 != x_interp.end) {
+        for (size_t x = x_interp.start + 1; x < x_interp.end - 1; ++x) {
+          sum_y += static_cast<float>(ptr[num_channels * BOUND_IF_NEEDED(x, st.in_width) + c]);
+        }
+        scale_x = x_interp.end_minus_one_scale;
+        sum_y += static_cast<float>(ptr[num_channels * BOUND_IF_NEEDED(x_interp.end - 1, st.in_width) + c]) * scale_x;
+      }
+      sum += sum_y * y_scales[i];
+    }
+    output_patch_ptr[c] = sum * scale;
+  }
+#undef BOUND_IF_NEEDED
+}
+
+// check params
+uint32_t ResizeAreaCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "ResizeArea check input and output number failed.");
+
+  Tensor *input_tensor1 = ctx.Input(0);
+  Tensor *input_tensor2 = ctx.Input(1);
+  Tensor *output_tensor = ctx.Output(0);
+  auto outsize = reinterpret_cast<int32_t *>(input_tensor2->GetData());
+  int32_t out_height = static_cast<int32_t>(outsize[0]);
+  int32_t out_width = static_cast<int32_t>(outsize[1]);
+
+  in_shape1 = input_tensor1->GetTensorShape()->GetDimSizes();
+  in_shape2 = input_tensor2->GetTensorShape()->GetDimSizes();
+  out_shape = output_tensor->GetTensorShape()->GetDimSizes();
+
+  KERNEL_CHECK_FALSE(in_shape1.size() == 4, KERNEL_STATUS_PARAM_INVALID, "Dim of input[0] must be 4,but got[%zu].",
+                     in_shape1.size());
+  KERNEL_CHECK_FALSE(in_shape2.size() == 1, KERNEL_STATUS_PARAM_INVALID, "Dim of input[1] must be 1,but got[%zu].",
+                     in_shape2.size());
+  KERNEL_CHECK_FALSE(out_shape.size() == 4, KERNEL_STATUS_PARAM_INVALID, "Dim of output[0] must be 4,but got[%zu].",
+                     out_shape.size());
+  KERNEL_CHECK_FALSE(out_height > 0 && out_width > 0, KERNEL_STATUS_PARAM_INVALID, "outsize must be positive.");
+
+  AttrValue *attr_align_corners = ctx.GetAttr("align_corners");
+  align_corners = (attr_align_corners == nullptr) ? false : (attr_align_corners->GetBool());
+  dtype_ = input_tensor1->GetDataType();
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kResizeArea, ResizeAreaCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/resize_area.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/resize_area.h
@ -0,0 +1,88 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_RESIZE_AREA_H_
+#define AICPU_KERNELS_NORMALIZED_RESIZE_AREA_H_
+
+#include <string>
+#include "Eigen/Core"
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+
+std::vector<int64_t> in_shape1;
+std::vector<int64_t> in_shape2;
+std::vector<int64_t> out_shape;
+bool align_corners = false;
+
+// weight data of every pixel
+struct ResizeAreaCachedInterpolation {
+  size_t start;
+  size_t end;
+  float start_scale;
+  float end_minus_one_scale;
+  bool needs_bounding = true;
+};
+
+inline int64_t Bound(int64_t val, int64_t limit);
+float Scaling_(size_t in_size, size_t out_size, bool align_corners);
+
+struct ResizeAreaSt {
+  void CalSt(CpuKernelContext &ctx) {
+    Tensor *input_tensor1 = ctx.Input(0);
+    Tensor *input_tensor2 = ctx.Input(1);
+    in_shape1 = input_tensor1->GetTensorShape()->GetDimSizes();
+    auto outsize = reinterpret_cast<int32_t *>(input_tensor2->GetData());
+    batch_size = in_shape1[0];
+    channels = in_shape1[3];
+    in_height = in_shape1[1];
+    in_width = in_shape1[2];
+    out_height = outsize[0];
+    out_width = outsize[1];
+    height_scale = Scaling_(in_height, out_height, align_corners);
+    width_scale = Scaling_(in_width, out_width, align_corners);
+  }
+  size_t batch_size;
+  size_t channels;
+  size_t in_height;
+  size_t in_width;
+  size_t out_height;
+  size_t out_width;
+  float height_scale;
+  float width_scale;
+};
+
+class ResizeAreaCpuKernel : public CpuKernel {
+ public:
+  ~ResizeAreaCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t DoCompute(const ResizeAreaSt &st, std::vector<ResizeAreaCachedInterpolation> &x_interps,
+                     int64_t kKnownNumChannels, CpuKernelContext &ctx);
+  template <bool NeedsXBounding, typename T>
+  void ComputePatchSumOf3Channels(float scale, const ResizeAreaSt &st, const std::vector<const T *> &y_ptrs,
+                                  const std::vector<float> &y_scales, const ResizeAreaCachedInterpolation &x_interp,
+                                  float *&output_patch_ptr);
+  template <bool NeedsXBounding, typename T>
+  void ComputePatchSum(float scale, const ResizeAreaSt &st, const std::vector<const T *> &y_ptrs,
+                       const std::vector<float> &y_scales, const ResizeAreaCachedInterpolation &x_interp,
+                       float *&output_patch_ptr);
+  uint32_t GetInputAndCheck(CpuKernelContext &ctx);
+  DataType dtype_ = DT_INT8;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc
@ -0,0 +1,337 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "segment_mean.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 2;
+const uint32_t kOutputNum = 1;
+const int64_t kParallelDataNum = 2 * 1024;
+const char *kSegmentMean = "SegmentMean";
+
+#define SEGMENTMEAN_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX)    \
+  case (DTYPE): {                                             \
+    uint32_t result = SegmentMeanCompute<TYPE1, TYPE2>(CTX);  \
+    if (result != KERNEL_STATUS_OK) {                         \
+      KERNEL_LOG_ERROR("SegmentMean kernel compute failed."); \
+      return result;                                          \
+    }                                                         \
+    break;                                                    \
+  }
+
+#define SEGMENTMEAN_COMPUTE_CASE_Complex(DTYPE, TYPE1, TYPE2, CTX)   \
+  case (DTYPE): {                                                    \
+    uint32_t result = SegmentMeanCompute_Complex<TYPE1, TYPE2>(CTX); \
+    if (result != KERNEL_STATUS_OK) {                                \
+      KERNEL_LOG_ERROR("SegmentMean kernel compute failed.");        \
+      return result;                                                 \
+    }                                                                \
+    break;                                                           \
+  }
+
+#define SEGMENTMEAN_COMPUTE_CASE_ALL(TYPE, CTX)                                                                       \
+  SEGMENTMEAN_COMPUTE_CASE_Complex(DT_COMPLEX64, std::complex<float>, TYPE, CTX)                                      \
+    SEGMENTMEAN_COMPUTE_CASE_Complex(DT_COMPLEX128, std::complex<double>, TYPE, CTX)                                  \
+      SEGMENTMEAN_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) SEGMENTMEAN_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX)     \
+        SEGMENTMEAN_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) SEGMENTMEAN_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
+          SEGMENTMEAN_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX)                                                      \
+            SEGMENTMEAN_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX)                                                  \
+              SEGMENTMEAN_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX)                                                \
+                SEGMENTMEAN_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)                                              \
+                  SEGMENTMEAN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX)                                        \
+                    SEGMENTMEAN_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX)                                              \
+                      SEGMENTMEAN_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+T ComplexDiv(T sum, int64_t num) {
+  T res;
+  auto real = sum.real();
+  auto imag = sum.imag();
+  res.real(real / num);
+  res.imag(imag / num);
+  return res;
+}
+
+uint32_t SegmentMeanCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SegmentMean check input and output number failed.");
+  Tensor *input_data = ctx.Input(0);
+  KERNEL_CHECK_NULLPTR(input_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[0] failed.")
+  Tensor *segment_ids_data = ctx.Input(1);
+  KERNEL_CHECK_NULLPTR(segment_ids_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[1] failed.")
+  Tensor *output_data = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR(output_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output[0] failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  auto segment_ids_type = ctx.Input(1)->GetDataType();
+  switch (segment_ids_type) {
+    case DT_INT32: {
+      switch (data_type) {
+        SEGMENTMEAN_COMPUTE_CASE_ALL(int32_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    }
+    case DT_INT64: {
+      switch (data_type) {
+        SEGMENTMEAN_COMPUTE_CASE_ALL(int64_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    }
+    default: {
+      KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(segment_ids_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T1, typename T2>
+uint32_t SegmentMeanCpuKernel::SegmentMeanCompute(CpuKernelContext &ctx) {
+  Tensor *input_data = ctx.Input(0);
+  auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
+  int64_t input_data_num = input_data->NumElements();
+  Tensor *segment_ids_data = ctx.Input(1);
+  auto segment_ids_data_addr = reinterpret_cast<T2 *>(segment_ids_data->GetData());
+  int64_t segment_ids_data_num = segment_ids_data->NumElements();
+  Tensor *output_data = ctx.Output(0);
+  auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
+  int64_t output_data_num = output_data->NumElements();
+  for (int64_t i = 0; i < output_data_num; i++) {
+    output_data_addr[i] = static_cast<T1>(0);
+  }
+  std::vector<int64_t> segments;
+  if (segment_ids_data_num != (input_data->GetTensorShape()->GetDimSize(0))) {
+    KERNEL_LOG_ERROR("The amount of data for input[1] must be equal to the first dimension of input[0].");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (segment_ids_data_addr[0] < 0) {
+    KERNEL_LOG_ERROR("Input[1] must be nonnegative data.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  int64_t seg_tmp = 1;
+  for (int64_t i = 0; i < segment_ids_data_num - 1; i++) {
+    if (segment_ids_data_addr[i] > segment_ids_data_addr[i + 1]) {
+      KERNEL_LOG_ERROR("Input[1] must be an ascending ordered sequence.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) {
+      seg_tmp++;
+    } else {
+      segments.push_back(seg_tmp);
+      seg_tmp = 1;
+    }
+    if (i == segment_ids_data_num - 2) {
+      segments.push_back(seg_tmp);
+    }
+  }
+  const int64_t num_compare_per = input_data_num / (input_data->GetTensorShape()->GetDimSize(0));
+  const int64_t num_segments = segments.size();
+  if (num_segments < kParallelDataNum) {
+    for (int64_t i = 0; i < num_segments; i++) {
+      int64_t count = segments[i];
+      int64_t count_no = 0;
+      for (int64_t j = 0; j < i; j++) {
+        count_no += segments[j];
+      }
+      int64_t input_addr_base = count_no * num_compare_per;
+      if (num_compare_per < kParallelDataNum) {
+        for (int64_t j = 0; j < num_compare_per; j++) {
+          int64_t mean_init_addr = input_addr_base + j;
+          T1 sum_value = input_data_addr[mean_init_addr];
+          for (int64_t k = 1; k < count; k++) {
+            int cmp_addr = mean_init_addr + k * num_compare_per;
+            sum_value += input_data_addr[cmp_addr];
+          }
+          output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = sum_value / count;
+        }
+      } else {
+        uint32_t min_core_num = 1;
+        int64_t mean_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+        if (mean_core_num > num_compare_per) {
+          mean_core_num = num_compare_per;
+        }
+        auto shard_compute = [&](size_t start, size_t end) {
+          for (size_t j = start; j < end; j++) {
+            int64_t mean_init_addr = input_addr_base + j;
+            T1 sum_value = input_data_addr[mean_init_addr];
+            for (int64_t k = 1; k < count; k++) {
+              int cmp_addr = mean_init_addr + k * num_compare_per;
+              sum_value += input_data_addr[cmp_addr];
+            }
+            output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = sum_value / count;
+          }
+        };
+        KERNEL_HANDLE_ERROR(
+          CpuKernelUtils::ParallelFor(ctx, num_compare_per, num_compare_per / mean_core_num, shard_compute),
+          "SegmentMean Compute failed.");
+      }
+    }
+  } else {
+    uint32_t min_core_num_seg = 1;
+    int64_t mean_core_num_seg = std::max(min_core_num_seg, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (mean_core_num_seg > num_segments) {
+      mean_core_num_seg = num_segments;
+    }
+    auto shard_compute_seg = [&](size_t start_seg, size_t end_seg) {
+      for (size_t i = start_seg; i < end_seg; i++) {
+        int64_t count = segments[i];
+        int64_t count_no = 0;
+        for (size_t j = 0; j < i; j++) {
+          count_no += segments[j];
+        }
+        int64_t input_addr_base = count_no * num_compare_per;
+        for (int64_t j = 0; j < num_compare_per; j++) {
+          int64_t mean_init_addr = input_addr_base + j;
+          T1 sum_value = input_data_addr[mean_init_addr];
+          for (int64_t k = 1; k < count; k++) {
+            int cmp_addr = mean_init_addr + k * num_compare_per;
+            sum_value += input_data_addr[cmp_addr];
+          }
+          output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = sum_value / count;
+        }
+      }
+    };
+    KERNEL_HANDLE_ERROR(
+      CpuKernelUtils::ParallelFor(ctx, num_segments, num_segments / mean_core_num_seg, shard_compute_seg),
+      "SegmentMean Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T1, typename T2>
+uint32_t SegmentMeanCpuKernel::SegmentMeanCompute_Complex(CpuKernelContext &ctx) {
+  Tensor *input_data = ctx.Input(0);
+  auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
+  int64_t input_data_num = input_data->NumElements();
+  Tensor *segment_ids_data = ctx.Input(1);
+  auto segment_ids_data_addr = reinterpret_cast<T2 *>(segment_ids_data->GetData());
+  int64_t segment_ids_data_num = segment_ids_data->NumElements();
+  Tensor *output_data = ctx.Output(0);
+  auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
+  int64_t output_data_num = output_data->NumElements();
+  for (int64_t i = 0; i < output_data_num; i++) {
+    output_data_addr[i] = static_cast<T1>(0);
+  }
+  std::vector<int64_t> segments;
+  if (segment_ids_data_num != (input_data->GetTensorShape()->GetDimSize(0))) {
+    KERNEL_LOG_ERROR("The amount of data for input[1] must be equal to the first dimension of input[0].");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (segment_ids_data_addr[0] < 0) {
+    KERNEL_LOG_ERROR("Input[1] must be nonnegative data.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  int64_t seg_tmp = 1;
+  for (int64_t i = 0; i < segment_ids_data_num - 1; i++) {
+    if (segment_ids_data_addr[i] > segment_ids_data_addr[i + 1]) {
+      KERNEL_LOG_ERROR("Input[1] must be an ascending ordered sequence.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) {
+      seg_tmp++;
+    } else {
+      segments.push_back(seg_tmp);
+      seg_tmp = 1;
+    }
+    if (i == segment_ids_data_num - 2) {
+      segments.push_back(seg_tmp);
+    }
+  }
+  const int64_t num_compare_per = input_data_num / (input_data->GetTensorShape()->GetDimSize(0));
+  const int64_t num_segments = segments.size();
+  if (num_segments < kParallelDataNum) {
+    for (int64_t i = 0; i < num_segments; i++) {
+      int64_t count = segments[i];
+      int64_t count_no = 0;
+      for (int64_t j = 0; j < i; j++) {
+        count_no += segments[j];
+      }
+      int64_t input_addr_base = count_no * num_compare_per;
+      if (num_compare_per < kParallelDataNum) {
+        for (int64_t j = 0; j < num_compare_per; j++) {
+          int64_t mean_init_addr = input_addr_base + j;
+          T1 sum_value = input_data_addr[mean_init_addr];
+          for (int64_t k = 1; k < count; k++) {
+            int cmp_addr = mean_init_addr + k * num_compare_per;
+            sum_value += input_data_addr[cmp_addr];
+          }
+          output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = ComplexDiv(sum_value, count);
+        }
+      } else {
+        uint32_t min_core_num = 1;
+        int64_t mean_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+        if (mean_core_num > num_compare_per) {
+          mean_core_num = num_compare_per;
+        }
+        auto shard_compute = [&](size_t start, size_t end) {
+          for (size_t j = start; j < end; j++) {
+            int64_t mean_init_addr = input_addr_base + j;
+            T1 sum_value = input_data_addr[mean_init_addr];
+            for (int64_t k = 1; k < count; k++) {
+              int cmp_addr = mean_init_addr + k * num_compare_per;
+              sum_value += input_data_addr[cmp_addr];
+            }
+            output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = ComplexDiv(sum_value, count);
+          }
+        };
+        KERNEL_HANDLE_ERROR(
+          CpuKernelUtils::ParallelFor(ctx, num_compare_per, num_compare_per / mean_core_num, shard_compute),
+          "SegmentMean Compute failed.");
+      }
+    }
+  } else {
+    uint32_t min_core_num_seg = 1;
+    int64_t mean_core_num_seg = std::max(min_core_num_seg, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (mean_core_num_seg > num_segments) {
+      mean_core_num_seg = num_segments;
+    }
+    auto shard_compute_seg = [&](size_t start_seg, size_t end_seg) {
+      for (size_t i = start_seg; i < end_seg; i++) {
+        int64_t count = segments[i];
+        int64_t count_no = 0;
+        for (size_t j = 0; j < i; j++) {
+          count_no += segments[j];
+        }
+        int64_t input_addr_base = count_no * num_compare_per;
+        for (int64_t j = 0; j < num_compare_per; j++) {
+          int64_t mean_init_addr = input_addr_base + j;
+          T1 sum_value = input_data_addr[mean_init_addr];
+          for (int64_t k = 1; k < count; k++) {
+            int cmp_addr = mean_init_addr + k * num_compare_per;
+            sum_value += input_data_addr[cmp_addr];
+          }
+          output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = ComplexDiv(sum_value, count);
+        }
+      }
+    };
+    KERNEL_HANDLE_ERROR(
+      CpuKernelUtils::ParallelFor(ctx, num_segments, num_segments / mean_core_num_seg, shard_compute_seg),
+      "SegmentMean Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kSegmentMean, SegmentMeanCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.h
@ -0,0 +1,37 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SEGMENTMEAN_H_
+#define AICPU_KERNELS_NORMALIZED_SEGMENTMEAN_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class SegmentMeanCpuKernel : public CpuKernel {
+ public:
+  SegmentMeanCpuKernel() = default;
+  ~SegmentMeanCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T1, typename T2>
+  static uint32_t SegmentMeanCompute(CpuKernelContext &ctx);
+  template <typename T1, typename T2>
+  static uint32_t SegmentMeanCompute_Complex(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_min.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_min.cc
@ -0,0 +1,243 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "segment_min.h"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+using namespace std;
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *kSegmentMin = "SegmentMin";
+#define SEGMENT_MIN_COMPUTE_CASE(DTYPE, TYPE, CTX, STYPE)                                                  \
+  case (DTYPE): {                                                                                          \
+    uint32_t res;                                                                                          \
+    switch (STYPE) {                                                                                       \
+      case DT_INT32:                                                                                       \
+        res = SegmentMinCompute<TYPE, int32_t>(CTX);                                                       \
+        break;                                                                                             \
+      case DT_INT64:                                                                                       \
+        res = SegmentMinCompute<TYPE, int64_t>(CTX);                                                       \
+        break;                                                                                             \
+      default:                                                                                             \
+        KERNEL_LOG_ERROR("SegmentMin kernel segment_ids type [%s] not support.", DTypeStr(STYPE).c_str()); \
+        return KERNEL_STATUS_PARAM_INVALID;                                                                \
+    }                                                                                                      \
+    if (res != KERNEL_STATUS_OK) {                                                                         \
+      KERNEL_LOG_ERROR("SegmentMin kernel compute failed.");                                               \
+      return res;                                                                                          \
+    }                                                                                                      \
+    break;                                                                                                 \
+  }
+}  // namespace
+namespace aicpu {
+uint32_t SegmentMinCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SegmentMin check input and output number failed.");
+  KERNEL_HANDLE_ERROR(SegmentMinCheck(ctx), "SegmentMin check params failed.");
+  auto type_data = ctx.Input(0)->GetDataType();
+  auto type_seg = ctx.Input(1)->GetDataType();
+  switch (type_data) {
+    SEGMENT_MIN_COMPUTE_CASE(DT_INT8, int8_t, ctx, type_seg)
+    SEGMENT_MIN_COMPUTE_CASE(DT_INT16, int16_t, ctx, type_seg)
+    SEGMENT_MIN_COMPUTE_CASE(DT_INT32, int32_t, ctx, type_seg)
+    SEGMENT_MIN_COMPUTE_CASE(DT_INT64, int64_t, ctx, type_seg)
+    SEGMENT_MIN_COMPUTE_CASE(DT_UINT8, uint8_t, ctx, type_seg)
+    SEGMENT_MIN_COMPUTE_CASE(DT_UINT32, uint32_t, ctx, type_seg)
+    SEGMENT_MIN_COMPUTE_CASE(DT_UINT16, uint16_t, ctx, type_seg)
+    SEGMENT_MIN_COMPUTE_CASE(DT_UINT64, uint64_t, ctx, type_seg)
+    SEGMENT_MIN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx, type_seg)
+    SEGMENT_MIN_COMPUTE_CASE(DT_FLOAT, float, ctx, type_seg)
+    SEGMENT_MIN_COMPUTE_CASE(DT_DOUBLE, double, ctx, type_seg)
+    default:
+      KERNEL_LOG_ERROR("SegmentMin kernel data type [%s] not support.", DTypeStr(type_data).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+template <class T1, class T2>
+uint32_t SegmentMinCpuKernel::SegmentMinCompute(CpuKernelContext &ctx) {
+  auto data = ctx.Input(0);  // tensor*
+  auto segment_ids = ctx.Input(1);
+  auto output = ctx.Output(0);
+  auto data_data = reinterpret_cast<T1 *>(data->GetData());
+  auto segment_ids_data = reinterpret_cast<T2 *>(segment_ids->GetData());
+  auto segment_ids_len = segment_ids->NumElements();
+  auto data_len = data->NumElements();
+  auto data_shape = data->GetTensorShape();
+  auto segment_ids_shape = segment_ids->GetTensorShape();
+  auto output_data = reinterpret_cast<T1 *>(output->GetData());
+  uint64_t output_len = output->NumElements();
+  uint64_t len2 = data_len / data_shape->GetDimSize(0);
+  uint64_t _8k = 8 * 1024, _2k = 2 * 1024;
+  // 输出初始化为0
+  if (output_len <= _8k) {
+    for (uint64_t i = 0; i < output_len; i++) output_data[i] = (T1)0;
+  } else {
+    uint32_t min_core = 1;
+    uint64_t max_core = std::max(min_core, aicpu::CpuKernelUtils::GetCPUNum(ctx) / 2);
+    if (max_core > output_len) {
+      max_core = output_len;
+    }
+    auto init = [&](size_t start, size_t end) {
+      for (auto i = start; i < end; i++) output_data[i] = (T1)0;
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, output_len, output_len / max_core, init),
+                        "Initialize value of output failed.");
+  }
+  vector<T2> nums;
+  vector<pair<uint64_t, uint64_t>> ranges;
+  for (int64_t i = 0; i < segment_ids_len; ++i) {
+    if (i) {
+      if (segment_ids_data[i] == nums.back()) {
+        ++ranges.back().second;
+      } else {
+        nums.push_back(segment_ids_data[i]), ranges.push_back({i, i});
+      }
+    } else {
+      nums.push_back(segment_ids_data[0]), ranges.push_back(make_pair(0, 0));
+    }
+  }
+  uint64_t nums_len = nums.size();
+  if (nums_len > _8k) {
+    uint32_t min_core = 1;
+    uint64_t max_core = std::max(min_core, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    max_core = std::min(max_core, nums_len);
+    auto mt_for_nums = [&](size_t start_num, size_t end_num) {
+      for (auto i = start_num; i < end_num; ++i) {
+        uint64_t st = ranges[i].first, ed = ranges[i].second;
+        uint64_t output_start = nums[i] * len2;
+        for (uint64_t k = 0; k < len2; k++) {
+          for (uint64_t j = st; j <= ed; j++) {
+            uint64_t data_start = j * len2;
+            T1 &u = output_data[output_start + k], &v = data_data[data_start + k];
+            if (j == st)
+              u = v;
+            else
+              u = std::min(u, v);
+          }
+        }
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, nums_len, nums_len / max_core, mt_for_nums),
+                        "SegmentMin Compute failed.");
+  } else {
+    for (uint64_t i = 0; i < nums_len; ++i) {
+      uint64_t st = ranges[i].first, ed = ranges[i].second;
+      uint64_t output_start = nums[i] * len2;
+      if (len2 < _2k) {
+        for (uint64_t k = 0; k < len2; k++) {
+          for (uint64_t j = st; j <= ed; j++) {
+            uint64_t data_start = j * len2;
+            T1 &u = output_data[output_start + k], &v = data_data[data_start + k];
+            if (j == st) {
+              u = v;
+            } else {
+              u = std::min(u, v);
+            }
+          }
+        }
+      } else {
+        uint32_t min_core = 1;
+        uint64_t max_core = std::max(min_core, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+        max_core = std::min(max_core, len2);
+        auto mt_for_len2 = [&](size_t start_len, size_t end_len) {
+          for (uint64_t k = start_len; k < end_len; k++) {
+            for (uint64_t j = st; j <= ed; j++) {
+              uint64_t data_start = j * len2;
+              T1 &u = output_data[output_start + k], &v = data_data[data_start + k];
+              if (j == st) {
+                u = v;
+              } else {
+                u = std::min(u, v);
+              }
+            }
+          }
+        };
+        KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, len2, len2 / max_core, mt_for_len2),
+                            "SegmentMin Compute failed.");
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+uint32_t SegmentMinCpuKernel::SegmentMinCheck(CpuKernelContext &ctx) {
+  // inspect the input & output pointer
+  KERNEL_CHECK_NULLPTR(ctx.Input(0), KERNEL_STATUS_PARAM_INVALID, "Get input 0 failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Input(1), KERNEL_STATUS_PARAM_INVALID, "Get input 1 failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Output(0), KERNEL_STATUS_PARAM_INVALID, "Get output failed.")
+  // inspect data in input & output
+  KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Input(1)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed.")
+  // regular test
+  KERNEL_CHECK_FALSE(CheckType(ctx.Input(1)), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of segment_ids should be DT_INT32 or DT_INT64.")
+  KERNEL_CHECK_FALSE(CheckDim(ctx.Input(1)), KERNEL_STATUS_PARAM_INVALID, "The dimension of segment_ids should be 1.")
+  KERNEL_CHECK_FALSE(CheckSorted(ctx.Input(1)), KERNEL_STATUS_PARAM_INVALID,
+                     "segment_ids should be ascending and no negative number in it.")
+  KERNEL_CHECK_FALSE(CheckLength(ctx.Input(1), ctx.Input(0)), KERNEL_STATUS_PARAM_INVALID,
+                     "The length of segment_ids should be equal to the length "
+                     "of the first dimension of the data")
+  KERNEL_LOG_DEBUG(
+    "SegmentMinCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Output(0)->GetDataSize());
+  return KERNEL_STATUS_OK;
+}
+bool SegmentMinCpuKernel::CheckType(Tensor *t) {
+  DataType type = t->GetDataType();
+  return type == DT_INT32 || type == DT_INT64;
+}
+bool SegmentMinCpuKernel::CheckDim(Tensor *t) {
+  auto dims = t->GetTensorShape()->GetDims();
+  return dims == 1;
+}
+bool SegmentMinCpuKernel::CheckSorted(Tensor *tensor) {
+  DataType type = tensor->GetDataType();
+  auto len = tensor->NumElements();
+  switch (type) {
+    case DT_INT32: {
+      auto data = reinterpret_cast<int32_t *>(tensor->GetData());
+      for (int64_t i = 0; i < len; i++)
+        if ((i && data[i] < data[i - 1]) || data[i] < 0) {
+          return false;
+        }
+      break;
+    }
+    case DT_INT64: {
+      auto data = reinterpret_cast<int64_t *>(tensor->GetData());
+      for (int64_t i = 0; i < len; i++)
+        if ((i && data[i] < data[i - 1]) || data[i] < 0) {
+          return false;
+        }
+      break;
+    }
+    default:
+      return true;
+  }
+  return true;
+}
+bool SegmentMinCpuKernel::CheckLength(Tensor *seg, Tensor *data) {
+  auto len1 = seg->NumElements();
+  auto len2 = data->GetTensorShape()->GetDimSize(0);
+  return len1 == len2;
+}
+REGISTER_CPU_KERNEL(kSegmentMin, SegmentMinCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_min.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_min.h
@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_SEGMENT_MIN_H_
+#define AICPU_KERNELS_NORMALIZED_SEGMENT_MIN_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class SegmentMinCpuKernel : public CpuKernel {
+ public:
+  SegmentMinCpuKernel() = default;
+  ~SegmentMinCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <class T1, class T2>
+  static uint32_t SegmentMinCompute(CpuKernelContext &ctx);
+  static uint32_t SegmentMinCheck(CpuKernelContext &ctx);
+  static bool CheckType(Tensor *t);
+  static bool CheckDim(Tensor *t);
+  static bool CheckSorted(Tensor *t);
+  static bool CheckLength(Tensor *seg, Tensor *data);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc
@ -0,0 +1,342 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "segment_prod.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 2;
+const uint32_t kOutputNum = 1;
+const char *kSegmentProd = "SegmentProd";
+
+#define SEGMENTPROD_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX)    \
+  case (DTYPE): {                                             \
+    uint32_t result = SegmentProdCompute<TYPE1, TYPE2>(CTX);  \
+    if (result != KERNEL_STATUS_OK) {                         \
+      KERNEL_LOG_ERROR("SegmentProd kernel compute failed."); \
+      return result;                                          \
+    }                                                         \
+    break;                                                    \
+  }
+
+#define SEGMENTPROD_COMPUTE_CASE_CP(DTYPE, TYPE1, TYPE2, CTX)        \
+  case (DTYPE): {                                                    \
+    uint32_t result = SegmentProdCompute_Complex<TYPE1, TYPE2>(CTX); \
+    if (result != KERNEL_STATUS_OK) {                                \
+      KERNEL_LOG_ERROR("SegmentProd kernel compute failed.");        \
+      return result;                                                 \
+    }                                                                \
+    break;                                                           \
+  }
+
+#define SEGMENTPROD_COMPUTE_CASE_ALL(TYPE, CTX)                               \
+  SEGMENTPROD_COMPUTE_CASE_CP(DT_COMPLEX64, std::complex<float>, TYPE, CTX)   \
+  SEGMENTPROD_COMPUTE_CASE_CP(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
+  SEGMENTPROD_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX)                        \
+  SEGMENTPROD_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX)                      \
+  SEGMENTPROD_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX)                      \
+  SEGMENTPROD_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX)                      \
+  SEGMENTPROD_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX)                      \
+  SEGMENTPROD_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX)                    \
+  SEGMENTPROD_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX)                    \
+  SEGMENTPROD_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)                    \
+  SEGMENTPROD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX)                \
+  SEGMENTPROD_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX)                        \
+  SEGMENTPROD_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
+}  // namespace
+
+namespace aicpu {
+
+uint32_t SegmentProdCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SegmentProd check input and output number failed.");
+  Tensor *input_data = ctx.Input(0);
+  KERNEL_CHECK_NULLPTR(input_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[0] failed.")
+  Tensor *segment_ids_data = ctx.Input(1);
+  KERNEL_CHECK_NULLPTR(segment_ids_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[1] failed.")
+  Tensor *output_data = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR(output_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output[0] failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  auto segment_ids_type = ctx.Input(1)->GetDataType();
+  switch (segment_ids_type) {
+    case DT_INT32: {
+      switch (data_type) {
+        SEGMENTPROD_COMPUTE_CASE_ALL(int32_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    }
+    case DT_INT64: {
+      switch (data_type) {
+        SEGMENTPROD_COMPUTE_CASE_ALL(int64_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    }
+    default: {
+      KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(segment_ids_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+T SegmentProdCpuKernel::ComputeMul(T num_1, T num_2) {
+  T res;
+  auto a = num_1.real();
+  auto b = num_1.imag();
+  auto x = num_2.real();
+  auto y = num_2.imag();
+  auto real_res = a * x - b * y;
+  auto imag_res = b * x + a * y;
+  res.real(real_res);
+  res.imag(imag_res);
+  return res;
+}
+
+template <typename T1, typename T2>
+uint32_t SegmentProdCpuKernel::SegmentProdCompute(CpuKernelContext &ctx) {
+  Tensor *input_data = ctx.Input(0);
+  auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
+  int64_t input_data_num = input_data->NumElements();
+  Tensor *segment_ids_data = ctx.Input(1);
+  auto segment_ids_data_addr = reinterpret_cast<T2 *>(segment_ids_data->GetData());
+  int64_t segment_ids_data_num = segment_ids_data->NumElements();
+  Tensor *output_data = ctx.Output(0);
+  auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
+  int64_t output_data_num = output_data->NumElements();
+  for (int64_t i = 0; i < output_data_num; i++) {
+    output_data_addr[i] = static_cast<T1>(1);
+  }
+  std::vector<int64_t> segments;
+  if (segment_ids_data_num != (input_data->GetTensorShape()->GetDimSize(0))) {
+    KERNEL_LOG_ERROR("The amount of data for input[1] must be equal to the first dimension of input[0].");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (segment_ids_data_addr[0] < 0) {
+    KERNEL_LOG_ERROR("Input[1] must be nonnegative data.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  int64_t seg_tmp = 1;
+  for (int64_t i = 0; i < segment_ids_data_num - 1; i++) {
+    if (segment_ids_data_addr[i] > segment_ids_data_addr[i + 1]) {
+      KERNEL_LOG_ERROR("Input[1] must be an ascending ordered sequence.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) {
+      seg_tmp++;
+    } else {
+      segments.push_back(seg_tmp);
+      seg_tmp = 1;
+    }
+    if (i == segment_ids_data_num - 2) {
+      segments.push_back(seg_tmp);
+    }
+  }
+  const int64_t num_compare_per = input_data_num / (input_data->GetTensorShape()->GetDimSize(0));
+  const int64_t num_segments = segments.size();
+  if (num_segments < 2 * 1024) {
+    for (int64_t i = 0; i < num_segments; i++) {
+      int64_t count = segments[i];
+      int64_t count_no = 0;
+      for (int64_t j = 0; j < i; j++) {
+        count_no += segments[j];
+      }
+      int64_t input_addr_base = count_no * num_compare_per;
+      if (num_compare_per < 2 * 1024) {
+        for (int64_t j = 0; j < num_compare_per; j++) {
+          int64_t prod_init_addr = input_addr_base + j;
+          T1 prod_value = input_data_addr[prod_init_addr];
+          for (int64_t k = 1; k < count; k++) {
+            int cmp_addr = prod_init_addr + k * num_compare_per;
+            prod_value *= input_data_addr[cmp_addr];
+          }
+          output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = prod_value;
+        }
+      } else {
+        uint32_t min_core_num = 1;
+        int64_t prod_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+        if (prod_core_num > num_compare_per) {
+          prod_core_num = num_compare_per;
+        }
+        auto shard_compute = [&](size_t start, size_t end) {
+          for (size_t j = start; j < end; j++) {
+            int64_t prod_init_addr = input_addr_base + j;
+            T1 prod_value = input_data_addr[prod_init_addr];
+            for (int64_t k = 1; k < count; k++) {
+              int cmp_addr = prod_init_addr + k * num_compare_per;
+              prod_value *= input_data_addr[cmp_addr];
+            }
+            output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = prod_value;
+          }
+        };
+        KERNEL_HANDLE_ERROR(
+          CpuKernelUtils::ParallelFor(ctx, num_compare_per, num_compare_per / prod_core_num, shard_compute),
+          "SegmentProd Compute failed.");
+      }
+    }
+  } else {
+    uint32_t min_core_num_seg = 1;
+    int64_t prod_core_num_seg = std::max(min_core_num_seg, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (prod_core_num_seg > num_segments) {
+      prod_core_num_seg = num_segments;
+    }
+    auto shard_compute_seg = [&](size_t start_seg, size_t end_seg) {
+      for (size_t i = start_seg; i < end_seg; i++) {
+        int64_t count = segments[i];
+        int64_t count_no = 0;
+        for (size_t j = 0; j < i; j++) {
+          count_no += segments[j];
+        }
+        int64_t input_addr_base = count_no * num_compare_per;
+        for (int64_t j = 0; j < num_compare_per; j++) {
+          int64_t prod_init_addr = input_addr_base + j;
+          T1 prod_value = input_data_addr[prod_init_addr];
+          for (int64_t k = 1; k < count; k++) {
+            int cmp_addr = prod_init_addr + k * num_compare_per;
+            prod_value *= input_data_addr[cmp_addr];
+          }
+          output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = prod_value;
+        }
+      }
+    };
+    KERNEL_HANDLE_ERROR(
+      CpuKernelUtils::ParallelFor(ctx, num_segments, num_segments / prod_core_num_seg, shard_compute_seg),
+      "SegmentProd Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+template <typename T1, typename T2>
+uint32_t SegmentProdCpuKernel::SegmentProdCompute_Complex(CpuKernelContext &ctx) {
+  Tensor *input_data = ctx.Input(0);
+  auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
+  int64_t input_data_num = input_data->NumElements();
+  Tensor *segment_ids_data = ctx.Input(1);
+  auto segment_ids_data_addr = reinterpret_cast<T2 *>(segment_ids_data->GetData());
+  int64_t segment_ids_data_num = segment_ids_data->NumElements();
+  Tensor *output_data = ctx.Output(0);
+  auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
+  int64_t output_data_num = output_data->NumElements();
+  for (int64_t i = 0; i < output_data_num; i++) {
+    output_data_addr[i] = static_cast<T1>(1);
+  }
+  std::vector<int64_t> segments;
+  if (segment_ids_data_num != (input_data->GetTensorShape()->GetDimSize(0))) {
+    KERNEL_LOG_ERROR("The amount of data for input[1] must be equal to the first dimension of input[0].");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (segment_ids_data_addr[0] < 0) {
+    KERNEL_LOG_ERROR("Input[1] must be nonnegative data.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  int64_t seg_tmp = 1;
+  for (int64_t i = 0; i < segment_ids_data_num - 1; i++) {
+    if (segment_ids_data_addr[i] > segment_ids_data_addr[i + 1]) {
+      KERNEL_LOG_ERROR("Input[1] must be an ascending ordered sequence.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) {
+      seg_tmp++;
+    } else {
+      segments.push_back(seg_tmp);
+      seg_tmp = 1;
+    }
+    if (i == segment_ids_data_num - 2) {
+      segments.push_back(seg_tmp);
+    }
+  }
+  const int64_t num_compare_per = input_data_num / (input_data->GetTensorShape()->GetDimSize(0));
+  const int64_t num_segments = segments.size();
+  if (num_segments < 2 * 1024) {
+    for (int64_t i = 0; i < num_segments; i++) {
+      int64_t count = segments[i];
+      int64_t count_no = 0;
+      for (int64_t j = 0; j < i; j++) {
+        count_no += segments[j];
+      }
+      int64_t input_addr_base = count_no * num_compare_per;
+      if (num_compare_per < 2 * 1024) {
+        for (int64_t j = 0; j < num_compare_per; j++) {
+          int64_t prod_init_addr = input_addr_base + j;
+          T1 prod_value = input_data_addr[prod_init_addr];
+          for (int64_t k = 1; k < count; k++) {
+            int cmp_addr = prod_init_addr + k * num_compare_per;
+            prod_value = ComputeMul(prod_value, input_data_addr[cmp_addr]);
+          }
+          output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = prod_value;
+        }
+      } else {
+        uint32_t min_core_num = 1;
+        int64_t prod_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+        if (prod_core_num > num_compare_per) {
+          prod_core_num = num_compare_per;
+        }
+        auto shard_compute = [&](size_t start, size_t end) {
+          for (size_t j = start; j < end; j++) {
+            int64_t prod_init_addr = input_addr_base + j;
+            T1 prod_value = input_data_addr[prod_init_addr];
+            for (int64_t k = 1; k < count; k++) {
+              int cmp_addr = prod_init_addr + k * num_compare_per;
+              prod_value = ComputeMul(prod_value, input_data_addr[cmp_addr]);
+            }
+            output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = prod_value;
+          }
+        };
+        KERNEL_HANDLE_ERROR(
+          CpuKernelUtils::ParallelFor(ctx, num_compare_per, num_compare_per / prod_core_num, shard_compute),
+          "SegmentProd Compute failed.");
+      }
+    }
+  } else {
+    uint32_t min_core_num_seg = 1;
+    int64_t prod_core_num_seg = std::max(min_core_num_seg, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (prod_core_num_seg > num_segments) {
+      prod_core_num_seg = num_segments;
+    }
+    auto shard_compute_seg = [&](size_t start_seg, size_t end_seg) {
+      for (size_t i = start_seg; i < end_seg; i++) {
+        int64_t count = segments[i];
+        int64_t count_no = 0;
+        for (size_t j = 0; j < i; j++) {
+          count_no += segments[j];
+        }
+        int64_t input_addr_base = count_no * num_compare_per;
+        for (int64_t j = 0; j < num_compare_per; j++) {
+          int64_t prod_init_addr = input_addr_base + j;
+          T1 prod_value = input_data_addr[prod_init_addr];
+          for (int64_t k = 1; k < count; k++) {
+            int cmp_addr = prod_init_addr + k * num_compare_per;
+            prod_value = ComputeMul(prod_value, input_data_addr[cmp_addr]);
+          }
+          output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = prod_value;
+        }
+      }
+    };
+    KERNEL_HANDLE_ERROR(
+      CpuKernelUtils::ParallelFor(ctx, num_segments, num_segments / prod_core_num_seg, shard_compute_seg),
+      "SegmentProd Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kSegmentProd, SegmentProdCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SEGMENTPROD_H_
+#define AICPU_KERNELS_NORMALIZED_SEGMENTPROD_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class SegmentProdCpuKernel : public CpuKernel {
+ public:
+  SegmentProdCpuKernel() = default;
+  ~SegmentProdCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  static T ComputeMul(T num_1, T num_2);
+  template <typename T1, typename T2>
+  static uint32_t SegmentProdCompute(CpuKernelContext &ctx);
+  template <typename T1, typename T2>
+  static uint32_t SegmentProdCompute_Complex(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_sum.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_sum.cc
@ -0,0 +1,212 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "segment_sum.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 2;
+const uint32_t kOutputNum = 1;
+const char *kSegmentSum = "SegmentSum";
+const int64_t kDataSize = 2 * 1024;
+
+#define SEGMENTSUM_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX)    \
+  case (DTYPE): {                                            \
+    uint32_t result = SegmentSumCompute<TYPE1, TYPE2>(CTX);  \
+    if (result != KERNEL_STATUS_OK) {                        \
+      KERNEL_LOG_ERROR("SegmentSum kernel compute failed."); \
+      return result;                                         \
+    }                                                        \
+    break;                                                   \
+  }
+
+#define SEGMENTSUM_COMPUTE_CASE_ALL(TYPE, CTX)                            \
+  SEGMENTSUM_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, TYPE, CTX)   \
+  SEGMENTSUM_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
+  SEGMENTSUM_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX)                     \
+  SEGMENTSUM_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX)                   \
+  SEGMENTSUM_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX)                   \
+  SEGMENTSUM_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX)                   \
+  SEGMENTSUM_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX)                   \
+  SEGMENTSUM_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX)                 \
+  SEGMENTSUM_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX)                 \
+  SEGMENTSUM_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)                 \
+  SEGMENTSUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX)             \
+  SEGMENTSUM_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX)                     \
+  SEGMENTSUM_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
+}  // namespace
+
+namespace aicpu {
+uint32_t SegmentSumCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SegmentSum check input and output number failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  auto segment_ids_type = ctx.Input(1)->GetDataType();
+  switch (segment_ids_type) {
+    case DT_INT32: {
+      switch (data_type) {
+        SEGMENTSUM_COMPUTE_CASE_ALL(int32_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    }
+    case DT_INT64: {
+      switch (data_type) {
+        SEGMENTSUM_COMPUTE_CASE_ALL(int64_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    }
+    default: {
+      KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(segment_ids_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T1, typename T2>
+uint32_t SegmentSumCpuKernel::SegmentSumCompute(CpuKernelContext &ctx) {
+  Tensor *input_x_data = ctx.Input(0);
+  auto input_x_data_addr = reinterpret_cast<T1 *>(input_x_data->GetData());
+  auto input_x_shape = input_x_data->GetTensorShape();
+  auto input_x_dims = input_x_shape->GetDimSizes();
+  int64_t input_x_data_num = input_x_data->NumElements();
+  Tensor *segment_ids_data = ctx.Input(1);
+  auto segment_ids_data_addr = reinterpret_cast<T2 *>(segment_ids_data->GetData());
+  int64_t segment_ids_data_num = segment_ids_data->NumElements();
+  input_x_dims[0] = segment_ids_data_addr[segment_ids_data_num - 1] + 1;
+  Tensor *output_data = ctx.Output(0);
+  auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
+  auto output_data_shape = output_data->GetTensorShape();
+  if (output_data_shape->GetDimSize(0) < input_x_dims[0]) {
+    KERNEL_LOG_ERROR("The number of segments of the segmentation result of segment_ids is too large.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  output_data_shape->SetDimSizes(input_x_dims);
+  if (!output_data->SetTensorShape(output_data_shape.get())) {
+    KERNEL_LOG_ERROR("Set output shape failed.");
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  int64_t output_data_num = output_data->NumElements();
+  for (int64_t i = 0; i < output_data_num; i++) {
+    output_data_addr[i] = static_cast<T1>(0);
+  }
+  std::vector<int64_t> segments;
+  if (segment_ids_data_num != (input_x_data->GetTensorShape()->GetDimSize(0))) {
+    KERNEL_LOG_ERROR("The amount of data for input[1] must be equal to the first dimension of input[0].");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (segment_ids_data_addr[0] < 0) {
+    KERNEL_LOG_ERROR("Input[1] must be nonnegative data.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  int64_t seg_tmp = 1;
+  for (int64_t i = 0; i < segment_ids_data_num - 1; i++) {
+    if (segment_ids_data_addr[i] > segment_ids_data_addr[i + 1]) {
+      KERNEL_LOG_ERROR("Input[1] must be an ascending ordered sequence.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) {
+      seg_tmp++;
+    } else {
+      segments.push_back(seg_tmp);
+      seg_tmp = 1;
+    }
+    if (i == segment_ids_data_num - 2) {
+      segments.push_back(seg_tmp);
+    }
+  }
+  const int64_t num_compare_per = input_x_data_num / (input_x_shape->GetDimSize(0));
+  const int64_t num_segments = segments.size();
+  if (num_segments < kDataSize) {
+    for (int64_t i = 0; i < num_segments; i++) {
+      int64_t count = segments[i];
+      int64_t count_no = 0;
+      for (int64_t j = 0; j < i; j++) {
+        count_no += segments[j];
+      }
+      int64_t input_addr_base = count_no * num_compare_per;
+      if (num_compare_per < 2 * 1024) {
+        for (int64_t j = 0; j < num_compare_per; j++) {
+          int64_t sum_init_addr = input_addr_base + j;
+          T1 sum_value = input_x_data_addr[sum_init_addr];
+          for (int64_t k = 1; k < count; k++) {
+            int cmp_addr = sum_init_addr + k * num_compare_per;
+            sum_value += input_x_data_addr[cmp_addr];
+          }
+          output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = sum_value;
+        }
+      } else {
+        uint32_t min_core_num = 1;
+        int64_t sum_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+        if (sum_core_num > num_compare_per) {
+          sum_core_num = num_compare_per;
+        }
+        auto shard_compute = [&](size_t start, size_t end) {
+          for (size_t j = start; j < end; j++) {
+            int64_t sum_init_addr = input_addr_base + j;
+            T1 sum_value = input_x_data_addr[sum_init_addr];
+            for (int64_t k = 1; k < count; k++) {
+              int cmp_addr = sum_init_addr + k * num_compare_per;
+              sum_value += input_x_data_addr[cmp_addr];
+            }
+            output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = sum_value;
+          }
+        };
+        KERNEL_HANDLE_ERROR(
+          CpuKernelUtils::ParallelFor(ctx, num_compare_per, num_compare_per / sum_core_num, shard_compute),
+          "SegmentSum Compute failed.");
+      }
+    }
+  } else {
+    uint32_t min_core_num_seg = 1;
+    int64_t sum_core_num_seg = std::max(min_core_num_seg, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (sum_core_num_seg > num_segments) {
+      sum_core_num_seg = num_segments;
+    }
+    auto shard_compute_seg = [&](size_t start_seg, size_t end_seg) {
+      for (size_t i = start_seg; i < end_seg; i++) {
+        int64_t count = segments[i];
+        int64_t count_no = 0;
+        for (size_t j = 0; j < i; j++) {
+          count_no += segments[j];
+        }
+        int64_t input_addr_base = count_no * num_compare_per;
+        for (int64_t j = 0; j < num_compare_per; j++) {
+          int64_t sum_init_addr = input_addr_base + j;
+          T1 sum_value = input_x_data_addr[sum_init_addr];
+          for (int64_t k = 1; k < count; k++) {
+            int cmp_addr = sum_init_addr + k * num_compare_per;
+            sum_value += input_x_data_addr[cmp_addr];
+          }
+          output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = sum_value;
+        }
+      }
+    };
+    KERNEL_HANDLE_ERROR(
+      CpuKernelUtils::ParallelFor(ctx, num_segments, num_segments / sum_core_num_seg, shard_compute_seg),
+      "SegmentSum Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kSegmentSum, SegmentSumCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_sum.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_sum.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SEGMENTSUM_H_
+#define AICPU_KERNELS_NORMALIZED_SEGMENTSUM_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class SegmentSumCpuKernel : public CpuKernel {
+ public:
+  SegmentSumCpuKernel() = default;
+  ~SegmentSumCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T1, typename T2>
+  static uint32_t SegmentSumCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/slice.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/slice.cc
@ -0,0 +1,282 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "slice.h"
+
+#include "securec.h"
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include <iostream>
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 3;
+const char *kSlice = "Slice";
+
+#define SLICE_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                       \
+    uint32_t result = SliceCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                   \
+      KERNEL_LOG_ERROR("Slice kernel compute failed."); \
+      return result;                                    \
+    }                                                   \
+    break;                                              \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t SliceCpuKernel::GetSliceValue(Tensor *tensor, std::vector<int64_t> &value) {
+  auto type = tensor->GetDataType();
+  if (type == DT_INT32) {
+    auto data = reinterpret_cast<int32_t *>(tensor->GetData());
+    for (int64_t i = 0; i < tensor->NumElements(); i++) {
+      value.push_back(static_cast<int64_t>(*(data + i)));
+    }
+  } else if (type == DT_INT64) {
+    auto data = reinterpret_cast<int64_t *>(tensor->GetData());
+    for (int64_t i = 0; i < tensor->NumElements(); i++) {
+      value.push_back(*(data + i));
+    }
+  } else {
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SliceCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kSlice);
+  KERNEL_HANDLE_ERROR(SliceCheck(ctx), "[%s] check params failed.", kSlice);
+  auto x_type = ctx.Input(0)->GetDataType();
+  switch (x_type) {
+    SLICE_COMPUTE_CASE(DT_BOOL, bool, ctx)
+    SLICE_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
+    SLICE_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
+    SLICE_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
+    SLICE_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    SLICE_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    SLICE_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    SLICE_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    SLICE_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    SLICE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    SLICE_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    SLICE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    SLICE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
+    SLICE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("Slice kernel data type [%s] not support.", DTypeStr(x_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SliceCpuKernel::SliceCheck(CpuKernelContext &ctx) {
+  KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Input(1)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Input(kThirdInputIndex)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 2 data failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed.")
+
+  KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 tensor shape failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Input(1)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 tensor shape failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Input(kThirdInputIndex)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID,
+                       "Get input 2 tensor shape failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID,
+                       "Get output 0 tensor shape failed.")
+
+  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> shape_offsets = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> shape_size = ctx.Input(kThirdInputIndex)->GetTensorShape()->GetDimSizes();
+
+  auto offsets_tensor = ctx.Input(1);
+  auto size_tensor = ctx.Input(2);
+  auto y_tensor = ctx.Output(0);
+
+  KERNEL_CHECK_FALSE((offsets_tensor->NumElements() == static_cast<int64_t>(shape_x.size())),
+                     KERNEL_STATUS_PARAM_INVALID, "Expected offsets to be 1-D tensors of size [%zu], but got [%zu].",
+                     shape_x.size(), offsets_tensor->NumElements())
+  KERNEL_CHECK_FALSE((size_tensor->NumElements() == static_cast<int64_t>(shape_x.size())), KERNEL_STATUS_PARAM_INVALID,
+                     "Expected size to be 1-D tensors of size [%zu], but got [%zu].", shape_x.size(),
+                     size_tensor->NumElements())
+
+  KERNEL_CHECK_FALSE((GetSliceValue(offsets_tensor, offsets) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
+                     "Offsets must be either int32 or int64, but got [%s].",
+                     DTypeStr(offsets_tensor->GetDataType()).c_str())
+  KERNEL_CHECK_FALSE((GetSliceValue(size_tensor, size) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
+                     "Size must be either int32 or int64, but got [%s].", DTypeStr(size_tensor->GetDataType()).c_str())
+
+  is_identity = true;
+  slice_dim0 = true;
+  std::vector<int64_t> shape_y;
+  for (size_t i = 0; i < shape_x.size(); ++i) {
+    if (size.at(i) == -1) {
+      size.at(i) = shape_x.at(i) - offsets.at(i);
+    }
+    int64_t offset = offsets.at(i);
+    int64_t size_dim = size.at(i);
+    if (shape_x.at(i) == 0) {
+      KERNEL_CHECK_FALSE((offset == 0 && size_dim == 0), KERNEL_STATUS_PARAM_INVALID,
+                         "Expected offsets[%zu] == 0 (got %zu) and size[%zu] == 0 (got %zu),"
+                         " when x shape[%zu] == 0.",
+                         i, offset, i, size_dim, i)
+    } else {
+      KERNEL_CHECK_FALSE((0 <= offset && offset < shape_x.at(i)), KERNEL_STATUS_PARAM_INVALID,
+                         "Expected offsets[%zu] in [0, %zu], but got %zu.", i, shape_x.at(i), offset)
+      KERNEL_CHECK_FALSE((0 <= size_dim && offset + size_dim <= shape_x.at(i)), KERNEL_STATUS_PARAM_INVALID,
+                         "Expected size[%zu] in [0, %zu], but got %zu.", i, shape_x.at(i) - offset, size_dim)
+    }
+    bool take_all = (offset == 0) && (size_dim == shape_x.at(i));
+    is_identity &= take_all;
+    slice_dim0 &= (i == 0) || take_all;
+    shape_y.push_back(size_dim);
+  }
+  y_tensor->GetTensorShape()->SetDimSizes(shape_y);
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t SliceCpuKernel::SliceCompute(CpuKernelContext &ctx) {
+  auto x_data = ctx.Input(0)->GetData();
+  auto y_data = ctx.Output(0)->GetData();
+  int64_t num_output = ctx.Output(0)->NumElements();
+  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> shape_y = ctx.Output(0)->GetTensorShape()->GetDimSizes();
+  if (num_output == 0) {
+    return KERNEL_STATUS_OK;
+  }
+  if (is_identity) {
+    int64_t input_size = ctx.Input(0)->GetDataSize();
+    int cpret = memcpy_s(y_data, input_size, x_data, input_size);
+    KERNEL_CHECK_FALSE((cpret == EOK), KERNEL_STATUS_INNER_ERROR, "[%s] memcpy_s to output failed, size [%llu].",
+                       kSlice, input_size);
+    return KERNEL_STATUS_OK;
+  }
+  if (slice_dim0) {
+    int data_size = size.at(0);
+    data_size = data_size * sizeof(T);
+    int cpret = memcpy_s(y_data, data_size, static_cast<T *>(x_data) + offsets.at(0), data_size);
+    KERNEL_CHECK_FALSE((cpret == EOK), KERNEL_STATUS_INNER_ERROR, "[%s] memcpy_s to output failed, size [%llu].",
+                       kSlice, data_size);
+    return KERNEL_STATUS_OK;
+  }
+
+  auto input_data = reinterpret_cast<T *>(x_data);
+  auto output_data = reinterpret_cast<T *>(y_data);
+  size_t input_dims = shape_x.size();
+  switch (input_dims) {
+    case INPUT_NUM2: {
+      using Eigen_Tensor_2D =
+        Eigen::TensorMap<Eigen::Tensor<T, static_cast<int>(INPUT_NUM2), Eigen::RowMajor>, Eigen::Aligned>;
+      Eigen_Tensor_2D input_2D(input_data, shape_x.at(0), shape_x.at(1));
+      Eigen_Tensor_2D output_2D(output_data, shape_y.at(0), shape_y.at(1));
+      Eigen::array<Eigen::DenseIndex, INPUT_NUM2> offsets_2D;
+      Eigen::array<Eigen::DenseIndex, INPUT_NUM2> size_2D;
+      for (size_t i = 0; i < INPUT_NUM2; ++i) {
+        offsets_2D[i] = offsets.at(i);
+        size_2D[i] = size.at(i);
+      }
+      output_2D = input_2D.slice(offsets_2D, size_2D);
+      break;
+    }
+    case INPUT_NUM3: {
+      using Eigen_Tensor_3D =
+        Eigen::TensorMap<Eigen::Tensor<T, static_cast<int>(INPUT_NUM3), Eigen::RowMajor>, Eigen::Aligned>;
+      Eigen_Tensor_3D input_3D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(INPUT_NUM2));
+      Eigen_Tensor_3D output_3D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(INPUT_NUM2));
+      Eigen::array<Eigen::DenseIndex, INPUT_NUM3> offsets_3D;
+      Eigen::array<Eigen::DenseIndex, INPUT_NUM3> size_3D;
+      for (size_t i = 0; i < INPUT_NUM3; ++i) {
+        offsets_3D[i] = offsets.at(i);
+        size_3D[i] = size.at(i);
+      }
+      output_3D = input_3D.slice(offsets_3D, size_3D);
+      break;
+    }
+    case INPUT_NUM4: {
+      using Eigen_Tensor_4D =
+        Eigen::TensorMap<Eigen::Tensor<T, static_cast<int>(INPUT_NUM4), Eigen::RowMajor>, Eigen::Aligned>;
+      Eigen_Tensor_4D input_4D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(INPUT_NUM2),
+                               shape_x.at(INPUT_NUM3));
+      Eigen_Tensor_4D output_4D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(INPUT_NUM2),
+                                shape_y.at(INPUT_NUM3));
+      Eigen::array<Eigen::DenseIndex, INPUT_NUM4> offsets_4D;
+      Eigen::array<Eigen::DenseIndex, INPUT_NUM4> size_4D;
+      for (size_t i = 0; i < INPUT_NUM4; ++i) {
+        offsets_4D[i] = offsets.at(i);
+        size_4D[i] = size.at(i);
+      }
+      output_4D = input_4D.slice(offsets_4D, size_4D);
+      break;
+    }
+    case INPUT_NUM5: {
+      using Eigen_Tensor_5D =
+        Eigen::TensorMap<Eigen::Tensor<T, static_cast<int>(INPUT_NUM5), Eigen::RowMajor>, Eigen::Aligned>;
+      Eigen_Tensor_5D input_5D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(INPUT_NUM2), shape_x.at(INPUT_NUM3),
+                               shape_x.at(INPUT_NUM4));
+      Eigen_Tensor_5D output_5D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(INPUT_NUM2),
+                                shape_y.at(INPUT_NUM3), shape_y.at(INPUT_NUM4));
+      Eigen::array<Eigen::DenseIndex, INPUT_NUM5> offsets_5D;
+      Eigen::array<Eigen::DenseIndex, INPUT_NUM5> size_5D;
+      for (size_t i = 0; i < INPUT_NUM5; ++i) {
+        offsets_5D[i] = offsets.at(i);
+        size_5D[i] = size.at(i);
+      }
+      output_5D = input_5D.slice(offsets_5D, size_5D);
+      break;
+    }
+    case INPUT_NUM6: {
+      using Eigen_Tensor_6D =
+        Eigen::TensorMap<Eigen::Tensor<T, static_cast<int>(INPUT_NUM6), Eigen::RowMajor>, Eigen::Aligned>;
+      Eigen_Tensor_6D input_6D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(INPUT_NUM2), shape_x.at(INPUT_NUM3),
+                               shape_x.at(INPUT_NUM4), shape_x.at(INPUT_NUM5));
+      Eigen_Tensor_6D output_6D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(INPUT_NUM2),
+                                shape_y.at(INPUT_NUM3), shape_y.at(INPUT_NUM4), shape_y.at(INPUT_NUM5));
+      Eigen::array<Eigen::DenseIndex, INPUT_NUM6> offsets_6D;
+      Eigen::array<Eigen::DenseIndex, INPUT_NUM6> size_6D;
+      for (size_t i = 0; i < INPUT_NUM6; ++i) {
+        offsets_6D[i] = offsets.at(i);
+        size_6D[i] = size.at(i);
+      }
+      output_6D = input_6D.slice(offsets_6D, size_6D);
+      break;
+    }
+    case INPUT_NUM7: {
+      using Eigen_Tensor_7D =
+        Eigen::TensorMap<Eigen::Tensor<T, static_cast<int>(INPUT_NUM7), Eigen::RowMajor>, Eigen::Aligned>;
+      Eigen_Tensor_7D input_7D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(INPUT_NUM2), shape_x.at(INPUT_NUM3),
+                               shape_x.at(INPUT_NUM4), shape_x.at(INPUT_NUM5), shape_x.at(INPUT_NUM6));
+      Eigen_Tensor_7D output_7D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(INPUT_NUM2),
+                                shape_y.at(INPUT_NUM3), shape_y.at(INPUT_NUM4), shape_y.at(INPUT_NUM5),
+                                shape_y.at(INPUT_NUM6));
+      Eigen::array<Eigen::DenseIndex, INPUT_NUM7> offsets_7D;
+      Eigen::array<Eigen::DenseIndex, INPUT_NUM7> size_7D;
+      for (size_t i = 0; i < INPUT_NUM7; ++i) {
+        offsets_7D[i] = offsets.at(i);
+        size_7D[i] = size.at(i);
+      }
+      output_7D = input_7D.slice(offsets_7D, size_7D);
+      break;
+    }
+    default:
+      KERNEL_LOG_ERROR("[%s] : Unhandled input dimensions [%zu].", kSlice, input_dims);
+      return KERNEL_STATUS_INNER_ERROR;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kSlice, SliceCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/slice.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/slice.h
@ -0,0 +1,44 @@
+/**
+ * Copyright (c) 2022-2022 Huawei Technologies Co., Ltd.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SLICE_H_
+#define AICPU_KERNELS_NORMALIZED_SLICE_H_
+
+#include "cpu_ops_kernel.h"
+#include <vector>
+
+namespace aicpu {
+class SliceCpuKernel : public CpuKernel {
+ public:
+  SliceCpuKernel() = default;
+  ~SliceCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  bool is_identity;
+  bool slice_dim0;
+  std::vector<int64_t> offsets;
+  std::vector<int64_t> size;
+
+  uint32_t GetSliceValue(Tensor *tensor, std::vector<int64_t> &value);
+  uint32_t SliceCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t SliceCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_cross.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_cross.cc
@ -0,0 +1,518 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cpu_ops_kernel.h"
+#include <string>
+#include "sparse_cross.h"
+#include <iostream>
+
+namespace {
+static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
+static const uint64_t k1 = 0xb492b66fbe98f273ULL;
+static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
+const char *kSparseCross = "SparseCross";
+}  // namespace
+
+namespace aicpu {
+typedef std::pair<uint64_t, uint64_t> uint128_t;
+inline uint64_t Uint128Low64(const uint128_t x) { return x.first; }
+inline uint64_t Uint128High64(const uint128_t x) { return x.second; }
+inline uint128_t Uint128(uint64_t lo, uint64_t hi) { return uint128_t(lo, hi); }
+#define STATIC_INLINE static inline
+
+using namespace std;
+
+using ui = unsigned int;
+using ul = unsigned long;
+using uc = unsigned char;
+using ull = unsigned long long;
+
+static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
+static const uint64_t k1 = 0xb492b66fbe98f273ULL;
+static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
+
+STATIC_INLINE uint64_t Fetch64(const char *p) {
+  uint64_t result;
+  memcpy(&result, p, sizeof(result));
+  return uint64_in_expected_order(result);
+}
+
+STATIC_INLINE uint32_t Fetch32(const char *p) {
+  uint32_t result;
+  memcpy(&result, p, sizeof(result));
+  return uint32_in_expected_order(result);
+}
+
+STATIC_INLINE uint64_t Hash128to64(uint128_t x) {
+  const uint64_t kMul = 0x9ddfea08eb382d69ULL;
+  uint64_t a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
+  uint64_t value = 47;
+  a ^= (a >> value);
+  uint64_t b = (Uint128High64(x) ^ a) * kMul;
+  b ^= (b >> value);
+  b *= kMul;
+  return b;
+}
+
+STATIC_INLINE uint64_t ShiftMix(uint64_t val) {
+  uint64_t value = 47;
+  return val ^ (val >> value);
+}
+
+STATIC_INLINE uint64_t HashLen16(uint64_t u, uint64_t v, uint64_t mul) {
+  uint64_t a = (u ^ v) * mul;
+  uint64_t value = 47;
+  a ^= (a >> value);
+  uint64_t b = (v ^ a) * mul;
+  b ^= (b >> value);
+  b *= mul;
+  return b;
+}
+
+STATIC_INLINE uint64_t HashLen0to16(const char *s, size_t len) {
+  if (len > 0) {
+    uint8_t a = s[0];
+    uint8_t b = s[len >> 1];
+    uint8_t c = s[len - 1];
+    uint32_t y = static_cast<uint32_t>(a) + (static_cast<uint32_t>(b) << 8);
+    uint32_t z = len + (static_cast<uint32_t>(c) << 2);
+    return ShiftMix(y * k2 ^ z * k0) * k2;
+  }
+  return k2;
+}
+
+uint64_t FarmHash64(const char *s, size_t len) { return HashLen0to16(s, len); }
+
+uint64_t Fingerprint64(const string s) { return FarmHash64(s.data(), s.size()); }
+
+template <typename InternalType>
+class ColumnInterface {
+ public:
+  virtual int64_t FeatureCount(int64_t batch) const = 0;
+  virtual InternalType Feature(int64_t batch, int64_t n) const = 0;
+  virtual ~ColumnInterface() {}
+};
+
+template <typename InternalType>
+class SparseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  SparseTensorColumn(Tensor *values, std::vector<int64_t> feature_counts, std::vector<int64_t> feature_start_indices)
+      : values_(values),
+        feature_counts_(std::move(feature_counts)),
+        feature_start_indices_(std::move(feature_start_indices)) {
+    if (feature_counts_.size() != feature_start_indices_.size()) {
+      KERNEL_LOG_ERROR("feature_counts_ is not equal to feature_start_indices_.");
+    }
+  }
+  int64_t FeatureCount(int64_t batch) const override { return feature_counts_[batch]; }
+  InternalType Feature(int64_t batch, int64_t n) const override;
+  ~SparseTensorColumn() override {}
+
+ private:
+  Tensor *values_;
+  std::vector<int64_t> feature_counts_;
+  std::vector<int64_t> feature_start_indices_;
+};
+
+template <>
+std::string SparseTensorColumn<std::string>::Feature(int64_t batch, int64_t n) const {
+  const int64_t start = feature_start_indices_[batch];
+  EigenTensor values_e(values_, values_->GetData());
+  if (DT_STRING == values_->GetDataType()) return values_e.vec<std::string>().data()[start + n];
+  return std::to_string(values_e.vec<int64_t>().data()[start + n]);
+}
+
+template <>
+int64_t SparseTensorColumn<int64_t>::Feature(int64_t batch, int64_t n) const {
+  const int64_t start = feature_start_indices_[batch];
+  EigenTensor values_e(values_, values_->GetData());
+  if (DT_STRING == values_->GetDataType()) {
+    return Fingerprint64(values_e.vec<std::string>().data()[start + n]);
+  }
+  return values_e.vec<int64_t>().data()[start + n];
+}
+
+template <typename InternalType>
+class DenseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  explicit DenseTensorColumn(Tensor *tensor) : tensor_(tensor) {}
+  int64_t FeatureCount(int64_t batch) const override { return tensor_->GetTensorShape()->GetDimSize(1); }
+  InternalType Feature(int64_t batch, int64_t n) const override;
+  ~DenseTensorColumn() override {}
+
+ private:
+  Tensor *tensor_;
+};
+
+template <>
+int64_t DenseTensorColumn<int64_t>::Feature(int64_t batch, int64_t n) const {
+  EigenTensor tensor_e(tensor_, tensor_->GetData());
+  if (DT_STRING == tensor_->GetDataType()) return Fingerprint64(tensor_e.matrix<std::string>()(batch, n));
+  return tensor_e.matrix<int64_t>()(batch, n);
+}
+
+template <>
+std::string DenseTensorColumn<std::string>::Feature(int64_t batch, int64_t n) const {
+  EigenTensor tensor_e(tensor_, tensor_->GetData());
+  if (DT_STRING == tensor_->GetDataType()) return tensor_e.matrix<std::string>()(batch, n);
+  return std::to_string(tensor_e.matrix<int64_t>()(batch, n));
+}
+
+template <typename OutType>
+class OutputUpdater {
+ public:
+  OutputUpdater(const std::vector<int64_t> &output_start_indices, Tensor *indices_out, Tensor *values_out)
+      : output_start_indices_(output_start_indices), indices_out_(indices_out), values_out_(values_out) {}
+  void Update(const int64_t batch_index, const int64_t cross_count, const OutType &cross) const {
+    const int64_t output_index = output_start_indices_[batch_index] + cross_count;
+    auto indices_out_addr = static_cast<int64_t *>(indices_out_->GetData());
+    int64_t value = 2;
+    indices_out_addr[output_index * value] = batch_index;
+    indices_out_addr[output_index * value + 1] = cross_count;
+    auto values_out_addr = static_cast<OutType *>(values_out_->GetData());
+    values_out_addr[output_index] = cross;
+  }
+
+ private:
+  const std::vector<int64_t> &output_start_indices_;
+  Tensor *indices_out_;
+  Tensor *values_out_;
+};
+
+template <typename InternalType>
+class StringCrosser {
+ public:
+  StringCrosser(const std::vector<std::unique_ptr<ColumnInterface<InternalType>>> &columns,
+                const int64_t num_buckets_unused, const uint64_t hash_key_unused)
+      : columns_(columns) {}
+  std::string Generate(const int64_t batch_index, const std::vector<int64_t> &permutation) const {
+    static const auto k_feature_separator = "_X_";
+    std::vector<InternalType> cross_vec(columns_.size());
+    for (size_t i = 0; i < permutation.size(); i++) {
+      cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i]);
+    }
+    size_t i;
+    string str1 = "";
+    for (i = 0; i < cross_vec.size() - 1; i++) {
+      str1 = str1 + cross_vec[i].data();
+      str1 = str1 + k_feature_separator;
+    }
+    str1 = str1 + cross_vec[i].data();
+    return str1;
+  }
+
+ private:
+  const std::vector<std::unique_ptr<ColumnInterface<InternalType>>> &columns_;
+};
+
+class HashCrosser {
+ public:
+  HashCrosser(const std::vector<std::unique_ptr<ColumnInterface<int64_t>>> &columns, const int64_t num_buckets,
+              const uint64_t hash_key)
+      : columns_(columns), num_buckets_(num_buckets), hash_key_(hash_key) {}
+
+  uint64_t ShiftMix(const uint64_t val) const { return val ^ (val >> 47); }
+  uint64_t FingerprintCat64(const uint64_t fp1, const uint64_t fp2) const {
+    static const uint64_t kMul = 0xc6a4a7935bd1e995ULL;
+    uint64_t result = fp1 ^ kMul;
+    result ^= ShiftMix(fp2 * kMul) * kMul;
+    result *= kMul;
+    result = ShiftMix(result) * kMul;
+    result = ShiftMix(result);
+    return result;
+  }
+
+  int64_t Generate(const int64_t batch_index, const std::vector<int64_t> &permutation) const {
+    uint64_t hashed_output = hash_key_;
+    for (size_t i = 0; i < permutation.size(); ++i) {
+      uint64_t hash_i = columns_[i]->Feature(batch_index, permutation[i]);
+      hashed_output = FingerprintCat64(hashed_output, hash_i);
+    }
+    if (num_buckets_ > 0) {
+      return hashed_output % num_buckets_;
+    } else {
+      return hashed_output % std::numeric_limits<int64_t>::max();
+    }
+  }
+
+ private:
+  const std::vector<std::unique_ptr<ColumnInterface<int64_t>>> &columns_;
+  const int64_t num_buckets_;
+  const uint64_t hash_key_;
+};
+
+template <typename InternalType>
+class ProductIterator {
+ public:
+  explicit ProductIterator(const std::vector<std::unique_ptr<ColumnInterface<InternalType>>> &columns,
+                           int64_t batch_index)
+      : columns_(columns), batch_index_(batch_index) {
+    next_permutation_.resize(columns_.size(), 0);
+    has_next_ = true;
+    for (size_t i = 0; i < columns_.size(); i++) {
+      if (columns_[i]->FeatureCount(batch_index_) == 0) {
+        has_next_ = false;
+        break;
+      }
+    }
+  }
+  std::vector<int64_t> Next() {
+    std::vector<int64_t> permutation(next_permutation_);
+    bool carry = true;
+    for (int64_t i = next_permutation_.size() - 1; i >= 0; i--) {
+      if (carry) {
+        next_permutation_[i] = next_permutation_[i] + 1;
+      }
+      if (next_permutation_[i] == columns_[i]->FeatureCount(batch_index_)) {
+        next_permutation_[i] = 0;
+      } else {
+        carry = false;
+        break;
+      }
+    }
+    has_next_ = !carry;
+    return permutation;
+  }
+  bool HasNext() { return has_next_; }
+
+ private:
+  bool has_next_;
+  const std::vector<std::unique_ptr<ColumnInterface<InternalType>>> &columns_;
+  const int64_t batch_index_;
+  std::vector<int64_t> next_permutation_;
+};
+
+template <bool HASHED_OUTPUT, typename InternalType>
+struct CrossTraits;
+template <typename InternalType>
+struct CrossTraits<false, InternalType> {
+  typedef StringCrosser<InternalType> Crosser;
+  typedef OutputUpdater<std::string> Updater;
+};
+
+template <>
+struct CrossTraits<true, int64_t> {
+  typedef HashCrosser Crosser;
+  typedef OutputUpdater<int64_t> Updater;
+};
+
+int64_t CalculateBatchSize(const OpInputList &shapes_list_in, const OpInputList &dense_list_in) {
+  EigenTensor shapes_list_in_e(shapes_list_in[0], shapes_list_in[0]->GetData());
+  if (shapes_list_in.size() > 0) {
+    return shapes_list_in_e.vec<int64_t>()(0);
+  }
+  if (dense_list_in.size() > 0) {
+    return dense_list_in[0]->GetTensorShape()->GetDimSize(0);
+  }
+  return 0;
+}
+
+void ExtractFeatureData(const OpInputList &indices_list_in, int64_t batch_size,
+                        std::vector<std::vector<int64_t>> *feature_counts,
+                        std::vector<std::vector<int64_t>> *feature_start_indices) {
+  std::vector<int64_t> current_row(indices_list_in.size());
+  for (int64_t b = 0; b < batch_size; b++) {
+    for (int64_t i = 0; i < indices_list_in.size(); i++) {
+      EigenTensor indices_list_in_e(indices_list_in[i], indices_list_in[i]->GetData());
+      const auto indices = indices_list_in_e.matrix<int64_t>();
+      int64_t feature_count = 0;
+      int64_t start_index = current_row[i];
+      while (current_row[i] < indices_list_in[i]->GetTensorShape()->GetDimSize(0) && indices(current_row[i], 0) == b) {
+        feature_count++;
+        current_row[i]++;
+      }
+      (*feature_counts)[i].push_back(feature_count);
+      (*feature_start_indices)[i].push_back(start_index);
+    }
+  }
+}
+
+template <typename InternalType>
+std::vector<std::unique_ptr<ColumnInterface<InternalType>>> ColumnsFromInput(const OpInputList &indices_list_in,
+                                                                             const OpInputList &values_list_in,
+                                                                             const OpInputList &shapes_list_in,
+                                                                             const OpInputList &dense_list_in) {
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
+  const int64_t batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  const int64_t number_of_columns = shapes_list_in.size();
+  std::vector<std::vector<int64_t>> feature_counts(number_of_columns, std::vector<int64_t>());
+  std::vector<std::vector<int64_t>> feature_start_indices(number_of_columns, std::vector<int64_t>());
+  ExtractFeatureData(indices_list_in, batch_size, &feature_counts, &feature_start_indices);
+  columns.reserve(values_list_in.size());
+  for (int64_t i = 0; i < values_list_in.size(); ++i) {
+    columns.emplace_back(
+      new SparseTensorColumn<InternalType>(values_list_in[i], feature_counts[i], feature_start_indices[i]));
+  }
+  for (int64_t i = 0; i < dense_list_in.size(); ++i) {
+    columns.emplace_back(new DenseTensorColumn<InternalType>(dense_list_in[i]));
+  }
+  return columns;
+}
+
+template <typename InternalType>
+int64_t CrossCountByBatchIndex(const std::vector<std::unique_ptr<ColumnInterface<InternalType>>> &columns,
+                               int64_t batch_index) {
+  int64_t cross_count = 1;
+  for (size_t i = 0; i < columns.size(); i++) {
+    const auto feature_count = columns[i]->FeatureCount(batch_index);
+    if (feature_count == 0) {
+      return 0;
+    }
+    cross_count *= feature_count;
+  }
+  return cross_count;
+}
+
+template <typename InternalType>
+void CreateOutputTensors(const std::vector<std::unique_ptr<ColumnInterface<InternalType>>> &columns, int64_t batch_size,
+                         CpuKernelContext *context, Tensor *indices_out, Tensor *values_out, Tensor *shape_out,
+                         std::vector<int64_t> *output_start_indices) {
+  int64_t cross_count_total = 0;
+  int64_t max_cross_count = 0;
+  for (int64_t b = 0; b < batch_size; b++) {
+    (*output_start_indices)[b] = cross_count_total;
+    const auto cross_count = CrossCountByBatchIndex<InternalType>(columns, b);
+    max_cross_count = std::max(max_cross_count, cross_count);
+    cross_count_total += cross_count;
+  }
+  indices_out = context->Output(0);
+  std::vector<int64_t> indices_t;
+  int64_t value1 = 1;
+  int64_t value2 = 2;
+  indices_t.reserve(value2);
+  indices_t.push_back(cross_count_total);
+  indices_t.push_back(value2);
+  indices_out->GetTensorShape()->SetDimSizes(indices_t);
+  indices_out->SetDataType(DT_INT64);
+
+  values_out = context->Output(value1);
+  std::vector<int64_t> values_t;
+  values_t.reserve(value1);
+  values_t.push_back(cross_count_total);
+  values_out->GetTensorShape()->SetDimSizes(values_t);
+
+  shape_out = context->Output(value2);
+  std::vector<int64_t> shape_t;
+  shape_t.reserve(value1);
+  shape_t.push_back(value2);
+  auto shape_vec = static_cast<int64_t *>(shape_out->GetData());
+  shape_vec[0] = batch_size;
+  shape_vec[1] = max_cross_count;
+  shape_out->GetTensorShape()->SetDimSizes(shape_t);
+}
+
+template <bool HASHED_OUTPUT, typename InternalType>
+uint32_t SparseCrossCpuKernel::SparseCrossCompute(CpuKernelContext &ctx) {
+  auto num_buckets_ptr = ctx.GetAttr("num_buckets");
+  uint32_t inputSize = ctx.GetInputsSize();
+  int64_t num_buckets_ = 0;
+  int64_t num = inputSize / 3;
+  uint64_t hash_key_ = ctx.GetAttr("hash_key")->GetInt();
+  auto num_ptr = ctx.GetAttr("N");
+  if (num_ptr != nullptr) {
+    num = num_ptr->GetInt();
+  } else {
+    if (inputSize % 3 == 0) num = num - 1;
+  }
+  if (num_buckets_ptr != nullptr) {
+    num_buckets_ = num_buckets_ptr->GetInt();
+  }
+  uint32_t start1 = 0;
+  uint32_t stop = num;
+  OpInputList indices_list_in(&ctx, start1, stop);
+  start1 = start1 + num;
+  stop = start1 + num;
+  OpInputList values_list_in(&ctx, start1, stop);
+  start1 = start1 + num;
+  stop = start1 + num;
+  OpInputList shapes_list_in(&ctx, start1, stop);
+  start1 = start1 + num;
+  OpInputList dense_list_in(&ctx, start1, inputSize);
+  const auto size = indices_list_in.size();
+  int64_t value = 2;
+  for (int64_t i = 0; i < size; i++) {
+    if (indices_list_in[i]->GetTensorShape()->GetDimSize(1) != value) {
+      KERNEL_LOG_ERROR("Expected D2 of index to be 2 got [%d], at position [%d].",
+                       indices_list_in[i]->GetTensorShape()->GetDimSize(1), i);
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  for (int64_t i = 0; i < size; i++) {
+    if (indices_list_in[i]->GetTensorShape()->GetDimSize(0) != values_list_in[i]->GetTensorShape()->GetDimSize(0)) {
+      KERNEL_LOG_ERROR("Expected size of values to be [%d], but got [%d] at position [%d].",
+                       indices_list_in[i]->GetTensorShape()->GetDimSize(0),
+                       values_list_in[i]->GetTensorShape()->GetDimSize(0), i);
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  for (int64_t i = 0; i < size; i++) {
+    EigenTensor shapes_list_in_e(shapes_list_in[i], shapes_list_in[i]->GetData());
+    int64_t value = 2;
+    if (shapes_list_in_e.vec<int64_t>().size() != value) {
+      KERNEL_LOG_ERROR("shape should imply a 2D tensor, but got [%d].", shapes_list_in[i]->GetTensorShape());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  for (int64_t i = 0; i < dense_list_in.size(); ++i) {
+    if (dense_list_in[i]->GetTensorShape()->GetDimSize(0) != batch_size) {
+      KERNEL_LOG_ERROR("Expected batch size [%d],got [%d].", batch_size,
+                       dense_list_in[i]->GetTensorShape()->GetDimSize(0));
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns =
+    ColumnsFromInput<InternalType>(indices_list_in, values_list_in, shapes_list_in, dense_list_in);
+  typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser crosser(columns, num_buckets_, hash_key_);
+  Tensor *indices_out = ctx.Output(0);
+  Tensor *values_out = ctx.Output(1);
+  Tensor *shape_out = ctx.Output(2);
+  std::vector<int64_t> output_start_indices(batch_size);
+  CreateOutputTensors(columns, batch_size, &ctx, indices_out, values_out, shape_out, &output_start_indices);
+  typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater updater(output_start_indices, indices_out, values_out);
+  for (int64_t b = 0; b < batch_size; b++) {
+    ProductIterator<InternalType> product_iterator(columns, b);
+    int64_t cross_count = 0;
+    while (product_iterator.HasNext()) {
+      const auto permutation = product_iterator.Next();
+
+      updater.Update(b, cross_count, crosser.Generate(b, permutation));
+      cross_count++;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SparseCrossCpuKernel::Compute(CpuKernelContext &ctx) {
+  bool hash_out = ctx.GetAttr("hashed_output")->GetBool();
+  DataType intertype = ctx.GetAttr("internal_type")->GetDataType();
+  if (hash_out == 0) {
+    if (intertype == 0) {
+      uint32_t res = SparseCrossCompute<false, string>(ctx);
+      if (res == 1) {
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+    }
+  } else if (hash_out == 1) {
+    uint32_t res = SparseCrossCompute<true, int64_t>(ctx);
+    if (res == 1) {
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kSparseCross, SparseCrossCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_cross.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_cross.h
@ -0,0 +1,121 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SPARSECROSS_H_
+#define AICPU_KERNELS_NORMALIZED_SPARSECROSS_H_
+
+#include <algorithm>
+#include <numeric>
+#include <vector>
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace swap {
+#define STATIC_INLINE static inline
+#define BSWAP_8(x) ((x)&0xff)
+#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8))
+#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16))
+#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
+#define uint32_in_expected_order(x) (x)
+#define uint64_in_expected_order(x) (BSWAP_64(x))
+}  // namespace swap
+
+namespace aicpu {
+class SparseCrossCpuKernel : public CpuKernel {
+ public:
+  SparseCrossCpuKernel() = default;
+  ~SparseCrossCpuKernel() override = default;
+
+ protected:
+  // template <bool HASHED_OUTPUT, typename InternalType>
+  uint32_t Compute(CpuKernelContext &ctx);
+
+ private:
+  template <bool HASHED_OUTPUT, typename InternalType>
+  uint32_t SparseCrossCompute(CpuKernelContext &ctx);
+
+  int64_t num_buckets_;
+  uint64_t hash_key_;
+};
+
+template <typename ListType, typename ElementType>
+class OpArgIterator {
+ public:
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = ElementType;
+  using pointer = ElementType *;
+  using const_pointer = const ElementType *;
+  using reference = ElementType &;
+  using const_reference = const ElementType &;
+  using difference_type = ptrdiff_t;
+
+  OpArgIterator(const ListType *list, int i) : list_(list), i_(i) {}
+
+  bool operator==(const OpArgIterator &rhs) {
+    if (list_ == rhs.list_) {
+      return i_ == rhs.i_;
+    }
+    return false;
+  }
+
+  bool operator!=(const OpArgIterator &rhs) {
+    if (list_ == rhs.list_) {
+      return i_ != rhs.i_;
+    }
+    return true;
+  }
+
+  OpArgIterator operator++() {  // prefix ++it
+    ++i_;
+    return *this;
+  }
+
+  OpArgIterator operator++(int) {  // postfix it++
+    OpArgIterator old_value = *this;
+    ++i_;
+    return old_value;
+  }
+
+  reference operator*() { return (*list_)[i_]; }
+  pointer operator->() { return &(*list_)[i_]; }
+
+  const_reference operator*() const { return (*list_)[i_]; }
+  const_pointer operator->() const { return &(*list_)[i_]; }
+
+ private:
+  const ListType *const list_;
+  int i_;
+};
+
+class OpInputList {
+ public:
+  using Iterator = OpArgIterator<OpInputList, const Tensor>;
+  OpInputList() : ctx_(nullptr), start_(0), stop_(0) {}
+  OpInputList(CpuKernelContext *ctx, uint32_t start, uint32_t stop) : ctx_(ctx), start_(start), stop_(stop) {}
+  OpInputList &operator=(const OpInputList &other) = default;
+  OpInputList(const OpInputList &other) = default;
+  Tensor *operator[](uint32_t i) const { return ctx_->Input(start_ + i); }
+  uint32_t size() const { return stop_ - start_; }
+  Iterator begin() const { return Iterator(this, 0); }
+  Iterator end() const { return Iterator(this, size()); }
+
+ private:
+  CpuKernelContext *ctx_;  // not owned
+  uint32_t start_;
+  uint32_t stop_;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_mean_with_num_segments.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_mean_with_num_segments.cc
@ -0,0 +1,180 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sparse_segment_mean_with_num_segments.h"
+
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 4;
+const uint32_t kOutputNum = 1;
+const char *SparseSegmentMeanWithNumSegments = "SparseSegmentMeanWithNumSegments";
+
+#define COMPUTE_CASE(DTYPE, TYPE, DTYPE_1, DTYPE_2, DTYPE_3, CTX)   \
+  case (DTYPE):                                                     \
+    if ((DTYPE_1) == DT_INT32) {                                    \
+      if ((DTYPE_2) == DT_INT32 && (DTYPE_3) == DT_INT32) {         \
+        return ComputeKernel<TYPE, int32_t, int32_t, int32_t>(CTX); \
+      } else if ((DTYPE_2) == DT_INT32 && (DTYPE_3) != DT_INT32) {  \
+        return ComputeKernel<TYPE, int32_t, int32_t, int64_t>(CTX); \
+      } else if ((DTYPE_2) != DT_INT32 && (DTYPE_3) == DT_INT32) {  \
+        return ComputeKernel<TYPE, int32_t, int64_t, int32_t>(CTX); \
+      } else if ((DTYPE_2) != DT_INT32 && (DTYPE_3) != DT_INT32) {  \
+        return ComputeKernel<TYPE, int32_t, int64_t, int64_t>(CTX); \
+      }                                                             \
+    } else {                                                        \
+      if ((DTYPE_2) == DT_INT32 && (DTYPE_3) == DT_INT32) {         \
+        return ComputeKernel<TYPE, int64_t, int32_t, int32_t>(CTX); \
+      } else if ((DTYPE_2) == DT_INT32 && (DTYPE_3) != DT_INT32) {  \
+        return ComputeKernel<TYPE, int64_t, int32_t, int64_t>(CTX); \
+      } else if ((DTYPE_2) != DT_INT32 && (DTYPE_3) == DT_INT32) {  \
+        return ComputeKernel<TYPE, int64_t, int64_t, int32_t>(CTX); \
+      } else if ((DTYPE_2) != DT_INT32 && (DTYPE_3) != DT_INT32) {  \
+        return ComputeKernel<TYPE, int64_t, int64_t, int64_t>(CTX); \
+      }                                                             \
+    }                                                               \
+    break;
+}  // namespace
+
+namespace aicpu {
+uint32_t SparseSegmentMeanWithNumSegmentsCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSegmentMeanWithNumSegments normalcheck failed.");
+  Tensor *x = ctx.Input(0);
+  Tensor *indices = ctx.Input(1);
+  Tensor *segment_ids = ctx.Input(2);
+  Tensor *num_segments = ctx.Input(3);
+
+  if (x->GetDataSize() == 0 || indices->GetDataSize() == 0 || segment_ids->GetDataSize() == 0 ||
+      num_segments->GetDataSize() == 0) {
+    KERNEL_LOG_ERROR("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  auto x_shape = x->GetTensorShape();
+  auto indices_shape = indices->GetTensorShape();
+  auto segment_ids_shape = segment_ids->GetTensorShape();
+  auto num_segments_shape = num_segments->GetTensorShape();
+
+  if (x_shape->GetDims() < 1) {
+    KERNEL_LOG_ERROR("[%s] Tensor x's rank less than 1.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  if (indices_shape->NumElements() != segment_ids_shape->NumElements()) {
+    KERNEL_LOG_ERROR("[%s] Tensor indices&segment_ids's ranks mismatch.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  auto x_data_type = x->GetDataType();
+  auto indices_data_type = indices->GetDataType();
+  auto segment_ids_data_type = segment_ids->GetDataType();
+  auto num_segments_data_type = num_segments->GetDataType();
+
+  if (indices_data_type != DT_INT32 && indices_data_type != DT_INT64) {
+    KERNEL_LOG_ERROR("SparseSegmentMeanWithNumSegments kernel data type [%s] not support.",
+                     DTypeStr(indices_data_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  if (segment_ids_data_type != DT_INT32 && segment_ids_data_type != DT_INT64) {
+    KERNEL_LOG_ERROR("SparseSegmentMeanWithNumSegments kernel data type [%s] not support.",
+                     DTypeStr(segment_ids_data_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  if (num_segments_data_type != DT_INT32 && num_segments_data_type != DT_INT64) {
+    KERNEL_LOG_ERROR("SparseSegmentMeanWithNumSegments kernel data type [%s] not support.",
+                     DTypeStr(num_segments_data_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  switch (x_data_type) {
+    COMPUTE_CASE(DT_FLOAT16, Eigen::half, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
+    COMPUTE_CASE(DT_FLOAT, float, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
+    COMPUTE_CASE(DT_DOUBLE, double, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
+    default:
+      KERNEL_LOG_ERROR("SparseSegmentMeanWithNumSegments kernel data type [%s] not support.",
+                       DTypeStr(x_data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(SparseSegmentMeanWithNumSegments, SparseSegmentMeanWithNumSegmentsCpuKernel);
+
+template <typename T1, typename T2, typename T3, typename T4>
+uint32_t SparseSegmentMeanWithNumSegmentsCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
+  int n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
+  int m = ctx.Input(2)->GetTensorShape()->NumElements();
+  auto x_ptr = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
+  auto indices_ptr = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
+  auto segment_ids_ptr = reinterpret_cast<T3 *>(ctx.Input(2)->GetData());
+  auto num_segments_ptr = reinterpret_cast<T4 *>(ctx.Input(3)->GetData());
+  auto y_ptr = reinterpret_cast<T1 *>(ctx.Output(0)->GetData());
+
+  std::vector<int64_t> y_shape_values = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  y_shape_values[0] = num_segments_ptr[0];
+  ctx.Output(0)->GetTensorShape()->SetDimSizes(y_shape_values);
+
+  for (int64_t i = 1; i < m; i++) {
+    if (segment_ids_ptr[i] < segment_ids_ptr[i - 1]) {
+      KERNEL_LOG_ERROR("segment_ids should be sorted.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+
+  for (int64_t i = 0; i < m; i++) {
+    if (indices_ptr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
+      KERNEL_LOG_ERROR("indices out of range.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (segment_ids_ptr[i] >= num_segments_ptr[0]) {
+      KERNEL_LOG_ERROR("segment_ids out of range.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+
+  for (int64_t i = 0; i < ctx.Output(0)->GetTensorShape()->NumElements(); i++) {
+    y_ptr[i] = (T1)0;
+  }
+
+  int oldindex = -1;
+  int countnum = 0;
+  for (int64_t i = 0; i < m; i++) {
+    if (oldindex == segment_ids_ptr[i]) {
+      countnum++;
+    } else if (countnum != 0) {
+      for (int64_t j = 0; j < n; j++) {
+        y_ptr[j + oldindex * n] /= static_cast<T1>(countnum);
+      }
+      countnum = 1;
+      oldindex = segment_ids_ptr[i];
+    } else {
+      countnum = 1;
+      oldindex = segment_ids_ptr[i];
+    }
+    for (int64_t j = 0; j < n; j++) {
+      y_ptr[j + oldindex * n] += x_ptr[j + indices_ptr[i] * n];
+    }
+  }
+  if (countnum != 0) {
+    for (int64_t j = 0; j < n; j++) {
+      y_ptr[j + oldindex * n] /= static_cast<T1>(countnum);
+    }
+  }
+  return KERNEL_STATUS_OK;
+};
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_mean_with_num_segments.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_mean_with_num_segments.h
@ -0,0 +1,37 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_MEAN_WITH_NUM_SEGMENTS_H_
+#define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_MEAN_WITH_NUM_SEGMENTS_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class SparseSegmentMeanWithNumSegmentsCpuKernel : public CpuKernel {
+ public:
+  SparseSegmentMeanWithNumSegmentsCpuKernel() = default;
+  ~SparseSegmentMeanWithNumSegmentsCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T1, typename T2, typename T3, typename T4>
+  static uint32_t ComputeKernel(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_slice.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_slice.cc
@ -0,0 +1,282 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sparse_slice.h"
+
+#include <unistd.h>
+#include <complex>
+#include <string>
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include "utils/sparse_tensor.h"
+
+namespace {
+const uint32_t kOutputNum = 3;
+const uint32_t kInputNum = 5;
+const char *kSparseSlice = "SparseSlice";
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+using ArraySlice = std::vector<T>;
+
+template <typename T>
+void Slice(Tensor *output_indices, Tensor *output_values, Tensor *output_dense_shape, SparseTensor *input_tensor,
+           const aicpu::ArraySlice<int64_t> &start, const aicpu::ArraySlice<int64_t> &size) {
+  auto output_shape = CpuKernelUtils::CreateTensorShape();
+  output_shape->SetDimSizes(input_tensor->shape());
+  auto output_shape_num_dims = CpuKernelUtils::CreateTensorShape();
+  output_shape_num_dims->SetDimSizes(input_tensor->shape());
+  const int dims = input_tensor->dims();
+  std::vector<int64_t> dimsVec(dims, 0);
+
+  for (int dim = 0; dim < dims; dim++) {
+    // Determine the size of the result; if the selected slice goes beyond the
+    // input boundary, the result will correspond to the size of the overlap
+    // between the input and the selected slice.
+    const auto input_size = output_shape->GetDimSize(dim);
+    const int64_t start_index = start[dim];
+    const int64_t slice_size = size[dim];
+
+    if (start_index + slice_size < input_size) {
+      dimsVec[dim] = slice_size;
+    } else if (start_index < input_size) {
+      dimsVec[dim] = input_size - start_index;
+    } else {
+      dimsVec[dim] = 0;
+    }
+  }
+
+  output_shape->SetDimSizes(dimsVec);
+  auto input_indices_t = input_tensor->indices().get()->matrix<int64_t>();
+  auto input_values_t = input_tensor->values().get()->vec<T>();
+
+  // Find the number of indices that fall inside start and size.
+  int count = 0;
+  int dim_size = input_tensor->indices()->GetTensor()->GetTensorShape()->GetDimSize(0);
+
+  for (int i = 0; i < dim_size; i++) {
+    // The following will check to see if an input is within the
+    // range specified by start and size.
+    // The for loop below iterates through all dimensions. In case
+    // the index falls outside of the start and size at any dimension,
+    // it will be considered as a "no hit" (hit = false). In this
+    // case, it will not be counted as the index that fall inside
+    // the range specified by start and size.
+
+    bool hit = true;
+    for (int dim = 0; dim < dims; dim++) {
+      if (!(start[dim] <= input_indices_t(i, dim) && input_indices_t(i, dim) < start[dim] + size[dim])) {
+        hit = false;
+        break;
+      }
+    }
+    if (!hit) {
+      continue;
+    }
+    count++;
+  }
+
+  auto eigen_tensor_indices = EigenTensor(output_indices, output_indices->GetData());
+  auto eigen_tensor_values = EigenTensor(output_values, output_values->GetData());
+  auto eigen_tensor_shape = EigenTensor(output_dense_shape, output_dense_shape->GetData());
+  auto output_values_t = eigen_tensor_values.vec<T>();
+  auto output_indices_t = eigen_tensor_indices.matrix<int64_t>();
+  auto output_shape_t = eigen_tensor_shape.vec<int64_t>();
+
+  // Obtain the output indices that fall inside start and size.
+  for (int dim = 0; dim < output_dense_shape->NumElements(); ++dim) {
+    const auto input_size = output_shape->GetDimSize(dim);
+    output_shape_t(dim) = input_size;
+  }
+
+  int index = 0;
+  for (int i = 0; i < dim_size && index < count; i++) {
+    // The logic here is similar as the above except that the above
+    // only count the number of indices while here we actually generate
+    // the output.
+
+    bool hit = true;
+    for (int dim = 0; dim < dims; dim++) {
+      if (!(start[dim] <= input_indices_t(i, dim) && input_indices_t(i, dim) < start[dim] + size[dim])) {
+        hit = false;
+        break;
+      }
+    }
+    if (!hit) {
+      continue;
+    }
+
+    output_values_t(index) = input_values_t(i);
+
+    for (int64_t dim = 0; dim < dims; dim++) {
+      output_indices_t(index, dim) = input_indices_t(i, dim) - start[dim];
+    }
+    index++;
+  }
+  const int num_dims = dims;
+  const int64_t y_nnz = index;
+
+  std::vector<int64_t> indices_dims = {y_nnz, num_dims};
+  auto output_indices_shape = output_indices->GetTensorShape();
+  output_indices_shape->SetDimSizes(indices_dims);
+  output_indices->SetTensorShape(output_indices_shape.get());
+
+  std::vector<int64_t> values_dims = {y_nnz};
+  auto output_values_shape = output_values->GetTensorShape();
+  output_values_shape->SetDimSizes(values_dims);
+  output_values->SetTensorShape(output_values_shape.get());
+}
+
+std::uint32_t SparseSliceCpuKernel::Compute(CpuKernelContext &ctx) {
+  Tensor *indices = ctx.Input(0);
+  Tensor *values = ctx.Input(1);
+  Tensor *shape = ctx.Input(2);
+  Tensor *start = ctx.Input(3);
+  Tensor *size = ctx.Input(4);
+
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "sparseslice check input and output number failed.");
+  KERNEL_HANDLE_ERROR(SparseSliceParamCheck(indices, values, shape, start, size), "sparseslice check params failed.");
+
+  const int input_dims = shape->NumElements();
+  auto shape_shape = shape->GetTensorShape();
+  std::vector<int64_t> dense_shape;
+  std::vector<int64_t> order;
+  int64_t output_size = 1;
+  for (int32_t index = 0; index < shape_shape->GetDimSize(0); ++index) {
+    if (shape->GetDataType() == DT_INT32) {
+      int32_t *temp_dim = static_cast<int32_t *>(shape->GetData());
+      dense_shape.emplace_back(static_cast<int64_t>(temp_dim[index]));
+    } else {
+      int64_t *temp_dim = static_cast<int64_t *>(shape->GetData());
+      dense_shape.emplace_back(temp_dim[index]);
+    }
+    order.push_back(dense_shape[index]);
+    output_size *= dense_shape[index];
+  }
+
+  std::iota(order.begin(), order.end(), 0);
+
+  SparseTensor st;
+  if (st.CreateSparseTensor(indices, values, dense_shape, order) != KERNEL_STATUS_OK) {
+    KERNEL_LOG_ERROR("Create sparse tensor failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  aicpu::ArraySlice<int64_t> slice_start(input_dims, 0);
+  aicpu::ArraySlice<int64_t> slice_size(input_dims, 0);
+
+  auto start_val = static_cast<int64_t *>(start->GetData());
+  auto size_val = static_cast<int64_t *>(size->GetData());
+  for (int64_t i = 0; i < input_dims; i++) {
+    slice_start[i] = *(start_val + i);
+  }
+
+  for (int64_t i = 0; i < input_dims; i++) {
+    slice_size[i] = *(size_val + i);
+  }
+
+  Tensor *output_indices = ctx.Output(0);
+  Tensor *output_values = ctx.Output(1);
+  Tensor *output_dense_shape = ctx.Output(2);
+
+  DataType values_data_type = ctx.Input(1)->GetDataType();
+  KERNEL_LOG_DEBUG("%s op input[a] data type is [%s].", kSparseSlice, DTypeStr(values_data_type).c_str());
+  switch (values_data_type) {
+    case DT_INT64:
+      Slice<int64_t>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    case DT_INT32:
+      Slice<int32_t>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    case DT_UINT16:
+      Slice<uint16_t>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    case DT_INT16:
+      Slice<int16_t>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    case DT_UINT8:
+      Slice<uint8_t>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    case DT_INT8:
+      Slice<int8_t>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    case DT_FLOAT16:
+      Slice<Eigen::half>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    case DT_FLOAT:
+      Slice<float>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    case DT_DOUBLE:
+      Slice<double>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    case DT_COMPLEX64:
+      Slice<std::complex<float>>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    case DT_COMPLEX128:
+      Slice<std::complex<double>>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    case DT_BOOL:
+      Slice<bool>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    case DT_STRING:
+      Slice<std::string>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
+      break;
+    default:
+      KERNEL_LOG_ERROR("SparseSlice kernel data type [%s] not support.", DTypeStr(values_data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SparseSliceCpuKernel::SparseSliceParamCheck(Tensor *indices, Tensor *values, Tensor *shape, Tensor *start,
+                                                     Tensor *size) {
+  auto indices_shape = indices->GetTensorShape();
+  KERNEL_CHECK_FALSE((IsMatrix(indices_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
+                     "Input indeices must be a matrix.");
+
+  auto values_shape = values->GetTensorShape();
+  KERNEL_CHECK_FALSE((IsVector(values_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
+                     "Input values must be a vector.");
+
+  auto shape_shape = shape->GetTensorShape();
+  std::vector<int64_t> shape_shape_vec;
+  int64_t *shape_vec = static_cast<int64_t *>(shape->GetData());
+  shape_shape_vec.push_back(*(shape_vec));
+  shape_shape_vec.push_back(*(shape_vec + 1));
+  KERNEL_CHECK_FALSE((IsVector(shape_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
+                     " Input shape must be a vector.");
+
+  auto start_shape = start->GetTensorShape();
+  KERNEL_CHECK_FALSE((IsVector(start_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
+                     "Input start must be a vector.");
+
+  auto size_shape = size->GetTensorShape();
+  KERNEL_CHECK_FALSE((IsVector(size_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID, "Input size must be a vector");
+
+  const int input_dims = shape->NumElements();
+  KERNEL_CHECK_FALSE((input_dims == start->NumElements()), KERNEL_STATUS_PARAM_INVALID,
+                     "Expected start to be a vector of length [%s]", input_dims, "but get length [%s]",
+                     start->NumElements());
+
+  KERNEL_CHECK_FALSE((input_dims == size->NumElements()), KERNEL_STATUS_PARAM_INVALID,
+                     "Expected start to be a vector of length [%s]", input_dims, "but get length [%s]",
+                     size->NumElements());
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kSparseSlice, SparseSliceCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_slice.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_slice.h
@ -0,0 +1,24 @@
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_SPARSE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_SPARSE_OPS_H_
+
+#include <string>
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel_utils.h"
+#include "cpu_tensor.h"
+#include "cpu_tensor_shape.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "utils/eigen_tensor.h"
+#include "utils/sparse_tensor.h"
+
+namespace aicpu {
+class SparseSliceCpuKernel : public CpuKernel {
+ public:
+  SparseSliceCpuKernel() = default;
+  ~SparseSliceCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+  uint32_t SparseSliceParamCheck(Tensor *indices, Tensor *values, Tensor *shape, Tensor *start, Tensor *size);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_slice_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_slice_grad.cc
@ -0,0 +1,145 @@
+#include "sparse_slice_grad.h"
+#include <complex>
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include "utils/sparse_tensor.h"
+
+namespace {
+const uint32_t kInputNum = 4;
+const uint32_t kOutputNum = 1;
+const char *kSparseSliceGrad = "SparseSliceGrad";
+}  // namespace
+namespace aicpu {
+uint32_t SparseSliceGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  Tensor *backprop_val_grad = ctx.Input(0);
+  Tensor *indices = ctx.Input(1);
+  Tensor *start = ctx.Input(2);
+  Tensor *new_indices = ctx.Input(3);
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "sparseslicegrad check input and output number failed.");
+  KERNEL_HANDLE_ERROR(SparseSliceGradParamCheck(backprop_val_grad, indices, start, new_indices),
+                      "sparseslicegrad check params failed.");
+  DataType input0_type = ctx.Input(0)->GetDataType();
+  KERNEL_LOG_DEBUG("%s op input[a] data type is [%s].", kSparseSliceGrad, DTypeStr(input0_type).c_str());
+  switch (input0_type) {
+    case DT_INT8:
+      GradCompute<int8_t>(ctx);
+      break;
+    case DT_UINT8:
+      GradCompute<uint8_t>(ctx);
+      break;
+    case DT_INT16:
+      GradCompute<int16_t>(ctx);
+      break;
+    case DT_UINT16:
+      GradCompute<uint16_t>(ctx);
+      break;
+    case DT_INT32:
+      GradCompute<int32_t>(ctx);
+      break;
+    case DT_INT64:
+      GradCompute<int64_t>(ctx);
+      break;
+    case DT_FLOAT:
+      GradCompute<float>(ctx);
+      break;
+    case DT_FLOAT16:
+      GradCompute<Eigen::half>(ctx);
+      break;
+    case DT_DOUBLE:
+      GradCompute<double>(ctx);
+      break;
+    case DT_COMPLEX64:
+      GradCompute<std::complex<float>>(ctx);
+      break;
+    case DT_COMPLEX128:
+      GradCompute<std::complex<double>>(ctx);
+      break;
+
+    default:
+      KERNEL_LOG_ERROR("SparseSliceGrad kernel data type [%s] not support.", DTypeStr(input0_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t SparseSliceGradCpuKernel::GradCompute(CpuKernelContext &ctx) {
+  Tensor *backprop_val_grad = ctx.Input(0);
+  Tensor *indices = ctx.Input(1);
+  Tensor *start = ctx.Input(2);
+  Tensor *new_indices = ctx.Input(3);
+  auto indices_shape = indices->GetTensorShape();
+  const int64_t input_nnz = indices_shape->GetDimSize(0);
+  Tensor *y_grad = ctx.Output(0);
+  auto *y_grad_vec = static_cast<T *>(y_grad->GetData());
+  memset(y_grad_vec, 0, sizeof(T) * input_nnz);
+
+  std::vector<T> backprop_val_grad_flat;
+  auto *backprop_val_grad_vec = static_cast<T *>(backprop_val_grad->GetData());
+  const auto indices_mat = (EigenTensor(indices, indices->GetData())).matrix<int64_t>();
+  const auto new_indices_mat = (EigenTensor(new_indices, new_indices->GetData())).matrix<int64_t>();
+  EigenTensor start_ET(start, start->GetData());
+  const auto start_flat = start_ET.flat<int64_t>();
+
+  int64_t j = 0;
+  const int num_dims = indices_shape->GetDimSize(1);
+  for (int64_t i = 0; i < input_nnz && j < backprop_val_grad->NumElements(); ++i) {
+    bool is_same = true;
+    for (int d = 0; d < num_dims; ++d) {
+      const int64_t indices_value = indices_mat(i, d);
+      const int64_t new_indices_value = new_indices_mat(j, d);
+      const int64_t offset = start_flat(d);
+      if (indices_value != new_indices_value + offset) {
+        is_same = false;
+        break;
+      }
+    }
+    if (is_same) {
+      y_grad_vec[i] = *(backprop_val_grad_vec + j);
+      ++j;
+    }
+  }
+  KERNEL_CHECK_FALSE((backprop_val_grad->NumElements() == j), KERNEL_STATUS_PARAM_INVALID,
+                     "Elements of backprop_val_grad aren't all propagated."
+                     "Num elements:",
+                     backprop_val_grad->NumElements(), ", used: ", j);
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SparseSliceGradCpuKernel::SparseSliceGradParamCheck(Tensor *backprop_val_grad, Tensor *indices, Tensor *start,
+                                                             Tensor *new_indices) {
+  KERNEL_CHECK_FALSE((IsVector(backprop_val_grad->GetTensorShape()->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
+                     "Input backprop_val_grad should be a vector but received shape: [%d].",
+                     backprop_val_grad->GetTensorShape()->GetDimSizes());
+  KERNEL_CHECK_FALSE(
+    (IsMatrix(indices->GetTensorShape()->GetDimSizes()) && IsMatrix(new_indices->GetTensorShape()->GetDimSizes())),
+    KERNEL_STATUS_PARAM_INVALID,
+    "Input and output indices should be matrices [%lld], but "
+    "received shapes: [%lld].",
+    indices->GetTensorShape()->GetDimSizes(), new_indices->GetTensorShape()->GetDimSizes());
+  auto indices_shape = indices->GetTensorShape();
+  auto new_indices_shape = new_indices->GetTensorShape();
+  KERNEL_CHECK_FALSE((indices_shape->GetDimSize(1) == new_indices_shape->GetDimSize(1)), KERNEL_STATUS_PARAM_INVALID,
+                     "The input and output should have the same, ndims: got: [%d] and [%d].",
+                     indices_shape->GetDimSize(1), new_indices_shape->GetDimSize(1));
+  KERNEL_CHECK_FALSE((new_indices_shape->GetDimSize(0) <= indices_shape->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID,
+                     "# rows of output_indices should be not greater than of input_indices, "
+                     "got: [%d] and [%d].",
+                     new_indices_shape->GetDimSize(0), indices_shape->GetDimSize(0));
+  KERNEL_CHECK_FALSE((backprop_val_grad->NumElements() == new_indices_shape->GetDimSize(0)),
+                     KERNEL_STATUS_PARAM_INVALID,
+                     "# elements of backprop_val_grad and rows of new_indices should match "
+                     "(#nnz of sum): got [%d] and [%d].",
+                     backprop_val_grad->NumElements(), new_indices_shape->GetDimSize(0));
+  KERNEL_CHECK_FALSE((IsVector(start->GetTensorShape()->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
+                     "The start should be a vector but received shape [%s].",
+                     VectorToString(start->GetTensorShape()->GetDimSizes()).c_str());
+  const int num_dims = indices_shape->GetDimSize(1);
+  KERNEL_CHECK_FALSE((num_dims == start->NumElements()), KERNEL_STATUS_PARAM_INVALID,
+                     "Expected start must be a vector of length [%d] but got length [%d].", num_dims,
+                     start->NumElements());
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kSparseSliceGrad, SparseSliceGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_slice_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_slice_grad.h
@ -0,0 +1,20 @@
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_SPARSE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_SPARSE_OPS_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class SparseSliceGradCpuKernel : public CpuKernel {
+ public:
+  SparseSliceGradCpuKernel() = default;
+  ~SparseSliceGradCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+  uint32_t SparseSliceGradParamCheck(Tensor *backprop_val_grad, Tensor *indices, Tensor *start, Tensor *new_indices);
+
+  template <typename T>
+  uint32_t GradCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_softmax.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_softmax.cc
@ -0,0 +1,155 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sparse_softmax.h"
+
+#include <securec.h>
+#include <iostream>
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include "utils/sparse_tensor.h"
+
+namespace {
+const uint32_t kSparseSoftmaxInputNum = 3;
+const uint32_t kSparseSoftmaxOutputNum = 1;
+const uint32_t kIndex0 = 0;
+const uint32_t kIndex1 = 1;
+const uint32_t kIndex2 = 2;
+const uint32_t kSize1 = 1;
+const uint32_t kSize2 = 2;
+const char *kSparseSoftmax = "SparseSoftmax";
+#define SPARSESOFTMAX_COMPUTE_CASE(DTYPE, TYPE, CTX)         \
+  case (DTYPE): {                                            \
+    uint32_t result = SparseSoftmaxCompute<TYPE>(CTX);       \
+    if (result != KERNEL_STATUS_OK) {                        \
+      KERNEL_LOG_ERROR("SparseSoft kernel compute failed."); \
+      return result;                                         \
+    }                                                        \
+    break;                                                   \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t SparseSoftmaxCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kSparseSoftmaxInputNum, kSparseSoftmaxOutputNum),
+                      "[%s] check input and output failed.", kSparseSoftmax);
+  // parse params
+  KERNEL_HANDLE_ERROR(SparseSoftmaxCheck(ctx), "[%s] check params failed.", kSparseSoftmax);
+  auto data_type = ctx.Input(1)->GetDataType();
+  switch (data_type) {
+    SPARSESOFTMAX_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    SPARSESOFTMAX_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("SparseSoftmax kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+uint32_t SparseSoftmaxCpuKernel::SparseSoftmaxCheck(CpuKernelContext &ctx) {
+  std::vector<int64_t> shape_indices = ctx.Input(kIndex0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> shape_values = ctx.Input(kIndex1)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> shape_shape = ctx.Input(kIndex2)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> shape_output = ctx.Output(kIndex0)->GetTensorShape()->GetDimSizes();
+  KERNEL_CHECK_FALSE((shape_indices.size() == kSize2), KERNEL_STATUS_PARAM_INVALID,
+                     "Indices must be rank 2D, got [%zu].", shape_indices.size())
+  KERNEL_CHECK_FALSE((shape_values.size() == kSize1), KERNEL_STATUS_PARAM_INVALID, "values must be rank 1D, got [%zu].",
+                     shape_values.size())
+  KERNEL_CHECK_FALSE((shape_shape.size() == kSize1), KERNEL_STATUS_PARAM_INVALID, "shape must be rank 1D, got [%zu].",
+                     shape_shape.size())
+  KERNEL_CHECK_FALSE((ctx.Input(kIndex2)->GetTensorShape()->NumElements() >= kSize2), KERNEL_STATUS_PARAM_INVALID,
+                     "shape number must be more than 1, got [%zu].", shape_shape.size())
+  KERNEL_CHECK_FALSE((shape_values.size() == shape_output.size()), KERNEL_STATUS_PARAM_INVALID,
+                     "The input shape size should be same as the output shape size")
+  const int64_t nnz = shape_indices[0];
+  const int64_t data_num = ctx.Input(kIndex1)->NumElements();
+  KERNEL_CHECK_FALSE((nnz == data_num), KERNEL_STATUS_PARAM_INVALID,
+                     "The values number should be same as the indices_size(0)");
+  auto data_type_indices = ctx.Input(kIndex0)->GetDataType();
+  auto data_type_shape = ctx.Input(kIndex2)->GetDataType();
+  KERNEL_CHECK_FALSE((data_type_indices == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
+                     "data type of indices should be int64");
+  KERNEL_CHECK_FALSE((data_type_shape == DT_INT64), KERNEL_STATUS_PARAM_INVALID, "data type of shape should be int64");
+  return KERNEL_STATUS_OK;
+}
+template <typename T>
+uint32_t SparseSoftmaxCpuKernel::SparseSoftmaxCompute(CpuKernelContext &ctx) {
+  int64_t data_num = ctx.Input(kIndex1)->NumElements();
+
+  auto *indices_t = ctx.Input(kIndex0);
+  auto *values_t = ctx.Input(kIndex1);
+  auto *shape_t = ctx.Input(kIndex2);
+  auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+
+  std::vector<int64_t> shape_indices = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  const int64_t nnz = shape_indices[0];
+  const int64_t rank = static_cast<int64_t>(shape_indices[1]);
+
+  SparseTensor st;
+
+  std::vector<int64_t> order;
+  std::vector<int64_t> shape_flat;
+  int64_t *temp_dim = reinterpret_cast<int64_t *>(shape_t->GetData());
+  for (int32_t index = 0; index < shape_t->GetTensorShape()->GetDimSize(0); ++index) {
+    shape_flat.emplace_back(temp_dim[index]);
+    order.push_back(shape_flat[index]);
+  }
+  std::iota(order.begin(), order.end(), 0);
+
+  if (st.CreateSparseTensor(indices_t, values_t, shape_flat, order) != KERNEL_STATUS_OK) {
+    KERNEL_LOG_ERROR("Create sparse tensor failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  Eigen::Tensor<T, 1, Eigen::RowMajor> output_flat(nnz);
+
+  // { 0, ..., rank-1 }.
+  std::vector<int64_t> kReorderDims(rank);
+  std::iota(kReorderDims.begin(), kReorderDims.end(), 0);
+  // All but the last dim -- the class dimension to be max-reduced along.
+  std::vector<int64_t> kGroupByDims(rank - 1);
+  std::iota(kGroupByDims.begin(), kGroupByDims.end(), 0);
+  st.Reorder<T>(kReorderDims);
+
+  int64_t count = 0;
+
+  for (const auto &g : st.group(kGroupByDims)) {
+    const auto group_vals = g.values<T>();
+    const int group_size = group_vals.size();
+    Eigen::Tensor<T, 0, Eigen::RowMajor> tmp_scalar;
+    tmp_scalar = group_vals.maximum();
+
+    Eigen::Tensor<T, 1, Eigen::RowMajor> tmp(group_size);
+    tmp = (group_vals - tmp.constant(tmp_scalar())).exp();
+    tmp_scalar = tmp.sum().inverse();
+
+    tmp = tmp * tmp.constant(tmp_scalar());
+    Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>> output_part(output_flat.data() + count, group_size);
+    output_part = tmp;
+
+    count += group_size;
+  }
+  for (int64_t index = 0; index < data_num; ++index) {
+    output_data[index] = static_cast<T>(output_flat[index]);
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kSparseSoftmax, SparseSoftmaxCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_softmax.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_softmax.h
@ -0,0 +1,38 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SPARSESOFTMAX_H_
+#define AICPU_KERNELS_NORMALIZED_SPARSESOFTMAX_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/sparse_tensor.h"
+
+namespace aicpu {
+class SparseSoftmaxCpuKernel : public CpuKernel {
+ public:
+  SparseSoftmaxCpuKernel() = default;
+  ~SparseSoftmaxCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t SparseSoftmaxCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t SparseSoftmaxCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_add.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_add.cc
@ -0,0 +1,226 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sparse_tensor_dense_add.h"
+#include <float.h>
+#include <securec.h>
+#include <complex>
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "iostream"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+namespace {
+const char *kSparseTensorDenseAdd = "SparseTensorDenseAdd";
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 4;
+// when input data size is more than kParallelDataNum, use Parallel func
+constexpr uint64_t kParallelDataNums = 256 * 1024;
+}  // namespace
+
+namespace aicpu {
+uint32_t SparseTensorDenseAddCpuKernel::Compute(CpuKernelContext &ctx) {
+  if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
+    KERNEL_LOG_ERROR("Check SparseTensorDenseAdd params failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  uint32_t res = ValidateInputs(ctx);
+  if (res != KERNEL_STATUS_OK) {
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  DataType data_type = static_cast<DataType>(ctx.Input(1)->GetDataType());
+  switch (data_type) {
+    case DT_INT8:
+      return DoCompute<int8_t>(ctx);
+    case DT_UINT8:
+      return DoCompute<uint8_t>(ctx);
+    case DT_INT16:
+      return DoCompute<int16_t>(ctx);
+    case DT_UINT16:
+      return DoCompute<uint16_t>(ctx);
+    case DT_INT32:
+      return DoCompute<int32_t>(ctx);
+    case DT_INT64:
+      return DoCompute<int64_t>(ctx);
+    case DT_FLOAT16:
+      return DoCompute<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return DoCompute<float>(ctx);
+    case DT_DOUBLE:
+      return DoCompute<double>(ctx);
+    case DT_COMPLEX64:
+      return DoCompute<std::complex<float>>(ctx);
+    case DT_COMPLEX128:
+      return DoCompute<std::complex<double>>(ctx);
+    default:
+      KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+template <typename T>
+uint32_t SparseTensorDenseAddCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  Tensor *a_indices = ctx.Input(0);
+  Tensor *a_values = ctx.Input(1);
+  Tensor *b = ctx.Input(3);
+  Tensor *out = ctx.Output(0);
+  const int NDIMS = static_cast<int>(a_indices->GetTensorShape()->GetDimSize(1));
+  auto b_data = reinterpret_cast<T *>(b->GetData());
+  auto out_data = reinterpret_cast<T *>(out->GetData());
+  auto value_data = reinterpret_cast<T *>(a_values->GetData());
+  const auto ix_ = std::make_shared<EigenTensor>(a_indices, a_indices->GetData());
+  DataType dt = static_cast<DataType>(a_indices->GetDataType());
+  uint32_t data_num = out->NumElements();
+  if (data_num <= kParallelDataNums) {
+    for (size_t i = 0; i < data_num; i++) {
+      out_data[i] = b_data[i];
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shared_sparsetensordenseadd = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        out_data[i] = b_data[i];
+      }
+    };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_sparsetensordenseadd);
+  }
+  const int num_nnz = static_cast<int>(a_indices->GetTensorShape()->GetDimSize(0));
+  std::vector<int64_t> strides(NDIMS);
+  if (NDIMS > 0) {
+    strides[NDIMS - 1] = 1;
+  }
+  for (int d = NDIMS - 2; d >= 0; --d) {
+    const int64_t dimsize = b->GetTensorShape()->GetDimSize(d + 1);
+    strides[d] = strides[d + 1] * dimsize;
+  }
+  for (int i = 0; i < num_nnz; ++i) {
+    int64_t ix = 0;
+    for (int d = 0; d < NDIMS; ++d) {
+      int64_t ix_i_d = 0;
+      if (dt == DT_INT32) {
+        auto a_indices_mat = ix_->matrix<int32_t>();
+        ix_i_d = a_indices_mat(i, d);
+      } else {
+        auto a_indices_mat = ix_->matrix<int64_t>();
+        ix_i_d = a_indices_mat(i, d);
+      }
+      ix += strides[d] * ix_i_d;
+    }
+    out_data[ix] += value_data[i];
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SparseTensorDenseAddCpuKernel::ValidateInputs(CpuKernelContext &ctx) {
+  Tensor *a_indices_t = ctx.Input(0);
+  Tensor *a_values_t = ctx.Input(1);
+  Tensor *a_shape_t = ctx.Input(2);
+  Tensor *b_t = ctx.Input(3);
+  Tensor *out_t = ctx.Output(0);
+  const int a_indices_shape_dims = 2;
+  DataType input0_dt = a_values_t->GetDataType();
+  DataType input1_dt = b_t->GetDataType();
+  DataType input2_dt = out_t->GetDataType();
+  const int ndims = static_cast<int>(a_indices_t->GetTensorShape()->GetDimSize(1));
+  const int min_ndims = 1;
+  const int max_ndims = 5;
+  if (ndims < min_ndims || ndims > max_ndims) {
+    KERNEL_LOG_ERROR("Only tensors with ranks between 1 and 5 are currently supported. Tensor rank: [%d]", ndims);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  // valid data type
+  if (input0_dt != input1_dt || input1_dt != input2_dt) {
+    KERNEL_LOG_ERROR("x1_values data type[%s], x2 data type[%s] and y data type[%s] must be same.",
+                     DTypeStr(input0_dt).c_str(), DTypeStr(input1_dt).c_str(), DTypeStr(input2_dt).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  int32_t IndiceType = a_indices_t->GetDataType();
+  int32_t ShapeType = a_shape_t->GetDataType();
+  bool validIndiceType = (IndiceType != DT_INT64 && IndiceType != DT_INT32);
+  bool validShapeType = (ShapeType != DT_INT64 && ShapeType != DT_INT32);
+  if (validShapeType || validIndiceType) {
+    KERNEL_LOG_ERROR(
+      "Valid indice or shape data type failed, indiceType [%d], shapeType "
+      "[%d].",
+      IndiceType, ShapeType);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (IndiceType != ShapeType) {
+    KERNEL_LOG_ERROR(
+      "Indice type and shape type should be same, indiceType [%d], shapeType "
+      "[%d].",
+      IndiceType, ShapeType);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  //  valid data shape
+  if (a_indices_t->GetTensorShape()->GetDims() != a_indices_shape_dims) {
+    KERNEL_LOG_ERROR("Input a_indices should be a matrix but get dim size: [%d].",
+                     a_indices_t->GetTensorShape()->GetDims());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (a_values_t->GetTensorShape()->GetDims() != 1 || a_shape_t->GetTensorShape()->GetDims() != 1) {
+    KERNEL_LOG_ERROR(
+      "Inputs a_values and a_shape should be vectors but received shapes: "
+      "[%d] and [%d]",
+      a_values_t->GetTensorShape()->GetDims(), a_shape_t->GetTensorShape()->GetDims());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (a_shape_t->NumElements() != b_t->GetTensorShape()->GetDims() ||
+      out_t->GetTensorShape()->GetDims() != b_t->GetTensorShape()->GetDims()) {
+    KERNEL_LOG_ERROR(
+      "Three operands have different ranks; received: [%lld] , [%lld] and "
+      "[%lld]",
+      a_shape_t->NumElements(), b_t->GetTensorShape()->GetDims(), out_t->GetTensorShape()->GetDims());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  std::shared_ptr<EigenTensor> a_shape_ = std::make_shared<EigenTensor>(a_shape_t, a_shape_t->GetData());
+  if (IndiceType == DT_INT32) {
+    auto a_shape_flat = a_shape_->vec<int32_t>();
+    for (int i = 0; i < b_t->GetTensorShape()->GetDims(); i++) {
+      if (out_t->GetTensorShape()->GetDimSize(i) != b_t->GetTensorShape()->GetDimSize(i) ||
+          a_shape_flat(i) != b_t->GetTensorShape()->GetDimSize(i)) {
+        KERNEL_LOG_ERROR(
+          "Dimension [%d] does not equal (no broadcasting is supported): y "
+          "side [%lld] vs x2 shape side [%lld] vs x1 shape side [%lld]",
+          i, out_t->GetTensorShape()->GetDimSize(i), b_t->GetTensorShape()->GetDimSize(i), a_shape_flat(i));
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+    }
+  } else {
+    auto a_shape_flat = a_shape_->vec<int64_t>();
+    for (int i = 0; i < b_t->GetTensorShape()->GetDims(); i++) {
+      if (out_t->GetTensorShape()->GetDimSize(i) != b_t->GetTensorShape()->GetDimSize(i) ||
+          a_shape_flat(i) != b_t->GetTensorShape()->GetDimSize(i)) {
+        KERNEL_LOG_ERROR(
+          "Dimension [%d] does not equal (no broadcasting is supported): y "
+          "side [%lld] vs x2 shape side [%lld] vs x1 shape side [%lld]",
+          i, out_t->GetTensorShape()->GetDimSize(i), b_t->GetTensorShape()->GetDimSize(i), a_shape_flat(i));
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kSparseTensorDenseAdd, SparseTensorDenseAddCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_add.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_add.h
@ -0,0 +1,39 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_TENSOR_DENSE_ADD_H_
+#define AICPU_KERNELS_NORMALIZED_SPARSE_TENSOR_DENSE_ADD_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class SparseTensorDenseAddCpuKernel : public CpuKernel {
+ private:
+  /* data */
+ public:
+  SparseTensorDenseAddCpuKernel() = default;
+  ~SparseTensorDenseAddCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+  uint32_t ValidateInputs(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t DoCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_mat_mul.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_mat_mul.cc
@ -0,0 +1,225 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <iostream>
+#include <type_traits>
+#include <vector>
+
+#include "cpu_kernel_utils.h"
+#include "sparse_tensor_dense_mat_mul.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+using namespace std;
+namespace {
+#define COL_SHED 1024 << 1
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 4;
+const char *kSparseTensorDenseMatMul = "SparseTensorDenseMatMul";
+}  // namespace
+namespace aicpu {
+uint32_t SparseTensorDenseMatMulCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
+                      "SparseTensorDenseMatMul check input and output number failed.");
+  KERNEL_HANDLE_ERROR(SparseTensorDenseMatMulCheck(ctx), "SparseTensorDenseMatMul check params failed.");
+  DataType sparse_data_type = ctx.Input(1)->GetDataType();
+  DataType indice_data_type = ctx.Input(0)->GetDataType();
+  DataType dense_data_type = ctx.Input(3)->GetDataType();
+  DataType y_data_type = ctx.Output(0)->GetDataType();
+  if (sparse_data_type == DT_FLOAT && indice_data_type == DT_INT64 && dense_data_type == DT_FLOAT &&
+      y_data_type == DT_FLOAT)
+    regular_calculate<float, int64_t, float, float>(ctx);
+  else if (sparse_data_type == DT_FLOAT && indice_data_type == DT_INT64 && dense_data_type == DT_COMPLEX64 &&
+           y_data_type == DT_COMPLEX64)
+    regular_calculate<float, int64_t, complex<float>, complex<float>>(ctx);
+  else if (sparse_data_type == DT_FLOAT && indice_data_type == DT_INT32 && dense_data_type == DT_FLOAT &&
+           y_data_type == DT_FLOAT)
+    regular_calculate<float, int32_t, float, float>(ctx);
+  else if (sparse_data_type == DT_FLOAT && indice_data_type == DT_INT32 && dense_data_type == DT_COMPLEX64 &&
+           y_data_type == DT_COMPLEX64)
+    regular_calculate<float, int32_t, complex<float>, complex<float>>(ctx);
+  else if (sparse_data_type == DT_DOUBLE && indice_data_type == DT_INT64 && dense_data_type == DT_DOUBLE &&
+           y_data_type == DT_DOUBLE)
+    regular_calculate<double, int64_t, double, double>(ctx);
+  else if (sparse_data_type == DT_DOUBLE && indice_data_type == DT_INT64 && dense_data_type == DT_COMPLEX128 &&
+           y_data_type == DT_COMPLEX128)
+    regular_calculate<double, int64_t, complex<double>, complex<double>>(ctx);
+  else if (sparse_data_type == DT_DOUBLE && indice_data_type == DT_INT32 && dense_data_type == DT_DOUBLE &&
+           y_data_type == DT_DOUBLE)
+    regular_calculate<double, int32_t, double, double>(ctx);
+  else if (sparse_data_type == DT_DOUBLE && indice_data_type == DT_INT32 && dense_data_type == DT_COMPLEX128 &&
+           y_data_type == DT_COMPLEX128)
+    regular_calculate<double, int32_t, complex<double>, complex<double>>(ctx);
+  else if (sparse_data_type == DT_INT64 && indice_data_type == DT_INT64 && dense_data_type == DT_INT64 &&
+           y_data_type == DT_INT64)
+    regular_calculate<int64_t, int64_t, int64_t, int64_t>(ctx);
+  else if (sparse_data_type == DT_INT64 && indice_data_type == DT_INT32 && dense_data_type == DT_INT64 &&
+           y_data_type == DT_INT64)
+    regular_calculate<int64_t, int32_t, int64_t, int64_t>(ctx);
+  else if (sparse_data_type == DT_INT32 && indice_data_type == DT_INT64 && dense_data_type == DT_INT32 &&
+           y_data_type == DT_INT32)
+    regular_calculate<int32_t, int64_t, int32_t, int32_t>(ctx);
+  else if (sparse_data_type == DT_INT32 && indice_data_type == DT_INT32 && dense_data_type == DT_INT32 &&
+           y_data_type == DT_INT32)
+    regular_calculate<int32_t, int32_t, int32_t, int32_t>(ctx);
+  else if (sparse_data_type == DT_COMPLEX64 && indice_data_type == DT_INT64 && dense_data_type == DT_FLOAT &&
+           y_data_type == DT_COMPLEX64)
+    regular_calculate<complex<float>, int64_t, float, complex<float>>(ctx);
+  else if (sparse_data_type == DT_COMPLEX64 && indice_data_type == DT_INT64 && dense_data_type == DT_COMPLEX64 &&
+           y_data_type == DT_COMPLEX64)
+    regular_calculate<complex<float>, int64_t, complex<float>, complex<float>>(ctx);
+  else if (sparse_data_type == DT_COMPLEX64 && indice_data_type == DT_INT32 && dense_data_type == DT_FLOAT &&
+           y_data_type == DT_COMPLEX64)
+    regular_calculate<complex<float>, int32_t, float, complex<float>>(ctx);
+  else if (sparse_data_type == DT_COMPLEX64 && indice_data_type == DT_INT32 && dense_data_type == DT_COMPLEX64 &&
+           y_data_type == DT_COMPLEX64)
+    regular_calculate<complex<float>, int32_t, complex<float>, complex<float>>(ctx);
+  else if (sparse_data_type == DT_COMPLEX128 && indice_data_type == DT_INT64 && dense_data_type == DT_DOUBLE &&
+           y_data_type == DT_COMPLEX128)
+    regular_calculate<complex<double>, int64_t, double, complex<double>>(ctx);
+  else if (sparse_data_type == DT_COMPLEX128 && indice_data_type == DT_INT64 && dense_data_type == DT_COMPLEX128 &&
+           y_data_type == DT_COMPLEX128)
+    regular_calculate<complex<double>, int64_t, complex<double>, complex<double>>(ctx);
+  else if (sparse_data_type == DT_COMPLEX128 && indice_data_type == DT_INT32 && dense_data_type == DT_DOUBLE &&
+           y_data_type == DT_COMPLEX128)
+    regular_calculate<complex<double>, int32_t, double, complex<double>>(ctx);
+  else if (sparse_data_type == DT_COMPLEX128 && indice_data_type == DT_INT32 && dense_data_type == DT_COMPLEX128 &&
+           y_data_type == DT_COMPLEX128)
+    regular_calculate<complex<double>, int32_t, complex<double>, complex<double>>(ctx);
+  else if (sparse_data_type == DT_FLOAT16 && indice_data_type == DT_INT64 && dense_data_type == DT_FLOAT16 &&
+           y_data_type == DT_FLOAT16)
+    regular_calculate<Eigen::half, int64_t, Eigen::half, Eigen::half>(ctx);
+  else if (sparse_data_type == DT_FLOAT16 && indice_data_type == DT_INT32 && dense_data_type == DT_FLOAT16 &&
+           y_data_type == DT_FLOAT16)
+    regular_calculate<Eigen::half, int32_t, Eigen::half, Eigen::half>(ctx);
+
+  else {
+    KERNEL_LOG_ERROR(
+      "sparse_tensor_dense_mat_mul kernel wrong datatype."
+      "sparse_data_type [%s],"
+      "indices_data_type [%s],"
+      "dense_data_type [%s],"
+      "y_data_type [%s].",
+      DTypeStr(sparse_data_type).c_str(), DTypeStr(indice_data_type).c_str(), DTypeStr(dense_data_type).c_str(),
+      DTypeStr(y_data_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+template <class SparseType, class IndicesType, class DenseType, class OutputType>
+uint32_t SparseTensorDenseMatMulCpuKernel::regular_calculate(CpuKernelContext &ctx) {
+  Tensor *x1_indices = ctx.Input(0);
+  Tensor *x1_values = ctx.Input(1);
+  Tensor *x1_shape = ctx.Input(2);
+  Tensor *x2 = ctx.Input(3);
+  Tensor *y = ctx.Output(0);
+  auto x1_indices_shape = x1_indices->GetTensorShape();
+  auto x2_shape = x2->GetTensorShape();
+  auto y_shape = y->GetTensorShape();
+  int64_t *x1_shape_data = (int64_t *)x1_shape->GetData();
+  uint64_t x1_row = x1_shape_data[0];
+  uint64_t x1_col = x1_shape_data[1];
+  uint64_t x2_row = x2_shape->GetDimSize(0);
+  uint64_t x2_col = x2_shape->GetDimSize(1);
+  AttrValue *adjoint_a = ctx.GetAttr("adjoint_a");
+  AttrValue *adjoint_b = ctx.GetAttr("adjoint_b");
+  SparseType *x1_values_data = (SparseType *)x1_values->GetData();
+  DenseType *x2_data = (DenseType *)x2->GetData();
+  OutputType *y_data = (OutputType *)y->GetData();
+  uint64_t y_data_len = y->NumElements();
+  for (uint64_t i = 0; i < y_data_len; i++) {
+    y_data[i] = static_cast<OutputType>(0);
+  }
+
+  if (adjoint_a->GetBool()) {
+    swap(x1_row, x1_col);
+  }
+  if (adjoint_b->GetBool()) {
+    swap(x2_row, x2_col);
+  }
+  uint64_t pairs = x1_indices_shape->GetDimSize(0);
+  IndicesType *x1_indices_data = (IndicesType *)x1_indices->GetData();
+  for (uint64_t i = 0; i < pairs; i++) {
+    uint64_t row = x1_indices_data[i << 1], col = x1_indices_data[1 + (i << 1)];
+    SparseType a = x1_values_data[i];
+    if (adjoint_a->GetBool()) {
+      swap(row, col);
+    }
+    KERNEL_CHECK_FALSE(row >= 0 && row < x1_row && col >= 0 && col < x1_col, KERNEL_STATUS_PARAM_INVALID,
+                       "sparse size invalid.")
+    if (x2_col < COL_SHED) {
+      for (uint64_t j = 0; j < x2_col; j++) {
+        uint64_t idx = adjoint_b->GetBool() ? (j * x2_row + col) : (col * x2_col + j);
+        DenseType b = x2_data[idx];
+        if constexpr (std::is_same<DenseType, complex<double>>::value || std::is_same<DenseType, complex<float>>::value)
+          if (adjoint_b->GetBool()) {
+            b = conj(b);
+          }
+        y_data[row * x2_col + j] += a * b;
+      }
+      continue;
+    }
+    uint32_t min_core = 1;
+    uint64_t max_core = std::max(min_core, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    max_core = std::min(max_core, x2_col);
+    auto fun = [&](size_t s, size_t t) {
+      for (uint64_t j = s; j < t; j++) {
+        uint64_t idx = adjoint_b->GetBool() ? (j * x2_row + col) : (col * x2_col + j);
+        DenseType b = x2_data[idx];
+        if constexpr (std::is_same<DenseType, complex<double>>::value || std::is_same<DenseType, complex<float>>::value)
+          if (adjoint_b->GetBool()) {
+            b = conj(b);
+          }
+        y_data[row * x2_col + j] += a * b;
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, x2_col, x2_col / max_core, fun),
+                        "SparseTensorDenseMatMul Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+uint32_t SparseTensorDenseMatMulCpuKernel::SparseTensorDenseMatMulCheck(CpuKernelContext &ctx) {
+  Tensor *x1_indices = ctx.Input(0);
+  Tensor *x1_values = ctx.Input(1);
+  Tensor *x1_shape = ctx.Input(2);
+  Tensor *x2 = ctx.Input(3);
+  Tensor *y = ctx.Output(0);
+  AttrValue *adjoint_a = ctx.GetAttr("adjoint_a"), *adjoint_b = ctx.GetAttr("adjoint_b");
+  KERNEL_CHECK_NULLPTR(x1_indices, KERNEL_STATUS_PARAM_INVALID, "Get input 0 failed.")
+  KERNEL_CHECK_NULLPTR(x1_values, KERNEL_STATUS_PARAM_INVALID, "Get input 1 failed.")
+  KERNEL_CHECK_NULLPTR(x1_shape, KERNEL_STATUS_PARAM_INVALID, "Get input 2 failed.")
+  KERNEL_CHECK_NULLPTR(x2, KERNEL_STATUS_PARAM_INVALID, "Get input 3 failed.")
+  KERNEL_CHECK_NULLPTR(y, KERNEL_STATUS_PARAM_INVALID, "Get output 0 failed.")
+  KERNEL_CHECK_NULLPTR(adjoint_a, KERNEL_STATUS_PARAM_INVALID, "Get attribute adjoint_a failed.")
+  KERNEL_CHECK_NULLPTR(adjoint_b, KERNEL_STATUS_PARAM_INVALID, "Get attribute adjoint_b failed.")
+  KERNEL_CHECK_FALSE(x1_indices->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
+  KERNEL_CHECK_FALSE(x1_values->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
+  KERNEL_CHECK_FALSE(x1_indices->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 2 data failed.")
+  KERNEL_CHECK_FALSE(x2->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 3 data failed.")
+  KERNEL_CHECK_FALSE(y->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed.")
+  KERNEL_CHECK_FALSE(x1_shape->GetDataType() == DT_INT64, KERNEL_STATUS_PARAM_INVALID, "x1_shape must be DT_INT64")
+  KERNEL_CHECK_FALSE(x1_shape->GetTensorShape()->GetDims() == 1 && x1_shape->NumElements() == 2 &&
+                       x1_indices->GetTensorShape()->GetDimSize(0) == x1_values->NumElements(),
+                     KERNEL_STATUS_PARAM_INVALID, "sparse tensor x1 dimension error.")
+  KERNEL_CHECK_FALSE(x2->GetTensorShape()->GetDims() == 2, KERNEL_STATUS_PARAM_INVALID, "matrix x2 dimension error.")
+  int64_t *x1_shape_data = (int64_t *)x1_shape->GetData();
+  uint64_t x1_col = x1_shape_data[!adjoint_a->GetBool()];
+  uint64_t x2_row = x2->GetTensorShape()->GetDimSize(adjoint_b->GetBool());
+  KERNEL_CHECK_FALSE(x1_col == x2_row, KERNEL_STATUS_PARAM_INVALID, "can not do matrix multiplication.")
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kSparseTensorDenseMatMul, SparseTensorDenseMatMulCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_mat_mul.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_mat_mul.h
@ -0,0 +1,36 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_TENSOR_DENSE_MAT_MUL_H_
+#define AICPU_KERNELS_NORMALIZED_SPARSE_TENSOR_DENSE_MAT_MUL_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class SparseTensorDenseMatMulCpuKernel : public CpuKernel {
+ public:
+  SparseTensorDenseMatMulCpuKernel() = default;
+  ~SparseTensorDenseMatMulCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <class SparseType, class IndicesType, class DenseType, class OutputType>
+  static uint32_t regular_calculate(CpuKernelContext &ctx);
+  static uint32_t SparseTensorDenseMatMulCheck(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_to_csr_sparse_matrix.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_to_csr_sparse_matrix.cc
@ -0,0 +1,200 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sparse_tensor_to_csr_sparse_matrix.h"
+
+#include <complex>
+#include <numeric>
+
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 3;
+const uint32_t kOutputNum = 5;
+const char *SparseTensorToCSRSparseMatrix = "SparseTensorToCSRSparseMatrix";
+const int DIM2 = 2;
+const int DIM3 = 3;
+}  // namespace
+
+namespace aicpu {
+uint32_t SparseTensorToCSRSparseMatrixCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseTensorToCSRSparseMatrix normal check failed.");
+  Tensor *x_indices = ctx.Input(0);
+  Tensor *x_values = ctx.Input(1);
+  Tensor *x_dense_shape = ctx.Input(2);
+
+  const int rank = x_dense_shape->NumElements();
+  if (rank != DIM2 && rank != DIM3) {
+    KERNEL_LOG_ERROR("SparseTensor must have rank 2 or 3.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  auto x_indices_shape = x_indices->GetTensorShape();
+  auto x_values_shape = x_values->GetTensorShape();
+  if (x_indices_shape->NumElements() / rank != x_values_shape->NumElements()) {
+    KERNEL_LOG_ERROR("Tensor x_indices&x_values's ranks mismatch.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  auto x_dense_shape_data_type = x_dense_shape->GetDataType();
+  auto x_indices_data_type = x_indices->GetDataType();
+  if (x_indices_data_type != DT_INT32 && x_indices_data_type != DT_INT64) {
+    KERNEL_LOG_ERROR("SparseTensorToCSRSparseMatrix kernel data type [%s] not support.",
+                     DTypeStr(x_indices_data_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  if (x_dense_shape_data_type != x_indices_data_type) {
+    KERNEL_LOG_ERROR("SparseTensorToCSRSparseMatrix kernel data type mismatch.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  auto x_values_data_type = x_values->GetDataType();
+
+  uint32_t status;
+  switch (x_indices_data_type) {
+    case DT_INT32:
+      switch (x_values_data_type) {
+        case DT_FLOAT:
+          status = ComputeKernel<int32_t, float>(ctx);
+          break;
+        case DT_DOUBLE:
+          status = ComputeKernel<int32_t, double>(ctx);
+          break;
+        case DT_COMPLEX64:
+          status = ComputeKernel<int32_t, std::complex<float> >(ctx);
+          break;
+        case DT_COMPLEX128:
+          status = ComputeKernel<int32_t, std::complex<double> >(ctx);
+          break;
+        default:
+          KERNEL_LOG_ERROR(
+            "SparseTensorToCSRSparseMatrix kernel data type [%s] not "
+            "support.",
+            DTypeStr(x_values_data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    case DT_INT64:
+      switch (x_values_data_type) {
+        case DT_FLOAT:
+          status = ComputeKernel<int64_t, float>(ctx);
+          break;
+        case DT_DOUBLE:
+          status = ComputeKernel<int64_t, double>(ctx);
+          break;
+        case DT_COMPLEX64:
+          status = ComputeKernel<int64_t, std::complex<float> >(ctx);
+          break;
+        case DT_COMPLEX128:
+          status = ComputeKernel<int64_t, std::complex<double> >(ctx);
+          break;
+        default:
+          KERNEL_LOG_ERROR(
+            "SparseTensorToCSRSparseMatrix kernel data type [%s] not "
+            "support.",
+            DTypeStr(x_values_data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    default:
+      KERNEL_LOG_ERROR("data type of indices is not int32 or int64");
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  if (status != KERNEL_STATUS_OK) {
+    KERNEL_LOG_ERROR("SparseTensorToCSRSparseMatrix kernel compute failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(SparseTensorToCSRSparseMatrix, SparseTensorToCSRSparseMatrixCpuKernel);
+
+template <typename indicesT, typename dataT>
+uint32_t SparseTensorToCSRSparseMatrixCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
+  auto x_dense_shape = ctx.Input(2);
+  auto x_dense_shape_ptr = static_cast<indicesT *>(x_dense_shape->GetData());
+  auto y_dense_shape_ptr = static_cast<indicesT *>(ctx.Output(0)->GetData());
+  auto x_values_ptr = static_cast<dataT *>(ctx.Input(1)->GetData());
+  auto y_values_ptr = static_cast<dataT *>(ctx.Output(4)->GetData());
+
+  // Copy the CSRSparseMatrix's dense_shape and values from the SparseTensor.
+  for (int64_t i = 0; i < x_dense_shape->GetTensorShape()->NumElements(); i++) {
+    y_dense_shape_ptr[i] = x_dense_shape_ptr[i];
+  }
+  for (int64_t i = 0; i < ctx.Input(1)->GetTensorShape()->NumElements(); i++) {
+    y_values_ptr[i] = x_values_ptr[i];
+  }
+
+  auto y_batch_pointers_ptr = static_cast<indicesT *>(ctx.Output(1)->GetData());
+  auto y_row_pointers_ptr = static_cast<indicesT *>(ctx.Output(2)->GetData());
+  auto y_col_indices_ptr = static_cast<indicesT *>(ctx.Output(3)->GetData());
+  auto x_indices_ptr = static_cast<indicesT *>(ctx.Input(0)->GetData());
+
+  const int rank = ctx.Input(2)->NumElements();
+  const int64_t batch_size = (rank == DIM2) ? 1 : x_dense_shape_ptr[0];
+  const int64_t num_rows = x_dense_shape_ptr[(rank == DIM2) ? 0 : 1];
+  const int64_t total_nnz = ctx.Input(1)->NumElements();
+
+  for (int64_t i = 0; i < batch_size * (num_rows + 1); i++) {
+    y_row_pointers_ptr[i] = 0;
+  }
+
+  int64_t prev_batch = -1;
+  if (rank == DIM2) {
+    // For a single batch, the batch_ptrs are {0, total_nnz}.
+    y_batch_pointers_ptr[0] = 0;
+    ++prev_batch;
+
+    for (int64_t i = 0; i < total_nnz; ++i) {
+      // For now, the rows pointers store the corresponding row counts.
+      int64_t offset = i * rank;
+      y_row_pointers_ptr[x_indices_ptr[offset] + 1] += 1;
+      y_col_indices_ptr[i] = x_indices_ptr[++offset];
+    }
+  } else {  // rank == 3
+    for (int64_t i = 0; i < total_nnz; ++i) {
+      int64_t offset = i * rank;
+      const int cur_batch = x_indices_ptr[offset];
+      // For now, the rows pointers store the corresponding row counts.
+      y_row_pointers_ptr[cur_batch * (num_rows + 1) + x_indices_ptr[++offset] + 1] += 1;
+      y_col_indices_ptr[i] = x_indices_ptr[++offset];
+
+      // We're at a new batch and might have skipped over empty batches.
+      while (prev_batch < cur_batch) {
+        // The previous batch ends at position i.
+        y_batch_pointers_ptr[prev_batch + 1] = i;
+        ++prev_batch;
+      }
+    }
+  }
+  // Set the last element of batch_ptr and account for trailing empty batches.
+  while (prev_batch < batch_size) {
+    y_batch_pointers_ptr[prev_batch + 1] = total_nnz;
+    ++prev_batch;
+  }
+
+  // Compute the cumulative row counts for each batch.
+  for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
+    auto *row_ptr_batch = y_row_pointers_ptr + batch_idx * (num_rows + 1);
+    std::partial_sum(row_ptr_batch, row_ptr_batch + num_rows + 1, row_ptr_batch);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_to_csr_sparse_matrix.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_to_csr_sparse_matrix.h
@ -0,0 +1,34 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SparseTensorToCSRSparseMatrix_H_
+#define AICPU_KERNELS_NORMALIZED_SparseTensorToCSRSparseMatrix_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class SparseTensorToCSRSparseMatrixCpuKernel : public CpuKernel {
+ public:
+  ~SparseTensorToCSRSparseMatrixCpuKernel() = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename indicesT, typename dataT>
+  uint32_t ComputeKernel(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sspaddmm.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sspaddmm.cc
@ -0,0 +1,605 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sspaddmm.h"
+#include <complex>
+#include <iostream>
+#include "utils/eigen_tensor.h"
+
+namespace aicpu {
+
+const char *SSPADDMM = "Sspaddmm";
+#define SPADDMM_COMPUTE_CASE(DTYPE, TYPE, CTX)             \
+  case (DTYPE): {                                          \
+    uint32_t result = SspaddmmCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                      \
+      KERNEL_LOG_ERROR("Sspaddmm kernel compute failed."); \
+      return result;                                       \
+    }                                                      \
+    break;                                                 \
+  }
+
+// scalar * sparse matrix for beta * input alpha * mat1
+template <typename T>
+T *SspaddmmCpuKernel::ScalarSparseMul(CpuKernelContext &ctx, Tensor *vals, Tensor *scalar) {
+  T scalar_val;
+  auto scalar_val_addr = scalar->GetData();
+  switch (scalar->GetDataType()) {
+    case DT_UINT8:
+      scalar_val = static_cast<T>(reinterpret_cast<uint8_t *>(scalar_val_addr)[0]);
+      break;
+    case DT_UINT16:
+      scalar_val = static_cast<T>(reinterpret_cast<uint16_t *>(scalar_val_addr)[0]);
+      break;
+    case DT_UINT32:
+      scalar_val = static_cast<T>(reinterpret_cast<uint32_t *>(scalar_val_addr)[0]);
+      break;
+    case DT_UINT64:
+      scalar_val = static_cast<T>(reinterpret_cast<uint64_t *>(scalar_val_addr)[0]);
+      break;
+    case DT_INT8:
+      scalar_val = static_cast<T>(reinterpret_cast<int8_t *>(scalar_val_addr)[0]);
+      break;
+    case DT_INT16:
+      scalar_val = static_cast<T>(reinterpret_cast<int16_t *>(scalar_val_addr)[0]);
+      break;
+    case DT_INT32:
+      scalar_val = static_cast<T>(reinterpret_cast<int32_t *>(scalar_val_addr)[0]);
+      break;
+    case DT_INT64:
+      scalar_val = static_cast<T>(reinterpret_cast<int64_t *>(scalar_val_addr)[0]);
+      break;
+    case DT_FLOAT16:
+      scalar_val = static_cast<T>(reinterpret_cast<Eigen::half *>(scalar_val_addr)[0]);
+      break;
+    case DT_FLOAT:
+      scalar_val = static_cast<T>(reinterpret_cast<float *>(scalar_val_addr)[0]);
+      break;
+    case DT_DOUBLE:
+      scalar_val = static_cast<T>(reinterpret_cast<double *>(scalar_val_addr)[0]);
+      break;
+    case DT_BOOL:
+      scalar_val = static_cast<T>(reinterpret_cast<bool *>(scalar_val_addr)[0]);
+      break;
+    case DT_COMPLEX64:
+      scalar_val = static_cast<T>(reinterpret_cast<std::complex<float> *>(scalar_val_addr)[0].real());
+      break;
+    case DT_COMPLEX128:
+      scalar_val = static_cast<T>(reinterpret_cast<std::complex<double> *>(scalar_val_addr)[0].real());
+      break;
+    default:
+      KERNEL_LOG_ERROR("For Sspaddm, scalar dtype %s not support", DTypeStr(scalar->GetDataType()).c_str());
+      return nullptr;
+  }
+  T *val_addr = reinterpret_cast<T *>(vals->GetData());
+  uint32_t data_num = vals->GetTensorShape()->GetDimSize(0);
+  T *val_addr_bak = new T[data_num];
+  if (data_num >= kParallelDataNumSameShape_) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, CpuKernelUtils::GetCPUNum(ctx));
+    if (data_num <= kParallelDataNumSameShapeMid_) {
+      max_core_num = std::min(max_core_num, 4U);
+    }
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto multi = [&val_addr, &val_addr_bak, scalar_val](uint32_t start, uint32_t end) {
+      for (uint32_t idx = start; idx < end; idx++) {
+        val_addr_bak[idx] = val_addr[idx] * scalar_val;
+      }
+    };
+    max_core_num = max_core_num == 0 ? 1 : max_core_num;
+    CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, multi);
+  } else {
+    // no match for ‘operator*=’ (operand types are ‘Eigen::half’ and ‘float’)
+    for (uint32_t idx = 0; idx < data_num; idx++) {
+      val_addr_bak[idx] = val_addr[idx] * scalar_val;
+    }
+  }
+  return val_addr_bak;
+}
+
+template <typename T>
+void SspaddmmCpuKernel::Clear(Tensor *tensor, CpuKernelContext &ctx) {
+  T *addr = reinterpret_cast<T *>(tensor->GetData());
+  uint32_t num = tensor->GetTensorShape()->GetDimSize(0);
+  if (num >= kParallelDataNumSameShape_) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, CpuKernelUtils::GetCPUNum(ctx));
+    if (num <= kParallelDataNumSameShapeMid_) {
+      max_core_num = std::min(max_core_num, 4U);
+    }
+    if (max_core_num > num) {
+      max_core_num = num;
+    }
+    auto multi = [&addr](uint32_t start, uint32_t end) {
+      for (uint32_t idx = start; idx < end; idx++) {
+        addr[idx] = static_cast<T>(0);
+      }
+    };
+    max_core_num = max_core_num == 0 ? 1 : max_core_num;
+    CpuKernelUtils::ParallelFor(ctx, num, num / max_core_num, multi);
+  } else {
+    // no match for ‘operator*=’ (operand types are ‘Eigen::half’ and ‘float’)
+    for (uint32_t idx = 0; idx < num; idx++) {
+      addr[idx] = static_cast<T>(0);
+    }
+  }
+}
+
+template <typename T>
+void SspaddmmCpuKernel::ClearIndices(Tensor *tensor, CpuKernelContext &ctx) {
+  T *addr = reinterpret_cast<T *>(tensor->GetData());
+  uint32_t num = 2 * tensor->GetTensorShape()->GetDimSize(1);
+  if (num >= kParallelDataNumSameShape_) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, CpuKernelUtils::GetCPUNum(ctx));
+    if (num <= kParallelDataNumSameShapeMid_) {
+      max_core_num = std::min(max_core_num, 4U);
+    }
+    if (max_core_num > num) {
+      max_core_num = num;
+    }
+    auto multi = [&addr](uint32_t start, uint32_t end) {
+      for (uint32_t idx = start; idx < end; idx++) {
+        addr[idx] = static_cast<T>(0);
+      }
+    };
+    max_core_num = max_core_num == 0 ? 1 : max_core_num;
+    CpuKernelUtils::ParallelFor(ctx, num, num / max_core_num, multi);
+  } else {
+    // no match for ‘operator*=’ (operand types are ‘Eigen::half’ and ‘float’)
+    for (uint32_t idx = 0; idx < num; idx++) {
+      addr[idx] = static_cast<T>(0);
+    }
+  }
+}
+
+template <typename T1>
+uint32_t SspaddmmCpuKernel::BoundaryCheck(Tensor *tensor, Tensor *shape_tensor, int64_t nums, CpuKernelContext &ctx) {
+  int64_t row;
+  int64_t col;
+  if (shape_tensor->GetDataType() == DT_INT32) {
+    int32_t *in_dim = reinterpret_cast<int32_t *>(shape_tensor->GetData());
+    row = static_cast<int64_t>(in_dim[0]);
+    col = static_cast<int64_t>(in_dim[1]);
+  } else {
+    int64_t *in_dim = reinterpret_cast<int64_t *>(shape_tensor->GetData());
+    row = in_dim[0];
+    col = in_dim[1];
+  }
+  if (row <= 0 || col <= 0) {
+    KERNEL_LOG_ERROR("For sspaddmm, sparse tensor shape should be positive num but get [%d, %d]", row, col);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  int64_t row_tmp, col_tmp;
+  T1 *addr = reinterpret_cast<T1 *>(tensor->GetData());
+  uint32_t data_num = static_cast<uint32_t>(nums);
+  if (data_num >= kParallelDataNumSameShape_) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, CpuKernelUtils::GetCPUNum(ctx));
+    if (data_num <= kParallelDataNumSameShapeMid_) {
+      max_core_num = std::min(max_core_num, 4U);
+    }
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto multi = [&](uint32_t start, uint32_t end) {
+      for (uint32_t i = start; i < end; i++) {
+        row_tmp = static_cast<int64_t>(addr[i]);
+        col_tmp = static_cast<int64_t>(addr[i + data_num]);
+        if (row_tmp >= row || row_tmp < 0) {
+          KERNEL_LOG_ERROR("For sspaddmm, sparse tensor indices row index [%d] out of range[0, %d]", row_tmp, row);
+          return KERNEL_STATUS_PARAM_INVALID;
+        }
+        if (col_tmp >= col || col_tmp < 0) {
+          KERNEL_LOG_ERROR("For sspaddmm, sparse tensor indices col index [%d] out of range[0, %d]", col_tmp, col);
+          return KERNEL_STATUS_PARAM_INVALID;
+        }
+      }
+      return KERNEL_STATUS_PARAM_INVALID;
+    };
+    max_core_num = max_core_num == 0 ? 1 : max_core_num;
+    CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, multi);
+    return KERNEL_STATUS_OK;
+  } else {
+    for (uint32_t i = 0; i < data_num; i++) {
+      row_tmp = static_cast<int64_t>(addr[i]);
+      col_tmp = static_cast<int64_t>(addr[i + data_num]);
+      if (row_tmp >= row || row_tmp < 0) {
+        KERNEL_LOG_ERROR("For sspaddmm, sparse tensor indices row index [%d] out of range[0, %d]", row_tmp, row);
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+      if (col_tmp >= col || col_tmp < 0) {
+        KERNEL_LOG_ERROR("For sspaddmm, sparse tensor indices col index [%d] out of range[0, %d]", col_tmp, col);
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+    }
+    return KERNEL_STATUS_OK;
+  }
+}
+
+// sparse matrix multiply dense matrix
+template <typename T_idx, typename T>
+uint32_t SspaddmmCpuKernel::SparseMulDense(CpuKernelContext &ctx, Tensor *mat1_indices_tensor, T *mat1_val_addr,
+                                           Tensor *mat2_values_tensor, Tensor *output_indices_tensor,
+                                           Tensor *output_values_tensor, const int64_t row, const int64_t mat2_col) {
+  const int mat1_vals_num = mat1_indices_tensor->GetTensorShape()->GetDimSize(1);
+
+  // the result of mat1 @ mat2 will write to output directly
+  T_idx *mat1_idx_addr = reinterpret_cast<T_idx *>(mat1_indices_tensor->GetData());
+  T *mat2_val_addr = reinterpret_cast<T *>(mat2_values_tensor->GetData());
+  int64_t *out_idx_addr = reinterpret_cast<int64_t *>(output_indices_tensor->GetData());
+  T *out_val_addr = reinterpret_cast<T *>(output_values_tensor->GetData());
+  int out_num = output_indices_tensor->GetTensorShape()->GetDimSize(1);
+  std::unordered_map<T_idx, std::unordered_map<int64_t, uint32_t>> idx_map_cnt;
+  std::unordered_map<T_idx, std::vector<T_idx>> unrepeated;
+  std::unordered_map<T_idx, std::unordered_map<T_idx, std::vector<T>>> co_map_idx;
+
+  // unrepeated : [1 -> [0], 2 -> [1, 2]]
+  // co_map_idx : [1][0] -> 0.3
+  for (int64_t i = 0; i < mat1_vals_num; i++) {
+    T_idx _row = mat1_idx_addr[i];
+    T_idx _col = mat1_idx_addr[i + mat1_vals_num];
+    unrepeated[_row].push_back(_col);
+    co_map_idx[_row][_col].push_back(mat1_val_addr[i]);
+    for (uint32_t j = 0; j < mat2_col; j++) {
+      if (idx_map_cnt[_row][j] == 0) {
+        idx_map_cnt[_row][j] = this->cnt_;
+        this->cnt_++;
+      }
+    }
+  }
+
+  std::vector<T_idx> res;
+  for (auto it = unrepeated.begin(); it != unrepeated.end(); it++) {
+    res.push_back(it->first);
+  }
+
+  uint32_t n_unreapeat = unrepeated.size();
+  if (n_unreapeat * mat2_col > kParallelDataNumSameShape_) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, CpuKernelUtils::GetCPUNum(ctx));
+    if (n_unreapeat <= kParallelDataNumSameShape_) {
+      max_core_num = std::min(max_core_num, 4U);
+    }
+    if (max_core_num > n_unreapeat) {
+      max_core_num = n_unreapeat;
+    }
+    auto multi = [&](uint32_t start, uint32_t end) {
+      for (uint32_t i = start; i < end; i++) {
+        // get val
+        auto row_mat1 = res[i];
+        for (auto row_mat2 : unrepeated[row_mat1]) {
+          T val = co_map_idx[row_mat1][row_mat2].back();
+          co_map_idx[row_mat1][row_mat2].pop_back();
+          for (int64_t j = 0; j < mat2_col; j++) {
+            // get val
+            T_idx idx = idx_map_cnt[row_mat1][j];
+            *(out_val_addr + idx) += val * mat2_val_addr[row_mat2 * mat2_col + j];
+            out_idx_addr[idx] = static_cast<int64_t>(row_mat1);
+            out_idx_addr[idx + out_num] = j;
+          }
+        }
+      }
+    };
+    max_core_num = max_core_num == 0 ? 1 : max_core_num;
+    CpuKernelUtils::ParallelFor(ctx, n_unreapeat, n_unreapeat / max_core_num, multi);
+  } else {
+    for (uint32_t i = 0; i < n_unreapeat; i++) {
+      // get val
+      auto row_mat1 = res[i];
+      for (auto row_mat2 : unrepeated[row_mat1]) {
+        T val = co_map_idx[row_mat1][row_mat2].back();
+        co_map_idx[row_mat1][row_mat2].pop_back();
+        for (int64_t j = 0; j < mat2_col; j++) {
+          // get val
+          T_idx idx = idx_map_cnt[row_mat1][j];
+          *(out_val_addr + idx) += val * mat2_val_addr[row_mat2 * mat2_col + j];
+          out_idx_addr[idx] = static_cast<int64_t>(row_mat1);
+          out_idx_addr[idx + out_num] = j;
+        }
+      }
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+// sparse matrix add sparse matrix
+// input + mat1 @ mat2
+template <typename T_idx, typename T>
+uint32_t SspaddmmCpuKernel::SparseAddSparse(CpuKernelContext &ctx, Tensor *input_indices_tensor, T *in_val_addr,
+                                            Tensor *output_indices_tensor, Tensor *output_values_tensor) {
+  // to implement m1[row][col] = vals
+  uint32_t input_nums = input_indices_tensor->GetTensorShape()->GetDimSize(1);
+  this->cnt_ = input_nums;
+  // get output vals and index addr
+  T *out_val_addr = reinterpret_cast<T *>(output_values_tensor->GetData());
+
+  int64_t *out_idx_addr = reinterpret_cast<int64_t *>(output_indices_tensor->GetData());
+  int out_num = output_indices_tensor->GetTensorShape()->GetDimSize(1);
+  // if input idx not in output, will append at the end of output
+
+  T_idx *input_addr = reinterpret_cast<T_idx *>(input_indices_tensor->GetData());
+  if (input_nums >= kParallelDataNumSameShape_) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, CpuKernelUtils::GetCPUNum(ctx));
+    if (input_nums <= kParallelDataNumSameShapeMid_) {
+      max_core_num = std::min(max_core_num, 4U);
+    }
+    if (max_core_num > input_nums) {
+      max_core_num = input_nums;
+    }
+    auto multi = [&](uint32_t start, uint32_t end) {
+      for (uint32_t i = start; i < end; i++) {
+        auto row = input_addr[i];
+        auto col = input_addr[i + input_nums];
+        // else append it at the end
+        out_val_addr[i] = in_val_addr[i];
+        // copy indices[0]
+        out_idx_addr[i] = row;
+        // copy indices[1]
+        out_idx_addr[i + out_num] = col;
+      }
+    };
+    max_core_num = max_core_num == 0 ? 1 : max_core_num;
+    CpuKernelUtils::ParallelFor(ctx, input_nums, input_nums / max_core_num, multi);
+  } else {
+    for (uint32_t i = 0; i < input_nums; i++) {
+      auto row = input_addr[i];
+      auto col = input_addr[i + input_nums];
+
+      // else append it at the end
+      out_val_addr[i] = in_val_addr[i];
+      // copy indices[0]
+      out_idx_addr[i] = row;
+      // copy indices[1]
+      out_idx_addr[i + out_num] = col;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+int64_t SspaddmmCpuKernel::GetIndicesNum(Tensor *tensor) {
+  if (tensor->GetDataType() == DT_INT32) {
+    int32_t *a = reinterpret_cast<int32_t *>(tensor->GetData());
+    return static_cast<int64_t>(a[1]);
+  }
+  int64_t *a = reinterpret_cast<int64_t *>(tensor->GetData());
+  return a[1];
+}
+
+template <typename T>
+uint32_t SspaddmmCpuKernel::SspaddmmCompute(CpuKernelContext &ctx) {
+  Tensor *input_indices_tensor = ctx.Input(0);
+  Tensor *input_values_tensor = ctx.Input(1);
+  Tensor *input_shapes_tensor = ctx.Input(2);
+
+  Tensor *mat1_indices_tensor = ctx.Input(3);
+  Tensor *mat1_values_tensor = ctx.Input(4);
+  Tensor *mat1_shapes_tensor = ctx.Input(5);
+
+  Tensor *mat2_values_tensor = ctx.Input(6);
+  Tensor *alpha_tensor = ctx.Input(7);
+  Tensor *beta_tensor = ctx.Input(8);
+
+  Tensor *output_indices_tensor = ctx.Output(0);
+  Tensor *output_values_tensor = ctx.Output(1);
+
+  Clear<T>(output_values_tensor, ctx);
+  ClearIndices<int64_t>(output_indices_tensor, ctx);
+
+  // scalar * sparse inplace
+  T *input_values_addr_bak = ScalarSparseMul<T>(ctx, input_values_tensor, beta_tensor);
+  T *mat1_values_addr_bak = ScalarSparseMul<T>(ctx, mat1_values_tensor, alpha_tensor);
+  // sparse * mat write to output directly
+  auto row = GetIndicesNum(mat1_shapes_tensor);
+  auto col = GetIndicesNum(input_shapes_tensor);
+
+  // sparse + sparse
+  if (input_indices_tensor->GetDataType() == DT_INT32) {
+    SparseAddSparse<int32_t, T>(ctx, input_indices_tensor, input_values_addr_bak, output_indices_tensor,
+                                output_values_tensor);
+  } else {
+    SparseAddSparse<int64_t, T>(ctx, input_indices_tensor, input_values_addr_bak, output_indices_tensor,
+                                output_values_tensor);
+  }
+  if (mat1_indices_tensor->GetDataType() == DT_INT32) {
+    SparseMulDense<int32_t, T>(ctx, mat1_indices_tensor, mat1_values_addr_bak, mat2_values_tensor,
+                               output_indices_tensor, output_values_tensor, row, col);
+  } else {
+    SparseMulDense<int64_t, T>(ctx, mat1_indices_tensor, mat1_values_addr_bak, mat2_values_tensor,
+                               output_indices_tensor, output_values_tensor, row, col);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SspaddmmCpuKernel::ValidParam(CpuKernelContext &ctx) {
+  // valid input and output nullptr
+  Tensor *input_indices_tensor = ctx.Input(0);
+  Tensor *input_values_tensor = ctx.Input(1);
+  Tensor *input_shapes_tensor = ctx.Input(2);
+
+  Tensor *mat1_indices_tensor = ctx.Input(3);
+  Tensor *mat1_values_tensor = ctx.Input(4);
+  Tensor *mat1_shapes_tensor = ctx.Input(5);
+  Tensor *mat2_tensor = ctx.Input(6);
+  Tensor *alpha_tensor = ctx.Input(7);
+  Tensor *beta_tensor = ctx.Input(8);
+
+  Tensor *output_indices_tensor = ctx.Output(0);
+  Tensor *output_values_tensor = ctx.Output(1);
+  Tensor *output_shapes_tensor = ctx.Output(2);
+
+  // valid shape nullptr
+  auto mat1_values_shape = mat1_values_tensor->GetTensorShape();
+  auto mat1_shapes_shape = mat1_shapes_tensor->GetTensorShape();
+  auto mat1_indices_shape = mat1_indices_tensor->GetTensorShape();
+  auto mat2_shapes_shape = mat2_tensor->GetTensorShape();
+  auto input_values_shape = input_values_tensor->GetTensorShape();
+  auto input_shapes_shape = input_shapes_tensor->GetTensorShape();
+  auto input_indices_shape = input_indices_tensor->GetTensorShape();
+  auto output_values_shape = output_values_tensor->GetTensorShape();
+  auto output_shapes_shape = output_shapes_tensor->GetTensorShape();
+  auto output_indices_shape = output_indices_tensor->GetTensorShape();
+  auto alpha_shape = alpha_tensor->GetTensorShape();
+  auto beta_shape = beta_tensor->GetTensorShape();
+  // sparse_indices
+  // GetDims() will return dims number, uint32_t
+  if (mat1_indices_shape->GetDims() != 2) {
+    KERNEL_LOG_ERROR(
+      "Mat1 sparse_indices should be 2D, got dim "
+      "size [%d].",
+      mat1_indices_shape->GetDims());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (input_indices_shape->GetDims() != 2) {
+    KERNEL_LOG_ERROR(
+      "Input sparse_indices should be 2D, got dim "
+      "size [%d].",
+      input_indices_shape->GetDims());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (output_indices_shape->GetDims() != 2) {
+    KERNEL_LOG_ERROR(
+      "Output sparse_indices should be 2D, got dim "
+      "size [%d].",
+      input_indices_shape->GetDims());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  // valid data type
+  int32_t mat1_IndiceType = mat1_indices_tensor->GetDataType();
+  int32_t input_IndiceType = input_indices_tensor->GetDataType();
+  int32_t output_IndiceType = output_indices_tensor->GetDataType();
+  int32_t mat1_ShapeType = mat1_shapes_tensor->GetDataType();
+  int32_t input_ShapeType = input_shapes_tensor->GetDataType();
+  int32_t output_ShapeType = output_shapes_tensor->GetDataType();
+
+  bool validIndiceType = ((mat1_IndiceType == DT_INT32) || (mat1_IndiceType == DT_INT64)) &&
+                         ((output_IndiceType == DT_INT32) || (output_IndiceType == DT_INT64)) &&
+                         ((input_IndiceType == DT_INT32) || (input_IndiceType == DT_INT64));
+  bool validShapeType = ((mat1_ShapeType == DT_INT32) || (mat1_ShapeType == DT_INT64)) &&
+                        ((output_ShapeType == DT_INT32) || (output_ShapeType == DT_INT64)) &&
+                        ((input_ShapeType == DT_INT32) || (input_ShapeType == DT_INT64));
+  if (!validShapeType || !validIndiceType) {
+    KERNEL_LOG_ERROR(
+      "Valid indice and shape data type failed, "
+      "indiceType and shapeType should be INT32 or INT64");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  // sparse_values' number check
+  int32_t mat1_values_dims_size = mat1_values_shape->GetDims();
+  int32_t input_values_dims_size = input_values_shape->GetDims();
+
+  if ((mat1_values_dims_size != 0) && (mat1_values_dims_size != 1)) {
+    KERNEL_LOG_ERROR(
+      "mat1 values_shape should be a scalar or a vector, "
+      "got dim size [%d].",
+      mat1_values_shape->GetDims());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  if ((input_values_dims_size != 0) && (input_values_dims_size != 1)) {
+    KERNEL_LOG_ERROR(
+      "input values_shape should be a scalar or a vector, "
+      "got dim size [%d].",
+      input_values_shape->GetDims());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  int64_t mat1_elems_num = mat1_indices_shape->GetDims() > 0 ? mat1_indices_shape->GetDimSize(1) : 1;
+  int64_t input_elems_num = input_indices_shape->GetDims() > 0 ? input_indices_shape->GetDimSize(1) : 1;
+
+  if ((mat1_values_dims_size == 1) && (mat1_values_tensor->NumElements() != mat1_elems_num)) {
+    KERNEL_LOG_ERROR(
+      "mat1 values_shape has incorrect number of elements [%lld], "
+      "should be [%lld]",
+      mat1_values_tensor->NumElements(), mat1_elems_num);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if ((input_values_dims_size == 1) && (input_values_tensor->NumElements() != input_elems_num)) {
+    KERNEL_LOG_ERROR(
+      "input values_shape has incorrect number of elements [%lld], "
+      "should be [%lld]",
+      input_values_tensor->NumElements(), input_elems_num);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (alpha_shape->GetDims() > 1) {
+    KERNEL_LOG_ERROR("alpha should be a scalar or vector but get dim num [%d]", alpha_shape->GetDims());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (beta_shape->GetDims() > 1) {
+    KERNEL_LOG_ERROR("beta should be a scalar or vector but get dim num [%d]", alpha_shape->GetDims());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  uint32_t status = KERNEL_STATUS_OK;
+  if (input_indices_tensor->GetDataType() == DT_INT32) {
+    status = BoundaryCheck<int32_t>(input_indices_tensor, input_shapes_tensor, input_values_tensor->NumElements(), ctx);
+  } else {
+    status = BoundaryCheck<int64_t>(input_indices_tensor, input_shapes_tensor, input_values_tensor->NumElements(), ctx);
+  }
+  if (status != KERNEL_STATUS_OK) {
+    return status;
+  }
+  if (mat1_indices_tensor->GetDataType() == DT_INT32) {
+    status = BoundaryCheck<int32_t>(mat1_indices_tensor, mat1_shapes_tensor, mat1_values_tensor->NumElements(), ctx);
+  } else {
+    status = BoundaryCheck<int64_t>(mat1_indices_tensor, mat1_shapes_tensor, mat1_values_tensor->NumElements(), ctx);
+  }
+  return status;
+}
+
+uint32_t SspaddmmCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, this->kInputNum, this->kOutputNum),
+                      "Sspaddmm check input and output number failed.");
+  if (ValidParam(ctx) != KERNEL_STATUS_OK) {
+    KERNEL_LOG_ERROR("Valid sparse to dense param error.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  Tensor *input_shapes_tensor = ctx.Input(2);
+  Tensor *output_values_tensor = ctx.Output(1);
+  Tensor *output_shapes_tensor = ctx.Output(2);
+  int64_t *ou_dim = reinterpret_cast<int64_t *>(output_shapes_tensor->GetData());
+  if (input_shapes_tensor->GetDataType() == DT_INT32) {
+    int32_t *in_dim = reinterpret_cast<int32_t *>(input_shapes_tensor->GetData());
+    for (int32_t index = 0; index < 2; ++index) {
+      ou_dim[index] = in_dim[index];
+    }
+  } else {
+    int64_t *in_dim = reinterpret_cast<int64_t *>(input_shapes_tensor->GetData());
+    for (int32_t index = 0; index < 2; ++index) {
+      ou_dim[index] = in_dim[index];
+    }
+  }
+  auto output_dtype = output_values_tensor->GetDataType();
+  switch (output_dtype) {
+    SPADDMM_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    SPADDMM_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    SPADDMM_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    SPADDMM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    SPADDMM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    SPADDMM_COMPUTE_CASE(DT_FLOAT, float_t, ctx)
+    SPADDMM_COMPUTE_CASE(DT_DOUBLE, double_t, ctx)
+    default:
+      KERNEL_LOG_ERROR("Sspaddmm kernel data type [%s] not support.", DTypeStr(output_dtype).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(SSPADDMM, SspaddmmCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sspaddmm.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sspaddmm.h
@ -0,0 +1,66 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_SSPADDMM_H_
+#define AICPU_KERNELS_NORMALIZED_SSPADDMM_H_
+#include <deque>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "utils/kernel_util.h"
+
+namespace aicpu {
+
+class SspaddmmCpuKernel : public CpuKernel {
+ public:
+  SspaddmmCpuKernel() = default;
+  uint32_t ValidParam(CpuKernelContext &ctx);
+  virtual ~SspaddmmCpuKernel() = default;
+
+  uint32_t cnt_ = 0;
+  const uint32_t kInputNum = 9;
+  const uint32_t kOutputNum = 3;
+  const int64_t kParallelDataNumSameShape_ = 14 * 1024;
+  const int64_t kParallelDataNumSameShapeMid_ = 7 * 1024;
+  template <typename T>
+  void Clear(Tensor *tensor, CpuKernelContext &ctx);
+  template <typename T>
+  void ClearIndices(Tensor *tensor, CpuKernelContext &ctx);
+  template <typename T1>
+  uint32_t BoundaryCheck(Tensor *, Tensor *, int64_t, CpuKernelContext &);
+  template <typename T>
+  uint32_t SspaddmmCompute(CpuKernelContext &ctx);
+  template <typename T_idx, typename T>
+  uint32_t SparseAddSparse(CpuKernelContext &ctx, Tensor *input_indices_tensor, T *in_val_addr,
+                           Tensor *output_indices_tensor, Tensor *output_values_tensor);
+  template <typename T_idx, typename T>
+  uint32_t SparseMulDense(CpuKernelContext &ctx, Tensor *mat1_indices_tensor, T *mat1_val_addr,
+                          Tensor *mat2_values_tensor, Tensor *output_indices_tensor, Tensor *output_values_tensor,
+                          const int64_t row, const int64_t col);
+  template <typename T>
+  T *ScalarSparseMul(CpuKernelContext &ctx, Tensor *vals, Tensor *scalar);
+  int64_t GetIndicesNum(Tensor *tensor);
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+};  // namespace CpuKernel
+};  // namespace aicpu
+
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/trace.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/trace.cc
@ -0,0 +1,98 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "trace.h"
+
+#include "cpu_kernel_utils.h"
+#include "cstring"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 1;
+const uint32_t kOutputNum = 1;
+const uint32_t InputShapeDim = 2;
+const uint32_t OutputShapeDim = 1;
+const uint64_t OutputShapeDimSize = 1;
+const char *kTrace = "Trace";
+
+#define TRACE_COMPUTE_CASE(DTYPE, INPUT, OUTPUT, CTX, TYPE)   \
+  case (DTYPE): {                                             \
+    uint32_t result = TraceCompute<TYPE>(INPUT, OUTPUT, CTX); \
+    if (result != KERNEL_STATUS_OK) {                         \
+      KERNEL_LOG_ERROR("Trace kernel compute failed.");       \
+      return result;                                          \
+    }                                                         \
+    break;                                                    \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t TraceCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Trace check input and output number failed.");
+
+  Tensor *input_tensor = ctx.Input(0);
+  KERNEL_CHECK_NULLPTR(input_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "Trace get input data failed.")
+  KERNEL_CHECK_NULLPTR(input_tensor->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Trace get input shape failed")
+
+  if (input_tensor->GetTensorShape()->GetDims() != InputShapeDim) {
+    KERNEL_LOG_ERROR("Trace input dim must be 2!");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  // check output tensor
+  Tensor *output_tensor = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR(output_tensor, KERNEL_STATUS_PARAM_INVALID, "Trace get output failed.")
+  KERNEL_CHECK_NULLPTR(output_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "Trace get output data failed.")
+  KERNEL_CHECK_NULLPTR(output_tensor->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Trace get output shape failed")
+
+  auto input_dtype = input_tensor->GetDataType();
+  auto output_dtype = output_tensor->GetDataType();
+  switch (input_dtype) {
+    TRACE_COMPUTE_CASE(DT_INT8, input_tensor, output_tensor, ctx, int8_t)
+    TRACE_COMPUTE_CASE(DT_UINT8, input_tensor, output_tensor, ctx, uint8_t)
+    TRACE_COMPUTE_CASE(DT_INT16, input_tensor, output_tensor, ctx, int16_t)
+    TRACE_COMPUTE_CASE(DT_UINT16, input_tensor, output_tensor, ctx, uint16_t)
+    TRACE_COMPUTE_CASE(DT_INT32, input_tensor, output_tensor, ctx, int32_t)
+    TRACE_COMPUTE_CASE(DT_UINT32, input_tensor, output_tensor, ctx, uint32_t)
+    TRACE_COMPUTE_CASE(DT_INT64, input_tensor, output_tensor, ctx, int64_t)
+    TRACE_COMPUTE_CASE(DT_UINT64, input_tensor, output_tensor, ctx, uint64_t)
+    TRACE_COMPUTE_CASE(DT_FLOAT16, input_tensor, output_tensor, ctx, Eigen::half)
+    TRACE_COMPUTE_CASE(DT_FLOAT, input_tensor, output_tensor, ctx, float)
+    TRACE_COMPUTE_CASE(DT_DOUBLE, input_tensor, output_tensor, ctx, double)
+    default:
+      KERNEL_LOG_ERROR("Trace kernel data type [%u] not support", output_dtype);
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t TraceCpuKernel::TraceCompute(Tensor *input, Tensor *output, CpuKernelContext &ctx) {
+  auto inputDataAddr = reinterpret_cast<T *>(input->GetData());
+  auto outputDataAddr = reinterpret_cast<T *>(output->GetData());
+  auto input_shape = ctx.Input(0)->GetTensorShape();
+  int64_t inputLine = input_shape->GetDimSize(0), inputCol = input_shape->GetDimSize(1);
+  auto min_shape = std::min(inputLine, inputCol);
+
+  memset(outputDataAddr, 0, sizeof(T));
+  for (int64_t i = 0; i < min_shape; i++) {
+    *(outputDataAddr) += *(inputDataAddr + i * inputCol + i);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kTrace, TraceCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/trace.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/trace.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_LESS_H_
+#define AICPU_KERNELS_NORMALIZED_LESS_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class TraceCpuKernel : public CpuKernel {
+ public:
+  TraceCpuKernel() = default;
+  ~TraceCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  static uint32_t TraceCompute(Tensor *input, Tensor *output, CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/trace_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/trace_grad.cc
@ -0,0 +1,177 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "trace_grad.h"
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+#include "utils/eigen_tensor.h"
+#include "Eigen/Core"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *kTraceGrad = "TraceGrad";
+
+}  // namespace
+
+// 定义命名空间aicpu
+namespace aicpu {
+// 实现自定义算子类的Compute函数
+uint32_t TraceGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Tracegrad check input and output number failed.");
+  Tensor *y_grad = ctx.Input(0);
+  Tensor *x_shape = ctx.Input(1);
+  Tensor *x_grad = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR(y_grad->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.");
+  KERNEL_CHECK_NULLPTR(x_shape->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.");
+  KERNEL_CHECK_NULLPTR(x_grad->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed");
+  KERNEL_CHECK_FALSE(ctx.Input(1)->NumElements() == 2, KERNEL_STATUS_PARAM_INVALID, "Expected matrix input.",
+                     ctx.Input(1)->NumElements());
+  KERNEL_LOG_DEBUG(
+    "TraceGradCpuKernel[%s], y_grad: size[%llu];"
+    "x_shape: size[%llu], x_grad: size[%llu].",
+    ctx.GetOpType().c_str(), y_grad->GetDataSize(), x_shape->GetDataSize(), x_grad->GetDataSize());
+  DataType data_type = ctx.Input(0)->GetDataType();
+  DataType shape_type = ctx.Input(1)->GetDataType();
+
+  switch (data_type) {
+    case DT_INT8:
+      switch (shape_type) {
+        case DT_INT32:
+          return TraceGradCompute<int8_t, int32_t>(ctx);
+        case DT_INT64:
+          return TraceGradCompute<int8_t, int64_t>(ctx);
+        default:
+          break;
+      }
+    case DT_INT16:
+      switch (shape_type) {
+        case DT_INT32:
+          return TraceGradCompute<int16_t, int32_t>(ctx);
+        case DT_INT64:
+          return TraceGradCompute<int16_t, int64_t>(ctx);
+        default:
+          break;
+      }
+    case DT_INT32:
+      switch (shape_type) {
+        case DT_INT32:
+          return TraceGradCompute<int32_t, int32_t>(ctx);
+        case DT_INT64:
+          return TraceGradCompute<int32_t, int64_t>(ctx);
+        default:
+          break;
+      }
+    case DT_INT64:
+      switch (shape_type) {
+        case DT_INT32:
+          return TraceGradCompute<int64_t, int32_t>(ctx);
+        case DT_INT64:
+          return TraceGradCompute<int64_t, int64_t>(ctx);
+        default:
+          break;
+      }
+    case DT_UINT8:
+      switch (shape_type) {
+        case DT_INT32:
+          return TraceGradCompute<uint8_t, int32_t>(ctx);
+        case DT_INT64:
+          return TraceGradCompute<uint8_t, int64_t>(ctx);
+        default:
+          break;
+      }
+    case DT_UINT16:
+      switch (shape_type) {
+        case DT_INT32:
+          return TraceGradCompute<uint16_t, int32_t>(ctx);
+        case DT_INT64:
+          return TraceGradCompute<uint16_t, int64_t>(ctx);
+        default:
+          break;
+      }
+    case DT_UINT32:
+      switch (shape_type) {
+        case DT_INT32:
+          return TraceGradCompute<uint32_t, int32_t>(ctx);
+        case DT_INT64:
+          return TraceGradCompute<uint32_t, int64_t>(ctx);
+        default:
+          break;
+      }
+    case DT_UINT64:
+      switch (shape_type) {
+        case DT_INT32:
+          return TraceGradCompute<uint64_t, int32_t>(ctx);
+        case DT_INT64:
+          return TraceGradCompute<uint64_t, int64_t>(ctx);
+        default:
+          break;
+      }
+    case DT_FLOAT16:
+      switch (shape_type) {
+        case DT_INT32:
+          return TraceGradCompute<Eigen::half, int32_t>(ctx);
+        case DT_INT64:
+          return TraceGradCompute<Eigen::half, int64_t>(ctx);
+        default:
+          break;
+      }
+    case DT_FLOAT:
+      switch (shape_type) {
+        case DT_INT32:
+          return TraceGradCompute<float, int32_t>(ctx);
+        case DT_INT64:
+          return TraceGradCompute<float, int64_t>(ctx);
+        default:
+          break;
+      }
+    case DT_DOUBLE:
+      switch (shape_type) {
+        case DT_INT32:
+          return TraceGradCompute<double, int32_t>(ctx);
+        case DT_INT64:
+          return TraceGradCompute<double, int64_t>(ctx);
+        default:
+          break;
+      }
+    default:
+      KERNEL_LOG_ERROR("TraceGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T1, typename T2>
+uint32_t TraceGradCpuKernel::TraceGradCompute(CpuKernelContext &ctx) {
+  auto input_x1 = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
+  auto input_x2 = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
+  auto output_x = reinterpret_cast<T1 *>(ctx.Output(0)->GetData());
+
+  T2 m = *(input_x2);
+  T2 n = *(input_x2 + 1);
+  T1 *grad_input = new T1[m * n];
+  for (T2 i = 0; i < m * n; i++) *(grad_input + i) = (T1)0;
+  for (T2 i = 0; i < m; i++)
+    for (T2 j = 0; j < n; j++) {
+      if (i == j) *(grad_input + i * n + j) = *(input_x1);
+    }
+  for (T2 i = 0; i < m * n; i++) *(output_x + i) = *(grad_input + i);
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kTraceGrad, TraceGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/trace_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/trace_grad.h
@ -0,0 +1,21 @@
+#ifndef AICPU_KERNELS_NORMALIZED_TRACEGRAD_H_
+#define AICPU_KERNELS_NORMALIZED_TRACEGARD_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+
+class TraceGradCpuKernel : public CpuKernel {
+ public:
+  TraceGradCpuKernel() = default;
+  ~TraceGradCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T1, typename T2>
+  static uint32_t TraceGradCompute(CpuKernelContext &ctx);
+};
+}  // namespace  aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tridiagonal_solve.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tridiagonal_solve.cc
@ -0,0 +1,414 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tridiagonal_solve.h"
+#include <iostream>
+#include "Eigen/Core"
+#include "Eigen/LU"
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "utils/kernel_util.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "utils/eigen_tensor.h"
+
+using namespace Eigen;
+using namespace std;
+
+namespace {
+const char *TRIDIAGONALSOLVE = "TridiagonalSolve";
+// 是否启用多线程的标识分界点
+const int64_t kParallelDataNumSameShape = 8 * 1024;
+}  // namespace
+
+// 定义命名空间aicpu
+namespace aicpu {
+
+// 读取输入输出以及exception抛出
+uint32_t TridiagonalSolveCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
+  // get 输入输出指针
+  diags_tensor_ = ctx.Input(0);
+  rhs_tensor_ = ctx.Input(1);
+  output_tensor_ = ctx.Output(0);
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, 2, 1), "Less check input and output number failed.");
+  // get shape指针
+  std::shared_ptr<TensorShape> diags_shape = diags_tensor_->GetTensorShape();
+  KERNEL_CHECK_NULLPTR(diags_shape, KERNEL_STATUS_PARAM_INVALID, "Get shape of input[0], diags failed");
+  std::shared_ptr<TensorShape> rhs_shape = rhs_tensor_->GetTensorShape();
+  KERNEL_CHECK_NULLPTR(rhs_shape, KERNEL_STATUS_PARAM_INVALID, "Get shape of input[1], rhs failed");
+
+  // get 输入维度
+  int32_t diags_rank = diags_shape->GetDims();
+  int32_t rhs_rank = rhs_shape->GetDims();
+
+  // get diags和rhs矩阵的尺寸
+  rhs_size =
+    rhs_tensor_->GetTensorShape()->GetDimSize(rhs_rank - 1) * rhs_tensor_->GetTensorShape()->GetDimSize(rhs_rank - 2);
+  diags_size = diags_tensor_->GetTensorShape()->GetDimSize(diags_rank - 1) *
+               diags_tensor_->GetTensorShape()->GetDimSize(diags_rank - 2);
+
+  // get shape的size vector
+  std::vector<int64_t> diags_dimsize = diags_shape->GetDimSizes();
+  std::vector<int64_t> rhs_dimsize = rhs_shape->GetDimSizes();
+
+  // get partial_pivoting
+  partial_pivoting = ctx.GetAttr("partial_pivoting");
+
+  // get diags_type_和rhs_type_
+  diags_type_ = static_cast<DataType>(diags_tensor_->GetDataType());
+  rhs_type_ = static_cast<DataType>(rhs_tensor_->GetDataType());
+
+  // get data_type_
+  data_type_ = rhs_type_;
+
+  // exception抛出
+
+  //  1) 维度小于2
+  if (diags_rank < 2) {
+    KERNEL_LOG_ERROR("Expected diags to have rank at least 2, got[%d]", diags_rank);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  // 2) diags和rhs维度不匹配
+  if (rhs_rank != diags_rank) {
+    KERNEL_LOG_ERROR("Expected the rank of rhs to be [%d] or [%d], got [%d]", diags_rank - 1, diags_rank, rhs_rank);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  //  3) diags没有三行
+  DimSize0 = diags_shape->GetDimSize(diags_rank - 2);
+  if (DimSize0 != 3) {
+    KERNEL_LOG_ERROR("Expected 3 diagonals got [%d]", DimSize0);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  // 4) batch_size不一致
+  for (int i = 0; i < diags_rank - 2; i++) {
+    if (diags_dimsize[i] != rhs_dimsize[i]) {
+      KERNEL_LOG_ERROR("Batch shapes of diags and rhs are incompatible");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  //  5) diags和rhs类型不一致
+  if (diags_type_ != rhs_type_) {
+    KERNEL_LOG_ERROR("The type of diags and rhs are incompatible");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  //  6) 输入为空
+  if (diags_dimsize.size() == 0 || rhs_dimsize.size() == 0) {
+    KERNEL_LOG_ERROR("The input is null");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  // 7)diags和rhs长度不匹配
+  int DimSize1 = diags_shape->GetDimSize(diags_rank - 1);
+  int RhsSize0 = rhs_shape->GetDimSize(rhs_rank - 2);
+  if (DimSize1 != RhsSize0) {
+    KERNEL_LOG_ERROR("The length of diags and rhs are incompatible");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  // 8) 输入的数据类型无法处理
+  if (diags_type_ != DT_FLOAT && diags_type_ != DT_DOUBLE && diags_type_ != DT_COMPLEX64 &&
+      diags_type_ != DT_COMPLEX128) {
+    KERNEL_LOG_ERROR("The type of inputs are invalid");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+// 根据数据类型为模板函数赋不同的值,并且根据partial_pivoting的值选择不同的计算方法
+uint32_t TridiagonalSolveCpuKernel::choosedatatype_(CpuKernelContext &ctx, size_t nth_batch, int i) {
+  if (partial_pivoting->GetBool()) {
+    switch (data_type_) {
+      case DT_FLOAT: {
+        res = DoCompute1<float>(ctx, nth_batch, i);
+        break;
+      }
+      case DT_DOUBLE: {
+        res = DoCompute1<double>(ctx, nth_batch, i);
+        break;
+      }
+      case DT_COMPLEX64: {
+        res = DoCompute1<std::complex<float>>(ctx, nth_batch, i);
+        break;
+      }
+      case DT_COMPLEX128: {
+        res = DoCompute1<std::complex<double>>(ctx, nth_batch, i);
+        break;
+      }
+      default: {
+        KERNEL_LOG_ERROR(
+          "Tridiagonal-solve op support input tensor type: float、double、complex64、complex128,should not be tensor "
+          "type [%s]",
+          data_type_);
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+    }
+  } else {
+    switch (data_type_) {
+      case DT_FLOAT:
+        res = DoCompute2<float>(ctx, nth_batch, i);
+        break;
+      case DT_DOUBLE:
+        res = DoCompute2<double>(ctx, nth_batch, i);
+        break;
+      case DT_COMPLEX64:
+        res = DoCompute2<std::complex<float>>(ctx, nth_batch, i);
+        break;
+      case DT_COMPLEX128:
+        res = DoCompute2<std::complex<double>>(ctx, nth_batch, i);
+        break;
+      default: {
+        KERNEL_LOG_ERROR(
+          "Tridiagonal-solve op support input tensor type: float、double、complex64、complex128,should not be tensor "
+          "type [%s]",
+          data_type_);
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+    }
+  }
+  return res;
+}
+
+// 当partial_pivoting的值为true时的计算函数
+template <typename T>
+uint32_t TridiagonalSolveCpuKernel::DoCompute1(CpuKernelContext &ctx, size_t nth_batch, int i) {
+  // 计算变量的尺寸
+  int rhs_rank = rhs_tensor_->GetTensorShape()->GetDims();
+  int diags_rank = diags_tensor_->GetTensorShape()->GetDims();
+  const int batch = rhs_tensor_->GetTensorShape()->GetDimSize(rhs_rank - 1);
+  const int n = diags_tensor_->GetTensorShape()->GetDimSize(diags_rank - 1);
+
+  // 计算分片后该次运算的起始地址
+  auto a = static_cast<T *>(diags_tensor_->GetData());
+  auto b = static_cast<T *>(rhs_tensor_->GetData());
+  auto value = reinterpret_cast<T *>(output_tensor_->GetData());
+  if (i == -1) {
+    a += nth_batch * diags_size;
+    b += nth_batch * rhs_size;
+    value += nth_batch * rhs_size;
+  } else {
+    a += i * diags_size;
+    b += i * rhs_size;
+    value += i * rhs_size;
+  }
+
+  const T zero = 0;
+
+  // 用于计算的中间变量
+  Array<T, Dynamic, 3> u(n, 3);
+
+  // 输入superdiags,diags,subdiags
+  Array<T, Dynamic, 1> superdiag(n);
+  Array<T, Dynamic, 1> diag(n);
+  Array<T, Dynamic, 1> subdiag(n);
+
+  // 输入rhs
+  Array<T, Dynamic, Dynamic> rhs(n, batch);
+
+  // 计算结果x
+  Array<T, Dynamic, Dynamic> x(n, batch);
+
+  // 将输入数据装载进变量
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < batch; j++) {
+      rhs(i, j) = *(b + i * batch + j);
+    }
+  }
+
+  for (int i = 0; i < n; i++) {
+    superdiag(i) = *(a + i);
+    diag(i) = *(a + n + i);
+    subdiag(i) = *(a + 2 * n + i);
+  }
+
+  // 计算过程
+  u(0, 0) = diag(0);
+  u(0, 1) = superdiag(0);
+  x.row(0) = rhs.row(0);
+
+  for (int i = 0; i < n - 1; ++i) {
+    if (abs(u(i, 0)) >= abs(subdiag(i + 1))) {
+      // No row interchange.
+      if (u(i, 0) == zero) {
+        KERNEL_LOG_ERROR("The first element of diag should not be zero");
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+      const T factor = subdiag(i + 1) / u(i, 0);
+      u(i + 1, 0) = diag(i + 1) - factor * u(i, 1);
+      x.row(i + 1) = rhs.row(i + 1) - factor * x.row(i);
+      if (i != n - 2) {
+        u(i + 1, 1) = superdiag(i + 1);
+        u(i, 2) = 0;
+      }
+    } else {
+      // Interchange rows i and i + 1.
+      const T factor = u(i, 0) / subdiag(i + 1);
+      u(i, 0) = subdiag(i + 1);
+      u(i + 1, 0) = u(i, 1) - factor * diag(i + 1);
+      u(i, 1) = diag(i + 1);
+      x.row(i + 1) = x.row(i) - factor * rhs.row(i + 1);
+      x.row(i) = rhs.row(i + 1);
+      if (i != n - 2) {
+        u(i, 2) = superdiag(i + 1);
+        u(i + 1, 1) = -factor * superdiag(i + 1);
+      }
+    }
+  }
+  if (u(n - 1, 0) == zero) {
+    KERNEL_LOG_ERROR("The last element of diag should not be zero");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  // 计算最终结果并且存入相应的输出地址中
+  x.row(n - 1) /= u(n - 1, 0);
+  for (int j = 0; j < batch; j++) {
+    *(value + (n - 1) * batch + j) = x(n - 1, j);
+  }
+
+  x.row(n - 2) = (x.row(n - 2) - u(n - 2, 1) * x.row(n - 1)) / u(n - 2, 0);
+  for (int j = 0; j < batch; j++) {
+    *(value + (n - 2) * batch + j) = x(n - 2, j);
+  }
+
+  for (int i = n - 3; i >= 0; --i) {
+    x.row(i) = (x.row(i) - u(i, 1) * x.row(i + 1) - u(i, 2) * x.row(i + 2)) / u(i, 0);
+    for (int j = 0; j < batch; j++) {
+      *(value + i * batch + j) = x(i, j);
+    }
+  }
+
+  // KERNEL_LOG_INFO("TridiagonalSolveCpuKernel::DoCompute end! ");
+  return KERNEL_STATUS_OK;
+}
+
+// 当partial_pivoting的值为false时的计算函数
+template <typename T>
+uint32_t TridiagonalSolveCpuKernel::DoCompute2(CpuKernelContext &ctx, size_t nth_batch, int i) {
+  // 计算变量的尺寸
+  int rhs_rank = rhs_tensor_->GetTensorShape()->GetDims();
+  int diags_rank = diags_tensor_->GetTensorShape()->GetDims();
+  const int batch = rhs_tensor_->GetTensorShape()->GetDimSize(rhs_rank - 1);
+  const int n = diags_tensor_->GetTensorShape()->GetDimSize(diags_rank - 1);
+  // 计算分片后该次运算的起始地址
+  auto a = static_cast<T *>(diags_tensor_->GetData());
+  auto b = static_cast<T *>(rhs_tensor_->GetData());
+  auto value = reinterpret_cast<T *>(output_tensor_->GetData());
+  if (i == -1) {
+    a += nth_batch * diags_size;
+    b += nth_batch * rhs_size;
+    value += nth_batch * rhs_size;
+  } else {
+    a += i * diags_size;
+    b += i * rhs_size;
+    value += i * rhs_size;
+  }
+
+  // 用于计算的中间变量
+  Array<T, Dynamic, 3> u(n, 3);
+
+  // 输入superdiags,diags,subdiags
+  Array<T, Dynamic, 1> superdiag(n);
+  Array<T, Dynamic, 1> diag(n);
+  Array<T, Dynamic, 1> subdiag(n);
+
+  // 输入rhs
+  Array<T, Dynamic, Dynamic> rhs(n, batch);
+
+  // 计算结果x
+  Array<T, Dynamic, Dynamic> x(n, batch);
+
+  const T zero = 0;
+
+  // 计算过程
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < batch; j++) {
+      rhs(i, j) = *(b + i * batch + j);
+    }
+  }
+
+  for (int i = 0; i < n; i++) {
+    superdiag(i) = *(a + i);
+    diag(i) = *(a + n + i);
+    subdiag(i) = *(a + 2 * n + i);
+  }
+
+  if (diag(0) == zero) {
+    KERNEL_LOG_ERROR("The first element of diag should not be zero");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  u(0) = superdiag(0) / diag(0);
+  x.row(0) = rhs.row(0) / diag(0);
+  for (int i = 1; i < n; ++i) {
+    auto denom = diag(i) - subdiag(i) * u(i - 1);
+    if (denom == zero) {
+      KERNEL_LOG_ERROR("The diag should not be zero");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    u(i) = superdiag(i) / denom;
+    x.row(i) = (rhs.row(i) - subdiag(i) * x.row(i - 1)) / denom;
+  }
+  for (int i = n - 2; i >= 0; --i) {
+    x.row(i) -= u(i) * x.row(i + 1);
+  }
+
+  // 计算最终结果并且存入相应的输出地址中
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < batch; j++) {
+      *(value + i * batch + j) = x(i, j);
+    }
+  }
+
+  KERNEL_LOG_INFO("TridiagonalSolveCpuKernel::DoCompute end! ");
+  return KERNEL_STATUS_OK;
+}
+
+// 主函数
+uint32_t TridiagonalSolveCpuKernel::Compute(CpuKernelContext &ctx) {
+  res = GetInputAndCheck(ctx);
+  if (res != KERNEL_STATUS_OK) {
+    return res;
+  }
+
+  data_size = ctx.Input(0)->NumElements();
+  matrix_num = ctx.Input(0)->NumElements() / diags_size;
+
+  // 判断多线程
+  if (data_size >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    // 使用CpuKernelUtils::GetCPUNum接口获取AI CPU的核数
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+
+    // 若AI CPU中核数大于数据量，以数据量作为max_core_num
+    if (max_core_num > matrix_num) {
+      max_core_num = matrix_num;
+    }
+    // 多线程的lambda函数
+    auto shared_tridiagonalsolve = [&](size_t start, size_t end) {
+      for (size_t nth_batch = start; nth_batch < end; nth_batch++) res = choosedatatype_(ctx, nth_batch, -1);
+    };
+    CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shared_tridiagonalsolve);
+  } else {
+    // 若数据量小于8K，不进行分片，使用单AI CPU核进行计算。
+    for (size_t nth_batch = 0; nth_batch < matrix_num; nth_batch++) res = choosedatatype_(ctx, -1, nth_batch);
+  }
+  return res;
+}
+
+// 注册该算子实现
+REGISTER_CPU_KERNEL(TRIDIAGONALSOLVE, TridiagonalSolveCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tridiagonal_solve.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tridiagonal_solve.h
@ -0,0 +1,46 @@
+#ifndef AICPU_KERNELS_NORMALIZED_TRIDIAGONAL_SOLVE_H
+#define AICPU_KERNELS_NORMALIZED_TRIDIAGONAL_SOLVE_H
+
+#include "cpu_ops_kernel.h"
+#include <vector>
+
+namespace aicpu {
+class TridiagonalSolveCpuKernel : public CpuKernel {
+ public:
+  TridiagonalSolveCpuKernel() = default;
+  ~TridiagonalSolveCpuKernel() = default;
+
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  Tensor *diags_tensor_ = nullptr;
+  Tensor *rhs_tensor_ = nullptr;
+  AttrValue *partial_pivoting = nullptr;
+  Tensor *output_tensor_ = nullptr;
+  size_t matrix_num;
+  int64_t data_size;
+  uint32_t res;
+  int32_t diags_rank;
+  int32_t rhs_rank;
+  int32_t diags_size;
+  int32_t rhs_size;
+  std::shared_ptr<TensorShape> diags_shape;
+  std::shared_ptr<TensorShape> rhs_shape;
+  std::vector<int64_t> diags_dimsize;
+  std::vector<int64_t> rhs_dimsize;
+  DataType diags_type_ = DT_DOUBLE;
+  DataType rhs_type_ = DT_DOUBLE;
+  DataType data_type_ = DT_DOUBLE;
+  int DimSize0 = 0;
+  uint32_t GetInputAndCheck(CpuKernelContext &ctx);
+  uint32_t choosedatatype_(CpuKernelContext &ctx, size_t nth_batch, int i);
+
+  template <typename T>
+  uint32_t DoCompute1(CpuKernelContext &ctx, size_t nth_batch, int i);
+
+  template <typename T>
+  uint32_t DoCompute2(CpuKernelContext &ctx, size_t nth_batch, int i);
+
+};  // namespace aicpu
+#endif
+}
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/truncated_normal.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/truncated_normal.cc
@ -0,0 +1,132 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "truncated_normal.h"
+
+#include <cmath>
+#include <ctime>
+#include <iostream>
+#include <random>
+
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+#include "utils/eigen_tensor.h"
+
+namespace {
+const uint32_t kInputNum = 1;
+const uint32_t kOutputNum = 1;
+const uint32_t kInputDims = 1;
+const uint32_t kInputSizes = 2;
+const char *kTruncatedNormal = "TruncatedNormal";
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+uint32_t TruncatedNormalCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  Tensor *input = ctx.Input(0);
+  Tensor *output = ctx.Output(0);
+  auto output_nums = output->NumElements();
+  AttrValue *seed_ptr = ctx.GetAttr("seed");
+  auto seed_base1 = (seed_ptr == nullptr) ? 0 : (seed_ptr->GetInt());
+  AttrValue *seed2_ptr = ctx.GetAttr("seed2");
+  auto seed_base2 = (seed2_ptr == nullptr) ? 0 : (seed2_ptr->GetInt());
+  auto output_type = output->GetDataType();
+  auto input_data_nums = input->NumElements();
+  auto input_data = reinterpret_cast<T *>(input->GetData());
+  std::vector<int64_t> out_put_dims;
+  for (auto i = 0; i < input_data_nums; ++i) {
+    if (*(input_data + i) <= 0) {
+      KERNEL_LOG_ERROR("Shape elements must be > 0.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    out_put_dims.push_back(input_data[i]);
+  }
+  std::random_device rd;
+  size_t seedc = seed_base2 != 0 ? seed_base2 : (seed_base1 != 0 ? seed_base1 : rd());
+  std::default_random_engine final_seed(seedc);
+  if (output_type == DT_FLOAT16) {
+    auto output_data = reinterpret_cast<Eigen::half *>(output->GetData());
+    std::normal_distribution<float> dis(0, 1);
+    for (int j = 0; j < output_nums;) {
+      auto data = dis(final_seed);
+      if (data >= -2 && data <= 2) {
+        *(output_data + j) = static_cast<Eigen::half>(data);
+        ++j;
+      }
+    }
+  } else if (output_type == DT_FLOAT) {
+    auto output_data = reinterpret_cast<float_t *>(output->GetData());
+    std::normal_distribution<float> dis(0, 1);
+    for (int j = 0; j < output_nums;) {
+      auto data = dis(final_seed);
+      if (data >= -2 && data <= 2) {
+        *(output_data + j) = data;
+        ++j;
+      }
+    }
+  } else {
+    auto output_data = reinterpret_cast<double_t *>(output->GetData());
+    std::normal_distribution<double> dis(0, 1);
+    for (int j = 0; j < output_nums;) {
+      auto data = dis(final_seed);
+      if (data >= -2 && data <= 2) {
+        *(output_data + j) = data;
+        ++j;
+      }
+    }
+  }
+  output->GetTensorShape()->SetDimSizes(out_put_dims);
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t TruncatedNormalCpuKernel::DataAndTypeCheck(CpuKernelContext &ctx) {
+  Tensor *input = ctx.Input(0);
+  Tensor *output = ctx.Output(0);
+  auto input_data_nums = input->NumElements();
+  KERNEL_CHECK_FALSE((input_data_nums >= kInputSizes), KERNEL_STATUS_PARAM_INVALID, "Input data elements must >= 2.");
+  KERNEL_CHECK_FALSE((input->GetTensorShape()->GetDimSizes().size() == kInputDims), KERNEL_STATUS_PARAM_INVALID,
+                     "Input tensor must be a 1-D tensor.");
+  auto input_datatype = input->GetDataType();
+  auto output_datatype = output->GetDataType();
+  KERNEL_CHECK_FALSE((input_datatype == DT_INT32 || input_datatype == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
+                     "Input type must be int64 or int32.");
+  KERNEL_CHECK_FALSE((output_datatype == DT_FLOAT16 || output_datatype == DT_FLOAT || output_datatype == DT_DOUBLE),
+                     KERNEL_STATUS_PARAM_INVALID, "Out put type must be one of float16, float32 or double.");
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t TruncatedNormalCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check params failed.", kTruncatedNormal);
+  KERNEL_HANDLE_ERROR(DataAndTypeCheck(ctx), " TruncatedNormal input elements value  check failed.");
+  auto input_datatype = ctx.Input(0)->GetDataType();
+  uint32_t ret;
+  switch (input_datatype) {
+    case DT_INT32:
+      ret = DoCompute<int32_t>(ctx);
+      break;
+    case DT_INT64:
+      ret = DoCompute<int64_t>(ctx);
+      break;
+    default: {
+      KERNEL_LOG_WARN("TruncatedNormal kernel data type [%s] not support.", DTypeStr(input_datatype).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  KERNEL_CHECK_FALSE((ret == KERNEL_STATUS_OK), ret, "Compute failed.");
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kTruncatedNormal, TruncatedNormalCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/truncated_normal.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/truncated_normal.h
@ -0,0 +1,36 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_TRUNCATEDNORMAL_H_
+#define AICPU_KERNELS_NORMALIZED_TRUNCATEDNORMAL_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+
+class TruncatedNormalCpuKernel : public CpuKernel {
+ public:
+  TruncatedNormalCpuKernel() = default;
+  ~TruncatedNormalCpuKernel() override = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t DataAndTypeCheck(CpuKernelContext &ctx);
+  template <typename T>
+  static uint32_t DoCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_tensor.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_tensor.h
@ -36,6 +36,66 @@ const T SubtleMustCopy(const T &x) {
 }
 }  // namespace aicpu

+namespace aicpu {
+class DimComparator {
+ public:
+  DimComparator(const TTypes<int64_t>::Matrix &ix, const std::vector<int64_t> &order, const std::vector<int64_t> &shape)
+      : ix_(ix), order_(order), dims_(shape.size()) {}
+
+  inline bool operator()(const int64_t i, const int64_t j) const {
+    for (int di = 0; di < dims_; ++di) {
+      const int64_t d = order_[di];
+      if (ix_(i, d) < ix_(j, d)) return true;
+      if (ix_(i, d) > ix_(j, d)) return false;
+    }
+    return false;
+  }
+
+  // Compares two indices taken from corresponding index matrices, using the
+  // standard, row-major (or lexicographic) order.  Useful for cases that need
+  // to distinguish between all three orderings (<, ==, >).
+  inline static int cmp(const TTypes<int64_t>::ConstMatrix &a_idx, const TTypes<int64_t>::ConstMatrix &b_idx,
+                        const int64_t a_row, const int64_t b_row, const int dims) {
+    for (int d = 0; d < dims; ++d) {
+      const int64_t a = a_idx(a_row, d);
+      const int64_t b = b_idx(b_row, d);
+      if (a < b) {
+        return -1;
+      } else if (a > b) {
+        return 1;
+      }
+    }
+    return 0;
+  }
+
+ protected:
+  const TTypes<int64_t>::Matrix ix_;
+  const std::vector<int64_t> order_;
+  const int64_t dims_;
+  const std::vector<int64_t> *ix_order_;
+};
+
+template <int ORDER_DIM>
+class FixedDimComparator : DimComparator {
+ public:
+  FixedDimComparator(const TTypes<int64_t>::Matrix &ix, const std::vector<int64_t> &order,
+                     const std::vector<int64_t> &shape)
+      : DimComparator(ix, order, shape) {}
+  inline bool operator()(const int64_t i, const int64_t j) const {
+    bool value = false;
+    for (int di = 0; di < ORDER_DIM; ++di) {
+      const int64_t d = order_[di];
+      if (ix_(i, d) < ix_(j, d)) {
+        value = true;
+        break;
+      }
+      if (ix_(i, d) > ix_(j, d)) break;
+    }
+    return value;
+  }
+};
+}  // namespace aicpu
+
 namespace aicpu {
 class SparseTensor {
 public:
@ -58,6 +118,61 @@ class SparseTensor {
   */
  uint32_t IndicesValid(CpuKernelContext &ctx) const;

+  template <typename T>
+  void Reorder(const std::vector<int64_t> &order) {
+    int32_t order_size = static_cast<int32_t>(order.size());
+    if (order_size != dims_) {
+      KERNEL_LOG_ERROR("Order length must be SparseTensor rank");
+    }
+    auto ix_t = ix_->matrix<int64_t>();
+    auto vals_t = vals_->vec<T>();
+    std::vector<int64_t> reorder(dims_);
+    std::iota(reorder.begin(), reorder.end(), 0);
+    // Sort to get order of indices
+    switch (order.size()) {
+#define CASE_SORT(ORDER_SIZE)                                     \
+  case (ORDER_SIZE): {                                            \
+    FixedDimComparator<(ORDER_SIZE)> sorter(ix_t, order, shape_); \
+    std::sort(reorder.begin(), reorder.end(), sorter);            \
+    break;                                                        \
+  }
+      CASE_SORT(0);
+      CASE_SORT(1);
+      CASE_SORT(2);
+      CASE_SORT(3);
+      CASE_SORT(4);
+      CASE_SORT(5);
+#undef CASE_SORT
+      default: {
+        DimComparator sorter(ix_t, order, shape_);
+        std::sort(reorder.begin(), reorder.end(), sorter);
+      }
+    }
+    // We have a forward reordering, but what we'll need is a
+    // permutation (the inverse).  This can be calculated with O(1)
+    // additional
+    // and O(n) time (INVPERM) but we just do the simple thing here.
+    std::vector<size_t> permutation(reorder.size());
+    for (std::size_t n = 0; n < reorder.size(); ++n) {
+      permutation[reorder[n]] = n;
+    }
+    // Update indices & values by converting the permutations to
+    // a product of transpositions.  Iterate over the cycles in the
+    // permutation, and convert each of those into a product of
+    // transpositions (swaps):
+    //   https://en.wikipedia.org/wiki/Cyclic_permutation
+    // This is N swaps, 2*N comparisons.
+    for (std::size_t n = 0; n + 1 < permutation.size(); ++n) {
+      while (n != permutation[n]) {
+        std::size_t r = permutation[n];
+        std::swap_ranges(&(ix_t(n, 0)), &(ix_t(n + 1, 0)), &(ix_t(r, 0)));
+        std::swap(vals_t(n), vals_t(r));
+        std::swap(permutation[n], permutation[r]);
+      }
+    }
+    order_.assign(order.begin(), order.end());
+  }
+
  /*
   * group sparse tensor
   * @return GroupIterable