merge canndev code to mindspore

2023-02-06 19:45:40 +08:00 · 2023-02-06 19:45:40 +08:00 · 6920953b56
parent aacab0ca60
commit 6920953b56
32 changed files with 4396 additions and 1 deletions
--- a/.jenkins/check/config/filter_cpplint.txt
+++ b/.jenkins/check/config/filter_cpplint.txt
@ -135,4 +135,5 @@
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/comma"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "runtime/indentation_namespace"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/line_length"
-
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/semicolon"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "readability/nolint"
--- a/.jenkins/check/config/whitelizard.txt
+++ b/.jenkins/check/config/whitelizard.txt
@ -350,3 +350,8 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/resize_bicubic_grad.cc:aicpu::ResizeBicubicGrad
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_max.cc:aicpu::SegmentMaxCpuKernel::SegmentMaxCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/extract_glimpse.cc:aicpu::ExtractGlimpseCpuKernel::Compute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_grad_with_argmax.cc:aicpu::MaxPool3DGradWithArgmaxCpuKernel::MaxPool3DGradWithArgmaxCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_power.cc:aicpu::MatrixPowerCpuKernel::ComputeKernel
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d_grad.cc:aicpu::MaxUnpool2DGradCpuKernel::MaxUnpool2DGradCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_with_argmax.cc:aicpu::MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/layer_norm_grad_grad.cc:aicpu::LayerNormGradGradCpuKernel::LayerNormGradGradCompute
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/layer_norm_grad_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/layer_norm_grad_grad.cc
@ -0,0 +1,226 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "layer_norm_grad_grad.h"
+
+#include <cmath>
+#include <numeric>
+#include <vector>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+using namespace std;
+
+namespace {
+const uint32_t kOutputNum = 3;
+const uint32_t kInputNum = 8;
+const char *kLayerNormGradGrad = "LayerNormGradGrad";
+
+#define LAYERNORMGRADGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX, NUM)       \
+  case (DTYPE): {                                                   \
+    uint32_t result = LayerNormGradGradCompute<TYPE>(CTX, NUM);     \
+    if (result != KERNEL_STATUS_OK) {                               \
+      KERNEL_LOG_ERROR("LayerNormGradGrad kernel compute failed."); \
+      return result;                                                \
+    }                                                               \
+    break;                                                          \
+  }
+
+#define SWITCH_PARALLEL(SHARD, data_num, thread_num)                            \
+  if (data_num <= ParallelDataNums) {                                           \
+    for (size_t i = 0; i < thread_num; i++) {                                   \
+      SHARD(i, i + 1);                                                          \
+    }                                                                           \
+  } else {                                                                      \
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, thread_num, 1, SHARD), \
+                        "LayerNormGradGrad ParallelFor Compute failed.");       \
+  }
+
+Eigen::half sqrt(Eigen::half &data) { return Eigen::half_impl::sqrt(data); }
+}  // namespace
+
+namespace aicpu {
+uint32_t LayerNormGradGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
+                      "LayerNormGradGrad check input and output number failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    LAYERNORMGRADGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx, 512)
+    LAYERNORMGRADGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx, 4 * 1024)
+    default:
+      KERNEL_LOG_ERROR("LayerNormGradGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t LayerNormGradGradCpuKernel::LayerNormGradGradCompute(CpuKernelContext &ctx, size_t ParallelDataNums) {
+  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto input_dy = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto input_var = reinterpret_cast<T *>(ctx.Input(2)->GetData());
+  auto input_mean = reinterpret_cast<T *>(ctx.Input(3)->GetData());
+  auto input_gamma = reinterpret_cast<T *>(ctx.Input(4)->GetData());
+  auto input_d_dx = reinterpret_cast<T *>(ctx.Input(5)->GetData());
+  auto input_d_dg = reinterpret_cast<T *>(ctx.Input(6)->GetData());
+  auto input_d_db = reinterpret_cast<T *>(ctx.Input(7)->GetData());
+
+  auto output_sopd_x = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto output_sopd_dy = reinterpret_cast<T *>(ctx.Output(1)->GetData());
+  auto output_sopd_g = reinterpret_cast<T *>(ctx.Output(2)->GetData());
+
+  size_t num = static_cast<size_t>(ctx.Input(0)->NumElements());
+  size_t g_num = static_cast<size_t>(ctx.Input(4)->NumElements());
+  size_t mean_num = static_cast<size_t>(ctx.Input(3)->NumElements());
+
+  KERNEL_CHECK_FALSE((g_num > 0), KERNEL_STATUS_PARAM_INVALID, "gamma should not be empty");
+
+  T *inv_std = new T[mean_num];
+  for (size_t i = 0; i < mean_num; i++) {
+    if (input_var[i] <= T(0)) {
+      KERNEL_LOG_ERROR("variance must be greater than zero");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    inv_std[i] = T(1) / sqrt(input_var[i]);
+  }
+
+  T *x_hat = new T[num];
+  T *dy_gamma = new T[num];
+  T *sum1 = new T[mean_num];
+  std::fill_n(sum1, mean_num, T(0));
+  T *sum2 = new T[mean_num];
+  std::fill_n(sum2, mean_num, T(0));
+  T *sum3 = new T[mean_num];
+  std::fill_n(sum3, mean_num, T(0));
+  T *sum4 = new T[mean_num];
+  std::fill_n(sum4, mean_num, T(0));
+
+  auto shard_inner_mean = [&](size_t start, size_t end) {
+    for (size_t sum_idx = start; sum_idx < end; sum_idx++) {
+      for (size_t g_idx = 0; g_idx < g_num; g_idx++) {
+        size_t i = g_idx + sum_idx * g_num;  // value of sum_idx = i / g_num;
+        sum1[sum_idx] -= inv_std[sum_idx] * input_d_dx[i] / static_cast<T>(g_num);
+        ;
+        T cur_x_hat = (input_x[i] - input_mean[sum_idx]) * inv_std[sum_idx];
+        x_hat[i] = cur_x_hat;
+        sum2[sum_idx] -= cur_x_hat * inv_std[sum_idx] * input_d_dx[i] / static_cast<T>(g_num);
+        ;
+        T cur_dy_gamma = input_dy[i] * input_gamma[g_idx];
+        dy_gamma[i] = cur_dy_gamma;
+        sum3[sum_idx] += cur_dy_gamma / static_cast<T>(g_num);
+        ;
+        sum4[sum_idx] += cur_dy_gamma * cur_x_hat / static_cast<T>(g_num);
+        ;
+      }
+    }
+  };
+  SWITCH_PARALLEL(shard_inner_mean, num, mean_num);
+  T *sum5 = new T[mean_num];
+  std::fill_n(sum5, mean_num, T(0));
+  T *sum6 = new T[mean_num];
+  std::fill_n(sum6, mean_num, T(0));
+  T *sum7 = new T[mean_num];
+  std::fill_n(sum7, mean_num, T(0));
+  T *part3 = new T[num];
+
+  auto shard_outer_mean = [&](size_t start, size_t end) {
+    for (size_t sum_idx = start; sum_idx < end; sum_idx++) {
+      for (size_t g_idx = 0; g_idx < g_num; g_idx++) {
+        size_t i = g_idx + sum_idx * g_num;  // value of sum_idx is i / g_num;
+        T part_sum1 = dy_gamma[i] - sum3[sum_idx] - x_hat[i] * sum4[sum_idx];
+        T part_sum2 = dy_gamma[i] * sum2[sum_idx] - sum4[sum_idx] * input_d_dx[i] * inv_std[sum_idx] +
+                      input_dy[i] * input_d_dg[g_idx];
+        sum5[sum_idx] += input_d_dx[i] * part_sum1 / static_cast<T>(g_num);
+        ;
+        sum6[sum_idx] += (input_x[i] - input_mean[sum_idx]) * part_sum2 / static_cast<T>(g_num);
+        ;
+        T cur_part3 = inv_std[sum_idx] * part_sum2;
+        part3[i] = cur_part3;
+        sum7[sum_idx] -= cur_part3 / static_cast<T>(g_num);
+        ;
+      }
+    }
+  };
+  SWITCH_PARALLEL(shard_outer_mean, num, mean_num);
+  if (sum3 != nullptr) {
+    delete[] sum3;
+  }
+  if (sum4 != nullptr) {
+    delete[] sum4;
+  }
+  if (dy_gamma != nullptr) {
+    delete[] dy_gamma;
+  }
+
+  auto shard_input_prop = [&](size_t start, size_t end) {
+    for (size_t sum_idx = start; sum_idx < end; sum_idx++) {
+      for (size_t g_idx = 0; g_idx < g_num; g_idx++) {
+        size_t i = g_idx + sum_idx * g_num;  // value of sum_idx is i / g_num;
+        T cur_part4 = -x_hat[i] * inv_std[sum_idx] * inv_std[sum_idx] * (sum5[sum_idx] + sum6[sum_idx]);
+        output_sopd_x[i] = part3[i] + cur_part4 + sum7[sum_idx];
+        T cur_part5 = input_gamma[g_idx] * input_d_dx[i] * inv_std[sum_idx];
+        T cur_part6 = input_gamma[g_idx] * sum1[sum_idx];
+        T cur_part7 = input_gamma[g_idx] * x_hat[i] * sum2[sum_idx];
+        T cur_part8 = x_hat[i] * input_d_dg[g_idx];
+        output_sopd_dy[i] = cur_part5 + cur_part6 + cur_part7 + cur_part8 + input_d_db[g_idx];
+      }
+    }
+  };
+  SWITCH_PARALLEL(shard_input_prop, num, mean_num);
+  if (sum5 != nullptr) {
+    delete[] sum5;
+  }
+  if (sum6 != nullptr) {
+    delete[] sum6;
+  }
+  if (sum7 != nullptr) {
+    delete[] sum7;
+  }
+  std::fill_n(output_sopd_g, g_num, T(0));
+
+  auto shard_param_prop = [&](size_t start, size_t end) {
+    for (size_t g_idx = start; g_idx < end; g_idx++) {
+      for (size_t sum_idx = 0; sum_idx < mean_num; sum_idx++) {
+        size_t i = g_idx + sum_idx * g_num;  // value of sum_idx is i / g_num;
+        T cur_part9 = input_dy[i] * x_hat[i] * sum2[sum_idx];
+        T cur_part10 = input_dy[i] * sum1[sum_idx];
+        T cur_part11 = input_dy[i] * input_d_dx[i] * inv_std[sum_idx];
+        output_sopd_g[g_idx] += cur_part9 + cur_part10 + cur_part11;
+      }
+    }
+  };
+  SWITCH_PARALLEL(shard_param_prop, num, g_num);
+
+  if (sum1 != nullptr) {
+    delete[] sum1;
+  }
+  if (sum2 != nullptr) {
+    delete[] sum2;
+  }
+  if (inv_std != nullptr) {
+    delete[] inv_std;
+  }
+  if (x_hat != nullptr) {
+    delete[] x_hat;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kLayerNormGradGrad, LayerNormGradGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/layer_norm_grad_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/layer_norm_grad_grad.h
@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_LAYERNORMGRADGRAD_H_
+#define AICPU_KERNELS_NORMALIZED_LAYERNORMGRADGRAD_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+#include "utils/eigen_tensor.h"
+
+namespace aicpu {
+
+class LayerNormGradGradCpuKernel : public CpuKernel {
+ public:
+  LayerNormGradGradCpuKernel() = default;
+  ~LayerNormGradGradCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  static uint32_t LayerNormGradGradCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  static uint32_t LayerNormGradGradCompute(CpuKernelContext &ctx, size_t ParallelDataNums);
+};
+}  // namespace aicpu
+
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/log.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/log.cc
@ -0,0 +1,256 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "log.h"
+
+#include "cmath"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 1;
+const char *kLog = "Log";
+
+#define LOG_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                     \
+    uint32_t result = LogCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                 \
+      KERNEL_LOG_ERROR("Log kernel compute failed."); \
+      return result;                                  \
+    }                                                 \
+    break;                                            \
+  }
+
+#define LOG_COMPUTE_CASE2(DTYPE, TYPE, CTX)           \
+  case (DTYPE): {                                     \
+    uint32_t result = LogCompute2(CTX);               \
+    if (result != KERNEL_STATUS_OK) {                 \
+      KERNEL_LOG_ERROR("Log kernel compute failed."); \
+      return result;                                  \
+    }                                                 \
+    break;                                            \
+  }
+
+#define LOG_COMPUTE_CASE3(DTYPE, TYPE, CTX)           \
+  case (DTYPE): {                                     \
+    uint32_t result = LogCompute3<TYPE>(CTX);         \
+    if (result != KERNEL_STATUS_OK) {                 \
+      KERNEL_LOG_ERROR("Log kernel compute failed."); \
+      return result;                                  \
+    }                                                 \
+    break;                                            \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t LogCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLog);
+  KERNEL_HANDLE_ERROR(LogCheck(ctx), "[%s] check params failed.", kLog);
+  DataType data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    LOG_COMPUTE_CASE2(DT_FLOAT16, Eigen::half, ctx)
+    LOG_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    LOG_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    LOG_COMPUTE_CASE3(DT_COMPLEX64, std::complex<float>, ctx)
+    LOG_COMPUTE_CASE3(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("Log kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t LogCpuKernel::LogCheck(CpuKernelContext &ctx) {
+  auto input_0 = ctx.Input(0);
+  auto output_0 = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
+  KERNEL_CHECK_NULLPTR(output_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed")
+  KERNEL_CHECK_NULLPTR(input_0->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.")
+  std::vector<int64_t> shape_x = input_0->GetTensorShape()->GetDimSizes();
+  size_t shape_size = shape_x.size();
+  KERNEL_CHECK_FALSE((shape_size > 0), KERNEL_STATUS_PARAM_INVALID, "Input must be at least rank 1, got [%zu].",
+                     shape_x.size())
+  KERNEL_CHECK_FALSE((shape_x[shape_size - 1] > 0), KERNEL_STATUS_PARAM_INVALID,
+                     "Input last dimension must be at least 1.")
+  AttrValue *base_ptr = ctx.GetAttr("base");
+  KERNEL_CHECK_NULLPTR(base_ptr, KERNEL_STATUS_PARAM_INVALID, "Get attr base failed.");
+  float base_ = base_ptr->GetFloat();
+  KERNEL_CHECK_FALSE(((base_ > 0 && base_ != 1.0) || base_ == -1.0), KERNEL_STATUS_PARAM_INVALID,
+                     "Attr base must be -1.0  or base > 0 and base is not "
+                     "equal to 1 , but got attr base[%lld]",
+                     base_);
+  AttrValue *scale_ptr = ctx.GetAttr("scale");
+  KERNEL_CHECK_NULLPTR(scale_ptr, KERNEL_STATUS_PARAM_INVALID, "Get attr scale failed.");
+  AttrValue *shift_ptr = ctx.GetAttr("shift");
+  KERNEL_CHECK_NULLPTR(shift_ptr, KERNEL_STATUS_PARAM_INVALID, "Get attr shift failed.");
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t LogCpuKernel::LogCompute(CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+
+  AttrValue *base_ptr = ctx.GetAttr("base");
+  T base_;
+  base_ = static_cast<T>(base_ptr->GetFloat());
+  if (base_ == static_cast<T>(-1.0)) {
+    base_ = static_cast<T>(exp(1.0));
+  }
+  AttrValue *scale_ptr = ctx.GetAttr("scale");
+  T scale_;
+  scale_ = static_cast<T>(scale_ptr->GetFloat());
+  AttrValue *shift_ptr = ctx.GetAttr("shift");
+  T shift_;
+  shift_ = static_cast<T>(shift_ptr->GetFloat());
+
+  size_t data_num = ctx.Input(0)->NumElements();
+  if (data_num <= 4 * 1024) {
+    for (size_t i = 0; i < data_num; i++) {
+      if (*(input_x + i) <= static_cast<T>(0)) {
+        KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+      *(output_y + i) = std::log(*(input_x + i) * scale_ + shift_) / std::log(base_);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shard_log = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        if (*(input_x + i) <= static_cast<T>(0)) {
+          KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
+          return KERNEL_STATUS_PARAM_INVALID;
+        }
+        *(output_y + i) = std::log(*(input_x + i) * scale_ + shift_) / std::log(base_);
+      }
+      return KERNEL_STATUS_PARAM_INVALID;
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log),
+                        "Log Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t LogCpuKernel::LogCompute2(CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<Eigen::half *>(ctx.Input(0)->GetData());
+  auto output_y = reinterpret_cast<Eigen::half *>(ctx.Output(0)->GetData());
+  size_t data_num = ctx.Input(0)->NumElements();
+  for (uint64_t i = 0; i < data_num; i++) {
+    if (*(input_x + i) <= static_cast<Eigen::half>(0)) {
+      KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  AttrValue *base_ptr = ctx.GetAttr("base");
+  Eigen::half base_;
+  base_ = static_cast<Eigen::half>(base_ptr->GetFloat());
+  if (base_ == static_cast<Eigen::half>(-1.0)) {
+    base_ = static_cast<Eigen::half>(exp(1.0));
+  }
+  AttrValue *scale_ptr = ctx.GetAttr("scale");
+  Eigen::half scale_;
+  scale_ = static_cast<Eigen::half>(scale_ptr->GetFloat());
+  AttrValue *shift_ptr = ctx.GetAttr("shift");
+  Eigen::half shift_;
+  shift_ = static_cast<Eigen::half>(shift_ptr->GetFloat());
+
+  typedef Eigen::Array<Eigen::half, Eigen::Dynamic, Eigen::Dynamic> ArrayxXd;
+  ArrayxXd array_x(1, data_num);
+  ArrayxXd array_y(1, data_num);
+  ArrayxXd array_z(1, 1);
+  for (size_t i = 0; i < data_num; i++) {
+    array_x(0, i) = *(input_x + i);
+  }
+  array_x = array_x * scale_;
+  array_x = array_x + shift_;
+  array_y = array_x.log();
+  array_z(0, 0) = base_;
+  array_z = array_z.log();
+  if (data_num <= 8 * 1024) {
+    for (size_t i = 0; i < data_num; i++) {
+      *(output_y + i) = array_y(0, i) / array_z(0, 0);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shard_log = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        *(output_y + i) = array_y(0, i) / array_z(0, 0);
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log),
+                        "Log Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t LogCpuKernel::LogCompute3(CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  size_t data_num = ctx.Input(0)->NumElements();
+  AttrValue *base_ptr = ctx.GetAttr("base");
+  T base_;
+  base_ = static_cast<T>(base_ptr->GetFloat());
+  if (base_ == static_cast<T>(-1.0)) {
+    base_ = static_cast<T>(exp(1.0));
+  }
+  AttrValue *scale_ptr = ctx.GetAttr("scale");
+  T scale_;
+  scale_ = static_cast<T>(scale_ptr->GetFloat());
+  AttrValue *shift_ptr = ctx.GetAttr("shift");
+  T shift_;
+  shift_ = static_cast<T>(shift_ptr->GetFloat());
+
+  if (data_num <= 4 * 1024) {
+    for (size_t i = 0; i < data_num; i++) {
+      if (*(input_x + i) == static_cast<T>(0)) {
+        KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+      *(output_y + i) = std::log(*(input_x + i) * scale_ + shift_) / std::log(base_);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shard_log = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        if (*(input_x + i) == static_cast<T>(0)) {
+          KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
+          return KERNEL_STATUS_PARAM_INVALID;
+        }
+        *(output_y + i) = std::log(*(input_x + i) * scale_ + shift_) / std::log(base_);
+      }
+      return KERNEL_STATUS_PARAM_INVALID;
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log),
+                        "Log Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kLog, LogCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/log.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/log.h
@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_LOG_H_
+#define AICPU_KERNELS_NORMALIZED_LOG_H_
+
+#include "cpu_ops_kernel.h"
+
+using namespace std;
+
+namespace aicpu {
+class LogCpuKernel : public CpuKernel {
+ public:
+  LogCpuKernel() = default;
+  ~LogCpuKernel() override = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t LogCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t LogCompute(CpuKernelContext &ctx);
+
+  uint32_t LogCompute2(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t LogCompute3(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logspace.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logspace.cc
@ -0,0 +1,160 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "logspace.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+constexpr uint32_t kLogSpaceInputNum = 2;
+constexpr uint32_t kLogSpaceOutputNum = 1;
+const char *kLogSpace = "LogSpace";
+
+#define LOGSPACE_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                          \
+    uint32_t result = LogSpaceCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                      \
+      KERNEL_LOG_ERROR("LogSpace kernel compute failed."); \
+      return result;                                       \
+    }                                                      \
+    break;                                                 \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t LogSpaceCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kLogSpaceInputNum, kLogSpaceOutputNum), "[%s] check input and output failed.",
+                      kLogSpace);
+  KERNEL_HANDLE_ERROR(LogSpaceCheck(ctx), "[%s] check params failed.", kLogSpace);
+  DataType data_type = ctx.Output(0)->GetDataType();
+  switch (data_type) {
+    LOGSPACE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    LOGSPACE_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    default:
+      KERNEL_LOG_ERROR("LogSpace kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t LogSpaceCpuKernel::LogSpaceCheck(CpuKernelContext &ctx) {
+  // get Attr steps_attr
+  AttrValue *steps_attr_ptr = ctx.GetAttr("steps");
+  if (steps_attr_ptr) {
+    int64_t steps_data = steps_attr_ptr->GetInt();
+    KERNEL_CHECK_FALSE((steps_data >= 0), KERNEL_STATUS_PARAM_INVALID,
+                       "Attr [steps] data has to be greater than or equal to 0.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t LogSpaceCpuKernel::LogSpaceCompute(CpuKernelContext &ctx) {
+  DataType data_type_in = ctx.Input(0)->GetDataType();
+  DataType data_type = ctx.Output(0)->GetDataType();
+  if (data_type_in == data_type) {
+    auto *input_start_ = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+    auto *input_end_ = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+    auto input_start = static_cast<double>(input_start_[0]);
+    auto input_end = static_cast<double>(input_end_[0]);
+    auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+    AttrValue *steps_data = ctx.GetAttr("steps");
+    AttrValue *base_data = ctx.GetAttr("base");
+    int64_t steps_value = 100;
+    int base_value = 10;
+    if (steps_data) {
+      steps_value = steps_data->GetInt();
+    }
+    if (base_data) {
+      base_value = base_data->GetInt();
+    }
+    if (steps_value != 1) {
+      double b = (input_end - input_start) / (steps_value - 1);
+      double q = pow(base_value, b);
+      double input_start_value = input_start;
+      for (int64_t i = 0; i < steps_value; i++) {
+        double end_num = pow(base_value, input_start_value) * pow(q, i);
+        *(output_y + i) = static_cast<T>(end_num);
+      }
+    }
+    if (steps_value == 1) {
+      double end_num = pow(base_value, double(input_start));
+      *(output_y) = static_cast<T>(end_num);
+    }
+  } else if (data_type_in == DT_FLOAT) {
+    auto *input_start_ = reinterpret_cast<float *>(ctx.Input(0)->GetData());
+    auto *input_end_ = reinterpret_cast<float *>(ctx.Input(1)->GetData());
+    auto input_start = static_cast<double>(input_start_[0]);
+    auto input_end = static_cast<double>(input_end_[0]);
+    auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+    AttrValue *steps_data = ctx.GetAttr("steps");
+    AttrValue *base_data = ctx.GetAttr("base");
+    int64_t steps_value = 100;
+    int base_value = 10;
+    if (steps_data) {
+      steps_value = steps_data->GetInt();
+    }
+    if (base_data) {
+      base_value = base_data->GetInt();
+    }
+    if (steps_value != 1) {
+      double b = (input_end - input_start) / (steps_value - 1);
+      double q = pow(base_value, b);
+      double input_start_value = input_start;
+      for (int64_t i = 0; i < steps_value; i++) {
+        double end_num = pow(base_value, input_start_value) * pow(q, i);
+        *(output_y + i) = static_cast<T>(end_num);
+      }
+    }
+    if (steps_value == 1) {
+      double end_num = pow(base_value, double(input_start));
+      *(output_y) = static_cast<T>(end_num);
+    }
+  } else if (data_type_in == DT_FLOAT16) {
+    auto *input_start_ = reinterpret_cast<Eigen::half *>(ctx.Input(0)->GetData());
+    auto *input_end_ = reinterpret_cast<Eigen::half *>(ctx.Input(1)->GetData());
+    auto input_start = static_cast<double>(input_start_[0]);
+    auto input_end = static_cast<double>(input_end_[0]);
+    auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+    AttrValue *steps_data = ctx.GetAttr("steps");
+    AttrValue *base_data = ctx.GetAttr("base");
+    int64_t steps_value = 100;
+    int base_value = 10;
+    if (steps_data) {
+      steps_value = steps_data->GetInt();
+    }
+    if (base_data) {
+      base_value = base_data->GetInt();
+    }
+    if (steps_value != 1) {
+      double b = (input_end - input_start) / (steps_value - 1);
+      double q = pow(base_value, b);
+      for (int64_t i = 0; i < steps_value; i++) {
+        double end_num = pow(base_value, input_start) * pow(q, i);
+        *(output_y + i) = static_cast<T>(end_num);
+      }
+    }
+    if (steps_value == 1) {
+      double end_num = pow(base_value, double(input_start));
+      *(output_y) = static_cast<T>(end_num);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kLogSpace, LogSpaceCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logspace.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logspace.h
@ -0,0 +1,37 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_LOG1P_H_
+#define AICPU_KERNELS_NORMALIZED_LOG1P_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class LogSpaceCpuKernel : public CpuKernel {
+ public:
+  LogSpaceCpuKernel() = default;
+  ~LogSpaceCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t LogSpaceCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t LogSpaceCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_inverse.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_inverse.cc
@ -0,0 +1,126 @@
+/**
+ * Copyright (c) 2022-2022 Huawei Technologies Co., Ltd.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "matrix_inverse.h"
+#include <complex>
+#include <vector>
+#include "Eigen/Core"
+#include "Eigen/LU"
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 1;
+const char *kMatrixInverse = "MatrixInverse";
+// if the data size is larger than the value, call ParallelFor() func
+constexpr int64_t kParallelDataNums = 1 * 1024;
+
+#define MATRIXINVERSE_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                               \
+    uint32_t result = MatrixInverseCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                           \
+      KERNEL_LOG_ERROR("MatrixInverse kernel compute failed."); \
+      return result;                                            \
+    }                                                           \
+    break;                                                      \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t MatrixInverseCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixInverse check input and output number failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    MATRIXINVERSE_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    MATRIXINVERSE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    MATRIXINVERSE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
+    MATRIXINVERSE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("MatrixInverse kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MatrixInverseCpuKernel::MatrixInverseCompute(CpuKernelContext &ctx) {
+  Tensor *input = ctx.Input(0);
+  T *input_ptr = reinterpret_cast<T *>(input->GetData());
+  Tensor *output = ctx.Output(0);
+  T *output_ptr = reinterpret_cast<T *>(output->GetData());
+  // Judge whether the input shape matches
+  auto shape = input->GetTensorShape();
+  uint64_t data_size = input->GetDataSize();
+  std::vector<int64_t> dims = shape->GetDimSizes();
+  KERNEL_CHECK_FALSE((dims.size() >= 2 && (*(dims.end() - 1) == *(dims.end() - 2))), KERNEL_STATUS_PARAM_INVALID,
+                     "Input Shape is wrong");
+  auto last_dimsize = *(dims.end() - 1);
+  // Output length
+  auto input_num = input->NumElements();
+  size_t matrix_size = last_dimsize * last_dimsize;
+  // Number of matrices
+  size_t matrix_num = input_num / matrix_size;
+  // Store two-dimensional array of data for slicing
+  std::vector<std::vector<T>> temp(matrix_num, std::vector<T>(matrix_size));
+  for (size_t i = 0; i < matrix_num; i++) {
+    for (size_t j = 0; j < matrix_size; j++) {
+      temp[i][j] = *(input_ptr + i * matrix_size + j);
+    }
+  }
+  // Gets the value of the property adjoint
+  AttrValue *adjoint_attr = ctx.GetAttr("adjoint");
+  bool adjoint__ = adjoint_attr->GetBool();
+  if (data_size <= kParallelDataNums) {
+    for (size_t i = 0; i < matrix_num; i++) {
+      Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> eigen_input(temp[i].data(), last_dimsize,
+                                                                               last_dimsize);
+      Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> eigen_output(output_ptr + i * matrix_size,
+                                                                                last_dimsize, last_dimsize);
+      if (adjoint__) {
+        eigen_input = eigen_input.adjoint().eval();
+      }
+      Eigen::FullPivLU<Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>> lu(eigen_input);
+      eigen_output = lu.inverse();
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > matrix_num) {
+      max_core_num = matrix_num;
+    }
+    auto sharedcompute = [&](size_t start, size_t end) {
+      for (auto i = start; i < end; i++) {
+        Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> eigen_input(temp[i].data(), last_dimsize,
+                                                                                 last_dimsize);
+        Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> eigen_output(output_ptr + i * matrix_size,
+                                                                                  last_dimsize, last_dimsize);
+        if (adjoint__) {
+          eigen_input = eigen_input.adjoint().eval();
+        }
+        Eigen::FullPivLU<Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>> lu(eigen_input);
+        eigen_output = lu.inverse();
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, sharedcompute),
+                        "Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMatrixInverse, MatrixInverseCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_inverse.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_inverse.h
@ -0,0 +1,37 @@
+/**
+ * Copyright (c) 2022-2022 Huawei Technologies Co., Ltd.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_MATRIXINVERSE_H_
+#define AICPU_KERNELS_NORMALIZED_MATRIXINVERSE_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class MatrixInverseCpuKernel : public CpuKernel {
+ public:
+  MatrixInverseCpuKernel() = default;
+  ~MatrixInverseCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  static uint32_t MatrixInverseCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_power.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_power.cc
@ -0,0 +1,198 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "matrix_power.h"
+
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include "cpu_kernel_utils.h"
+#include <Eigen/Dense>
+#include <algorithm>
+#include <iostream>
+#include <map>
+
+namespace {
+const uint32_t kInputNum = 1;
+const uint32_t kOutputNum = 1;
+const char *kMatrixPower = "MatrixPower";
+const int64_t kParallelDataNum = 4 * 1024;
+}  // namespace
+
+namespace aicpu {
+uint32_t MatrixPowerCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixPower normal check failed.");
+  auto x_type = ctx.Input(0)->GetDataType();
+  if (x_type == DT_FLOAT) {
+    return ComputeKernel<float>(ctx);
+  } else {
+    return ComputeKernel<Eigen::half>(ctx);
+  }
+}
+
+template <typename T>
+uint32_t MatrixPowerCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
+  Tensor *input_x = ctx.Input(0);
+  Tensor *output_y = ctx.Output(0);
+  AttrValue *power = ctx.GetAttr("n");
+  int64_t powervalue = power->GetInt();
+  auto x_shape = input_x->GetTensorShape();
+  size_t batch = x_shape->GetDimSize(0);
+  size_t dim = x_shape->GetDimSize(1);
+  auto x_ptr = reinterpret_cast<T *>(input_x->GetData());
+  auto y_ptr = reinterpret_cast<T *>(output_y->GetData());
+  int64_t data_num = ctx.Input(0)->NumElements() * sizeof(T);
+
+  if (powervalue < 0) {
+    powervalue = -powervalue;
+    if (data_num >= kParallelDataNum) {
+      uint32_t min_core_num = 1;
+      uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+      if (max_core_num > batch) {
+        max_core_num = batch;
+      }
+      if (max_core_num == 0) {
+        max_core_num = 1;
+      }
+      int64_t NotInvertible = -1;
+      auto shard_matrix_power = [&](size_t start, size_t end) {
+        Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> A(dim, dim);
+        Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> B(dim, dim);
+        for (size_t i = start; i < end; i++) {
+          for (size_t p = 0; p < dim; p++) {
+            for (size_t q = 0; q < dim; q++) {
+              B(p, q) = (float)x_ptr[i * dim * dim + p * dim + q];
+            }
+          }
+          Eigen::FullPivLU<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> LU(B);
+          if (!(LU.isInvertible())) {
+            NotInvertible = i;
+          }
+          A = LU.inverse();
+          B.setIdentity();
+          int64_t n = powervalue;
+          while (n > 0) {
+            if (n % 2 == 1) {
+              B = B * A;
+            }
+            n = n / 2;
+            A = A * A;
+          }
+          for (size_t p = 0; p < dim; p++) {
+            for (size_t q = 0; q < dim; q++) {
+              y_ptr[i * dim * dim + p * dim + q] = (T)B(p, q);
+            }
+          }
+        }
+      };
+      CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, shard_matrix_power);
+      KERNEL_CHECK_FALSE((NotInvertible < 0), KERNEL_STATUS_PARAM_INVALID,
+                         "The %d-th matrix of input tensor is singular, but got n is negative.", NotInvertible)
+    } else {
+      Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> A(dim, dim);
+      Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> B(dim, dim);
+      for (size_t i = 0; i < batch; i++) {
+        for (size_t p = 0; p < dim; p++) {
+          for (size_t q = 0; q < dim; q++) {
+            B(p, q) = (float)x_ptr[i * dim * dim + p * dim + q];
+          }
+        }
+        Eigen::FullPivLU<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> LU(B);
+        KERNEL_CHECK_FALSE((LU.isInvertible()), KERNEL_STATUS_PARAM_INVALID,
+                           "The %d-th matrix of input tensor is singular, but got n is negative.", i)
+        A = LU.inverse();
+        B.setIdentity();
+        int64_t n = powervalue;
+        while (n > 0) {
+          if (n % 2 == 1) {
+            B = B * A;
+          }
+          n = n / 2;
+          A = A * A;
+        }
+        for (size_t p = 0; p < dim; p++) {
+          for (size_t q = 0; q < dim; q++) {
+            y_ptr[i * dim * dim + p * dim + q] = (T)B(p, q);
+          }
+        }
+      }
+    }
+  } else {
+    if (data_num >= kParallelDataNum) {
+      uint32_t min_core_num = 1;
+      uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+      if (max_core_num > batch) {
+        max_core_num = batch;
+      }
+      if (max_core_num == 0) {
+        max_core_num = 1;
+      }
+      auto shard_matrix_power = [&](size_t start, size_t end) {
+        Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> A(dim, dim);
+        Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> B(dim, dim);
+        for (size_t i = start; i < end; i++) {
+          for (size_t p = 0; p < dim; p++) {
+            for (size_t q = 0; q < dim; q++) {
+              A(p, q) = (float)x_ptr[i * dim * dim + p * dim + q];
+            }
+          }
+          B.setIdentity();
+          int64_t n = powervalue;
+          while (n > 0) {
+            if (n % 2 == 1) {
+              B = B * A;
+            }
+            n = n / 2;
+            A = A * A;
+          }
+          for (size_t p = 0; p < dim; p++) {
+            for (size_t q = 0; q < dim; q++) {
+              y_ptr[i * dim * dim + p * dim + q] = (T)B(p, q);
+            }
+          }
+        }
+      };
+      CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, shard_matrix_power);
+    } else {
+      Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> A(dim, dim);
+      Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> B(dim, dim);
+      for (size_t i = 0; i < batch; i++) {
+        for (size_t p = 0; p < dim; p++) {
+          for (size_t q = 0; q < dim; q++) {
+            A(p, q) = (float)x_ptr[i * dim * dim + p * dim + q];
+          }
+        }
+        B.setIdentity();
+        int64_t n = powervalue;
+        while (n > 0) {
+          if (n % 2 == 1) {
+            B = B * A;
+          }
+          n = n / 2;
+          A = A * A;
+        }
+        for (size_t p = 0; p < dim; p++) {
+          for (size_t q = 0; q < dim; q++) {
+            y_ptr[i * dim * dim + p * dim + q] = (T)B(p, q);
+          }
+        }
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMatrixPower, MatrixPowerCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_power.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_power.h
@ -0,0 +1,36 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_MATRIX_POWER_H_
+#define AICPU_KERNELS_NORMALIZED_MATRIX_POWER_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class MatrixPowerCpuKernel : public CpuKernel {
+ public:
+  MatrixPowerCpuKernel() = default;
+  ~MatrixPowerCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  static uint32_t ComputeKernel(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve.cc
@ -0,0 +1,160 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "matrix_solve.h"
+
+#include <complex>
+#include "Eigen/Core"
+#include "Eigen/LU"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kMatrixSolve = "MatrixSolve";
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const int64_t kParallelDataNumSameShape = 8 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 128 * 1024;
+}  // namespace
+
+namespace aicpu {
+uint32_t MatrixSolveCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixSolve check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MatrixSolveDataAndTypeCheck(ctx), "MatrixSolve check input and output params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    case DT_FLOAT:
+      return MatrixSolveCompute<float>(ctx);
+    case DT_DOUBLE:
+      return MatrixSolveCompute<double>(ctx);
+    case DT_COMPLEX64:
+      return MatrixSolveCompute<std::complex<float>>(ctx);
+    case DT_COMPLEX128:
+      return MatrixSolveCompute<std::complex<double>>(ctx);
+    default:
+      KERNEL_LOG_ERROR("MatrixSolve kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MatrixSolveCpuKernel::MatrixSolveDataAndTypeCheck(CpuKernelContext &ctx) {
+  DataType matrix_type = ctx.Input(0)->GetDataType();
+  DataType rhs_type = ctx.Input(1)->GetDataType();
+  KERNEL_CHECK_FALSE((matrix_type == rhs_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with "
+                     "input1 [%s].",
+                     DTypeStr(matrix_type).c_str(), DTypeStr(rhs_type).c_str())
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MatrixSolveCpuKernel::MatrixSolveCompute(CpuKernelContext &ctx) {
+  auto input0_tensor = ctx.Input(0);
+  auto input0_tensor_shape = input0_tensor->GetTensorShape();
+  auto input1_tensor = ctx.Input(1);
+  auto input1_tensor_shape = input1_tensor->GetTensorShape();
+  auto input0_data = reinterpret_cast<T *>(input0_tensor->GetData());
+  auto input1_data = reinterpret_cast<T *>(input1_tensor->GetData());
+  auto input0_shape = input0_tensor_shape->GetDimSizes();
+  int32_t input0_dims = input0_tensor_shape->GetDims();
+  int32_t input1_dims = input1_tensor_shape->GetDims();
+  int64_t m = input0_shape[input0_dims - 1];
+  int64_t size_mm = m * m;
+
+  KERNEL_CHECK_FALSE((input0_shape[input0_dims - 1] == input0_shape[input0_dims - 2]), KERNEL_STATUS_PARAM_INVALID,
+                     "Input[matrix] must be a square matrix")
+  KERNEL_CHECK_FALSE((input1_dims >= 2), KERNEL_STATUS_PARAM_INVALID, "Input[rhs] must be a matrix")
+  KERNEL_CHECK_FALSE(
+    (input0_tensor_shape->GetDimSize(input0_dims - 1) == input1_tensor_shape->GetDimSize(input1_dims - 2)),
+    KERNEL_STATUS_PARAM_INVALID, "Input matrix and rhs are incompatible")
+
+  typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartixXd;
+  auto adjoint = ctx.GetAttr("adjoint")->GetBool();
+  auto input1_shape = input1_tensor_shape->GetDimSizes();
+  int64_t k = input1_shape[input1_dims - 1];
+  auto output_tensor = ctx.Output(0);
+  auto output_data = reinterpret_cast<T *>(output_tensor->GetData());
+
+  if (size_mm > 0) {
+    size_t matrix_num = ctx.Input(0)->NumElements() / size_mm;
+    int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
+    if (data_size >= kParallelDataNumSameShape) {
+      uint32_t min_core_num = 1;
+      uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+      if (data_size <= kParallelDataNumSameShapeMid) {
+        max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+      }
+      // 若AI CPU中核数大于矩阵个数，以矩阵个数作为max_core_num
+      if (max_core_num > matrix_num) {
+        max_core_num = matrix_num;
+      }
+      auto sharder_matrix_solve = [&](size_t start, size_t end) {
+        for (size_t i = start; i < end; i++) {
+          Eigen::Map<MartixXd> input0(input0_data + i * m * m, m, m);
+          Eigen::Map<MartixXd> input1(input1_data + i * m * k, m, k);
+          Eigen::Map<MartixXd> output(output_data + i * m * k, m, k);
+          if (input0.rows() == 0 || input0.cols() == 0 || input1.cols() == 0) {
+            return KERNEL_STATUS_PARAM_INVALID;
+          }
+          Eigen::PartialPivLU<MartixXd> lu_decomposition(input0.rows());
+          if (adjoint) {
+            lu_decomposition.compute(input0.adjoint());
+          } else {
+            lu_decomposition.compute(input0);
+          }
+          using RealScalar = typename Eigen::NumTraits<T>::Real;
+          RealScalar pivot = lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff();
+          KERNEL_CHECK_FALSE((pivot > RealScalar(0)), KERNEL_STATUS_PARAM_INVALID, "Input matrix is not invertible");
+          output.noalias() = lu_decomposition.solve(input1);
+        }
+        return KERNEL_STATUS_OK;
+      };
+      KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, sharder_matrix_solve),
+                          "Matrix Solve Compute failed");
+
+    } else {
+      for (size_t i = 0; i < matrix_num; i++) {
+        Eigen::Map<MartixXd> input0(input0_data + i * m * m, m, m);
+        Eigen::Map<MartixXd> input1(input1_data + i * m * k, m, k);
+        Eigen::Map<MartixXd> output(output_data + i * m * k, m, k);
+        if (input0.rows() == 0 || input0.cols() == 0 || input1.cols() == 0) {
+          return KERNEL_STATUS_PARAM_INVALID;
+        }
+        Eigen::PartialPivLU<MartixXd> lu_decomposition(input0.rows());
+        if (adjoint) {
+          lu_decomposition.compute(input0.adjoint());
+        } else {
+          lu_decomposition.compute(input0);
+        }
+        using RealScalar = typename Eigen::NumTraits<T>::Real;
+        RealScalar pivot = lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff();
+        KERNEL_CHECK_FALSE((pivot > RealScalar(0)), KERNEL_STATUS_PARAM_INVALID, "Input matrix is not invertible");
+
+        output.noalias() = lu_decomposition.solve(input1);
+      }
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMatrixSolve, MatrixSolveCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve.h
@ -0,0 +1,37 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_MATRIXSOLVE_H_
+#define AICPU_KERNELS_NORMALIZED_MATRIXSOLVE_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class MatrixSolveCpuKernel : public CpuKernel {
+ public:
+  MatrixSolveCpuKernel() = default;
+  ~MatrixSolveCpuKernel() = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t MatrixSolveDataAndTypeCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t MatrixSolveCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_grad_with_argmax.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_grad_with_argmax.cc
@ -0,0 +1,315 @@
+
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "max_pool_3d_grad_with_argmax.h"
+#include <iostream>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 3;
+const char *kMaxPool3DGradWithArgmax = "MaxPool3DGradWithArgmax";
+
+#define MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DTYPE, INTYPE, ARGTYPE, CTX) \
+  case (DTYPE): {                                                             \
+    uint32_t result = MaxPool3DGradWithArgmaxCompute<INTYPE, ARGTYPE>(CTX);   \
+    if (result != KERNEL_STATUS_OK) {                                         \
+      KERNEL_LOG_ERROR("MaxPool3DGradWithArgmax kernel compute failed.");     \
+      return result;                                                          \
+    }                                                                         \
+    break;                                                                    \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t MaxPool3DGradWithArgmaxCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  std::vector<std::string> attr_names = {"ksize", "strides", "pads", "dilation"};
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum, attr_names),
+                      "MaxPool3DGradWithArgmax check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MaxPool3DGradWithArgmaxParamCheck(ctx), "MaxPool3DGradWithArgmax check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  auto argmax_type = ctx.Input(2)->GetDataType();
+  if (argmax_type == DT_INT32) {
+    switch (data_type) {
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT8, int8_t, int32_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT16, int16_t, int32_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT32, int32_t, int32_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT64, int64_t, int32_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT8, uint8_t, int32_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT16, uint16_t, int32_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT32, uint32_t, int32_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT64, uint64_t, int32_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, int32_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT, float, int32_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_DOUBLE, double, int32_t, ctx)
+      default:
+        KERNEL_LOG_ERROR("MaxPool3DWithArgmax kernel input data type [%s] not support.", DTypeStr(data_type).c_str());
+        return KERNEL_STATUS_PARAM_INVALID;
+    }
+  } else if (argmax_type == DT_INT64) {
+    switch (data_type) {
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT8, int8_t, int64_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT16, int16_t, int64_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT32, int32_t, int64_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT64, int64_t, int64_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT8, uint8_t, int64_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT16, uint16_t, int64_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT32, uint32_t, int64_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT64, uint64_t, int64_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, int64_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT, float, int64_t, ctx)
+      MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_DOUBLE, double, int64_t, ctx)
+      default:
+        KERNEL_LOG_ERROR("MaxPool3DGradWithArgmax kernel input data type [%s] not support.",
+                         DTypeStr(data_type).c_str());
+        return KERNEL_STATUS_PARAM_INVALID;
+    }
+  } else {
+    KERNEL_LOG_ERROR(
+      "MaxPool3DGradWithArgmax kernel input_argmax data type [%s] not "
+      "support.",
+      DTypeStr(argmax_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MaxPool3DGradWithArgmaxCpuKernel::MaxPool3DGradWithArgmaxParamCheck(CpuKernelContext &ctx) {
+  auto input_x_info = ctx.Input(0);
+  auto input_grads_info = ctx.Input(1);
+  auto input_argmax_info = ctx.Input(2);
+  auto output_y_info = ctx.Output(0);
+  DataType input_x_type = input_x_info->GetDataType();
+  DataType input_grads_type = input_grads_info->GetDataType();
+  DataType out_type = output_y_info->GetDataType();
+  KERNEL_CHECK_FALSE((input_x_type == input_grads_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input x [%s] need be same with "
+                     "input grads [%s].",
+                     DTypeStr(input_x_type).c_str(), DTypeStr(input_grads_type).c_str())
+  KERNEL_CHECK_FALSE((input_x_type == out_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input x [%s] need be same with "
+                     "output [%s].",
+                     DTypeStr(input_x_type).c_str(), DTypeStr(out_type).c_str())
+  DataType input_argmax_type = input_argmax_info->GetDataType();
+  KERNEL_CHECK_FALSE((input_argmax_type == DT_INT32) || (input_argmax_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of output argmax:[%s] should be a int32 or int64. ",
+                     DTypeStr(input_argmax_type).c_str())
+
+  std::vector<int64_t> dim_vec = input_x_info->GetTensorShape()->GetDimSizes();
+  int64_t dimsize = dim_vec.size();
+  KERNEL_CHECK_FALSE(dimsize == 5, KERNEL_STATUS_PARAM_INVALID, "The dim of input:[%d] should be 5.", dimsize)
+
+  const size_t DIM_SIZE1 = 1;
+  const size_t DIM_SIZE3 = 3;
+  const size_t DIM_SIZE5 = 5;
+  AttrValue *attr_ksize = ctx.GetAttr("ksize");
+  std::vector<int64_t> ksizeList = attr_ksize->GetListInt();
+  KERNEL_CHECK_FALSE(ksizeList.size() == DIM_SIZE1 || ksizeList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
+                     "The size of ksize:[%d] should be 1 or 3.", ksizeList.size())
+  AttrValue *attr_strides = ctx.GetAttr("strides");
+  std::vector<int64_t> stridesList = attr_strides->GetListInt();
+  KERNEL_CHECK_FALSE(stridesList.size() == DIM_SIZE1 || stridesList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
+                     "The size of strides:[%d] should be 1 or 3.", stridesList.size())
+  AttrValue *attr_pads = ctx.GetAttr("pads");
+  std::vector<int64_t> padsList = attr_pads->GetListInt();
+  KERNEL_CHECK_FALSE(padsList.size() == DIM_SIZE1 || padsList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
+                     "The size of pads:[%d] should be 1 or 3.", padsList.size())
+  AttrValue *attr_dilation = ctx.GetAttr("dilation");
+  std::vector<int64_t> dilationList = attr_dilation->GetListInt();
+  KERNEL_CHECK_FALSE(
+    dilationList.size() == DIM_SIZE1 || dilationList.size() == DIM_SIZE3 || dilationList.size() == DIM_SIZE5,
+    KERNEL_STATUS_PARAM_INVALID, "The size of dilation:[%d] should be 1, 3 or 5.", dilationList.size())
+  KERNEL_LOG_DEBUG(
+    "MaxPool3DGradWithArgmaxCpuKernel[%s], input x: size[%llu];"
+    "input grads: size[%llu], input argmax: size[%llu], output y: "
+    "size[%lld].",
+    ctx.GetOpType().c_str(), input_x_info->GetDataSize(), input_grads_info->GetDataSize(),
+    input_argmax_info->GetDataSize(), output_y_info->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T, typename S>
+void MaxPool3DGradWithArgmaxCpuKernel::MaxPool3DGradWithArgmaxSingleCompute(
+  T *input_grad, S *input_argmax, T *output_y, int64_t iD, int64_t iH, int64_t iW, int64_t oD, int64_t oH, int64_t oW,
+  int64_t kD, int64_t kH, int64_t kW, int64_t sD, int64_t sH, int64_t sW, int64_t pD, int64_t pH, int64_t pW,
+  int64_t dD, int64_t dH, int64_t dW) {
+  T *in_grad = input_grad;
+  T *out_y = output_y;
+  S *argmax = input_argmax;
+
+  /* calculate max points */
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  int64_t ti, i, j;
+  for (ti = 0; ti < oD; ti++) {
+    for (i = 0; i < oH; i++) {
+      for (j = 0; j < oW; j++) {
+        /* retrieve position of max */
+        int64_t index = ti * oH * oW + i * oW + j;
+        int64_t maxp = argmax[index];
+
+        if (maxp != -1) {
+          /* update gradient */
+          out_y[maxp] += in_grad[index];
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename S>
+uint32_t MaxPool3DGradWithArgmaxCpuKernel::MaxPool3DGradWithArgmaxCompute(CpuKernelContext &ctx) {
+  auto input_x_info = ctx.Input(0);
+  auto input_grads_info = ctx.Input(1);
+  auto input_argmax_info = ctx.Input(2);
+  auto output_y_info = ctx.Output(0);
+  auto input_grads = reinterpret_cast<T *>(input_grads_info->GetData());
+  auto input_argmax = reinterpret_cast<S *>(input_argmax_info->GetData());
+  auto output_y = reinterpret_cast<T *>(output_y_info->GetData());
+  AttrValue *attr_ksize = ctx.GetAttr("ksize");
+  std::vector<int64_t> ksizeList = attr_ksize->GetListInt();
+  AttrValue *attr_strides = ctx.GetAttr("strides");
+  std::vector<int64_t> stridesList = attr_strides->GetListInt();
+  AttrValue *attr_pads = ctx.GetAttr("pads");
+  std::vector<int64_t> padsList = attr_pads->GetListInt();
+  AttrValue *attr_dilation = ctx.GetAttr("dilation");
+  std::vector<int64_t> initList = {1, 1, 1, 1, 1};
+  std::vector<int64_t> dilationList = (attr_dilation == nullptr) ? initList : attr_dilation->GetListInt();
+
+  auto input_shape_vec = input_x_info->GetTensorShape()->GetDimSizes();
+  auto output_shape_vec = input_grads_info->GetTensorShape()->GetDimSizes();
+  const int64_t in_width = input_shape_vec[4];
+  const int64_t in_height = input_shape_vec[3];
+  const int64_t in_depth = input_shape_vec[2];
+  const int64_t in_channel = input_shape_vec[1];
+  const int64_t in_batch = input_shape_vec[0];
+  const int64_t out_width = output_shape_vec[4];
+  const int64_t out_height = output_shape_vec[3];
+  const int64_t out_depth = output_shape_vec[2];
+  const size_t DIM_SIZE1 = 1;
+  const size_t DIM_SIZE5 = 5;
+  std::vector<int64_t> ksizeTempList;
+  if (ksizeList.size() == DIM_SIZE1) {
+    ksizeTempList.push_back(ksizeList[0]);
+    ksizeTempList.push_back(ksizeList[0]);
+    ksizeTempList.push_back(ksizeList[0]);
+  } else {
+    ksizeTempList.push_back(ksizeList[0]);
+    ksizeTempList.push_back(ksizeList[1]);
+    ksizeTempList.push_back(ksizeList[2]);
+  }
+  std::vector<int64_t> stridesTempList;
+  if (stridesList.size() == DIM_SIZE1) {
+    stridesTempList.push_back(stridesList[0]);
+    stridesTempList.push_back(stridesList[0]);
+    stridesTempList.push_back(stridesList[0]);
+  } else {
+    stridesTempList.push_back(stridesList[0]);
+    stridesTempList.push_back(stridesList[1]);
+    stridesTempList.push_back(stridesList[2]);
+  }
+  std::vector<int64_t> padsTempList;
+  if (padsList.size() == DIM_SIZE1) {
+    padsTempList.push_back(padsList[0]);
+    padsTempList.push_back(padsList[0]);
+    padsTempList.push_back(padsList[0]);
+  } else {
+    padsTempList.push_back(padsList[0]);
+    padsTempList.push_back(padsList[1]);
+    padsTempList.push_back(padsList[2]);
+  }
+  std::vector<int64_t> dilationTempList;
+  if (dilationList.size() == DIM_SIZE1) {
+    dilationTempList.push_back(dilationList[0]);
+    dilationTempList.push_back(dilationList[0]);
+    dilationTempList.push_back(dilationList[0]);
+  } else if (dilationList.size() == DIM_SIZE5) {
+    dilationTempList.push_back(dilationList[2]);
+    dilationTempList.push_back(dilationList[3]);
+    dilationTempList.push_back(dilationList[4]);
+  } else {
+    dilationTempList.push_back(dilationList[1]);
+    dilationTempList.push_back(dilationList[2]);
+    dilationTempList.push_back(dilationList[3]);
+  }
+  const int64_t k_width = ksizeTempList[2];
+  const int64_t k_height = ksizeTempList[1];
+  const int64_t k_depth = ksizeTempList[0];
+  const int64_t s_width = stridesTempList[2];
+  const int64_t s_height = stridesTempList[1];
+  const int64_t s_depth = stridesTempList[0];
+  const int64_t p_width = padsTempList[2];
+  const int64_t p_height = padsTempList[1];
+  const int64_t p_depth = padsTempList[0];
+  const int64_t d_width = dilationTempList[2];
+  const int64_t d_height = dilationTempList[1];
+  const int64_t d_depth = dilationTempList[0];
+  KERNEL_CHECK_FALSE(k_width / 2 >= p_width && k_height / 2 >= p_height && k_depth / 2 >= p_depth,
+                     KERNEL_STATUS_PARAM_INVALID, "pads should be smaller than or equal to half of kernel size.");
+
+  int64_t data_num = ctx.Input(0)->NumElements();
+  const int64_t batch = in_batch * in_channel;
+  const int64_t in_stride = in_width * in_height * in_depth;
+  const int64_t out_stride = out_width * out_height * out_depth;
+  const int64_t kParallelDataNum = 16 * in_width * in_height * in_depth;
+  const int64_t kParallelDataNumMid = 72 * in_width * in_height * in_depth;
+  const float ZERO = 0.f;
+  int64_t output_num = ctx.Output(0)->NumElements();
+  for (int64_t i = 0; i < output_num; i++) {
+    output_y[i] = static_cast<T>(ZERO);
+  }
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    auto sharder_max_pool3d_grad_with_argmax = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; i++) {
+        MaxPool3DGradWithArgmaxSingleCompute(input_grads + i * out_stride, input_argmax + i * out_stride,
+                                             output_y + i * in_stride, in_depth, in_height, in_width, out_depth,
+                                             out_height, out_width, k_depth, k_height, k_width, s_depth, s_height,
+                                             s_width, p_depth, p_height, p_width, d_depth, d_height, d_width);
+      }
+    };
+
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    KERNEL_HANDLE_ERROR(
+      CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_max_pool3d_grad_with_argmax),
+      "MaxPool3DGradWithArgmax Compute failed.");
+
+  } else {
+    for (int64_t i = 0; i < batch; i++) {
+      MaxPool3DGradWithArgmaxSingleCompute(input_grads + i * out_stride, input_argmax + i * out_stride,
+                                           output_y + i * in_stride, in_depth, in_height, in_width, out_depth,
+                                           out_height, out_width, k_depth, k_height, k_width, s_depth, s_height,
+                                           s_width, p_depth, p_height, p_width, d_depth, d_height, d_width);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMaxPool3DGradWithArgmax, MaxPool3DGradWithArgmaxCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_grad_with_argmax.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_grad_with_argmax.h
@ -0,0 +1,45 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_MAX_POOL3D_GRAD_WINTH_ARGMAX_H_
+#define AICPU_KERNELS_NORMALIZED_MAX_POOL3D_GRAD_WINTH_ARGMAX_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class MaxPool3DGradWithArgmaxCpuKernel : public CpuKernel {
+ public:
+  MaxPool3DGradWithArgmaxCpuKernel() = default;
+  ~MaxPool3DGradWithArgmaxCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t MaxPool3DGradWithArgmaxParamCheck(CpuKernelContext &ctx);
+
+  template <typename T, typename S>
+  uint32_t MaxPool3DGradWithArgmaxCompute(CpuKernelContext &ctx);
+
+  template <typename T, typename S>
+  void MaxPool3DGradWithArgmaxSingleCompute(T *input_x, S *input_argmax, T *output_y, int64_t iD, int64_t iH,
+                                            int64_t iW, int64_t oD, int64_t oH, int64_t oW, int64_t kD, int64_t kH,
+                                            int64_t kW, int64_t sD, int64_t sH, int64_t sW, int64_t pD, int64_t pH,
+                                            int64_t pW, int64_t dD, int64_t dH, int64_t dW);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_with_argmax.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_with_argmax.cc
@ -0,0 +1,342 @@
+
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "max_pool_3d_with_argmax.h"
+#include <iostream>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 2;
+const uint32_t kInputNum = 1;
+const char *kMaxPool3DWithArgmax = "MaxPool3DWithArgmax";
+
+#define MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DTYPE, INTYPE, OUTTYPE, CTX) \
+  case (DTYPE): {                                                        \
+    uint32_t result = MaxPool3DWithArgmaxCompute<INTYPE, OUTTYPE>(CTX);  \
+    if (result != KERNEL_STATUS_OK) {                                    \
+      KERNEL_LOG_ERROR("MaxPool3DWithArgmax kernel compute failed.");    \
+      return result;                                                     \
+    }                                                                    \
+    break;                                                               \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t MaxPool3DWithArgmaxCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  std::vector<std::string> attr_names = {"ksize", "strides", "pads"};
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum, attr_names),
+                      "MaxPool3DWithArgmax check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MaxPool3DWithArgmaxParamCheck(ctx), "MaxPool3DWithArgmax check params failed.");
+  auto in_data_type = ctx.Input(0)->GetDataType();
+  auto out_data_type = ctx.Output(1)->GetDataType();
+  std::string argmax_type =
+    (ctx.GetAttr("argmax_type") == nullptr) ? "bitmask" : ctx.GetAttr("argmax_type")->GetString();
+  if (argmax_type == "bitmask") {
+    KERNEL_LOG_ERROR("Bitmask is not supported now.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  } else {
+    if (out_data_type == DT_INT32) {
+      switch (in_data_type) {
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT8, int8_t, int32_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT16, int16_t, int32_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT32, int32_t, int32_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT64, int64_t, int32_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT8, uint8_t, int32_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT16, uint16_t, int32_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT32, uint32_t, int32_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT64, uint64_t, int32_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, int32_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT, float, int32_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_DOUBLE, double, int32_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("MaxPool3DWithArgmax kernel input data type [%s] not support.",
+                           DTypeStr(in_data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+    } else if (out_data_type == DT_INT64) {
+      switch (in_data_type) {
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT8, int8_t, int64_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT16, int16_t, int64_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT32, int32_t, int64_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT64, int64_t, int64_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT8, uint8_t, int64_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT16, uint16_t, int64_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT32, uint32_t, int64_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT64, uint64_t, int64_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, int64_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT, float, int64_t, ctx)
+        MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_DOUBLE, double, int64_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("MaxPool3DWithArgmax kernel input data type [%s] not support.",
+                           DTypeStr(in_data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+    } else {
+      KERNEL_LOG_ERROR(
+        "MaxPool3DWithArgmax kernel output_argmax data type [%s] not "
+        "support.",
+        DTypeStr(out_data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+
+    return KERNEL_STATUS_OK;
+  }
+}
+
+uint32_t MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxParamCheck(CpuKernelContext &ctx) {
+  auto input_info = ctx.Input(0);
+  auto output_y_info = ctx.Output(0);
+  auto output_argmax_info = ctx.Output(1);
+  DataType input_type = input_info->GetDataType();
+  DataType output_y_type = output_y_info->GetDataType();
+  KERNEL_CHECK_FALSE((input_type == output_y_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input x [%s] need be same with "
+                     "output y [%s].",
+                     DTypeStr(input_type).c_str(), DTypeStr(output_y_type).c_str())
+  DataType output_argmax_type = output_argmax_info->GetDataType();
+  KERNEL_CHECK_FALSE((output_argmax_type == DT_INT32) || (output_argmax_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of output argmax:[%s] should be a int32 or int64. ",
+                     DTypeStr(output_argmax_type).c_str())
+
+  std::vector<int64_t> dim_vec = input_info->GetTensorShape()->GetDimSizes();
+  int64_t dimsize = dim_vec.size();
+  KERNEL_CHECK_FALSE(dimsize == 5, KERNEL_STATUS_PARAM_INVALID, "The dim of input:[%d] should be 5.", dimsize)
+
+  const size_t DIM_SIZE1 = 1;
+  const size_t DIM_SIZE3 = 3;
+  const size_t DIM_SIZE5 = 5;
+  AttrValue *attr_ksize = ctx.GetAttr("ksize");
+  std::vector<int64_t> ksizeList = attr_ksize->GetListInt();
+  KERNEL_CHECK_FALSE(ksizeList.size() == DIM_SIZE1 || ksizeList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
+                     "The size of ksize:[%d] should be 1 or 3.", ksizeList.size())
+  AttrValue *attr_strides = ctx.GetAttr("strides");
+  std::vector<int64_t> stridesList = attr_strides->GetListInt();
+  KERNEL_CHECK_FALSE(stridesList.size() == DIM_SIZE1 || stridesList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
+                     "The size of strides:[%d] should be 1 or 3.", stridesList.size())
+  AttrValue *attr_pads = ctx.GetAttr("pads");
+  std::vector<int64_t> padsList = attr_pads->GetListInt();
+  KERNEL_CHECK_FALSE(padsList.size() == DIM_SIZE1 || padsList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
+                     "The size of pads:[%d] should be 1 or 3.", padsList.size())
+  AttrValue *attr_dilation = ctx.GetAttr("dilation");
+  std::vector<int64_t> initList = {1, 1, 1, 1, 1};
+  std::vector<int64_t> dilationList = (attr_dilation == nullptr) ? initList : attr_dilation->GetListInt();
+  KERNEL_CHECK_FALSE(
+    dilationList.size() == DIM_SIZE1 || dilationList.size() == DIM_SIZE3 || dilationList.size() == DIM_SIZE5,
+    KERNEL_STATUS_PARAM_INVALID, "The size of dilation:[%d] should be 1, 3 or 5.", dilationList.size())
+  KERNEL_LOG_DEBUG(
+    "MaxPool3sWithArgmaxCpuKernel[%s], input x: size[%llu];"
+    "output y: size[%llu], output argmax: size[%llu].",
+    ctx.GetOpType().c_str(), input_info->GetDataSize(), output_y_info->GetDataSize(),
+    output_argmax_info->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T, typename S>
+void MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxSingleCompute(T *input, T *output_y, S *output_argmax, int64_t iD,
+                                                                    int64_t iH, int64_t iW, int64_t oD, int64_t oH,
+                                                                    int64_t oW, int64_t kD, int64_t kH, int64_t kW,
+                                                                    int64_t sD, int64_t sH, int64_t sW, int64_t pD,
+                                                                    int64_t pH, int64_t pW, int64_t dD, int64_t dH,
+                                                                    int64_t dW) {
+  int64_t i, j, ti;
+  T *ip = input;
+  for (ti = 0; ti < oD; ti++) {
+    for (i = 0; i < oH; i++) {
+      for (j = 0; j < oW; j++) {
+        int64_t start_t = ti * sD - pD;
+        int64_t start_h = i * sH - pH;
+        int64_t start_w = j * sW - pW;
+
+        int64_t end_t = std::min(start_t + (kD - 1) * dD + 1, iD);
+        int64_t end_h = std::min(start_h + (kH - 1) * dH + 1, iH);
+        int64_t end_w = std::min(start_w + (kW - 1) * dW + 1, iW);
+
+        while (start_t < 0) {
+          start_t += dD;
+        }
+        while (start_h < 0) {
+          start_h += dH;
+        }
+        while (start_w < 0) {
+          start_w += dW;
+        }
+
+        T *op = output_y + ti * oW * oH + i * oW + j;
+        S *indzp = output_argmax + ti * oW * oH + i * oW + j;
+
+        S maxindex = start_t * iH * iW + start_h * iW + start_w;
+        T maxval = -std::numeric_limits<T>::infinity();
+
+        for (int64_t z = start_t; z < end_t; z += dD) {
+          for (int64_t y = start_h; y < end_h; y += dH) {
+            for (int64_t x = start_w; x < end_w; x += dW) {
+              S index = z * iH * iW + y * iW + x;
+              T val = ip[index];
+              if ((val > maxval) || std::isnan(double(val))) {
+                maxval = (T)val;
+                maxindex = index;
+              }
+            }
+          }
+        }
+
+        // store location of max
+        *indzp = maxindex;
+
+        /* set output to local max */
+        *op = maxval;
+      }
+    }
+  }
+}
+
+template <typename T, typename S>
+uint32_t MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxCompute(CpuKernelContext &ctx) {
+  auto input_info = ctx.Input(0);
+  auto output_y_info = ctx.Output(0);
+  auto output_argmax_info = ctx.Output(1);
+  auto input_x = reinterpret_cast<T *>(input_info->GetData());
+  auto output_y = reinterpret_cast<T *>(output_y_info->GetData());
+  auto output_argmax = reinterpret_cast<S *>(output_argmax_info->GetData());
+  AttrValue *attr_ksize = ctx.GetAttr("ksize");
+  std::vector<int64_t> ksizeList = attr_ksize->GetListInt();
+  AttrValue *attr_strides = ctx.GetAttr("strides");
+  std::vector<int64_t> stridesList = attr_strides->GetListInt();
+  AttrValue *attr_pads = ctx.GetAttr("pads");
+  std::vector<int64_t> padsList = attr_pads->GetListInt();
+  AttrValue *attr_dilation = ctx.GetAttr("dilation");
+  std::vector<int64_t> initList = {1, 1, 1, 1, 1};
+  std::vector<int64_t> dilationList = (attr_dilation == nullptr) ? initList : attr_dilation->GetListInt();
+
+  auto input_shape_vec = input_info->GetTensorShape()->GetDimSizes();
+  auto output_shape_vec = output_y_info->GetTensorShape()->GetDimSizes();
+  const int64_t in_width = input_shape_vec[4];
+  const int64_t in_height = input_shape_vec[3];
+  const int64_t in_depth = input_shape_vec[2];
+  const int64_t in_channel = input_shape_vec[1];
+  const int64_t in_batch = input_shape_vec[0];
+  const int64_t out_width = output_shape_vec[4];
+  const int64_t out_height = output_shape_vec[3];
+  const int64_t out_depth = output_shape_vec[2];
+  const size_t DIM_SIZE1 = 1;
+  const size_t DIM_SIZE5 = 5;
+  std::vector<int64_t> ksizeTempList;
+  if (ksizeList.size() == DIM_SIZE1) {
+    ksizeTempList.push_back(ksizeList[0]);
+    ksizeTempList.push_back(ksizeList[0]);
+    ksizeTempList.push_back(ksizeList[0]);
+  } else {
+    ksizeTempList.push_back(ksizeList[0]);
+    ksizeTempList.push_back(ksizeList[1]);
+    ksizeTempList.push_back(ksizeList[2]);
+  }
+  std::vector<int64_t> stridesTempList;
+  if (stridesList.size() == DIM_SIZE1) {
+    stridesTempList.push_back(stridesList[0]);
+    stridesTempList.push_back(stridesList[0]);
+    stridesTempList.push_back(stridesList[0]);
+  } else {
+    stridesTempList.push_back(stridesList[0]);
+    stridesTempList.push_back(stridesList[1]);
+    stridesTempList.push_back(stridesList[2]);
+  }
+  std::vector<int64_t> padsTempList;
+  if (padsList.size() == DIM_SIZE1) {
+    padsTempList.push_back(padsList[0]);
+    padsTempList.push_back(padsList[0]);
+    padsTempList.push_back(padsList[0]);
+  } else {
+    padsTempList.push_back(padsList[0]);
+    padsTempList.push_back(padsList[1]);
+    padsTempList.push_back(padsList[2]);
+  }
+  std::vector<int64_t> dilationTempList;
+  if (dilationList.size() == DIM_SIZE1) {
+    dilationTempList.push_back(dilationList[0]);
+    dilationTempList.push_back(dilationList[0]);
+    dilationTempList.push_back(dilationList[0]);
+  } else if (dilationList.size() == DIM_SIZE5) {
+    dilationTempList.push_back(dilationList[2]);
+    dilationTempList.push_back(dilationList[3]);
+    dilationTempList.push_back(dilationList[4]);
+  } else {
+    dilationTempList.push_back(dilationList[0]);
+    dilationTempList.push_back(dilationList[1]);
+    dilationTempList.push_back(dilationList[2]);
+  }
+  const int64_t k_width = ksizeTempList[2];
+  const int64_t k_height = ksizeTempList[1];
+  const int64_t k_depth = ksizeTempList[0];
+  const int64_t s_width = stridesTempList[2];
+  const int64_t s_height = stridesTempList[1];
+  const int64_t s_depth = stridesTempList[0];
+  const int64_t p_width = padsTempList[2];
+  const int64_t p_height = padsTempList[1];
+  const int64_t p_depth = padsTempList[0];
+  const int64_t d_width = dilationTempList[2];
+  const int64_t d_height = dilationTempList[1];
+  const int64_t d_depth = dilationTempList[0];
+  KERNEL_CHECK_FALSE(k_width / 2 >= p_width && k_height / 2 >= p_height && k_depth / 2 >= p_depth,
+                     KERNEL_STATUS_PARAM_INVALID, "pads should be smaller than or equal to half of kernel size.");
+
+  int64_t data_num = ctx.Input(0)->NumElements();
+  const int64_t batch = in_batch * in_channel;
+  const int64_t in_stride = in_width * in_height * in_depth;
+  const int64_t out_stride = out_width * out_height * out_depth;
+  const int64_t kParallelDataNum = 16 * in_width * in_height * in_depth;
+  const int64_t kParallelDataNumMid = 72 * in_width * in_height * in_depth;
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    auto sharder_max_pool3d_with_argmax = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; i++) {
+        MaxPool3DWithArgmaxSingleCompute(input_x + i * in_stride, output_y + i * out_stride,
+                                         output_argmax + i * out_stride, in_depth, in_height, in_width, out_depth,
+                                         out_height, out_width, k_depth, k_height, k_width, s_depth, s_height, s_width,
+                                         p_depth, p_height, p_width, d_depth, d_height, d_width);
+      }
+    };
+
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_max_pool3d_with_argmax),
+                        "MaxPool3DWithArgmax Compute failed.");
+
+  } else {
+    for (int64_t i = 0; i < batch; i++) {
+      MaxPool3DWithArgmaxSingleCompute(input_x + i * in_stride, output_y + i * out_stride,
+                                       output_argmax + i * out_stride, in_depth, in_height, in_width, out_depth,
+                                       out_height, out_width, k_depth, k_height, k_width, s_depth, s_height, s_width,
+                                       p_depth, p_height, p_width, d_depth, d_height, d_width);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMaxPool3DWithArgmax, MaxPool3DWithArgmaxCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_with_argmax.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_with_argmax.h
@ -0,0 +1,46 @@
+
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_MAX_POOL3D_WINTH_ARGMAX_H_
+#define AICPU_KERNELS_NORMALIZED_MAX_POOL3D_WINTH_ARGMAX_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class MaxPool3DWithArgmaxCpuKernel : public CpuKernel {
+ public:
+  MaxPool3DWithArgmaxCpuKernel() = default;
+  ~MaxPool3DWithArgmaxCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t MaxPool3DWithArgmaxParamCheck(CpuKernelContext &ctx);
+
+  template <typename T, typename S>
+  uint32_t MaxPool3DWithArgmaxCompute(CpuKernelContext &ctx);
+
+  template <typename T, typename S>
+  void MaxPool3DWithArgmaxSingleCompute(T *input, T *output_y, S *output_argmax, int64_t iD, int64_t iH, int64_t iW,
+                                        int64_t oD, int64_t oH, int64_t oW, int64_t kD, int64_t kH, int64_t kW,
+                                        int64_t sD, int64_t sH, int64_t sW, int64_t pD, int64_t pH, int64_t pW,
+                                        int64_t dD, int64_t dH, int64_t dW);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc
@ -0,0 +1,235 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "max_unpool_2d.h"
+
+#include <cmath>
+#include <iostream>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+constexpr int64_t kParallelDataNums = 1024;
+const char *kMaxUnpool2D = "MaxUnpool2D";
+
+#define SWITCH_PARALLEL(SHARD, end_num, ctx)                                                                        \
+  if (end_num <= kParallelDataNums) {                                                                               \
+    for (size_t i = 0; i < size_t(end_num); i++) {                                                                  \
+      SHARD(i, i + 1);                                                                                              \
+    }                                                                                                               \
+  } else {                                                                                                          \
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), "MaxUnpool2D #SHARD Compute failed."); \
+  }
+
+}  // namespace
+
+namespace aicpu {
+template <typename DATA_T>
+uint32_t MaxUnpool2DCpuKernel::MaxUnpool2D_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type) {
+  // Compute by indices_type
+  switch (indices_type) {
+    case DT_INT32:
+      return MaxUnpool2DCompute<DATA_T, int32_t>(ctx);
+    case DT_INT64:
+      return MaxUnpool2DCompute<DATA_T, int64_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("indices_type [%s] must be in [{DT_INT32, DT_INT64}].", DTypeStr(indices_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+uint32_t MaxUnpool2DCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MaxUnpool2D check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MaxUnpool2DCheck(ctx), "MaxUnpool2D check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  auto indices_type = ctx.Input(1)->GetDataType();
+  switch (data_type) {
+    case DT_INT8:
+      return MaxUnpool2D_COMPUTE_CASE<int8_t>(ctx, indices_type);
+    case DT_INT16:
+      return MaxUnpool2D_COMPUTE_CASE<int16_t>(ctx, indices_type);
+    case DT_INT32:
+      return MaxUnpool2D_COMPUTE_CASE<int32_t>(ctx, indices_type);
+    case DT_INT64:
+      return MaxUnpool2D_COMPUTE_CASE<int64_t>(ctx, indices_type);
+    case DT_UINT8:
+      return MaxUnpool2D_COMPUTE_CASE<uint8_t>(ctx, indices_type);
+    case DT_UINT16:
+      return MaxUnpool2D_COMPUTE_CASE<uint16_t>(ctx, indices_type);
+    case DT_UINT32:
+      return MaxUnpool2D_COMPUTE_CASE<uint32_t>(ctx, indices_type);
+    case DT_UINT64:
+      return MaxUnpool2D_COMPUTE_CASE<uint64_t>(ctx, indices_type);
+    case DT_FLOAT16:
+      return MaxUnpool2D_COMPUTE_CASE<Eigen::half>(ctx, indices_type);
+    case DT_FLOAT:
+      return MaxUnpool2D_COMPUTE_CASE<float>(ctx, indices_type);
+    case DT_DOUBLE:
+      return MaxUnpool2D_COMPUTE_CASE<double>(ctx, indices_type);
+    default:
+      KERNEL_LOG_ERROR("MaxUnpool2D kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MaxUnpool2DCpuKernel::MaxUnpool2DCheck(CpuKernelContext &ctx) {
+  DataType input0Type = ctx.Input(0)->GetDataType();
+  DataType outputType = ctx.Output(0)->GetDataType();
+  KERNEL_CHECK_FALSE((input0Type == outputType), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of output [%d] need be same with "
+                     "input0 [%d].",
+                     outputType, input0Type)
+
+  KERNEL_LOG_INFO(
+    "MaxUnpool2DCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Output(0)->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename DATA_T, typename INDICES_T>
+uint32_t MaxUnpool2DCpuKernel::MaxUnpool2DCompute(CpuKernelContext &ctx) {
+  Tensor *input = ctx.Input(0);
+  Tensor *indices = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  std::string dataFormat = "NCHW";
+  if (ctx.GetAttr("data_format") != nullptr) {
+    dataFormat = ctx.GetAttr("data_format")->GetString();
+  }
+  int32_t NIndex, CIndex, HIndex, WIndex;
+  bool error = false;
+  if (dataFormat == "NHWC") {
+    NIndex = 0;
+    CIndex = 3;
+    HIndex = 1;
+    WIndex = 2;
+    auto inputShape = input->GetTensorShape();
+    int64_t numBatch = inputShape->GetDimSize(NIndex);
+    int64_t inputHeight = inputShape->GetDimSize(HIndex);
+    int64_t inputWidth = inputShape->GetDimSize(WIndex);
+    int64_t numChannels = inputShape->GetDimSize(CIndex);
+    auto output_shape = output->GetTensorShape();
+    int64_t oheight = output_shape->GetDimSize(HIndex);
+    int64_t owidth = output_shape->GetDimSize(WIndex);
+
+    auto *rawInput = reinterpret_cast<DATA_T *>(input->GetData());
+    auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
+    auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
+    for (int s = 0; s < numBatch * oheight * owidth * numChannels; s++) {
+      rawOutput[s] = (DATA_T)0;
+    }
+
+    auto shard = [&](int64_t start, int64_t end) {
+      for (int64_t n = start; n < end; n++) {
+        int64_t nOutputOffset = n * numChannels * owidth * oheight;
+        int64_t nInputOffset = n * numChannels * inputWidth * inputHeight;
+        DATA_T *output_p_k = rawOutput + nOutputOffset;
+        DATA_T *input_p_k = rawInput + nInputOffset;
+        INDICES_T *ind_p_k = rawIndices + nInputOffset;
+
+        int64_t maxp;
+        for (int64_t k = 0; k < numChannels; k++) {
+          for (int64_t i = 0; i < inputHeight; i++) {
+            for (int64_t j = 0; j < inputWidth; j++) {
+              maxp = ind_p_k[i * inputWidth * numChannels + j * numChannels + k];
+              if (maxp < 0 || maxp >= owidth * oheight) {
+                error = true;
+                KERNEL_LOG_ERROR(
+                  "MaxUnpool2D:  output_size H_out * W_out "
+                  "should be bigger than argmax, now H_out is [%ld], "
+                  "and W_out is [%ld], but one of the values in argmax is "
+                  "[%ld].",
+                  oheight, owidth, maxp);
+              } else {
+                output_p_k[maxp * numChannels + k] = input_p_k[i * inputWidth * numChannels + j * numChannels + k];
+              }
+            }
+          }
+        }
+      }
+    };
+    SWITCH_PARALLEL(shard, numBatch, ctx);
+  } else {
+    NIndex = 0;
+    CIndex = 1;
+    HIndex = 2;
+    WIndex = 3;
+    auto inputShape = input->GetTensorShape();
+    int64_t numBatch = inputShape->GetDimSize(NIndex);
+    int64_t inputHeight = inputShape->GetDimSize(HIndex);
+    int64_t inputWidth = inputShape->GetDimSize(WIndex);
+    int64_t numChannels = inputShape->GetDimSize(CIndex);
+
+    auto output_shape = output->GetTensorShape();
+    int64_t oheight = output_shape->GetDimSize(HIndex);
+    int64_t owidth = output_shape->GetDimSize(WIndex);
+    auto *rawInput = reinterpret_cast<DATA_T *>(input->GetData());
+    auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
+    auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
+
+    for (int s = 0; s < numBatch * oheight * owidth * numChannels; s++) {
+      rawOutput[s] = (DATA_T)0;
+    }
+    auto shard = [&](int64_t start, int64_t end) {
+      for (int64_t n = start; n < end; n++) {
+        int64_t nOutputOffset = n * numChannels * owidth * oheight;
+        int64_t nInputOffset = n * numChannels * inputWidth * inputHeight;
+        int64_t k = 0;
+        for (k = 0; k < numChannels; k++) {
+          int64_t finalOutputOffset = nOutputOffset + k * owidth * oheight;
+          int64_t finalInputOffset = nInputOffset + k * inputWidth * inputHeight;
+          DATA_T *output_p_k = rawOutput + finalOutputOffset;
+          DATA_T *input_p_k = rawInput + finalInputOffset;
+          INDICES_T *ind_p_k = rawIndices + finalInputOffset;
+
+          int64_t maxp;
+          for (int64_t i = 0; i < inputHeight; i++) {
+            for (int64_t j = 0; j < inputWidth; j++) {
+              maxp = ind_p_k[i * inputWidth + j];
+              if (maxp < 0 || maxp >= owidth * oheight) {
+                error = true;
+                KERNEL_LOG_ERROR(
+                  "MaxUnpool2D:  output_size H_out * W_out "
+                  "should be bigger than argmax, now H_out is [%ld], "
+                  "and W_out is [%ld], but one of the values in argmax is "
+                  "[%ld].",
+                  oheight, owidth, maxp);
+              } else {
+                output_p_k[maxp] = input_p_k[i * inputWidth + j];
+              }
+            }
+          }
+        }
+      }
+    };
+    SWITCH_PARALLEL(shard, numBatch, ctx);
+  }
+
+  if (error == true) {
+    return KERNEL_STATUS_PARAM_INVALID;
+  } else {
+    return KERNEL_STATUS_OK;
+  }
+}
+
+REGISTER_CPU_KERNEL(kMaxUnpool2D, MaxUnpool2DCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.h
@ -0,0 +1,40 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_H_
+#define AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+namespace aicpu {
+class MaxUnpool2DCpuKernel : public CpuKernel {
+ public:
+  MaxUnpool2DCpuKernel() = default;
+  ~MaxUnpool2DCpuKernel() = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  static uint32_t MaxUnpool2DCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  static uint32_t MaxUnpool2D_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type);
+
+  template <typename T, typename S>
+  static uint32_t MaxUnpool2DCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d_grad.cc
@ -0,0 +1,247 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "max_unpool_2d_grad.h"
+
+#include <cmath>
+#include <iostream>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 3;
+constexpr int64_t kParallelDataNums = 1024;
+const char *kMaxUnpool2DGrad = "MaxUnpool2DGrad";
+
+#define SWITCH_PARALLEL(SHARD, end_num, ctx)                                 \
+  if (end_num <= kParallelDataNums) {                                        \
+    for (size_t i = 0; i < size_t(end_num); i++) {                           \
+      SHARD(i, i + 1);                                                       \
+    }                                                                        \
+  } else {                                                                   \
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), \
+                        "MaxUnpool2DGrad #SHARD Compute failed.");           \
+  }
+
+}  // namespace
+
+namespace aicpu {
+template <typename DATA_T>
+uint32_t MaxUnpool2DGradCpuKernel::MaxUnpool2DGrad_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type) {
+  // Compute by indices_type
+  switch (indices_type) {
+    case DT_INT32:
+      return MaxUnpool2DGradCompute<DATA_T, int32_t>(ctx);
+    case DT_INT64:
+      return MaxUnpool2DGradCompute<DATA_T, int64_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("indices_type [%s] must be in [{DT_INT32, DT_INT64}].", DTypeStr(indices_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+uint32_t MaxUnpool2DGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MaxUnpool2DGrad check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MaxUnpool2DGradCheck(ctx), "MaxUnpool2DGrad check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  auto indices_type = ctx.Input(2)->GetDataType();
+  switch (data_type) {
+    case DT_INT8:
+      return MaxUnpool2DGrad_COMPUTE_CASE<int8_t>(ctx, indices_type);
+    case DT_INT16:
+      return MaxUnpool2DGrad_COMPUTE_CASE<int16_t>(ctx, indices_type);
+    case DT_INT32:
+      return MaxUnpool2DGrad_COMPUTE_CASE<int32_t>(ctx, indices_type);
+    case DT_INT64:
+      return MaxUnpool2DGrad_COMPUTE_CASE<int64_t>(ctx, indices_type);
+    case DT_UINT8:
+      return MaxUnpool2DGrad_COMPUTE_CASE<uint8_t>(ctx, indices_type);
+    case DT_UINT16:
+      return MaxUnpool2DGrad_COMPUTE_CASE<uint16_t>(ctx, indices_type);
+    case DT_UINT32:
+      return MaxUnpool2DGrad_COMPUTE_CASE<uint32_t>(ctx, indices_type);
+    case DT_UINT64:
+      return MaxUnpool2DGrad_COMPUTE_CASE<uint64_t>(ctx, indices_type);
+    case DT_FLOAT16:
+      return MaxUnpool2DGrad_COMPUTE_CASE<Eigen::half>(ctx, indices_type);
+    case DT_FLOAT:
+      return MaxUnpool2DGrad_COMPUTE_CASE<float>(ctx, indices_type);
+    case DT_DOUBLE:
+      return MaxUnpool2DGrad_COMPUTE_CASE<double>(ctx, indices_type);
+    default:
+      KERNEL_LOG_ERROR("MaxUnpool2DGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MaxUnpool2DGradCpuKernel::MaxUnpool2DGradCheck(CpuKernelContext &ctx) {
+  DataType input0Type = ctx.Input(0)->GetDataType();
+  DataType input1Type = ctx.Input(1)->GetDataType();
+  DataType outputType = ctx.Output(0)->GetDataType();
+  KERNEL_CHECK_FALSE((input0Type == input1Type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input1Type [%d] need be same with "
+                     "input0 [%d].",
+                     input1Type, input0Type)
+
+  KERNEL_CHECK_FALSE((input0Type == outputType), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of output [%d] need be same with "
+                     "input0 [%d].",
+                     outputType, input0Type)
+
+  auto Input0_size = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  auto Input2_size = ctx.Input(2)->GetTensorShape()->GetDimSizes();
+
+  KERNEL_CHECK_FALSE((Input0_size == Input2_size), KERNEL_STATUS_PARAM_INVALID,
+                     "The data size of x [%d] need be same with "
+                     "input argmax [%d].",
+                     Input0_size, Input2_size)
+
+  KERNEL_LOG_INFO(
+    "MaxUnpool2DGradCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], input2: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Input(2)->GetDataSize(),
+    ctx.Output(0)->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename DATA_T, typename INDICES_T>
+uint32_t MaxUnpool2DGradCpuKernel::MaxUnpool2DGradCompute(CpuKernelContext &ctx) {
+  Tensor *grads = ctx.Input(1);
+  Tensor *indices = ctx.Input(2);
+  Tensor *output = ctx.Output(0);
+  std::string dataFormat = "NCHW";
+  if (ctx.GetAttr("data_format") != nullptr) {
+    dataFormat = ctx.GetAttr("data_format")->GetString();
+  }
+  int32_t NIndex, CIndex, HIndex, WIndex;
+  bool error = false;
+  if (dataFormat == "NHWC") {
+    NIndex = 0;
+    CIndex = 3;
+    HIndex = 1;
+    WIndex = 2;
+    auto grads_out_shape = grads->GetTensorShape();
+    int64_t numBatch = grads_out_shape->GetDimSize(NIndex);
+    int64_t oheight = grads_out_shape->GetDimSize(HIndex);
+    int64_t owidth = grads_out_shape->GetDimSize(WIndex);
+    int64_t numChannels = grads_out_shape->GetDimSize(CIndex);
+    auto output_shape = output->GetTensorShape();
+    int64_t iheight = output_shape->GetDimSize(HIndex);
+    int64_t iwidth = output_shape->GetDimSize(WIndex);
+    auto *rawGrads = reinterpret_cast<DATA_T *>(grads->GetData());
+    auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
+    auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
+    for (int s = 0; s < numBatch * iheight * iwidth * numChannels; s++) {
+      rawOutput[s] = (DATA_T)0;
+    }
+
+    auto shard = [&](int64_t start, int64_t end) {
+      for (int64_t n = start; n < end; n++) {
+        int64_t nOutputOffset = n * numChannels * iwidth * iheight;
+        int64_t nGradsOffset = n * numChannels * owidth * oheight;
+        DATA_T *output_p_k = rawOutput + nOutputOffset;
+        DATA_T *grads_p_k = rawGrads + nGradsOffset;
+        INDICES_T *ind_p_k = rawIndices + nOutputOffset;
+        int64_t maxp;
+        for (int64_t k = 0; k < numChannels; k++) {
+          for (int64_t i = 0; i < iheight; i++) {
+            for (int64_t j = 0; j < iwidth; j++) {
+              maxp = ind_p_k[i * iwidth * numChannels + j * numChannels + k];
+              if (maxp < 0 || maxp >= owidth * oheight) {
+                error = true;
+                KERNEL_LOG_ERROR(
+                  "MaxUnpool2DGrad:  output_size H_out * W_out "
+                  "should be bigger than argmax, now H_out is [%ld], "
+                  "and W_out is [%ld], but one of the values in argmax is "
+                  "[%ld].",
+                  oheight, owidth, maxp);
+              } else {
+                output_p_k[i * iwidth * numChannels + j * numChannels + k] = grads_p_k[maxp * numChannels + k];
+              }
+            }
+          }
+        }
+      }
+    };
+    SWITCH_PARALLEL(shard, numBatch, ctx);
+  } else {
+    NIndex = 0;
+    CIndex = 1;
+    HIndex = 2;
+    WIndex = 3;
+    auto grads_out_shape = grads->GetTensorShape();
+    int64_t numBatch = grads_out_shape->GetDimSize(NIndex);
+    int64_t oheight = grads_out_shape->GetDimSize(HIndex);
+    int64_t owidth = grads_out_shape->GetDimSize(WIndex);
+    int64_t numChannels = grads_out_shape->GetDimSize(CIndex);
+    auto output_shape = output->GetTensorShape();
+    int64_t iheight = output_shape->GetDimSize(HIndex);
+    int64_t iwidth = output_shape->GetDimSize(WIndex);
+    auto *rawGrads = reinterpret_cast<DATA_T *>(grads->GetData());
+    auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
+    auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
+    for (int s = 0; s < numBatch * iheight * iwidth * numChannels; s++) {
+      rawOutput[s] = (DATA_T)0;
+    }
+
+    auto shard = [&](int64_t start, int64_t end) {
+      for (int64_t n = start; n < end; n++) {
+        int64_t nOutputOffset = n * numChannels * iwidth * iheight;
+        int64_t nGradsOffset = n * numChannels * owidth * oheight;
+        int64_t k = 0;
+        for (k = 0; k < numChannels; k++) {
+          int64_t finalOutputOffset = nOutputOffset + k * iwidth * iheight;
+          int64_t finalGradsOffset = nGradsOffset + k * owidth * oheight;
+          DATA_T *output_p_k = rawOutput + finalOutputOffset;
+          DATA_T *grads_p_k = rawGrads + finalGradsOffset;
+          INDICES_T *ind_p_k = rawIndices + finalOutputOffset;
+          int64_t maxp;
+          for (int64_t i = 0; i < iheight; i++) {
+            for (int64_t j = 0; j < iwidth; j++) {
+              maxp = ind_p_k[i * iwidth + j];
+              if (maxp < 0 || maxp >= owidth * oheight) {
+                error = true;
+                KERNEL_LOG_ERROR(
+                  "MaxUnpool2DGrad:  output_size H_out * W_out "
+                  "should be bigger than argmax, now H_out is [%ld], "
+                  "and W_out is [%ld], but one of the values in argmax is "
+                  "[%ld].",
+                  oheight, owidth, maxp);
+              } else {
+                output_p_k[i * iwidth + j] = grads_p_k[maxp];
+              }
+            }
+          }
+        }
+      }
+    };
+    SWITCH_PARALLEL(shard, numBatch, ctx);
+  }
+
+  if (error == true) {
+    return KERNEL_STATUS_PARAM_INVALID;
+  } else {
+    return KERNEL_STATUS_OK;
+  }
+}
+
+REGISTER_CPU_KERNEL(kMaxUnpool2DGrad, MaxUnpool2DGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d_grad.h
@ -0,0 +1,40 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_GRAD_H_
+#define AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_GRAD_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+namespace aicpu {
+class MaxUnpool2DGradCpuKernel : public CpuKernel {
+ public:
+  MaxUnpool2DGradCpuKernel() = default;
+  ~MaxUnpool2DGradCpuKernel() = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  static uint32_t MaxUnpool2DGradCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  static uint32_t MaxUnpool2DGrad_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type);
+
+  template <typename T, typename S>
+  static uint32_t MaxUnpool2DGradCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_GRAD_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc
@ -0,0 +1,247 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "max_unpool_3d.h"
+
+#include <cmath>
+#include <iostream>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+constexpr int64_t kParallelDataNums = 1024;
+const char *kMaxUnpool3D = "MaxUnpool3D";
+
+#define SWITCH_PARALLEL(SHARD, end_num, ctx)                                                                        \
+  if (end_num <= kParallelDataNums) {                                                                               \
+    for (size_t i = 0; i < size_t(end_num); i++) {                                                                  \
+      SHARD(i, i + 1);                                                                                              \
+    }                                                                                                               \
+  } else {                                                                                                          \
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), "MaxUnpool3D #SHARD Compute failed."); \
+  }
+
+}  // namespace
+
+namespace aicpu {
+template <typename DATA_T>
+uint32_t MaxUnpool3DCpuKernel::MaxUnpool3D_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type) {
+  // Compute by indices_type
+  switch (indices_type) {
+    case DT_INT32:
+      return MaxUnpool3DCompute<DATA_T, int32_t>(ctx);
+    case DT_INT64:
+      return MaxUnpool3DCompute<DATA_T, int64_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("indices_type [%s] must be in [{DT_INT32, DT_INT64}].", DTypeStr(indices_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+uint32_t MaxUnpool3DCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MaxUnpool3D check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MaxUnpool3DCheck(ctx), "MaxUnpool3D check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  auto indices_type = ctx.Input(1)->GetDataType();
+  switch (data_type) {
+    case DT_INT8:
+      return MaxUnpool3D_COMPUTE_CASE<int8_t>(ctx, indices_type);
+    case DT_INT16:
+      return MaxUnpool3D_COMPUTE_CASE<int16_t>(ctx, indices_type);
+    case DT_INT32:
+      return MaxUnpool3D_COMPUTE_CASE<int32_t>(ctx, indices_type);
+    case DT_INT64:
+      return MaxUnpool3D_COMPUTE_CASE<int64_t>(ctx, indices_type);
+    case DT_UINT8:
+      return MaxUnpool3D_COMPUTE_CASE<uint8_t>(ctx, indices_type);
+    case DT_UINT16:
+      return MaxUnpool3D_COMPUTE_CASE<uint16_t>(ctx, indices_type);
+    case DT_UINT32:
+      return MaxUnpool3D_COMPUTE_CASE<uint32_t>(ctx, indices_type);
+    case DT_UINT64:
+      return MaxUnpool3D_COMPUTE_CASE<uint64_t>(ctx, indices_type);
+    case DT_FLOAT16:
+      return MaxUnpool3D_COMPUTE_CASE<Eigen::half>(ctx, indices_type);
+    case DT_FLOAT:
+      return MaxUnpool3D_COMPUTE_CASE<float>(ctx, indices_type);
+    case DT_DOUBLE:
+      return MaxUnpool3D_COMPUTE_CASE<double>(ctx, indices_type);
+    default:
+      KERNEL_LOG_ERROR("MaxUnpool3D kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MaxUnpool3DCpuKernel::MaxUnpool3DCheck(CpuKernelContext &ctx) {
+  DataType input0Type = ctx.Input(0)->GetDataType();
+  DataType outputType = ctx.Output(0)->GetDataType();
+  KERNEL_CHECK_FALSE((input0Type == outputType), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of output [%d] need be same with "
+                     "input0 [%d].",
+                     outputType, input0Type)
+
+  KERNEL_LOG_INFO(
+    "MaxUnpool3DCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Output(0)->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename DATA_T, typename INDICES_T>
+uint32_t MaxUnpool3DCpuKernel::MaxUnpool3DCompute(CpuKernelContext &ctx) {
+  Tensor *input = ctx.Input(0);
+  Tensor *indices = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  std::string dataFormat = "NCDHW";
+  if (ctx.GetAttr("data_format") != nullptr) {
+    dataFormat = ctx.GetAttr("data_format")->GetString();
+  }
+  int32_t NIndex, CIndex, DIndex, HIndex, WIndex;
+  bool error = false;
+  if (dataFormat == "NDHWC") {
+    NIndex = 0;
+    CIndex = 4;
+    DIndex = 1;
+    HIndex = 2;
+    WIndex = 3;
+    auto input_shape = input->GetTensorShape();
+    int64_t numBatch = input_shape->GetDimSize(NIndex);
+    int64_t inputDepth = input_shape->GetDimSize(DIndex);
+    int64_t inputHeight = input_shape->GetDimSize(HIndex);
+    int64_t inputWidth = input_shape->GetDimSize(WIndex);
+    int64_t numChannels = input_shape->GetDimSize(CIndex);
+
+    auto output_shape = output->GetTensorShape();
+    int64_t odepth = output_shape->GetDimSize(DIndex);
+    int64_t oheight = output_shape->GetDimSize(HIndex);
+    int64_t owidth = output_shape->GetDimSize(WIndex);
+
+    auto *rawInput = reinterpret_cast<DATA_T *>(input->GetData());
+    auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
+    auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
+    for (int s = 0; s < numBatch * odepth * oheight * owidth * numChannels; s++) {
+      rawOutput[s] = (DATA_T)0;
+    }
+
+    auto shard = [&](int64_t start, int64_t end) {
+      for (int64_t n = start; n < end; n++) {
+        int64_t nOutputOffset = n * numChannels * odepth * owidth * oheight;
+        int64_t nInputOffset = n * numChannels * inputDepth * inputWidth * inputHeight;
+        DATA_T *output_p_k = rawOutput + nOutputOffset;
+        DATA_T *input_p_k = rawInput + nInputOffset;
+        INDICES_T *ind_p_k = rawIndices + nInputOffset;
+
+        int64_t maxp;
+        for (int64_t k = 0; k < numChannels; k++) {
+          for (int64_t t = 0; t < inputDepth; t++) {
+            for (int64_t i = 0; i < inputHeight; i++) {
+              for (int64_t j = 0; j < inputWidth; j++) {
+                maxp = ind_p_k[t * inputHeight * inputWidth * numChannels + i * inputWidth * numChannels +
+                               j * numChannels + k];
+                if (maxp < 0 || maxp >= odepth * owidth * oheight) {
+                  error = true;
+                  KERNEL_LOG_ERROR(
+                    "MaxUnpool3D:  output_size D_out * H_out * W_out "
+                    "should be bigger than argmax, now D_out is [%ld], H_out "
+                    "is [%ld], and W_out is [%ld], but one of the values in "
+                    "argmax is [%ld].",
+                    odepth, oheight, owidth, maxp);
+                } else {
+                  output_p_k[maxp * numChannels + k] = input_p_k[t * inputHeight * inputWidth * numChannels +
+                                                                 i * inputWidth * numChannels + j * numChannels + k];
+                }
+              }
+            }
+          }
+        }
+      }
+    };
+    SWITCH_PARALLEL(shard, numBatch, ctx);
+  } else {
+    NIndex = 0;
+    CIndex = 1;
+    DIndex = 2;
+    HIndex = 3;
+    WIndex = 4;
+
+    auto input_shape = input->GetTensorShape();
+    int64_t numBatch = input_shape->GetDimSize(NIndex);
+    int64_t inputDepth = input_shape->GetDimSize(DIndex);
+    int64_t inputHeight = input_shape->GetDimSize(HIndex);
+    int64_t inputWidth = input_shape->GetDimSize(WIndex);
+    int64_t numChannels = input_shape->GetDimSize(CIndex);
+    auto output_shape = output->GetTensorShape();
+    int64_t odepth = output_shape->GetDimSize(DIndex);
+    int64_t oheight = output_shape->GetDimSize(HIndex);
+    int64_t owidth = output_shape->GetDimSize(WIndex);
+    auto *rawInput = reinterpret_cast<DATA_T *>(input->GetData());
+    auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
+    auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
+    for (int s = 0; s < numBatch * odepth * oheight * owidth * numChannels; s++) {
+      rawOutput[s] = (DATA_T)0;
+    }
+
+    auto shard = [&](int64_t start, int64_t end) {
+      for (int64_t n = start; n < end; n++) {
+        int64_t nOutputOffset = n * numChannels * odepth * owidth * oheight;
+        int64_t nInputOffset = n * numChannels * inputDepth * inputWidth * inputHeight;
+        int64_t k = 0;
+        for (k = 0; k < numChannels; k++) {
+          int64_t finalOutputOffset = nOutputOffset + k * odepth * owidth * oheight;
+          int64_t finalInputOffset = nInputOffset + k * inputDepth * inputWidth * inputHeight;
+          DATA_T *output_p_k = rawOutput + finalOutputOffset;
+          DATA_T *input_p_k = rawInput + finalInputOffset;
+          INDICES_T *ind_p_k = rawIndices + finalInputOffset;
+          int64_t maxp;
+          for (int64_t t = 0; t < inputDepth; t++) {
+            for (int64_t i = 0; i < inputHeight; i++) {
+              for (int64_t j = 0; j < inputWidth; j++) {
+                maxp = ind_p_k[t * inputHeight * inputWidth + i * inputWidth + j];
+                if (maxp < 0 || maxp >= odepth * owidth * oheight) {
+                  error = true;
+                  KERNEL_LOG_ERROR(
+                    "MaxUnpool3D:  output_size D_out * H_out * W_out "
+                    "should be bigger than argmax, now D_out is [%ld], H_out "
+                    "is [%ld], and W_out is [%ld], but one of the values in "
+                    "argmax is [%ld].",
+                    odepth, oheight, owidth, maxp);
+                } else {
+                  output_p_k[maxp] = input_p_k[t * inputHeight * inputWidth + i * inputWidth + j];
+                }
+              }
+            }
+          }
+        }
+      }
+    };
+    SWITCH_PARALLEL(shard, numBatch, ctx);
+  }
+  if (error == true) {
+    return KERNEL_STATUS_PARAM_INVALID;
+  } else {
+    return KERNEL_STATUS_OK;
+  }
+}
+
+REGISTER_CPU_KERNEL(kMaxUnpool3D, MaxUnpool3DCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.h
@ -0,0 +1,40 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_H_
+#define AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+namespace aicpu {
+class MaxUnpool3DCpuKernel : public CpuKernel {
+ public:
+  MaxUnpool3DCpuKernel() = default;
+  ~MaxUnpool3DCpuKernel() = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  static uint32_t MaxUnpool3DCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  static uint32_t MaxUnpool3D_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type);
+
+  template <typename T, typename S>
+  static uint32_t MaxUnpool3DCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc
@ -0,0 +1,258 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "max_unpool_3d_grad.h"
+
+#include <cmath>
+#include <iostream>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 3;
+constexpr int64_t kParallelDataNums = 1024;
+const char *kMaxUnpool3DGrad = "MaxUnpool3DGrad";
+
+#define SWITCH_PARALLEL(SHARD, end_num, ctx)                                 \
+  if (end_num <= kParallelDataNums) {                                        \
+    for (size_t i = 0; i < size_t(end_num); i++) {                           \
+      SHARD(i, i + 1);                                                       \
+    }                                                                        \
+  } else {                                                                   \
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), \
+                        "MaxUnpool3DGrad #SHARD Compute failed.");           \
+  }
+
+}  // namespace
+
+namespace aicpu {
+template <typename DATA_T>
+uint32_t MaxUnpool3DGradCpuKernel::MaxUnpool3DGrad_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type) {
+  // Compute by indices_type
+  switch (indices_type) {
+    case DT_INT32:
+      return MaxUnpool3DGradCompute<DATA_T, int32_t>(ctx);
+    case DT_INT64:
+      return MaxUnpool3DGradCompute<DATA_T, int64_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("indices_type [%s] must be in [{DT_INT32, DT_INT64}].", DTypeStr(indices_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+uint32_t MaxUnpool3DGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MaxUnpool3DGrad check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MaxUnpool3DGradCheck(ctx), "MaxUnpool3DGrad check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  auto indices_type = ctx.Input(2)->GetDataType();
+  switch (data_type) {
+    case DT_INT8:
+      return MaxUnpool3DGrad_COMPUTE_CASE<int8_t>(ctx, indices_type);
+    case DT_INT16:
+      return MaxUnpool3DGrad_COMPUTE_CASE<int16_t>(ctx, indices_type);
+    case DT_INT32:
+      return MaxUnpool3DGrad_COMPUTE_CASE<int32_t>(ctx, indices_type);
+    case DT_INT64:
+      return MaxUnpool3DGrad_COMPUTE_CASE<int64_t>(ctx, indices_type);
+    case DT_UINT8:
+      return MaxUnpool3DGrad_COMPUTE_CASE<uint8_t>(ctx, indices_type);
+    case DT_UINT16:
+      return MaxUnpool3DGrad_COMPUTE_CASE<uint16_t>(ctx, indices_type);
+    case DT_UINT32:
+      return MaxUnpool3DGrad_COMPUTE_CASE<uint32_t>(ctx, indices_type);
+    case DT_UINT64:
+      return MaxUnpool3DGrad_COMPUTE_CASE<uint64_t>(ctx, indices_type);
+    case DT_FLOAT16:
+      return MaxUnpool3DGrad_COMPUTE_CASE<Eigen::half>(ctx, indices_type);
+    case DT_FLOAT:
+      return MaxUnpool3DGrad_COMPUTE_CASE<float>(ctx, indices_type);
+    case DT_DOUBLE:
+      return MaxUnpool3DGrad_COMPUTE_CASE<double>(ctx, indices_type);
+    default:
+      KERNEL_LOG_ERROR("MaxUnpool3DGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCheck(CpuKernelContext &ctx) {
+  DataType input0Type = ctx.Input(0)->GetDataType();
+  DataType input1Type = ctx.Input(1)->GetDataType();
+  DataType outputType = ctx.Output(0)->GetDataType();
+  KERNEL_CHECK_FALSE((input0Type == input1Type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input1Type [%d] need be same with "
+                     "input0 [%d].",
+                     input1Type, input0Type)
+
+  KERNEL_CHECK_FALSE((input0Type == outputType), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of output [%d] need be same with "
+                     "input0 [%d].",
+                     outputType, input0Type)
+  auto Input0_size = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  auto Input2_size = ctx.Input(2)->GetTensorShape()->GetDimSizes();
+
+  KERNEL_CHECK_FALSE((Input0_size == Input2_size), KERNEL_STATUS_PARAM_INVALID,
+                     "The data size of x [%d] need be same with "
+                     "input argmax [%d].",
+                     Input0_size, Input2_size)
+
+  KERNEL_LOG_INFO(
+    "MaxUnpool3DGradCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], input2: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Input(2)->GetDataSize(),
+    ctx.Output(0)->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename DATA_T, typename INDICES_T>
+uint32_t MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute(CpuKernelContext &ctx) {
+  Tensor *grads = ctx.Input(1);
+  Tensor *indices = ctx.Input(2);
+  Tensor *output = ctx.Output(0);
+  std::string dataFormat = "NCDHW";
+  if (ctx.GetAttr("data_format") != nullptr) {
+    dataFormat = ctx.GetAttr("data_format")->GetString();
+  }
+  int32_t NIndex, CIndex, DIndex, HIndex, WIndex;
+  bool error = false;
+  if (dataFormat == "NDHWC") {
+    NIndex = 0;
+    CIndex = 4;
+    DIndex = 1;
+    HIndex = 2;
+    WIndex = 3;
+
+    auto grads_out_shape = grads->GetTensorShape();
+    int64_t numBatch = grads_out_shape->GetDimSize(NIndex);
+    int64_t odepth = grads_out_shape->GetDimSize(DIndex);
+    int64_t oheight = grads_out_shape->GetDimSize(HIndex);
+    int64_t owidth = grads_out_shape->GetDimSize(WIndex);
+    int64_t numChannels = grads_out_shape->GetDimSize(CIndex);
+    auto output_shape = output->GetTensorShape();
+    int64_t idepth = output_shape->GetDimSize(DIndex);
+    int64_t iheight = output_shape->GetDimSize(HIndex);
+    int64_t iwidth = output_shape->GetDimSize(WIndex);
+    auto *rawGrads = reinterpret_cast<DATA_T *>(grads->GetData());
+    auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
+    auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
+
+    for (int s = 0; s < numBatch * iheight * iwidth * idepth * numChannels; s++) {
+      rawOutput[s] = (DATA_T)0;
+    }
+    auto shard = [&](int64_t start, int64_t end) {
+      for (int64_t n = start; n < end; n++) {
+        int64_t nOutputOffset = n * numChannels * iwidth * iheight * idepth;
+        int64_t nGradsOffset = n * numChannels * owidth * oheight * odepth;
+        DATA_T *output_p_k = rawOutput + nOutputOffset;
+        DATA_T *grads_p_k = rawGrads + nGradsOffset;
+        INDICES_T *ind_p_k = rawIndices + nOutputOffset;
+        int64_t maxp;
+        for (int64_t k = 0; k < numChannels; k++) {
+          for (int64_t t = 0; t < idepth; t++) {
+            for (int64_t i = 0; i < iheight; i++) {
+              for (int64_t j = 0; j < iwidth; j++) {
+                maxp = ind_p_k[t * iwidth * iheight * numChannels + i * iwidth * numChannels + j * numChannels + k];
+                if (maxp < 0 || maxp >= owidth * oheight * odepth) {
+                  error = true;
+                  KERNEL_LOG_ERROR(
+                    "MaxUnpool3DGrad:  output_size D_out * H_out * W_out "
+                    "should be bigger than argmax, now D_out is [%ld], H_out "
+                    "is [%ld], and W_out is [%ld], but one of the values in "
+                    "argmax is [%ld].",
+                    odepth, oheight, owidth, maxp);
+                } else {
+                  output_p_k[t * iwidth * iheight * numChannels + i * iwidth * numChannels + j * numChannels + k] =
+                    grads_p_k[maxp * numChannels + k];
+                }
+              }
+            }
+          }
+        }
+      }
+    };
+    SWITCH_PARALLEL(shard, numBatch, ctx);
+  } else {
+    NIndex = 0;
+    CIndex = 1;
+    DIndex = 2;
+    HIndex = 3;
+    WIndex = 4;
+
+    auto grads_out_shape = grads->GetTensorShape();
+    int64_t numBatch = grads_out_shape->GetDimSize(NIndex);
+    int64_t odepth = grads_out_shape->GetDimSize(DIndex);
+    int64_t oheight = grads_out_shape->GetDimSize(HIndex);
+    int64_t owidth = grads_out_shape->GetDimSize(WIndex);
+    int64_t numChannels = grads_out_shape->GetDimSize(CIndex);
+    auto output_shape = output->GetTensorShape();
+    int64_t idepth = output_shape->GetDimSize(DIndex);
+    int64_t iheight = output_shape->GetDimSize(HIndex);
+    int64_t iwidth = output_shape->GetDimSize(WIndex);
+    auto *rawGrads = reinterpret_cast<DATA_T *>(grads->GetData());
+    auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
+    auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
+    for (int s = 0; s < numBatch * idepth * iheight * iwidth * numChannels; s++) {
+      rawOutput[s] = (DATA_T)0;
+    }
+    auto shard = [&](int64_t start, int64_t end) {
+      for (int64_t n = start; n < end; n++) {
+        int64_t nOutputOffset = n * numChannels * iwidth * iheight * idepth;
+        int64_t nGradsOffset = n * numChannels * owidth * oheight * odepth;
+        int64_t k = 0;
+        for (k = 0; k < numChannels; k++) {
+          int64_t finalOutputOffset = nOutputOffset + k * iwidth * iheight * idepth;
+          int64_t finalGradsOffset = nGradsOffset + k * owidth * oheight * odepth;
+          DATA_T *output_p_k = rawOutput + finalOutputOffset;
+          DATA_T *grads_p_k = rawGrads + finalGradsOffset;
+          INDICES_T *ind_p_k = rawIndices + finalOutputOffset;
+          int64_t maxp;
+          for (int64_t t = 0; t < idepth; t++) {
+            for (int64_t i = 0; i < iheight; i++) {
+              for (int64_t j = 0; j < iwidth; j++) {
+                maxp = ind_p_k[t * iheight * iwidth + i * iwidth + j];
+                if (maxp < 0 || maxp >= owidth * oheight * odepth) {
+                  error = true;
+                  KERNEL_LOG_ERROR(
+                    "MaxUnpool3DGrad:  output_size D_out * H_out * W_out "
+                    "should be bigger than argmax, now D_out is [%ld], H_out "
+                    "is [%ld], and W_out is [%ld], but one of the values in "
+                    "argmax is [%ld].",
+                    odepth, oheight, owidth, maxp);
+                } else {
+                  output_p_k[t * iheight * iwidth + i * iwidth + j] = grads_p_k[maxp];
+                }
+              }
+            }
+          }
+        }
+      }
+    };
+    SWITCH_PARALLEL(shard, numBatch, ctx);
+  }
+  if (error == true) {
+    return KERNEL_STATUS_PARAM_INVALID;
+  } else {
+    return KERNEL_STATUS_OK;
+  }
+}
+
+REGISTER_CPU_KERNEL(kMaxUnpool3DGrad, MaxUnpool3DGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.h
@ -0,0 +1,40 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_GRAD_H_
+#define AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_GRAD_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+namespace aicpu {
+class MaxUnpool3DGradCpuKernel : public CpuKernel {
+ public:
+  MaxUnpool3DGradCpuKernel() = default;
+  ~MaxUnpool3DGradCpuKernel() = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  static uint32_t MaxUnpool3DGradCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  static uint32_t MaxUnpool3DGrad_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type);
+
+  template <typename T, typename S>
+  static uint32_t MaxUnpool3DGradCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_GRAD_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc
@ -0,0 +1,429 @@
+/**
+ * Copyright 2021 Harbin Institute of Technology
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "maxpool_grad.h"
+
+#include <Eigen/Dense>
+#include <string>
+#include <vector>
+
+#include "cpu_kernel_utils.h"
+#include "utils/allocator_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+namespace {
+const char *kMaxPoolGrad = "MaxPoolGrad";
+constexpr uint32_t kInvalidMaxPoolingIndex = -1;
+constexpr uint32_t kMaxPoolGradInputNum = 3;
+constexpr uint32_t kMaxPoolGradOutputNum = 1;
+constexpr int64_t kParallelNum_7K = 7 * 1024;
+constexpr int64_t kParallelNum_16K = 16 * 1024;
+constexpr int64_t kParallelNum_128K = 128 * 1024;
+constexpr uint32_t kThirdInputIndex = 2;
+struct PoolParams {
+  int depth;
+
+  int tensor_cols;
+  int tensor_rows;
+  int tensor_batch;
+
+  int ksize_rows;
+  int ksize_cols;
+  int ksize_depth;
+
+  int strides_rows;
+  int strides_cols;
+  int strides_depth;
+
+  int64_t out_height;
+  int64_t out_width;
+  int out_depth;
+
+  int64_t pad_top;
+  int64_t pad_bottom;
+  int64_t pad_left;
+  int64_t pad_right;
+};
+}  // namespace
+namespace aicpu {
+template <typename T, typename Targmax>
+uint32_t SpatialMaxPoolWithArgMaxHelper(CpuKernelContext &ctx, const PoolParams &params) {
+  bool include_batch_in_index = true;
+
+  Tensor *tensor_in = ctx.Input(kFirstInputIndex);
+  EigenTensor input_eigen_tensor(tensor_in, tensor_in->GetData());
+  Tensor *tensor_out = ctx.Input(kSecondInputIndex);
+  EigenTensor output_eigen_tensor(tensor_out, tensor_out->GetData());
+  Tensor *tensor_out_backprop = ctx.Input(2);
+  EigenTensor out_backprop(tensor_out_backprop, tensor_out_backprop->GetData());
+  Tensor *tensor_output_dup = ctx.Output(kFirstOutputIndex);
+  EigenTensor input_backprop(tensor_output_dup, tensor_output_dup->GetData());
+
+  // create a new aicpu::Tensor
+  auto tensor_out_arg_max_tmp = CpuKernelUtils::CreateTensor();
+  Targmax *arg_max = new Targmax[tensor_output_dup->NumElements()];
+
+  TensorShape out_dup_ts = *(tensor_output_dup->GetTensorShape());
+  tensor_out_arg_max_tmp->SetDataType(DT_INT64);
+  tensor_out_arg_max_tmp->SetData(static_cast<void *>(arg_max));
+  tensor_out_arg_max_tmp->SetDataSize(tensor_output_dup->GetDataSize());
+
+  auto out_arg_max_ts = tensor_out_arg_max_tmp->GetTensorShape();
+  out_arg_max_ts->SetFormat(out_dup_ts.GetFormat());
+  out_arg_max_ts->SetUnknownRank(out_dup_ts.GetUnknownRank());
+  out_arg_max_ts->SetDimSizes(out_dup_ts.GetDimSizes());
+
+  auto tensor_out_arg_max = tensor_out_arg_max_tmp.get();
+  EigenTensor output_arg_max(tensor_out_arg_max, tensor_out_arg_max->GetData());
+
+  typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenMatrixMap;
+  typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> EigenMatrixMap;
+  typedef Eigen::Map<Eigen::Matrix<Targmax, Eigen::Dynamic, Eigen::Dynamic>> EigenIndexMatrixMap;
+
+  ConstEigenMatrixMap in_mat(input_eigen_tensor.flat<T>().data(), params.depth,
+                             params.tensor_cols * params.tensor_rows * params.tensor_batch);
+  EigenMatrixMap out_mat(output_eigen_tensor.flat<T>().data(), params.depth,
+                         params.out_width * params.out_height * params.tensor_batch);
+  EigenIndexMatrixMap out_arg_max_mat(output_arg_max.flat<Targmax>().data(), params.depth,
+                                      params.out_width * params.out_height * params.tensor_batch);
+
+  input_backprop.flat<T>().setZero();
+  auto orig_input_ptr = static_cast<T *>(tensor_in->GetData());
+  auto orig_output_ptr = static_cast<T *>(tensor_out->GetData());
+  auto grad_ptr = static_cast<T *>(tensor_out_backprop->GetData());
+  auto output_ptr = static_cast<T *>(tensor_output_dup->GetData());
+  // shard_NCHW's limit is params.tensor_batch * params.depth
+  auto shard_NCHW = [&params, &orig_input_ptr, &orig_output_ptr, &grad_ptr, &output_ptr](int64_t start, int64_t limit) {
+    typedef Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenArrayMap;
+    typedef Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> EigenArrayMap;
+    const int64_t X_W = static_cast<int64_t>(params.tensor_cols), X_H = static_cast<int64_t>(params.tensor_rows);
+    const int64_t Y_W = params.out_width, Y_H = params.out_height;
+    const int64_t batch_size = limit;
+    const int64_t X_HxW = X_H * X_W, Y_HxW = Y_H * Y_W;
+    const int64_t X_stride = X_HxW, Y_stride = Y_HxW;
+    const int64_t stride_h = static_cast<int64_t>(params.strides_rows),
+                  stride_w = static_cast<int64_t>(params.strides_cols);
+    const int64_t pad_t = params.pad_top, pad_l = params.pad_left;
+    const int64_t kernel_h = static_cast<int64_t>(params.ksize_rows),
+                  kernel_w = static_cast<int64_t>(params.ksize_cols);
+    const T *dy_ptr = grad_ptr + start * Y_stride;
+    const T *x_ptr = orig_input_ptr + start * X_stride;
+    const T *y_ptr = orig_output_ptr + start * Y_stride;
+    T *dx_ptr = output_ptr + start * X_stride;
+    for (int64_t i = start; i < batch_size; i++) {
+      ConstEigenArrayMap dy_arr(dy_ptr, Y_W, Y_H);
+      ConstEigenArrayMap x_arr(x_ptr, X_W, X_H);
+      ConstEigenArrayMap y_arr(y_ptr, Y_W, Y_H);
+      EigenArrayMap dx_arr(dx_ptr, X_W, X_H);
+      for (int64_t h = 0; h < Y_H; ++h) {
+        const int64_t t = std::max(h * stride_h - pad_t, static_cast<int64_t>(0));
+        const int64_t b = std::min(h * stride_h - pad_t + kernel_h, X_H);
+        for (int64_t w = 0; w < Y_W; ++w) {
+          const int64_t l = std::max(w * stride_w - pad_l, static_cast<int64_t>(0));
+          const int64_t r = std::min(w * stride_w - pad_l + kernel_w, X_W);
+          const int64_t y = h * Y_W + w;
+          auto some_max_block = (x_arr.block(l, t, r - l, b - t) == y_arr(y)).template cast<T>();
+          int64_t first_max_x_rel = 0, first_max_y_rel = 0;
+          bool max_found = false;
+          for (int64_t by = 0; by < b - t; ++by) {
+            for (int64_t bx = 0; bx < r - l; ++bx) {
+              if (some_max_block(bx, by) == static_cast<T>(1)) {
+                first_max_x_rel = bx, first_max_y_rel = by, max_found = true;
+                break;
+              }
+            }
+            if (max_found) {
+              break;
+            }
+          }
+          const int64_t fact_index_h = t + first_max_y_rel, fact_index_w = l + first_max_x_rel;
+          *(dx_ptr + fact_index_h * X_W + fact_index_w) += static_cast<T>(1) * dy_arr(y);
+        }
+      }
+      dy_ptr += Y_stride;
+      x_ptr += X_stride;
+      y_ptr += Y_stride;
+      dx_ptr += X_stride;
+    }
+  };
+  auto shard = [&params, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop, &output_arg_max, &out_backprop,
+                &tensor_out_backprop, include_batch_in_index](int64_t start, int64_t limit) {
+    const int32_t depth = params.depth;
+    const int32_t in_rows = params.tensor_rows;
+    const int32_t in_cols = params.tensor_cols;
+    const int32_t pad_top = params.pad_top;
+    const int32_t pad_left = params.pad_left;
+    const int32_t window_rows = params.ksize_rows;
+    const int32_t window_cols = params.ksize_cols;
+    const int32_t row_stride = params.strides_rows;
+    const int32_t col_stride = params.strides_cols;
+    const int32_t out_height = params.out_height;
+    const int32_t out_width = params.out_width;
+    {
+      const int32_t output_image_size = out_height * out_width * depth;
+      EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, 1, (limit - start) * output_image_size);
+      out_shard.setConstant(Eigen::NumTraits<T>::lowest());
+      EigenIndexMatrixMap out_arg_max_shard(out_arg_max_mat.data() + start * output_image_size, 1,
+                                            (limit - start) * output_image_size);
+      out_arg_max_shard.setConstant(kInvalidMaxPoolingIndex);
+    }
+
+    for (int64_t b = start; b < limit; ++b) {
+      for (int h = 0; h < in_rows; ++h) {
+        for (int w = 0; w < in_cols; ++w) {
+          const int hpad = h + pad_top;
+          const int wpad = w + pad_left;
+          const int h_start = (hpad < window_rows) ? 0 : (hpad - window_rows) / row_stride + 1;
+          const int h_end = std::min(hpad / row_stride + 1, out_height);
+          const int w_start = (wpad < window_cols) ? 0 : (wpad - window_cols) / col_stride + 1;
+          const int w_end = std::min(wpad / col_stride + 1, out_width);
+          const int64_t in_index = (b * in_rows + h) * in_cols + w;
+          for (int ph = h_start; ph < h_end; ++ph) {
+            const int64_t out_index_base = (b * out_height + ph) * out_width;
+            for (int pw = w_start; pw < w_end; ++pw) {
+              const int64_t out_index = out_index_base + pw;
+              for (int d = 0; d < depth; ++d) {
+                const T &input_ref = in_mat.coeffRef(d, in_index);
+                T &output_ref = out_mat.coeffRef(d, out_index);
+                Targmax &out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index);
+                if (output_ref < input_ref || out_arg_max_ref == kInvalidMaxPoolingIndex) {
+                  output_ref = input_ref;
+                  if (include_batch_in_index) {
+                    out_arg_max_ref = in_index * depth + d;
+                  } else {
+                    out_arg_max_ref = (h * in_cols + w) * depth + d;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    if (include_batch_in_index) {
+      auto input_backprop_flat = input_backprop.flat<T>();
+      auto out_arg_max_flat = output_arg_max.flat<int64_t>();
+      auto out_backprop_flat = out_backprop.flat<T>();
+      const int64_t in_size = in_rows * in_cols * depth;
+      const int64_t in_start = start * in_size;
+      const int64_t in_end = limit * in_size;
+      EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1, in_end - in_start);
+      in_shard.setConstant(T(0));
+
+      // Backpropagate.
+      const int out_size = out_height * out_width * depth;
+      const int out_start = start * out_size;
+      const int out_end = limit * out_size;
+      for (int index = out_start; index < out_end; ++index) {
+        int input_backprop_index = out_arg_max_flat(index);
+        // BoundsCheck
+        if (input_backprop_index - in_start >= 0 && input_backprop_index - in_end < 0) {
+          if (index < (tensor_out_backprop->NumElements())) {
+            input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
+          }
+        } else {
+          KERNEL_LOG_ERROR("[MaxPoolGrad] Backpropagate boundsCheck failed");
+          return KERNEL_STATUS_PARAM_INVALID;
+        }
+      }
+    }
+    return KERNEL_STATUS_PARAM_INVALID;
+  };
+
+  const int64_t total_elements = params.tensor_batch * params.tensor_rows * params.tensor_cols * params.depth;
+  if (ctx.GetAttr("data_format") != nullptr && ctx.GetAttr("data_format")->GetString() == "NCHW") {
+    const int64_t total_images = params.tensor_batch * params.depth;
+    if (total_elements <= kParallelNum_16K) {
+      shard_NCHW(0, total_images);
+      return KERNEL_STATUS_OK;
+    } else {
+      return CpuKernelUtils::ParallelFor(ctx, total_images, 1, shard_NCHW);
+    }
+  }
+  uint32_t tensor_batch = params.tensor_batch;
+  if (total_elements <= kParallelNum_7K) {
+    shard(0, params.tensor_batch);
+    return KERNEL_STATUS_OK;
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (total_elements <= kParallelNum_16K) {
+      max_core_num = std::min(max_core_num, 4U);
+    }
+    if (total_elements >= kParallelNum_128K || max_core_num > tensor_batch) {
+      max_core_num = params.tensor_batch;
+    }
+    return CpuKernelUtils::ParallelFor(ctx, params.tensor_batch, params.tensor_batch / max_core_num, shard);
+  }
+}
+uint32_t CheckMaxPoolGrad(CpuKernelContext &ctx) {
+  Tensor *tensor_in = ctx.Input(kFirstInputIndex);
+  Tensor *tensor_out = ctx.Input(kSecondInputIndex);
+  Tensor *out_backprop = ctx.Input(kThirdInputIndex);
+  const std::vector<std::string> attr = {"ksize", "strides", "padding"};
+
+  KERNEL_CHECK_FALSE(NormalCheck(ctx, kMaxPoolGradInputNum, kMaxPoolGradOutputNum, attr) == KERNEL_STATUS_OK,
+                     KERNEL_STATUS_PARAM_INVALID, "[MaxPoolGrad] NormalCheck input and output failed.");
+  // check tensor_in dims
+  Tensor &input0 = *(tensor_in);
+  auto input_shape_ptr = input0.GetTensorShape();
+  KERNEL_CHECK_FALSE(input_shape_ptr->GetDims() == 4, KERNEL_STATUS_PARAM_INVALID,
+                     "Non-empty [4D] tensor expected for input(0).");
+  // check tensor_out dims
+  Tensor &input1 = *(tensor_out);
+  auto output_shape_ptr = input1.GetTensorShape();
+  KERNEL_CHECK_FALSE(output_shape_ptr->GetDims() == 4, KERNEL_STATUS_PARAM_INVALID,
+                     "Non-empty [4D] tensor expected for input(1).");
+  // check out_backprop dims
+  Tensor &input2 = *(out_backprop);
+  auto grad_shape_ptr = input2.GetTensorShape();
+  KERNEL_CHECK_FALSE(grad_shape_ptr->GetDims() == 4, KERNEL_STATUS_PARAM_INVALID,
+                     "Non-empty [4D] tensor expected for input(2).");
+  // check output data
+  KERNEL_LOG_DEBUG("[MaxPoolGrad] Parameters check pass.");
+  return KERNEL_STATUS_OK;
+}
+uint32_t GetOutputSizeGrad(int input_size, int kernel_size, int stride, const std::string &padding,
+                           int64_t *output_size, int64_t *padding_before, int64_t *padding_after) {
+  KERNEL_CHECK_FALSE(stride > 0, KERNEL_STATUS_PARAM_INVALID, "[MaxPoolGrad] Stride must be positive.");
+  std::string same("SAME"), valid("VALID");
+  if (valid == padding) {
+    *output_size = (input_size - kernel_size + stride) / stride;
+    *padding_before = 0;
+    *padding_after = 0;
+  } else if (same == padding) {
+    *output_size = (input_size + stride - 1) / stride;
+    const int64_t padding_need =
+      std::max(static_cast<int64_t>(0), (*output_size - 1) * stride + kernel_size - input_size);
+    *padding_before = padding_need / 2;
+    *padding_after = padding_need - *padding_before;
+  } else {
+    KERNEL_LOG_ERROR("[MaxPoolGrad] Padding is invalid.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (*output_size < 0) {
+    KERNEL_LOG_ERROR("[MaxPoolGrad] Computed output size is negative.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+uint32_t ConstructPoolParams(aicpu::CpuKernelContext &ctx, const aicpu::TensorShape &data_format, PoolParams &params) {
+  Format format = data_format.GetFormat();
+  KERNEL_CHECK_FALSE((format == FORMAT_NHWC || format == FORMAT_NCHW), KERNEL_STATUS_PARAM_INVALID,
+                     "[MaxPoolGrad] Format is not NHWC or NCHW.");
+  std::vector<int64_t> tensor_in_shapes = data_format.GetDimSizes();
+  std::vector<int64_t> ksize = ctx.GetAttr("ksize")->GetListInt(), strides = ctx.GetAttr("strides")->GetListInt();
+  std::string padding = ctx.GetAttr("padding")->GetString();
+  std::string data_format_str = "";
+  if (ctx.GetAttr("data_format") == nullptr) {
+    KERNEL_LOG_INFO("[MaxPoolGrad] Attr data_format is empty, using default value NHWC.");
+    format = FORMAT_NHWC;
+  } else {
+    std::map<std::string, aicpu::Format> format_str_to_enum_map = {{"NHWC", FORMAT_NHWC}, {"NCHW", FORMAT_NCHW}};
+    data_format_str = ctx.GetAttr("data_format")->GetString();
+
+    KERNEL_HANDLE_ERROR(format_str_to_enum_map.find(data_format_str) == format_str_to_enum_map.end(),
+                        "[MaxPoolGrad] data_format string is invalid.");
+    format = format_str_to_enum_map[data_format_str];
+  }
+  switch (format) {
+    case FORMAT_NHWC:
+      params.depth = tensor_in_shapes[kFormatNHWCIndexC];
+      params.tensor_rows = tensor_in_shapes[kFormatNHWCIndexH];
+      params.tensor_cols = tensor_in_shapes[kFormatNHWCIndexW];
+      params.tensor_batch = tensor_in_shapes[kFormatNHWCIndexN];
+      params.ksize_rows = ksize[kFormatNHWCIndexH];
+      params.ksize_cols = ksize[kFormatNHWCIndexW];
+      params.ksize_depth = ksize[kFormatNHWCIndexC];
+      params.strides_rows = strides[kFormatNHWCIndexH];
+      params.strides_cols = strides[kFormatNHWCIndexW];
+      params.strides_depth = strides[kFormatNHWCIndexC];
+      break;
+    case FORMAT_NCHW:
+      params.depth = tensor_in_shapes[kFormatNCHWIndexC];
+      params.tensor_rows = tensor_in_shapes[kFormatNCHWIndexH];
+      params.tensor_cols = tensor_in_shapes[kFormatNCHWIndexW];
+      params.tensor_batch = tensor_in_shapes[kFormatNCHWIndexN];
+      params.ksize_rows = ksize[kFormatNCHWIndexH];
+      params.ksize_cols = ksize[kFormatNCHWIndexW];
+      params.ksize_depth = ksize[kFormatNCHWIndexC];
+      params.strides_rows = strides[kFormatNCHWIndexH];
+      params.strides_cols = strides[kFormatNCHWIndexW];
+      params.strides_depth = strides[kFormatNCHWIndexC];
+      break;
+    default:
+      KERNEL_LOG_ERROR("[MaxPoolGrad] Format is not NHWC or NCHW, current is [%d].", format);
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  // 1 types of pooling is supported: 2d pooling on w/h
+  // depth pooling on channel is not supported
+  KERNEL_CHECK_FALSE(params.ksize_depth == 1, KERNEL_STATUS_PARAM_INVALID,
+                     "[MaxPoolGrad] Only pooling on width/height is supported.");
+  // Padding calc
+  if (params.ksize_depth == 1) {
+    uint32_t ret1 = GetOutputSizeGrad(params.tensor_rows, params.ksize_rows, params.strides_rows, padding,
+                                      &params.out_height, &params.pad_top, &params.pad_bottom);
+    uint32_t ret2 = GetOutputSizeGrad(params.tensor_cols, params.ksize_cols, params.strides_cols, padding,
+                                      &params.out_width, &params.pad_left, &params.pad_right);
+    KERNEL_CHECK_FALSE(ret1 == KERNEL_STATUS_OK && ret2 == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
+                       "[MaxPoolGrad] An error occurred while calculating output size.");
+    params.out_depth = params.depth;
+  }
+  return KERNEL_STATUS_OK;
+}
+template <class T>
+uint32_t ComputeMaxPoolGradImpl(CpuKernelContext &ctx) {
+  TensorShape ts = *(ctx.Input(kFirstInputIndex)->GetTensorShape());
+  PoolParams params;
+  KERNEL_CHECK_FALSE(ConstructPoolParams(ctx, ts, params) == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
+                     "[MaxPoolGrad] Parameters construct failed.")
+  return SpatialMaxPoolWithArgMaxHelper<T, int64_t>(ctx, params);
+}
+uint32_t MaxPoolGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_CHECK_FALSE(CheckMaxPoolGrad(ctx) == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
+                     "[MaxPoolGrad] Parameters check failure.");
+  DataType input_type = ctx.Input(kFirstInputIndex)->GetDataType();
+  switch (input_type) {
+    case DT_FLOAT16:
+      return ComputeMaxPoolGradImpl<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return ComputeMaxPoolGradImpl<float>(ctx);
+    case DT_DOUBLE:
+      return ComputeMaxPoolGradImpl<double>(ctx);
+    case DT_INT8:
+      return ComputeMaxPoolGradImpl<int8_t>(ctx);
+    case DT_INT16:
+      return ComputeMaxPoolGradImpl<int16_t>(ctx);
+    case DT_INT32:
+      return ComputeMaxPoolGradImpl<int32_t>(ctx);
+    case DT_INT64:
+      return ComputeMaxPoolGradImpl<int64_t>(ctx);
+    case DT_UINT8:
+      return ComputeMaxPoolGradImpl<uint8_t>(ctx);
+    case DT_UINT16:
+      return ComputeMaxPoolGradImpl<uint16_t>(ctx);
+    case DT_UINT32:
+      return ComputeMaxPoolGradImpl<uint32_t>(ctx);
+    case DT_UINT64:
+      return ComputeMaxPoolGradImpl<uint64_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("[MaxPoolGrad] Input Data type [%s] is not supported.", DTypeStr(input_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kMaxPoolGrad, MaxPoolGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.h
@ -0,0 +1,30 @@
+/**
+ * Copyright 2021 Harbin Institute of Technology
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_MAX_POOL_GRAD_H_
+#define AICPU_KERNELS_NORMALIZED_MAX_POOL_GRAD_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+
+namespace aicpu {
+class MaxPoolGradCpuKernel : public CpuKernel {
+ public:
+  ~MaxPoolGradCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/mirror_pad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/mirror_pad.cc
@ -0,0 +1,253 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "mirror_pad.h"
+
+#include "Eigen/Core"
+#include "Eigen/Dense"
+#include "cpu_kernel_utils.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "utils/eigen_tensor.h"
+#include "utils/equal_util.h"
+#include "utils/kernel_util.h"
+namespace {
+constexpr uint32_t kMirrotPadInputNum = 2;
+constexpr uint32_t kMirrotPadOutputNum = 1;
+const char *kMirrorPad = "MirrorPad";
+constexpr int kMinDims = 0;
+constexpr int kMaxDims = 5;
+constexpr int kTwo = 2;
+std::vector<std::string> attr_names;
+std::vector<int64_t> input_dim_shape;
+std::vector<int64_t> output_dim_shape;
+std::vector<std::pair<int64_t, int64_t>> padding_;
+std::vector<uint64_t> input_strides_;
+std::vector<uint64_t> output_strides_;
+int64_t input_num_elements;
+int64_t output_num_elements;
+int32_t dims_;
+int64_t offset_;
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+uint32_t MirrorPadCpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
+  // check params
+  attr_names.emplace_back("mode");
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMirrotPadInputNum, kMirrotPadOutputNum, attr_names),
+                      "[%s] check params failed.", kMirrorPad);
+  // get Attr mode
+  AttrValue *mode_ptr = ctx.GetAttr("mode");
+  auto mode = mode_ptr->GetString();
+  KERNEL_CHECK_FALSE((mode == "SYMMETRIC" || mode == "REFLECT"), KERNEL_STATUS_PARAM_INVALID,
+                     "Attr mode must be either REFLECT or SYMMETRIC, but got attr mode[%s]", mode);
+  if (mode == "SYMMETRIC") {
+    offset_ = 0;
+  } else if (mode == "REFLECT") {
+    offset_ = 1;
+  }
+  // get input x
+  Tensor *x_ptr = ctx.Input(0);
+  data_type_ = x_ptr->GetDataType();
+  auto x_shape_ptr = x_ptr->GetTensorShape();
+  auto dims = x_shape_ptr->GetDims();
+  dims_ = x_shape_ptr->GetDims();
+  KERNEL_CHECK_FALSE((kMinDims <= dims && dims <= kMaxDims), KERNEL_STATUS_PARAM_INVALID,
+                     "inputs rank not in [%lld, %lld]: %lld", kMinDims, kMaxDims, dims);
+  // get input paddings
+  Tensor *paddings_ptr = ctx.Input(1);
+  auto paddings_shape_ptr = paddings_ptr->GetTensorShape();
+  KERNEL_CHECK_FALSE((paddings_ptr->GetDataType() == DT_INT32 || paddings_ptr->GetDataType() == DT_INT64),
+                     KERNEL_STATUS_PARAM_INVALID,
+                     "Input split_dim data type must be DT_INT32 or DT_INT64, "
+                     "but got data type[%s]",
+                     DTypeStr(paddings_ptr->GetDataType()).c_str());
+  KERNEL_CHECK_FALSE(IsMatrix(paddings_shape_ptr->GetDimSizes()) && paddings_shape_ptr->GetDimSize(1),
+                     KERNEL_STATUS_PARAM_INVALID, "paddings must be a matrix with 2 columns: [%lld] ",
+                     paddings_shape_ptr->GetDimSizes());
+  KERNEL_CHECK_FALSE(dims == paddings_shape_ptr->GetDimSize(0), KERNEL_STATUS_PARAM_INVALID,
+                     "The first dimension of paddings must be the rank of inputs [%lld] , "
+                     "[%lld]",
+                     x_shape_ptr->GetDimSizes(), paddings_shape_ptr->GetDimSizes());
+  // Compute the shape of the output tensor, and allocate it.
+  auto size_pads_data = reinterpret_cast<T *>(paddings_ptr->GetData());
+  input_num_elements = 1;
+  output_num_elements = 1;
+  for (int d = 0; d < dims_; ++d) {
+    int64_t before = *(size_pads_data + d * 2);
+    int64_t after = *(size_pads_data + d * 2 + 1);
+    padding_.push_back(std::make_pair(before, after));
+    KERNEL_CHECK_FALSE(before >= 0 && after >= 0, KERNEL_STATUS_PARAM_INVALID,
+                       "paddings must be non-negative: [%lld]  [%lld]", before, after);
+    if (offset_ == 0) {
+      KERNEL_CHECK_FALSE(before <= x_shape_ptr->GetDimSize(d) && after <= x_shape_ptr->GetDimSize(d),
+                         KERNEL_STATUS_PARAM_INVALID,
+                         "paddings must be no greater "
+                         "than the dimension size: [%lld] , [%lld]  greater than [%lld] ",
+                         before, after, x_shape_ptr->GetDimSize(d));
+    } else if (offset_ == 1) {
+      KERNEL_CHECK_FALSE(before < x_shape_ptr->GetDimSize(d) && after < x_shape_ptr->GetDimSize(d),
+                         KERNEL_STATUS_PARAM_INVALID,
+                         "paddings must be no greater "
+                         "than the dimension size: [%lld] , [%lld]  not less than [%lld] ",
+                         before, after, x_shape_ptr->GetDimSize(d));
+    }
+    input_dim_shape.push_back(x_shape_ptr->GetDimSize(d));
+    int64_t dimi = after + x_shape_ptr->GetDimSize(d) + before;
+    input_num_elements *= x_shape_ptr->GetDimSize(d);
+    output_num_elements *= dimi;
+    output_dim_shape.push_back(dimi);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MirrorPadCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  auto input_data_ptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  if (output_num_elements == ctx.Input(0)->NumElements() || dims_ == 0) {
+    uint64_t copy_size = ctx.Input(0)->GetDataSize();
+    auto mem_ret = memcpy_s(output_data, copy_size, input_data_ptr, copy_size);
+    KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
+                       "Memcpy size[%zu] from input value to output failed.", copy_size);
+  } else {
+    KERNEL_CHECK_FALSE((MirrorPadCompute<T>(input_data_ptr, output_data) == KERNEL_STATUS_OK),
+                       KERNEL_STATUS_PARAM_INVALID, "MirrorPadCompute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MirrorPadCpuKernel::MirrorPadCompute(T *input_data_ptr, T *output_data_ptr) {
+  input_strides_.resize(dims_);
+  output_strides_.resize(dims_);
+  input_strides_[dims_ - 1] = 1;
+  output_strides_[dims_ - 1] = 1;
+  for (int i = dims_ - 1; i > 0; --i) {
+    input_strides_[i - 1] = input_strides_[i] * input_dim_shape[i];
+    output_strides_[i - 1] = output_strides_[i] * output_dim_shape[i];
+  }
+  std::vector<std::pair<int64_t, int64_t>> index;
+  index.resize(dims_);
+  index[dims_ - 1] = std::make_pair(output_strides_[dims_ - 1] * padding_[dims_ - 1].first,
+                                    output_strides_[dims_ - 1] * padding_[dims_ - 1].second);
+  for (int i = dims_ - 1; i > 0; --i) {
+    index[i - 1].first = index[i].first + output_strides_[i - 1] * padding_[i - 1].first;
+    index[i - 1].second = index[i].second + output_strides_[i - 1] * padding_[i - 1].second;
+  }
+  if (dims_ == 1) {
+    memcpy_s(output_data_ptr, padding_[0].first * sizeof(T), input_data_ptr + offset_, padding_[0].first * sizeof(T));
+    memcpy_s(output_data_ptr + padding_[0].first + input_num_elements, padding_[0].second * sizeof(T),
+             input_data_ptr + input_num_elements - padding_[0].second - offset_, padding_[0].second * sizeof(T));
+    memcpy_s(output_data_ptr + padding_[0].first, input_num_elements * sizeof(T), input_data_ptr,
+             input_num_elements * sizeof(T));
+    std::reverse(output_data_ptr, output_data_ptr + padding_[0].first);
+    std::reverse(output_data_ptr + padding_[0].first + input_num_elements,
+                 output_data_ptr + padding_[0].first + input_num_elements + padding_[0].second);
+    return KERNEL_STATUS_OK;
+  }
+
+  std::vector<int64_t> pos;
+  std::vector<int64_t> output_pos, tmp_pos;
+  pos.resize(dims_ - 1, 0);
+  int64_t output_index = index[0].first;
+  int64_t inx = 0, copy_size = sizeof(T) * input_dim_shape[dims_ - 1];
+  while (inx < input_num_elements) {
+    memcpy_s(output_data_ptr + output_index, copy_size, input_data_ptr + inx, copy_size);
+    output_pos.push_back(output_index);
+    pos[dims_ - kTwo] += 1;
+    int64_t dep = dims_ - 1;
+    for (int64_t i = dims_ - 2; i >= 0; --i) {
+      if (i > 0 && pos[i] >= input_dim_shape[i]) {
+        pos[i] -= input_dim_shape[i];
+        pos[i - 1] += 1;
+        dep = i;
+      } else {
+        break;
+      }
+    }
+    output_index += index[dep].first + index[dep].second + input_dim_shape[dims_ - 1];
+    inx += input_dim_shape[dims_ - 1];
+  }
+  for (int64_t i = dims_ - 1; i >= 0; --i) {
+    int64_t block_size = output_strides_[i], count = 0;
+    copy_size = block_size * sizeof(T);
+    for (auto item : output_pos) {
+      T *base_output_ptr1 = output_data_ptr + item;
+      for (int64_t cnt = 1; cnt <= padding_[i].first; ++cnt) {
+        memcpy_s(base_output_ptr1 - cnt * block_size, copy_size, base_output_ptr1 + (cnt - 1 + offset_) * block_size,
+                 copy_size);
+      }
+      T *base_output_ptr2 = output_data_ptr + item + input_dim_shape[i] * block_size;
+      for (int64_t cnt = 1; cnt <= padding_[i].second; ++cnt) {
+        memcpy_s(base_output_ptr2 + (cnt - 1) * block_size, copy_size, base_output_ptr2 - (cnt + offset_) * block_size,
+                 copy_size);
+      }
+      if (i > 0 && count % input_dim_shape[i - 1] == 0) {
+        tmp_pos.push_back(item - padding_[i].first * block_size);
+      }
+      ++count;
+    }
+    output_pos.clear();
+    for (auto item : tmp_pos) {
+      output_pos.push_back(item);
+    }
+    tmp_pos.clear();
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MirrorPadCpuKernel::Compute(CpuKernelContext &ctx) {
+  auto padding_type_ = ctx.Input(1)->GetDataType();
+  if (padding_type_ == DT_INT32) {
+    KERNEL_CHECK_FALSE((CheckAndInitParams<int32_t>(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
+                       "CheckAndInitParams failed.");
+  } else {
+    KERNEL_CHECK_FALSE((CheckAndInitParams<int64_t>(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
+                       "CheckAndInitParams failed.");
+  }
+  switch (data_type_) {
+    case DT_FLOAT16:
+      return DoCompute<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return DoCompute<float>(ctx);
+    case DT_DOUBLE:
+      return DoCompute<double>(ctx);
+    case DT_BOOL:
+      return DoCompute<bool>(ctx);
+    case DT_INT8:
+      return DoCompute<int8_t>(ctx);
+    case DT_INT16:
+      return DoCompute<int16_t>(ctx);
+    case DT_INT32:
+      return DoCompute<int32_t>(ctx);
+    case DT_INT64:
+      return DoCompute<int64_t>(ctx);
+    case DT_UINT8:
+      return DoCompute<uint8_t>(ctx);
+    case DT_UINT16:
+      return DoCompute<uint16_t>(ctx);
+    case DT_COMPLEX64:
+      return DoCompute<std::complex<float>>(ctx);
+    case DT_COMPLEX128:
+      return DoCompute<std::complex<double>>(ctx);
+    default:
+      KERNEL_LOG_ERROR("Unsupported datatype[%s]", DTypeStr(data_type_).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+REGISTER_CPU_KERNEL(kMirrorPad, MirrorPadCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/mirror_pad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/mirror_pad.h
@ -0,0 +1,64 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_MIRROR_PAD_H_
+#define AICPU_KERNELS_NORMALIZED_MIRROR_PAD_H_
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "securec.h"
+#include "status.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace aicpu {
+class MirrorPadCpuKernel : public CpuKernel {
+ public:
+  MirrorPadCpuKernel() = default;
+  ~MirrorPadCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  /**
+   * @brief Init params
+   * @param ctx cpu kernel context
+   * @return status if success
+   */
+  template <typename T>
+  uint32_t CheckAndInitParams(CpuKernelContext &ctx);
+
+  /**
+   * @brief padding
+   * @param input_data_ptr ptr which store input data
+   * @param output_data_ptr ptr which store output data
+   * @return status if success
+   */
+  template <typename T>
+  uint32_t MirrorPadCompute(T *input_data_ptr, T *output_data_ptr);
+
+  template <typename T>
+  uint32_t DoCompute(CpuKernelContext &ctx);
+
+ private:
+  DataType data_type_;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc
@ -0,0 +1,320 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "multi_margin_loss.h"
+
+#include <Eigen/Dense>
+#include <algorithm>
+#include <iostream>
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const char *kMultiMarginLoss = "MultiMarginLoss";
+// when input data size is more than kParallelDataNum, use Parallel func
+const int64_t kParallelDataNum = 28 * 1024;
+}  // namespace
+
+namespace aicpu {
+uint32_t MultiMarginLossCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  uint32_t kInputNum = 3;
+  constexpr int SERV_TYPE_SET = 2;
+  if (ctx.GetInputsSize() == SERV_TYPE_SET) {
+    kInputNum = SERV_TYPE_SET;
+  }
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MultiMarginLoss check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MultiMarginLossCheck(ctx), "MultiMarginLoss check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    case DT_FLOAT16:
+      return MultiMarginLossComputeFP16<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return MultiMarginLossCompute<float>(ctx);
+    case DT_DOUBLE:
+      return MultiMarginLossCompute<double>(ctx);
+    default:
+      KERNEL_LOG_ERROR("MultiMarginLoss kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MultiMarginLossCpuKernel::MultiMarginLossCheck(CpuKernelContext &ctx) {
+  auto input_0 = ctx.Input(0);
+  auto input_1 = ctx.Input(1);
+
+  constexpr int SERV_TYPE_SET = 2;
+  constexpr int SERV_TYPE_QUERY = 3;
+
+  DataType input0_type = input_0->GetDataType();
+  DataType input1_type = input_1->GetDataType();
+  KERNEL_CHECK_FALSE((input1_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of target [%s] should be int64.", DTypeStr(input1_type).c_str())
+  auto target = reinterpret_cast<int64_t *>(ctx.Input(1)->GetData());
+  int64_t target_num = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
+  int64_t batch_size = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
+  if (ctx.GetInputsSize() == SERV_TYPE_QUERY) {
+    auto input_weight = ctx.Input(2);
+    DataType input2_type = input_weight->GetDataType();
+    KERNEL_CHECK_FALSE((input2_type == input0_type), KERNEL_STATUS_PARAM_INVALID,
+                       "weight should have the same dtype with x, but get [%s].", DTypeStr(input2_type).c_str())
+  }
+  KERNEL_CHECK_FALSE((ctx.Input(0)->GetTensorShape()->GetDims() == SERV_TYPE_SET), KERNEL_STATUS_PARAM_INVALID,
+                     "Rank of x should be 2.")
+  KERNEL_CHECK_FALSE((ctx.Input(1)->GetTensorShape()->GetDims() == 1), KERNEL_STATUS_PARAM_INVALID,
+                     "Rank of target should be 1.")
+  KERNEL_CHECK_FALSE((batch_size == ctx.Input(1)->GetTensorShape()->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID,
+                     "[%s] 's x's shape[0] should be the same as target's "
+                     "shape[0].",
+                     ctx.GetOpType().c_str())
+  for (int64_t i = 0; i < batch_size; i++) {
+    KERNEL_CHECK_FALSE(*(target + i) >= 0 && (*(target + i) < target_num), KERNEL_STATUS_PARAM_INVALID,
+                       "[%s]'s target out of range", ctx.GetOpType().c_str());
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MultiMarginLossCpuKernel::MultiMarginLossCompute(CpuKernelContext &ctx) {
+  constexpr int SERV_TYPE_BRWD = 1;
+  constexpr int SERV_TYPE_SET = 2;
+  constexpr int ADULT_AGE = 4;
+  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> shape_target = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto input_target = reinterpret_cast<int64_t *>(ctx.Input(1)->GetData());
+  T *input_weight = nullptr;
+  bool weight_defined_ = (ctx.GetInputsSize() == 3);
+  if (weight_defined_) {
+    input_weight = reinterpret_cast<T *>(ctx.Input(2)->GetData());
+    int64_t weight_length = ctx.Input(2)->NumElements();
+    int64_t x_length = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
+    if (weight_length < x_length) {
+      for (int64_t i = 0; i < x_length - weight_length; i++) {
+        input_weight[i + weight_length] = static_cast<T>(0);
+      }
+    }
+  }
+  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  AttrValue *Attr_p = ctx.GetAttr("p");
+  int p = (Attr_p == nullptr) ? 1 : Attr_p->GetInt();
+  if (p != SERV_TYPE_BRWD && p != SERV_TYPE_SET) {
+    KERNEL_LOG_ERROR("MultiMarginLoss kernel attr p should be 1 or 2.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  AttrValue *Attr_margin = ctx.GetAttr("margin");
+  T margin = static_cast<T>((Attr_margin == nullptr) ? 1 : Attr_margin->GetFloat());
+  AttrValue *Attr_red = ctx.GetAttr("reduction");
+  std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
+  int64_t batch_size = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
+  int64_t dims = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
+  Eigen::Array<T, Eigen::Dynamic, 1> output(batch_size, 1);
+  output.setZero();
+  auto output_data = output.data();
+  int64_t min_core_num = 1;
+  int64_t max_core_num = std::max(min_core_num, (int64_t)aicpu::CpuKernelUtils::GetCPUNum(ctx));
+  auto shard_multi_margin_loss = [&](size_t start, size_t end) {
+    int64_t once_compute_thread_size = end - start;
+    Eigen::Array<T, Eigen::Dynamic, 1> cacl(dims, 1);
+    auto cacl_data = cacl.data();
+    cacl.setZero();
+    if (dims == 0) {
+      KERNEL_LOG_ERROR("dims could not be 0.");
+    }
+    for (int64_t m = 0; m < (once_compute_thread_size) / dims; m++) {
+      int64_t i = start / dims;
+      for (int64_t d = 0; d < dims; d++) {
+        if (d == input_target[i]) {
+          continue;
+        }
+        cacl_data[d] = margin + input_x[start + d] - input_x[start + input_target[i]];
+        if (cacl_data[d] > T(0)) {
+          cacl_data[d] = (p == 1) ? cacl_data[d] : cacl_data[d] * cacl_data[d];
+          if (weight_defined_) {
+            cacl_data[d] *= (input_weight[input_target[i]]);
+          }
+          output_data[i] += cacl_data[d];
+        }
+      }
+      output_data[i] = output_data[i] / static_cast<T>(dims);
+      start += dims;
+    }
+  };
+  if ((ctx.Input(0)->NumElements()) * sizeof(T) <= kParallelDataNum) {
+    Eigen::Array<T, Eigen::Dynamic, 1> cacl(dims, 1);
+    auto cacl_data = cacl.data();
+    cacl.setZero();
+    T sum = static_cast<T>(0);
+    for (int64_t i = 0; i < batch_size; i++) {
+      int64_t target_idx = input_target[i];
+      sum = static_cast<T>(0);
+      cacl.setZero();
+      for (int64_t d = 0; d < dims; d++) {
+        if (d == target_idx) {
+          continue;
+        }
+        cacl_data[d] = margin + input_x[i * dims + d] - input_x[i * dims + target_idx];
+        if (cacl_data[d] > T(0)) {
+          cacl_data[d] = (p == 1) ? cacl_data[d] : cacl_data[d] * cacl_data[d];
+          if (weight_defined_) {
+            cacl_data[d] *= static_cast<T>(input_weight[target_idx]);
+          }
+          sum += cacl_data[d];
+        }
+      }
+      sum = sum / static_cast<T>(dims);
+      output_data[i] = sum;
+    }
+  } else {
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    CpuKernelUtils::ParallelFor(ctx, ctx.Input(0)->NumElements(), dims * ADULT_AGE * (batch_size / max_core_num + 1),
+                                shard_multi_margin_loss);
+  }
+  if (reduction == "mean") {
+    *output_y = output.mean();
+  }
+  if (reduction == "sum") {
+    *output_y = output.sum();
+  }
+  if (reduction == "none") {
+    for (int64_t t = 0; t < batch_size; t++) {
+      *(output_y + t) = output_data[t];
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MultiMarginLossCpuKernel::MultiMarginLossComputeFP16(CpuKernelContext &ctx) {
+  constexpr int SERV_TYPE_BRWD = 1;
+  constexpr int SERV_TYPE_SET = 2;
+  constexpr int ADULT_AGE = 4;
+  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> shape_target = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto input_target = reinterpret_cast<int64_t *>(ctx.Input(1)->GetData());
+  T *input_weight = nullptr;
+  bool weight_defined_ = (ctx.GetInputsSize() == 3);
+  if (weight_defined_) {
+    input_weight = reinterpret_cast<T *>(ctx.Input(SERV_TYPE_SET)->GetData());
+    int64_t weight_length = ctx.Input(2)->NumElements();
+    int64_t x_length = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
+    if (weight_length < x_length) {
+      for (int64_t i = 0; i < x_length - weight_length; i++) {
+        input_weight[i + weight_length] = static_cast<T>(0);
+      }
+    }
+  }
+  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  AttrValue *Attr_p = ctx.GetAttr("p");
+  int p = (Attr_p == nullptr) ? 1 : Attr_p->GetInt();
+  if (p != SERV_TYPE_BRWD && p != SERV_TYPE_SET) {
+    KERNEL_LOG_ERROR("MultiMarginLoss kernel attr p should be 1 or 2.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  AttrValue *Attr_margin = ctx.GetAttr("margin");
+  float margin = static_cast<float>((Attr_margin == nullptr) ? 1 : Attr_margin->GetFloat());
+  AttrValue *Attr_red = ctx.GetAttr("reduction");
+  std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
+  int64_t batch_size = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
+  int64_t dims = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
+  Eigen::Array<float, Eigen::Dynamic, 1> output(batch_size, 1);
+  output.setZero();
+  auto output_data = output.data();
+  int64_t min_core_num = 1;
+  int64_t max_core_num = std::max(min_core_num, (int64_t)aicpu::CpuKernelUtils::GetCPUNum(ctx));
+  auto shard_multi_margin_loss = [&](size_t start, size_t end) {
+    int64_t once_compute_thread_size = end - start;
+    Eigen::Array<float, Eigen::Dynamic, 1> cacl(dims, 1);
+    auto cacl_data = cacl.data();
+    cacl.setZero();
+    if (dims == 0) {
+      KERNEL_LOG_ERROR("dims could not be 0.");
+    }
+    for (int64_t m = 0; m < (once_compute_thread_size) / dims; m++) {
+      int64_t i = start / dims;
+      for (int64_t d = 0; d < dims; d++) {
+        if (d == input_target[i]) {
+          continue;
+        }
+        cacl_data[d] =
+          margin + static_cast<float>(input_x[start + d]) - static_cast<float>(input_x[start + input_target[i]]);
+        if (cacl_data[d] > 0) {
+          cacl_data[d] = (p == 1) ? cacl_data[d] : cacl_data[d] * cacl_data[d];
+          if (weight_defined_) {
+            cacl_data[d] *= static_cast<float>(input_weight[input_target[i]]);
+          }
+          output_data[i] += cacl_data[d];
+        }
+      }
+      output_data[i] = output_data[i] / static_cast<float>(dims);
+      start += dims;
+    }
+  };
+  if ((ctx.Input(0)->NumElements()) * sizeof(T) <= kParallelDataNum) {
+    Eigen::Array<float, Eigen::Dynamic, 1> cacl(dims, 1);
+    auto cacl_data = cacl.data();
+    cacl.setZero();
+    float sum = 0;
+    for (int64_t i = 0; i < batch_size; i++) {
+      int64_t target_idx = input_target[i];
+      sum = 0;
+      cacl.setZero();
+      for (int64_t d = 0; d < dims; d++) {
+        if (d == target_idx) {
+          continue;
+        }
+        cacl_data[d] =
+          margin + static_cast<float>(input_x[i * dims + d]) - static_cast<float>(input_x[i * dims + target_idx]);
+        if (cacl_data[d] > 0) {
+          cacl_data[d] = (p == 1) ? cacl_data[d] : cacl_data[d] * cacl_data[d];
+          if (weight_defined_) {
+            cacl_data[d] *= static_cast<float>(input_weight[target_idx]);
+          }
+          sum += cacl_data[d];
+        }
+      }
+      sum = sum / static_cast<float>(dims);
+      output_data[i] = sum;
+    }
+  } else {
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    CpuKernelUtils::ParallelFor(ctx, ctx.Input(0)->NumElements(), dims * ADULT_AGE * (batch_size / max_core_num + 1),
+                                shard_multi_margin_loss);
+  }
+  if (reduction == "mean") {
+    *output_y = static_cast<T>(output.mean());
+  }
+  if (reduction == "sum") {
+    *output_y = static_cast<T>(output.sum());
+  }
+  if (reduction == "none") {
+    for (int64_t t = 0; t < batch_size; t++) {
+      *(output_y + t) = static_cast<T>(output_data[t]);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMultiMarginLoss, MultiMarginLossCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.h
@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_MULIT_MARGIN_LOSS_H_
+#define AICPU_KERNELS_NORMALIZED_MULTI_MARGIN_LOSS_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class MultiMarginLossCpuKernel : public CpuKernel {
+ public:
+  MultiMarginLossCpuKernel() = default;
+  ~MultiMarginLossCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  static uint32_t MultiMarginLossCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  static uint32_t MultiMarginLossCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  static uint32_t MultiMarginLossComputeFP16(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif