!48020 migrate PadV3 and other aicpu ops

Merge pull request !48020 from 李林杰/0118_block_apicpu_ops_that_might_have_issues
2023-01-20 01:17:25 +00:00 · 2023-01-20 01:17:25 +00:00 · 77442c99a2
parent 6d91c35487 c90a952027
commit 77442c99a2
11 changed files with 1336 additions and 6 deletions
--- a/mindspore/ccsrc/include/common/utils/utils.h
+++ b/mindspore/ccsrc/include/common/utils/utils.h
@ -539,6 +539,8 @@ constexpr auto kPadAndShiftOpName = "PadAndShift";
 constexpr auto kPaddingOpName = "Padding";
 constexpr auto kPadOpName = "Pad";
 constexpr auto kPadDOpName = "PadD";
+constexpr auto kPadV3GradOpName = "PadV3Grad";
+constexpr auto kPadV3OpName = "PadV3";
 constexpr auto kParallelResizeBilinearOpName = "ParallelResizeBilinear";
 constexpr auto kSyncResizeBilinearV2OpName = "SyncResizeBilinearV2";
 constexpr auto kParallelResizeBilinearGradOpName = "ParallelResizeBilinearGrad";
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logical_xor.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logical_xor.cc
@ -0,0 +1,198 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "logical_xor.h"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *kLogicalXor = "LogicalXor";
+// when input data size is more than kParallelDataNum, use Parallel func
+const int64_t kParallelDataNum = 2 * 1024;
+const int64_t kParallelDataNumMid = 16 * 1024;
+const int64_t kParallelDataNumSameShape = 7 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
+}  // namespace
+
+namespace aicpu {
+uint32_t LogicalXorCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LogicalXor check input and output number failed.");
+  KERNEL_HANDLE_ERROR(LogicalXorCheck(ctx), "LogicalXor check params or bcast failed.");
+  uint32_t result = LogicalXorCompute<bool>(ctx);
+  if (result != KERNEL_STATUS_OK) {
+    KERNEL_LOG_ERROR("LogicalXor kernel compute failed.");
+    return result;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t LogicalXorCpuKernel::LogicalXorCheck(CpuKernelContext &ctx) {
+  // the non null of input_0, input_1, output has been verified in NormalCheck
+  Tensor *input_0 = ctx.Input(0);
+  Tensor *input_1 = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  DataType input0_type = input_0->GetDataType();
+  DataType input1_type = input_1->GetDataType();
+  KERNEL_CHECK_FALSE((input0_type == input1_type && input0_type == DT_BOOL), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with "
+                     "input1 [%s] and both should be bool.",
+                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
+  KERNEL_LOG_DEBUG(
+    "LogicalXorCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+/**
+ *  special compute is used in the following situations.
+ *  1. the shapes of input1 and input2 are the same
+ *  2. input1 is a 1D tensor with only one element or input1 is scalar
+ *  3. input2 is a 1D tensor with only one element or input2 is scalar
+ *  4. the shapes of input1 and input2 are different
+ */
+template <typename T>
+void LogicalXorCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
+                                         const T *input2, bool *output) {
+  switch (type) {
+    case BcastShapeType::SAME_SHAPE:
+      for (int64_t i = start; i < end; ++i) {
+        *(output + i) = *(input1 + i) != *(input2 + i);
+      }
+      break;
+    case BcastShapeType::X_ONE_ELEMENT:
+      for (int64_t i = start; i < end; ++i) {
+        *(output + i) = *input1 != *(input2 + i);
+      }
+      break;
+    case BcastShapeType::Y_ONE_ELEMENT:
+      for (int64_t i = start; i < end; ++i) {
+        *(output + i) = *(input1 + i) != *input2;
+      }
+      break;
+    default:
+      KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
+      break;
+  }
+}
+
+template <typename T>
+uint32_t LogicalXorCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
+  auto input_0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto input_1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<bool *>(ctx.Output(0)->GetData());
+  int64_t input_0_elements_nums = ctx.Input(0)->NumElements();
+  int64_t input_1_elements_nums = ctx.Input(1)->NumElements();
+  int64_t data_num = ctx.Output(0)->NumElements();
+  BcastShapeType type =
+    input_0_elements_nums == input_1_elements_nums
+      ? BcastShapeType::SAME_SHAPE
+      : (input_0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
+
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+
+    if (data_num <= kParallelDataNumSameShapeMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_LogicalXor = [&](int64_t start, int64_t end) {
+      SpecialCompute<T>(type, start, end, input_0, input_1, out);
+    };
+
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_LogicalXor),
+                        "LogicalXor Compute failed.");
+  } else {
+    SpecialCompute<T>(type, 0, data_num, input_0, input_1, out);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t LogicalXorCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
+  auto input_0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto input_1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<bool *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Output(0)->NumElements();
+
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_LogicalXor = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; ++i) {
+        *(out + i) =
+          *(input_0 + bcast.GetBroadcastXIndex(i)) != *(input_1 + bcast.GetBroadcastYIndex(i)) ? true : false;
+      }
+    };
+
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_LogicalXor),
+                        "LogicalXor Compute failed.");
+  } else {
+    for (int64_t i = 0; i < data_num; ++i) {
+      *(out + i) = *(input_0 + bcast.GetBroadcastXIndex(i)) != *(input_1 + bcast.GetBroadcastYIndex(i)) ? true : false;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t LogicalXorCpuKernel::LogicalXorCompute(CpuKernelContext &ctx) {
+  Tensor *input0_tensor = ctx.Input(0);
+  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input0_elements_nums = input0_tensor->NumElements();
+
+  Tensor *input1_tensor = ctx.Input(1);
+  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input1_elements_nums = input1_tensor->NumElements();
+
+  bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
+  if (isNeedBcast) {
+    return NoBcastCompute<T>(ctx);
+  } else {
+    Bcast bcast(input0_shape, input1_shape);
+    if (!bcast.IsValid()) {
+      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+
+    return BcastCompute<T>(ctx, bcast);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kLogicalXor, LogicalXorCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logical_xor.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logical_xor.h
@ -0,0 +1,49 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_LOGICALXOR_H_
+#define AICPU_KERNELS_NORMALIZED_LOGICALXOR_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+
+class LogicalXorCpuKernel : public CpuKernel {
+ public:
+  LogicalXorCpuKernel() = default;
+  ~LogicalXorCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  static uint32_t LogicalXorCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, bool *output);
+
+  template <typename T>
+  uint32_t NoBcastCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
+
+  template <typename T>
+  uint32_t LogicalXorCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3.cc
@ -0,0 +1,533 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pad_v3.h"
+
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <vector>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kPadV3 = "PadV3";
+constexpr int64_t kMinCoreNum = 1;
+constexpr int64_t kParallelNum = 1024 * 16;
+constexpr int64_t kInput3D = 3;
+constexpr int64_t kInput4D = 4;
+constexpr int64_t kInput5D = 5;
+constexpr int64_t kPadding1D = 2;
+constexpr int64_t kPadding2D = 4;
+constexpr int64_t kPadding3D = 6;
+constexpr int64_t kNum2 = 2;
+constexpr int64_t kNum3 = 3;
+constexpr int64_t kNum4 = 4;
+
+const std::vector<std::string> mode_list = {"constant", "reflect", "edge"};
+using float16 = Eigen::half;
+
+#define PAD_V3_COMPUTE_CASE(DTYPE, TYPE, CTX)           \
+  case (DTYPE): {                                       \
+    uint32_t result = DoCompute<TYPE>(CTX);             \
+    if (result != KERNEL_STATUS_OK) {                   \
+      KERNEL_LOG_ERROR("PadV3 kernel compute failed."); \
+      return result;                                    \
+    }                                                   \
+    break;                                              \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t PadV3CpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input x failed")
+  KERNEL_CHECK_NULLPTR(ctx.Input(1)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input paddings failed")
+  KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output y failed")
+  KERNEL_HANDLE_ERROR(CheckAndInitParams(ctx), "PadV3 check and init params failed.");
+  auto paddings_type = ctx.Input(1)->GetDataType();
+  if (paddings_type == DT_INT32) {
+    KERNEL_CHECK_FALSE((GetPaddingsAndSetOuputShape<int32_t>(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
+                       "Get paddings and set output shape failed.");
+  } else if (paddings_type == DT_INT64) {
+    KERNEL_CHECK_FALSE((GetPaddingsAndSetOuputShape<int64_t>(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
+                       "Get paddings and set output shape failed.");
+  } else {
+    KERNEL_LOG_ERROR("PadV3 paddings data type [%s] not support.", DTypeStr(paddings_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  auto data_type_ = ctx.Input(0)->GetDataType();
+  switch (data_type_) {
+    PAD_V3_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    PAD_V3_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    PAD_V3_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    PAD_V3_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    PAD_V3_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    PAD_V3_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
+    PAD_V3_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
+    PAD_V3_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
+    PAD_V3_COMPUTE_CASE(DT_FLOAT16, float16, ctx)
+    PAD_V3_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    PAD_V3_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    PAD_V3_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
+    PAD_V3_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("PadV3 kernel data type [%s] not support.", DTypeStr(data_type_).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+int64_t PadV3CpuKernel::EdgeIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start,
+                                          int64_t i_start) {
+  int64_t ip;
+  if (now < pad_value) {
+    ip = pad_value;
+  } else if (now >= pad_value && now < input_value + pad_value) {
+    ip = now;
+  } else {
+    ip = input_value + pad_value - 1;
+  }
+  ip = ip - o_start + i_start;
+  return ip;
+}
+
+template <typename T>
+uint32_t PadV3CpuKernel::EdgeCompute1D(T *input, T *output, int64_t p) {
+  int64_t nplane = 0;
+  int64_t input_w = input_shape[kNum2];
+  int64_t output_w = output_shape.end()[-1];
+  int64_t pad_l = paddings[0];
+  int64_t i_start_x = std::max(int64_t(0), -pad_l);
+  int64_t o_start_x = std::max(int64_t(0), pad_l);
+  int64_t ip_x;
+  for (int64_t j = 0; j < output_w; ++j) {
+    ip_x = EdgeIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x);
+    T *dest_p = output + p * output_w * (nplane + 1) + j;
+    T *src_p = input + +p * input_w * (nplane + 1) + ip_x;
+    *dest_p = *src_p;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3CpuKernel::EdgeCompute2D(T *input, T *output, int64_t p) {
+  int64_t pad_l = paddings[0];
+  int64_t pad_t = paddings[kNum2];
+  int64_t nplane = 0;
+  int64_t input_h = input_shape[kNum2];
+  int64_t input_w = input_shape[kNum3];
+  int64_t output_h = input_h + pad_t + paddings[kNum3];
+  int64_t output_w = input_w + pad_l + paddings[1];
+  int64_t i_start_x = std::max(int64_t(0), -pad_l);
+  int64_t i_start_y = std::max(int64_t(0), -pad_t);
+  int64_t o_start_x = std::max(int64_t(0), pad_l);
+  int64_t o_start_y = std::max(int64_t(0), pad_t);
+  int64_t ip_x, ip_y;
+  for (int64_t i = 0; i < output_h; ++i) {
+    for (int64_t j = 0; j < output_w; ++j) {
+      ip_x = EdgeIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x);
+      ip_y = EdgeIndexCaculate(pad_t, i, input_h, o_start_y, i_start_y);
+      T *dest_p = output + p * output_w * output_h * (nplane + 1) + i * output_w + j;
+      T *src_p = input + p * input_w * input_h * (nplane + 1) + ip_y * input_w + ip_x;
+      *dest_p = *src_p;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3CpuKernel::EdgeCompute3D(T *input, T *output, int64_t p) {
+  int64_t pad_l = paddings[0];
+  int64_t pad_t = paddings[kNum2];
+  int64_t pad_f = paddings[kNum4];
+  int64_t nplane = 0;
+  int64_t input_d = input_shape[kNum2];
+  int64_t input_h = input_shape[kNum3];
+  int64_t input_w = input_shape[kNum4];
+  int64_t output_d = output_shape[kNum2];
+  int64_t output_h = output_shape[kNum3];
+  int64_t output_w = output_shape[kNum4];
+  int64_t i_start_x = std::max(int64_t(0), -pad_l);
+  int64_t i_start_y = std::max(int64_t(0), -pad_t);
+  int64_t i_start_z = std::max(int64_t(0), -pad_f);
+  int64_t o_start_x = std::max(int64_t(0), pad_l);
+  int64_t o_start_y = std::max(int64_t(0), pad_t);
+  int64_t o_start_z = std::max(int64_t(0), pad_f);
+  int64_t ip_x, ip_y, ip_z;
+  for (int64_t k = 0; k < output_d; ++k) {
+    for (int64_t j = 0; j < output_h; ++j) {
+      for (int64_t i = 0; i < output_w; ++i) {
+        ip_x = EdgeIndexCaculate(pad_l, i, input_w, o_start_x, i_start_x);
+        ip_y = EdgeIndexCaculate(pad_t, j, input_h, o_start_y, i_start_y);
+        ip_z = EdgeIndexCaculate(pad_f, k, input_d, o_start_z, i_start_z);
+        T *dest_p =
+          output + p * output_w * output_h * output_d * (nplane + 1) + k * output_w * output_h + j * output_w + i;
+        T *src_p =
+          input + p * input_w * input_h * input_d * (nplane + 1) + ip_z * input_w * input_h + ip_y * input_w + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3CpuKernel::EdgeModeCompute(CpuKernelContext &ctx, int64_t p) {
+  auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  if (paddings_num == kPadding1D) {
+    EdgeCompute1D<T>(input, output, p);
+  } else if (paddings_num == kPadding2D) {
+    EdgeCompute2D<T>(input, output, p);
+  } else if (paddings_num == kPadding3D) {
+    EdgeCompute3D<T>(input, output, p);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+int64_t PadV3CpuKernel::ReflectIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start,
+                                             int64_t i_start) {
+  int64_t ip;
+  if (now < pad_value) {
+    ip = pad_value + pad_value - now;
+  } else if (now >= pad_value && now < input_value + pad_value) {
+    ip = now;
+  } else {
+    ip = (input_value + pad_value - 1) + (input_value + pad_value - 1) - now;
+  }
+  ip = ip - o_start + i_start;
+  return ip;
+}
+
+template <typename T>
+uint32_t PadV3CpuKernel::ReflectCompute1D(T *input, T *output, int64_t p) {
+  int64_t nplane = 0;
+  int64_t input_w = input_shape[kNum2];
+  int64_t output_w = output_shape.end()[-1];
+  int64_t pad_l = paddings[0];
+  int64_t i_start_x = std::max(int64_t(0), -pad_l);
+  int64_t o_start_x = std::max(int64_t(0), pad_l);
+  int64_t ip_x;
+  for (int64_t j = 0; j < output_w; ++j) {
+    ip_x = ReflectIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x);
+    T *dest_p = output + p * output_w * (nplane + 1) + j;
+    T *src_p = input + +p * input_w * (nplane + 1) + ip_x;
+    *dest_p = *src_p;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3CpuKernel::ReflectCompute2D(T *input, T *output, int64_t p) {
+  int64_t pad_l = paddings[0];
+  int64_t pad_t = paddings[kNum2];
+  int64_t nplane = 0;
+  int64_t input_h = input_shape[kNum2];
+  int64_t input_w = input_shape[kNum3];
+  int64_t output_h = input_h + pad_t + paddings[kNum3];
+  int64_t output_w = input_w + pad_l + paddings[1];
+  int64_t i_start_x = std::max(int64_t(0), -pad_l);
+  int64_t i_start_y = std::max(int64_t(0), -pad_t);
+  int64_t o_start_x = std::max(int64_t(0), pad_l);
+  int64_t o_start_y = std::max(int64_t(0), pad_t);
+  int64_t ip_x, ip_y;
+  for (int64_t i = 0; i < output_h; ++i) {
+    for (int64_t j = 0; j < output_w; ++j) {
+      ip_x = ReflectIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x);
+      ip_y = ReflectIndexCaculate(pad_t, i, input_h, o_start_y, i_start_y);
+      T *dest_p = output + p * output_w * output_h * (nplane + 1) + i * output_w + j;
+      T *src_p = input + p * input_w * input_h * (nplane + 1) + ip_y * input_w + ip_x;
+      *dest_p = *src_p;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3CpuKernel::ReflectCompute3D(T *input, T *output, int64_t p) {
+  int64_t pad_l = paddings[0];
+  int64_t pad_t = paddings[kNum2];
+  int64_t pad_f = paddings[kNum4];
+  int64_t nplane = 0;
+  int64_t input_d = input_shape[kNum2];
+  int64_t input_h = input_shape[kNum3];
+  int64_t input_w = input_shape[kNum4];
+  int64_t output_d = output_shape[kNum2];
+  int64_t output_h = output_shape[kNum3];
+  int64_t output_w = output_shape[kNum4];
+  int64_t i_start_x = std::max(int64_t(0), -pad_l);
+  int64_t i_start_y = std::max(int64_t(0), -pad_t);
+  int64_t i_start_z = std::max(int64_t(0), -pad_f);
+  int64_t o_start_x = std::max(int64_t(0), pad_l);
+  int64_t o_start_y = std::max(int64_t(0), pad_t);
+  int64_t o_start_z = std::max(int64_t(0), pad_f);
+  int64_t ip_x, ip_y, ip_z;
+  for (int64_t k = 0; k < output_d; ++k) {
+    for (int64_t j = 0; j < output_h; ++j) {
+      for (int64_t i = 0; i < output_w; ++i) {
+        ip_x = ReflectIndexCaculate(pad_l, i, input_w, o_start_x, i_start_x);
+        ip_y = ReflectIndexCaculate(pad_t, j, input_h, o_start_y, i_start_y);
+        ip_z = ReflectIndexCaculate(pad_f, k, input_d, o_start_z, i_start_z);
+        T *dest_p =
+          output + p * output_w * output_h * output_d * (nplane + 1) + k * output_w * output_h + j * output_w + i;
+        T *src_p =
+          input + p * input_w * input_h * input_d * (nplane + 1) + ip_z * input_w * input_h + ip_y * input_w + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3CpuKernel::ReflectModeCompute(CpuKernelContext &ctx, int64_t p) {
+  auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  if (paddings_num == kPadding1D) {
+    ReflectCompute1D<T>(input, output, p);
+  } else if (paddings_num == kPadding2D) {
+    ReflectCompute2D<T>(input, output, p);
+  } else if (paddings_num == kPadding3D) {
+    ReflectCompute3D<T>(input, output, p);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3CpuKernel::ConstantModeCompute(CpuKernelContext &ctx, T constant_values) {
+  auto input_ptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_ptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t output_num = ctx.Output(0)->NumElements();
+  int64_t input_num = 1;
+  std::vector<int64_t> input_strides(input_dims, 0);
+  std::vector<int64_t> output_strides(input_dims, 0);
+  input_strides[input_dims - 1] = 1;
+  output_strides[input_dims - 1] = 1;
+  for (int64_t i = input_dims - 1; i >= 1; --i) {
+    input_strides[i - 1] = input_strides[i] * input_shape[i];
+    output_strides[i - 1] = output_strides[i] * output_shape[i];
+  }
+  std::vector<int64_t> offsets(input_dims, 0);
+  std::vector<int64_t> extents(input_dims, 0);
+  for (int64_t i = input_dims - 1; i >= 0; --i) {
+    extents[i] = input_shape[i];
+    if (paddings[i * kNum2] < 0) {
+      extents[i] += paddings[i * kNum2];
+      offsets[i] = -paddings[i * kNum2];
+      paddings[i * kNum2] = 0;
+    }
+    if (paddings[i * kNum2 + 1] < 0) {
+      extents[i] += paddings[i * kNum2 + 1];
+      paddings[i * kNum2 + 1] = 0;
+    }
+    input_shape[i] = extents[i];
+    input_num *= input_shape[i];
+  }
+  std::vector<T> input_values;
+  for (int64_t i = 0; i < input_num; ++i) {
+    int64_t k = i;
+    int64_t p = 0;
+    for (int64_t j = input_dims - 1; j >= 0; --j) {
+      p += (offsets[j] + (k % extents[j])) * input_strides[j];
+      k /= extents[j];
+    }
+    input_values.push_back(*(input_ptr + p));
+  }
+  for (int64_t i = 0; i < output_num; ++i) {
+    *(output_ptr + i) = constant_values;
+  }
+  if (input_dims == 1) {
+    for (int64_t i = 0; i < input_num; ++i) {
+      *(output_ptr + paddings[0] + i) = input_values[i];
+    }
+    return KERNEL_STATUS_OK;
+  }
+  std::vector<int64_t> i_inx_add(input_dims, 0);
+  std::vector<int64_t> o_inx_add(input_dims, 0);
+  i_inx_add[input_dims - 1] = output_strides[input_dims - 1] * paddings[kNum2 * (input_dims - 1)];
+  o_inx_add[input_dims - 1] = output_strides[input_dims - 1] * paddings[kNum2 * (input_dims - 1) + 1];
+  for (int64_t i = input_dims - 1; i >= 1; --i) {
+    i_inx_add[i - 1] = i_inx_add[i] + output_strides[i - 1] * paddings[kNum2 * (i - 1)];
+    o_inx_add[i - 1] = o_inx_add[i] + output_strides[i - 1] * paddings[kNum2 * (i - 1) + 1];
+  }
+  int64_t i_inx = 0;
+  int64_t o_inx = i_inx_add[0];
+  std::vector<int64_t> pos(input_dims - 1, 0);
+  while (i_inx < input_num) {
+    for (int64_t i = 0; i < input_shape[input_dims - 1]; ++i) {
+      *(output_ptr + o_inx + i) = input_values[i_inx + i];
+    }
+    pos[input_dims - kNum2] += 1;
+    int64_t dep = input_dims - 1;
+    for (int64_t i = input_dims - kNum2; i >= 0; --i) {
+      if (i > 0 && pos[i] >= input_shape[i]) {
+        pos[i] -= input_shape[i];
+        pos[i - 1] += 1;
+        dep = i;
+      } else {
+        break;
+      }
+    }
+    o_inx += i_inx_add[dep] + o_inx_add[dep] + input_shape[input_dims - 1];
+    i_inx += input_shape[input_dims - 1];
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3CpuKernel::DoCompute(CpuKernelContext &ctx) {
+  if (mode == "constant") {
+    T constant_values = static_cast<T>(0);
+    if (ctx.Input(kNum2) != nullptr) {
+      constant_values = *(reinterpret_cast<T *>(ctx.Input(kNum2)->GetData()));
+    } else {
+      KERNEL_LOG_DEBUG("Get attr [constant_values] failed, use default value [0]");
+    }
+    for (int64_t i = 0; i < input_dims / kNum2; ++i) {
+      int64_t u = paddings[i * kNum2];
+      int64_t v = paddings[i * kNum2 + 1];
+      paddings[i * kNum2] = paddings[kNum2 * (input_dims - i - 1)];
+      paddings[i * kNum2 + 1] = paddings[kNum2 * (input_dims - i - 1) + 1];
+      paddings[kNum2 * (input_dims - i - 1)] = u;
+      paddings[kNum2 * (input_dims - i - 1) + 1] = v;
+    }
+    ConstantModeCompute<T>(ctx, constant_values);
+  } else if (mode == "reflect") {
+    auto shard_padv3_reflcet = [&](int64_t start, int64_t end) {
+      for (int p = start; p < end; p++) {
+        ReflectModeCompute<T>(ctx, p);
+      }
+    };
+    const int64_t data_num = parallelSliceNum;
+    const bool enable_parallel = data_num > kParallelNum;
+    if (enable_parallel) {
+      const int64_t max_core_num =
+        std::max(static_cast<int64_t>(kMinCoreNum), static_cast<int64_t>(aicpu::CpuKernelUtils::GetCPUNum(ctx)));
+      const int64_t per_unit_size = data_num / std::min(data_num, max_core_num);
+      KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, per_unit_size, shard_padv3_reflcet),
+                          "PadV3 Compute failed.");
+    } else {
+      shard_padv3_reflcet(0, data_num);
+    }
+  } else if (mode == "edge") {
+    auto shard_padv3_edge = [&](int64_t start, int64_t end) {
+      for (int p = start; p < end; p++) {
+        EdgeModeCompute<T>(ctx, p);
+      }
+    };
+    const int64_t data_num = parallelSliceNum;
+    const bool enable_parallel = data_num > kParallelNum;
+    if (enable_parallel) {
+      const int64_t max_core_num =
+        std::max(static_cast<int64_t>(kMinCoreNum), static_cast<int64_t>(aicpu::CpuKernelUtils::GetCPUNum(ctx)));
+      const int64_t per_unit_size = data_num / std::min(data_num, max_core_num);
+      KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, per_unit_size, shard_padv3_edge),
+                          "PadV3 Compute failed.");
+    } else {
+      shard_padv3_edge(0, data_num);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t PadV3CpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
+  if (ctx.GetAttr("mode") == nullptr) {
+    mode = "constant";
+    KERNEL_LOG_DEBUG("Get attr [mode] failed, use default value [constant]");
+  } else {
+    mode = ctx.GetAttr("mode")->GetString();
+    const bool is_mode_available = std::find(mode_list.begin(), mode_list.end(), mode) != mode_list.end();
+    if (is_mode_available == false) {
+      KERNEL_LOG_ERROR(
+        "Attr [mode] must be included in [constant, reflect, edge], but got "
+        "[%s]",
+        mode.c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  if (ctx.GetAttr("paddings_contiguous") != nullptr) {
+    paddings_contiguous = ctx.GetAttr("paddings_contiguous")->GetBool();
+  } else {
+    paddings_contiguous = true;
+    KERNEL_LOG_DEBUG("Get attr [paddings_contiguous] failed, use default value [true]");
+  }
+  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
+    KERNEL_LOG_ERROR("Tensor y dtype[%s] must be same with x dtype[%s]", DTypeStr(ctx.Output(0)->GetDataType()).c_str(),
+                     DTypeStr(ctx.Input(0)->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  input_dims = ctx.Input(0)->GetTensorShape()->GetDims();
+  const std::vector<int64_t> paddings_shape = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  paddings_num = ctx.Input(1)->NumElements();
+  KERNEL_CHECK_FALSE(paddings_shape.size() == 1 && paddings_num == input_dims * kNum2, KERNEL_STATUS_PARAM_INVALID,
+                     "Paddings shape is not supported");
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3CpuKernel::GetPaddingsAndSetOuputShape(CpuKernelContext &ctx) {
+  auto paddings_ptr = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  paddings = std::vector<int64_t>(input_dims * kNum2, 0);
+  for (int64_t i = 0; i < paddings_num; i += kNum2) {
+    paddings[i] = static_cast<int64_t>(paddings_ptr[paddings_num - i - kNum2]);
+    paddings[i + 1] = static_cast<int64_t>(paddings_ptr[paddings_num - i - 1]);
+  }
+  if (mode == "edge" || mode == "reflect" || (mode == "constant" && paddings_contiguous == false)) {
+    paddings_num = paddings_num - kNum4;
+  }
+  if (paddings_contiguous == false) {
+    std::vector<int64_t> tmp = paddings;
+    for (int64_t i = 0; i < paddings_num; ++i) {
+      if (i % kNum2 == 0) {
+        paddings[i] = tmp[i / kNum2];
+      } else {
+        paddings[i] = tmp[(i + paddings_num) / kNum2];
+      }
+    }
+  }
+  input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  output_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  parallelSliceNum = 1;
+  for (int64_t i = 0; i < input_dims - paddings_num / kNum2; ++i) {
+    parallelSliceNum *= input_shape[i];
+  }
+  for (int64_t i = 0; i < paddings_num / kNum2; ++i) {
+    output_shape.end()[-(i + 1)] += (paddings[i * kNum2] + paddings[i * kNum2 + 1]);
+    KERNEL_CHECK_FALSE(output_shape.end()[-(i + 1)] > 0, KERNEL_STATUS_PARAM_INVALID,
+                       "output_shape number must be greater than 0");
+    KERNEL_CHECK_FALSE(input_shape.end()[-(i + 1)] >= std::max(-paddings[i * kNum2], -paddings[i * kNum2 + 1]),
+                       KERNEL_STATUS_PARAM_INVALID,
+                       "Padding size should be less than the corresponding input dimension");
+    if (mode == "reflect") {
+      KERNEL_CHECK_FALSE(input_shape.end()[-(i + 1)] > std::max(paddings[i * kNum2], paddings[i * kNum2 + 1]),
+                         KERNEL_STATUS_PARAM_INVALID,
+                         "Padding size should be less than the corresponding input dimension");
+    }
+  }
+  if (output_shape != ctx.Output(0)->GetTensorShape()->GetDimSizes()) {
+    ctx.Output(0)->GetTensorShape()->SetDimSizes(output_shape);
+    KERNEL_LOG_DEBUG("Set output tensor shape success, num elements:[%llu]",
+                     static_cast<uint64_t>(ctx.Output(0)->NumElements()));
+  } else {
+    KERNEL_LOG_DEBUG("Output tensor is a const tensor, num elements:[%llu]",
+                     static_cast<uint64_t>(ctx.Output(0)->NumElements()));
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kPadV3, PadV3CpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3.h
@ -0,0 +1,89 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_PAD_V3_H_
+#define AICPU_KERNELS_NORMALIZED_PAD_V3_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "securec.h"
+#include "status.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class PadV3CpuKernel : public CpuKernel {
+ public:
+  PadV3CpuKernel() = default;
+  ~PadV3CpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  std::vector<int64_t> paddings;
+  std::vector<int64_t> input_shape;
+  std::vector<int64_t> output_shape;
+  std::string mode;
+  bool paddings_contiguous;
+  int64_t input_dims{0};
+  int64_t paddings_num{0};
+  int64_t parallelSliceNum{1};
+
+  uint32_t CheckAndInitParams(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t GetPaddingsAndSetOuputShape(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t DoCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t EdgeModeCompute(CpuKernelContext &ctx, int64_t p);
+
+  template <typename T>
+  uint32_t EdgeCompute3D(T *input, T *output, int64_t p);
+
+  template <typename T>
+  uint32_t EdgeCompute2D(T *input, T *output, int64_t p);
+
+  template <typename T>
+  uint32_t EdgeCompute1D(T *input, T *output, int64_t p);
+
+  int64_t EdgeIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start, int64_t i_start);
+
+  template <typename T>
+  uint32_t ReflectModeCompute(CpuKernelContext &ctx, int64_t p);
+
+  template <typename T>
+  uint32_t ReflectCompute3D(T *input, T *output, int64_t p);
+
+  template <typename T>
+  uint32_t ReflectCompute2D(T *input, T *output, int64_t p);
+
+  template <typename T>
+  uint32_t ReflectCompute1D(T *input, T *output, int64_t p);
+
+  int64_t ReflectIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start, int64_t i_start);
+
+  template <typename T>
+  uint32_t ConstantModeCompute(CpuKernelContext &ctx, T constant_values);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3_grad.cc
@ -0,0 +1,367 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pad_v3_grad.h"
+
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <vector>
+
+#include "securec.h"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kPadV3Grad = "PadV3Grad";
+constexpr uint32_t kInputNum = 2;
+constexpr uint32_t kOutputNum = 1;
+constexpr int64_t kParallelNum = 1024 * 64;
+const int64_t k3DNum = 6;
+const int64_t k2DNum = 4;
+const int64_t k1DNum = 2;
+constexpr int64_t kpad_l = 0;
+constexpr int64_t kpad_t = 2;
+constexpr int64_t kpad_f = 4;
+constexpr int64_t kwidth = 1;
+constexpr int64_t kheight = 2;
+constexpr int64_t kchannel = 3;
+constexpr int64_t kInput1Dim = 3;
+constexpr int64_t kInput2Dim = 4;
+constexpr int64_t kInput3Dim = 5;
+constexpr int64_t k2Num = 2;
+constexpr int64_t k3Num = 3;
+constexpr int64_t k4Num = 4;
+
+const std::vector<std::string> mode_list = {"reflect", "edge"};
+using float16 = Eigen::half;
+
+#define PAD_V3_GRAD_READ_PADDINGS(DTYPE, TYPE, CTX)                    \
+  case (DTYPE): {                                                      \
+    uint32_t result1 = PadV3ReadPaddingsAndSetOutputShape1<TYPE>(CTX); \
+    uint32_t result2 = PadV3ReadPaddingsAndSetOutputShape2<TYPE>(CTX); \
+    if (result1 != KERNEL_STATUS_OK || result2 != KERNEL_STATUS_OK) {  \
+      KERNEL_LOG_ERROR("PadV3Grad kernel compute failed.");            \
+      return result1 && result2;                                       \
+    }                                                                  \
+    break;                                                             \
+  }
+
+#define PAD_V3_GRAD_COMPUTE_CASE(DTYPE, TYPE, CTX)          \
+  case (DTYPE): {                                           \
+    uint32_t result = PadV3GradCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                       \
+      KERNEL_LOG_ERROR("PadV3Grad kernel compute failed."); \
+      return result;                                        \
+    }                                                       \
+    break;                                                  \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t PadV3GradCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(PadV3GradCheck(ctx), "PadV3Grad check params failed.");
+  auto paddings_type = ctx.Input(1)->GetDataType();
+  switch (paddings_type) {
+    PAD_V3_GRAD_READ_PADDINGS(DT_INT32, int32_t, ctx)
+    PAD_V3_GRAD_READ_PADDINGS(DT_INT64, int64_t, ctx)
+    default:
+      KERNEL_LOG_ERROR("PadV3Grad paddings data type [%s] not support.", DTypeStr(paddings_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  auto data_type = ctx.Output(0)->GetDataType();
+  switch (data_type) {
+    PAD_V3_GRAD_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    PAD_V3_GRAD_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    PAD_V3_GRAD_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    PAD_V3_GRAD_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    PAD_V3_GRAD_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    PAD_V3_GRAD_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
+    PAD_V3_GRAD_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
+    PAD_V3_GRAD_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
+    PAD_V3_GRAD_COMPUTE_CASE(DT_FLOAT16, float16, ctx)
+    PAD_V3_GRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    PAD_V3_GRAD_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    PAD_V3_GRAD_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
+    PAD_V3_GRAD_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("PadV3Grad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t PadV3GradCpuKernel::PadV3GradCheck(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "PadV3Grad check failed.");
+  if (ctx.GetAttr("paddings_contiguous") == nullptr) {
+    padding_contiguous = true;
+    KERNEL_LOG_DEBUG("Get attr [paddings_contiguous] failed, use default value [true]");
+  } else {
+    padding_contiguous = ctx.GetAttr("paddings_contiguous")->GetBool();
+  }
+  if (ctx.GetAttr("mode") == nullptr) {
+    mode = "reflect";
+    KERNEL_LOG_DEBUG("Get attr [mode] failed, use default value [reflect]");
+  } else {
+    mode = ctx.GetAttr("mode")->GetString();
+    const bool is_mode_available = std::find(mode_list.begin(), mode_list.end(), mode) != mode_list.end();
+    if (is_mode_available == false) {
+      KERNEL_LOG_ERROR("Attr [mode] must be included in [reflect, edge], but got [%s]", mode.c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+
+  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
+    KERNEL_LOG_ERROR("Tensor y dtype[%s] must be same with x dtype[%s]", DTypeStr(ctx.Output(0)->GetDataType()).c_str(),
+                     DTypeStr(ctx.Input(0)->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  const std::vector<int64_t> paddings_shape = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  KERNEL_CHECK_FALSE(
+    paddings_shape.size() == 1 && (paddings_shape[0] == k3DNum + k4Num || paddings_shape[0] == k2DNum + k4Num ||
+                                   paddings_shape[0] == k1DNum + k4Num || paddings_shape[0] == 1),
+    KERNEL_STATUS_PARAM_INVALID, "Paddings shape is not supported");
+  KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() >= kInput1Dim, KERNEL_STATUS_PARAM_INVALID,
+                     "Dims of tensor x should be greater than or equal to 3");
+  KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() <= kInput3Dim, KERNEL_STATUS_PARAM_INVALID,
+                     "Only 3D, 4D, 5D padding with non-constant padding are "
+                     "supported for now");
+
+  const int64_t input_dim = ctx.Input(0)->GetTensorShape()->GetDims();
+  const int64_t num_elem = ctx.Input(1)->NumElements();
+  KERNEL_CHECK_FALSE(num_elem % k2Num == 0 || num_elem == 1, KERNEL_STATUS_PARAM_INVALID,
+                     "Padding length must be divisible by 2");
+
+  if (input_dim == kInput1Dim) {
+    KERNEL_CHECK_FALSE(num_elem == k1DNum + k4Num || num_elem == 1, KERNEL_STATUS_PARAM_INVALID,
+                       "3D tensors expect 6 values for padding");
+  } else if (input_dim == kInput2Dim) {
+    KERNEL_CHECK_FALSE(num_elem == k2DNum + k4Num || num_elem == 1, KERNEL_STATUS_PARAM_INVALID,
+                       "4D tensors expect 8 values for padding");
+  } else if (input_dim == kInput3Dim) {
+    KERNEL_CHECK_FALSE(num_elem == k3DNum + k4Num || num_elem == 1, KERNEL_STATUS_PARAM_INVALID,
+                       "5D tensors expect 10 values for padding");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3GradCpuKernel::PadV3ReadPaddingsAndSetOutputShape1(CpuKernelContext &ctx) {
+  num_elem = ctx.Input(1)->NumElements();
+  input_dim = ctx.Input(0)->GetTensorShape()->GetDims();
+  const std::vector<int64_t> input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  auto paddings_ptr = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  paddings = std::vector<int64_t>(input_dim * k2Num, 0);
+
+  for (int64_t i = 0; i < num_elem; i += k2Num) {
+    paddings[i] = static_cast<int64_t>(paddings_ptr[num_elem - i - k2Num]);
+    paddings[i + 1] = static_cast<int64_t>(paddings_ptr[num_elem - i - 1]);
+  }
+  num_elem = num_elem - k4Num;
+  if (num_elem == 1) {
+    num_elem = k2Num * (input_dim - k2Num);
+    for (int64_t i = 0; i < k2Num * (input_dim - k2Num); ++i) {
+      paddings[i] = static_cast<int64_t>(paddings_ptr[0]);
+    }
+  }
+
+  parallelSliceNum = 1;
+  for (int64_t i = 0; i < input_dim - num_elem / k2Num; i++) {
+    parallelSliceNum *= input_shape[i];
+  }
+
+  if (padding_contiguous == false && num_elem == k3DNum) {
+    std::vector<int64_t> tmp = paddings;
+    paddings[1] = tmp[k3Num];
+    paddings[k2Num] = tmp[1];
+    paddings[k3Num] = tmp[k4Num];
+    paddings[k4Num] = tmp[k2Num];
+  }
+
+  if (padding_contiguous == false && num_elem == k2DNum) {
+    std::vector<int64_t> tmp = paddings;
+    paddings[1] = tmp[k2Num];
+    paddings[k2Num] = tmp[1];
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3GradCpuKernel::PadV3ReadPaddingsAndSetOutputShape2(CpuKernelContext &ctx) {
+  std::vector<int64_t> output_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  output_shape.end()[-kwidth] -= (paddings[kpad_l] + paddings[kpad_l + 1]);
+  output_shape.end()[-kheight] -= (paddings[kpad_t] + paddings[kpad_t + 1]);
+  output_shape.end()[-kchannel] -= (paddings[kpad_f] + paddings[kpad_f + 1]);
+
+  KERNEL_CHECK_FALSE(
+    output_shape.end()[-kwidth] > 0 && output_shape.end()[-kheight] > 0 && output_shape.end()[-kchannel] > 0,
+    KERNEL_STATUS_PARAM_INVALID, "output_shape number must be greater than 0");
+
+  if (output_shape != ctx.Output(0)->GetTensorShape()->GetDimSizes()) {
+    ctx.Output(0)->GetTensorShape()->SetDimSizes(output_shape);
+    KERNEL_LOG_DEBUG("Set output tensor shape success, num elements:[%llu]",
+                     static_cast<uint64_t>(ctx.Output(0)->NumElements()));
+  } else {
+    KERNEL_LOG_DEBUG("Output tensor is a const tensor, num elements:[%llu]",
+                     static_cast<uint64_t>(ctx.Output(0)->NumElements()));
+  }
+  const std::string padding_contiguous_str = padding_contiguous ? std::string("True") : std::string("False");
+  KERNEL_LOG_DEBUG(
+    "PadV3GradCpuKernel[%s], x: size[%llu] dtype[%s], "
+    "paddings: size[%llu] dtype[%s], y: size[%llu] dtype[%s], mode: [%s], "
+    "padding_contiguous: [%s].",
+    ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), DTypeStr(ctx.Input(0)->GetDataType()).c_str(),
+    ctx.Input(1)->GetDataSize(), DTypeStr(ctx.Input(1)->GetDataType()).c_str(), ctx.Output(0)->GetDataSize(),
+    DTypeStr(ctx.Output(0)->GetDataType()).c_str(), mode.c_str(), padding_contiguous_str.c_str());
+  return KERNEL_STATUS_OK;
+}
+
+int64_t PadV3GradCpuKernel::IndexCaculate(int64_t pad_value, int64_t now, int64_t output_value, int64_t o_start,
+                                          int64_t i_start) {
+  int64_t ip = 0;
+  if (now < pad_value) {
+    if (mode == "reflect") {
+      ip = pad_value + pad_value - now;
+    } else if (mode == "edge") {
+      ip = pad_value;
+    }
+  } else if (now >= pad_value && now < output_value + pad_value) {
+    ip = now;
+  } else {
+    if (mode == "reflect") {
+      ip = (output_value + pad_value - 1) + (output_value + pad_value - 1) - now;
+    } else if (mode == "edge") {
+      ip = output_value + pad_value - 1;
+    }
+  }
+  ip = ip - o_start + i_start;
+  return ip;
+}
+
+template <typename T>
+uint32_t PadV3GradCpuKernel::PadV3GradCompute1(T *input, T *output, int64_t p) {
+  if (num_elem == k1DNum) {
+    PadV3GradCompute1D<T>(input, output, p);
+  } else if (num_elem == k2DNum) {
+    for (int i = 0; i < input_h; i++) {
+      PadV3GradCompute2D<T>(input, output, p, i);
+    }
+  } else if (num_elem == k3DNum) {
+    for (int z = 0; z < input_c; z++) {
+      PadV3GradCompute3D<T>(input, output, p, z);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3GradCpuKernel::PadV3GradCompute1D(T *input, T *output, int64_t p) {
+  int ip_x;
+  for (int j = 0; j < input_w; j++) {
+    ip_x = IndexCaculate(pad_l, j, output_w, o_start_x, i_start_x);
+    T *src_p = input + p * input_w + j;
+    T *dest_p = output + p * output_w + ip_x;
+    *dest_p += *src_p;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3GradCpuKernel::PadV3GradCompute2D(T *input, T *output, int64_t p, int64_t i) {
+  int ip_x, ip_y;
+  for (int j = 0; j < input_w; j++) {
+    ip_x = IndexCaculate(pad_l, j, output_w, o_start_x, i_start_x);
+    ip_y = IndexCaculate(pad_t, i, output_h, o_start_y, i_start_y);
+    T *src_p = input + p * input_w * input_h + i * input_w + j;
+    T *dest_p = output + p * output_w * output_h + ip_y * output_w + ip_x;
+    *dest_p += *src_p;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3GradCpuKernel::PadV3GradCompute3D(T *input, T *output, int64_t p, int64_t z) {
+  int ip_x, ip_y, ip_z;
+  for (int i = 0; i < input_h; i++) {
+    for (int j = 0; j < input_w; j++) {
+      ip_x = IndexCaculate(pad_l, j, output_w, o_start_x, i_start_x);
+      ip_y = IndexCaculate(pad_t, i, output_h, o_start_y, i_start_y);
+      ip_z = IndexCaculate(pad_f, z, output_c, o_start_z, i_start_z);
+      T *src_p = input + p * input_w * input_h * input_c + z * input_w * input_h + i * input_w + j;
+      T *dest_p = output + p * output_w * output_h * output_c + ip_z * output_w * output_h + ip_y * output_w + ip_x;
+      *dest_p += *src_p;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PadV3GradCpuKernel::PadV3GradCompute(CpuKernelContext &ctx) {
+  const std::vector<int64_t> input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> output_shape = ctx.Output(0)->GetTensorShape()->GetDimSizes();
+
+  T *input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  T *output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+
+  output_w = output_shape.end()[-kwidth];
+  output_h = output_shape.end()[-kheight];
+  output_c = output_shape.end()[-kchannel];
+  input_w = input_shape.end()[-kwidth];
+  input_h = input_shape.end()[-kheight];
+  input_c = input_shape.end()[-kchannel];
+  i_start_x = std::max(int64_t(0), -paddings[kpad_l]);
+  i_start_y = std::max(int64_t(0), -paddings[kpad_t]);
+  i_start_z = std::max(int64_t(0), -paddings[kpad_f]);
+  o_start_x = std::max(int64_t(0), paddings[kpad_l]);
+  o_start_y = std::max(int64_t(0), paddings[kpad_t]);
+  o_start_z = std::max(int64_t(0), paddings[kpad_f]);
+  pad_l = paddings[kpad_l];
+  pad_t = paddings[kpad_t];
+  pad_f = paddings[kpad_f];
+
+  int64_t output_num_ = 1;
+  for (int64_t i = 0; i < input_dim; i++) {
+    output_num_ *= output_shape[i];
+  }
+  auto ret = memset_s(output, sizeof(T) * output_num_, 0, sizeof(T) * output_num_);
+  if (ret != EOK) {
+    KERNEL_LOG_ERROR("memset_s error, ret=%d", ret);
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  auto shard_padv3_grad = [&](int64_t start, int64_t end) {
+    for (int p = start; p < end; p++) {
+      PadV3GradCompute1<T>(input, output, p);
+    }
+  };
+  const int64_t data_num = parallelSliceNum;
+  const bool enable_parallel = parallelSliceNum > kParallelNum;
+  if (enable_parallel) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_padv3_grad),
+                        "PadV3Grad Compute failed.");
+  } else {
+    for (int p = 0; p < data_num; p++) {
+      PadV3GradCompute1<T>(input, output, p);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kPadV3Grad, PadV3GradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3_grad.h
@ -0,0 +1,81 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_PAD_V3_GRAD_H_
+#define AICPU_KERNELS_NORMALIZED_PAD_V3_GRAD_H_
+
+#include <vector>
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class PadV3GradCpuKernel : public CpuKernel {
+ public:
+  PadV3GradCpuKernel() = default;
+  ~PadV3GradCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  bool padding_contiguous = true;
+  std::string mode = "reflect";
+  std::vector<int64_t> paddings;
+  int64_t output_w;
+  int64_t output_h;
+  int64_t output_c;
+  int64_t input_w;
+  int64_t input_h;
+  int64_t input_c;
+  int64_t i_start_x;
+  int64_t i_start_y;
+  int64_t i_start_z;
+  int64_t o_start_x;
+  int64_t o_start_y;
+  int64_t o_start_z;
+  int64_t pad_l;
+  int64_t pad_t;
+  int64_t pad_f;
+  int64_t parallelSliceNum;
+  int64_t num_elem;
+  int64_t input_dim;
+  uint32_t PadV3GradCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t PadV3GradCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t PadV3GradCompute1D(T *input, T *output, int64_t p);
+
+  template <typename T>
+  uint32_t PadV3GradCompute2D(T *input, T *output, int64_t p, int64_t i);
+
+  template <typename T>
+  uint32_t PadV3GradCompute3D(T *input, T *output, int64_t p, int64_t z);
+
+  template <typename T>
+  uint32_t PadV3GradCompute1(T *input, T *output, int64_t p);
+
+  int64_t IndexCaculate(int64_t pad_value, int64_t now, int64_t output_value, int64_t o_start, int64_t i_start);
+
+  template <typename T>
+  uint32_t PadV3ReadPaddingsAndSetOutputShape1(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t PadV3ReadPaddingsAndSetOutputShape2(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
@ -75,8 +75,6 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
                                                               mindspore::kFillOpName,
                                                               mindspore::kLogMatrixDeterminantOpName,
                                                               mindspore::kMatrixSolveLsOpName,
-                                                               mindspore::kMaskedSelectOpName,
-                                                               mindspore::kMaskedSelectGradOpName,
                                                               mindspore::kMedianOpName,
                                                               mindspore::kACosGradOpName,
                                                               mindspore::kAcoshGradOpName,
@ -244,7 +242,11 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
                                                               mindspore::kLuUnpackOpName,
                                                               mindspore::kLuUnpackGradOpName,
                                                               mindspore::kMatMulOpName,
-                                                               mindspore::kMatrixExpOpName};
+                                                               mindspore::kMatrixExpOpName,
+                                                               mindspore::kPadV3GradOpName,
+                                                               mindspore::kPadV3OpName,
+                                                               mindspore::kLogicalXorOpName,
+                                                               mindspore::kLogNormalReverseOpName};

  static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
  static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";
--- a/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
+++ b/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
@ -340,3 +340,5 @@ from .lstsq import _lstsq_aicpu
 from .lu_unpack import _lu_unpack_aicpu
 from .lu_unpack_grad import _lu_unpack_grad_aicpu
 from .matrix_exp import _matrix_exp_aicpu
+from .pad_v3_grad import _pad_v3_grad_aicpu
+from .pad_v3 import _pad_v3_aicpu
--- a/mindspore/python/mindspore/ops/composite/math_ops.py
+++ b/mindspore/python/mindspore/ops/composite/math_ops.py
@ -765,8 +765,12 @@ def resize_nearest_neighbor(input_x, size, align_corners=False):
        ``Ascend`` ``GPU`` ``CPU``

    Examples:
+        >>> import numpy as np
+        >>> import mindspore
+        >>> from mindspore import Tensor, ops
        >>> input_tensor = Tensor(np.array([[[[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]]]), mindspore.float32)
-        >>> output = ops.ResizeNearestNeighbor(input_tensor, (2, 2))
+        >>> size = (2, 2)
+        >>> output = ops.ResizeNearestNeighbor(size=size)(input_tensor)
        >>> print(output)
        [[[[-0.1  0.3]
           [ 0.4  0.5]]]]
--- a/mindspore/python/mindspore/ops/operations/array_ops.py
+++ b/mindspore/python/mindspore/ops/operations/array_ops.py
@ -3744,9 +3744,12 @@ class ResizeNearestNeighbor(Primitive):
        ``Ascend`` ``GPU`` ``CPU``

    Examples:
+        >>> import numpy as np
+        >>> import mindspore
+        >>> from mindspore import Tensor, ops
        >>> input_tensor = Tensor(np.array([[[[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]]]), mindspore.float32)
-        >>> resize = ops.ResizeNearestNeighbor((2, 2))
-        >>> output = resize(input_tensor)
+        >>> size = (2, 2)
+        >>> output = ops.ResizeNearestNeighbor(size=size)(input_tensor)
        >>> print(output)
        [[[[-0.1  0.3]
           [ 0.4  0.5]]]]