!48016 second half of 0103 aicpu migration without IsInf

Merge pull request !48016 from 李林杰/0118_second_half_0103_aicpu_migration_fix_test_conj
2023-01-18 08:23:36 +00:00 · 2023-01-18 08:23:36 +00:00 · 08aa1515d3
parent 2795accd48 3f71793aa3
commit 08aa1515d3
36 changed files with 4436 additions and 4 deletions
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.cc
@ -0,0 +1,228 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "hypot.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *kHypot = "Hypot";
+const int64_t kParallelDataNum = 2 * 1024;
+const int64_t kParallelDataNumMid = 16 * 1024;
+const int64_t kParallelDataNumSameShape = 7 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
+
+#define HYPOT_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                       \
+    uint32_t result = HypotCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                   \
+      KERNEL_LOG_ERROR("Hypot kernel compute failed."); \
+      return result;                                    \
+    }                                                   \
+    break;                                              \
+  }
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+T hypot(T a, T b) {
+  return std::hypot(a, b);
+}
+
+uint32_t HypotCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Hypot check input and output number failed.");
+  KERNEL_HANDLE_ERROR(HypotParamCheck(ctx), "Hypot check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    HYPOT_COMPUTE_CASE(DT_FLOAT, float_t, ctx)
+    HYPOT_COMPUTE_CASE(DT_DOUBLE, double_t, ctx)
+    default:
+      KERNEL_LOG_ERROR("Hypot kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t HypotCpuKernel::HypotParamCheck(CpuKernelContext &ctx) {
+  Tensor *input_0 = ctx.Input(0);
+  Tensor *input_1 = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  DataType input0_type = input_0->GetDataType();
+  DataType input1_type = input_1->GetDataType();
+  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with "
+                     "input1 [%s].",
+                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
+  KERNEL_LOG_DEBUG(
+    "HypotCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t HypotCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
+  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
+  int64_t data_num = ctx.Output(0)->NumElements();
+  BcastShapeType type;
+
+  if (in0_elements_nums == in1_elements_nums) {
+    type = BcastShapeType::SAME_SHAPE;
+  } else {
+    type = (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
+  }
+
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumSameShapeMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_hypot = [&](int64_t start, int64_t end) {
+      switch (type) {
+        case BcastShapeType::SAME_SHAPE:
+          for (int64_t i = start; i < end; ++i) {
+            *(out + i) = hypot(*(in0 + i), *(in1 + i));
+          }
+          break;
+        case BcastShapeType::X_ONE_ELEMENT:
+          for (int64_t i = start; i < end; ++i) {
+            *(out + i) = hypot(*in0, *(in1 + i));
+          }
+          break;
+        case BcastShapeType::Y_ONE_ELEMENT:
+          for (int64_t i = start; i < end; ++i) {
+            *(out + i) = hypot(*(in0 + i), *in1);
+          }
+          break;
+        default:
+          KERNEL_LOG_ERROR("Invalid type [%d]", static_cast<int32_t>(type));
+          break;
+      }
+    };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_hypot),
+                        "Hypot Compute failed.");
+  } else {
+    switch (type) {
+      case BcastShapeType::SAME_SHAPE:
+        for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
+          *(out + i) = hypot(*(in0 + i), *(in1 + i));
+        }
+        break;
+      case BcastShapeType::X_ONE_ELEMENT:
+        for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
+          *(out + i) = hypot(*in0, *(in1 + i));
+        }
+        break;
+      case BcastShapeType::Y_ONE_ELEMENT:
+        for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
+          *(out + i) = hypot(*(in0 + i), *in1);
+        }
+        break;
+      default:
+        KERNEL_LOG_ERROR("Invalid type [%d]", static_cast<int32_t>(type));
+        break;
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t HypotCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
+  T *in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  T *in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  T *out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Output(0)->NumElements();
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_hypot = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; ++i) {
+        *(out + i) = hypot<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
+      }
+    };
+
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_hypot),
+                        "Hypot Compute failed.");
+  } else {
+    for (int64_t i = 0; i < data_num; ++i) {
+      *(out + i) = hypot<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t HypotCpuKernel::HypotCompute(CpuKernelContext &ctx) {
+  Tensor *input0_tensor = ctx.Input(0);
+  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input0_elements_nums = input0_tensor->NumElements();
+
+  Tensor *input1_tensor = ctx.Input(1);
+  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input1_elements_nums = input1_tensor->NumElements();
+
+  bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
+  if (isNeedBcast) {
+    return NoBcastCompute<T>(ctx);
+  } else {
+    Bcast bcast(input0_shape, input1_shape);
+    if (!bcast.IsValid()) {
+      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+
+    return BcastCompute<T>(ctx, bcast);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kHypot, HypotCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.h
@ -0,0 +1,43 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_HYPOT_H_
+#define AICPU_KERNELS_NORMALIZED_HYPOT_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+namespace aicpu {
+class HypotCpuKernel : public CpuKernel {
+ public:
+  HypotCpuKernel() = default;
+  ~HypotCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t HypotParamCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t NoBcastCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
+
+  template <typename T>
+  uint32_t HypotCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.cc
@ -0,0 +1,81 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "identityn.h"
+#include <algorithm>
+#include <vector>
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "securec.h"
+#include "status.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kIdentityN = "IdentityN";
+}  // namespace
+
+namespace aicpu {
+uint32_t IdentityNCpuKernel::IdentityNParamCheck(CpuKernelContext &ctx) {
+  // input size and output size check
+  uint32_t input_size = ctx.GetInputsSize();
+  uint32_t output_size = ctx.GetOutputsSize();
+  KERNEL_CHECK_FALSE((input_size == output_size), KERNEL_STATUS_PARAM_INVALID,
+                     "Input size should equal to Output size.");
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, input_size, output_size), "[%s] check params failed.", kIdentityN);
+  for (uint32_t idx = 0; idx < input_size; ++idx) {
+    Tensor *in_tensor = ctx.Input(idx);
+    Tensor *out_tensor = ctx.Output(idx);
+    // TensorShape check
+    auto in_shape = in_tensor->GetTensorShape();
+    auto out_shape = out_tensor->GetTensorShape();
+    KERNEL_CHECK_FALSE((in_shape->GetDimSizes() == out_shape->GetDimSizes()), KERNEL_STATUS_PARAM_INVALID,
+                       "In tensor shape should equal to out tensor shape.");
+    // DataType Check
+    DataType in_type = in_tensor->GetDataType();
+    DataType out_type = out_tensor->GetDataType();
+    KERNEL_CHECK_FALSE((in_type == out_type), KERNEL_STATUS_PARAM_INVALID,
+                       "In tensor data type should equal to out tensor data type.");
+    bool type_support =
+      std::find(support_data_type.begin(), support_data_type.end(), in_type) != support_data_type.end();
+    KERNEL_CHECK_FALSE(type_support, KERNEL_STATUS_PARAM_INVALID, "IdentityN kernel data type [%s] not support.",
+                       DTypeStr(in_type).c_str());
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t IdentityNCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(IdentityNParamCheck(ctx), "IdentityNCpuKernel check params failed");
+  uint32_t input_size = ctx.GetInputsSize();
+  for (uint32_t idx = 0; idx < input_size; ++idx) {
+    Tensor *in_tensor = ctx.Input(idx);
+    Tensor *out_tensor = ctx.Output(idx);
+    auto in_data = in_tensor->GetData();
+    auto out_data = out_tensor->GetData();
+    uint64_t in_size = in_tensor->GetDataSize();
+    uint64_t out_size = out_tensor->GetDataSize();
+
+    // memory copy
+    if (out_data != in_data) {
+      int cpret = memcpy_s(out_data, out_size, in_data, in_size);
+      KERNEL_CHECK_FALSE((cpret == EOK), KERNEL_STATUS_INNER_ERROR,
+                         "[%s] memcpy_s to output failed, destMax [%ld], count [%ld].", kIdentityN, out_size, in_size);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kIdentityN, IdentityNCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.h
@ -0,0 +1,36 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_IDENTITY_N_H_
+#define AICPU_KERNELS_NORMALIZED_IDENTITY_N_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class IdentityNCpuKernel : public CpuKernel {
+ public:
+  IdentityNCpuKernel() = default;
+  ~IdentityNCpuKernel() = default;
+
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t IdentityNParamCheck(CpuKernelContext &ctx);
+  const std::vector<DataType> support_data_type = {DT_FLOAT, DT_FLOAT16, DT_INT8,   DT_INT16,  DT_UINT16, DT_UINT8,
+                                                   DT_INT32, DT_INT64,   DT_UINT32, DT_UINT64, DT_BOOL,   DT_DOUBLE};
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.cc
@ -0,0 +1,230 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "index_fill.h"
+
+#include <securec.h>
+
+#include <map>
+
+#include "Eigen/Core"
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kNumInput = 4;
+const uint32_t kNumOutput = 1;
+const char *kIndexFill = "IndexFill";
+
+// when input data size is more than kParallelDataNum, use Parallel func
+const uint32_t kParallelDataNum = 16 * 1024;
+const uint32_t kParallelDataNumMid = 128 * 1024;
+
+#define INDEXFILL_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                           \
+    uint32_t result = DoCompute<TYPE>(CTX);                 \
+    if (result != KERNEL_STATUS_OK) {                       \
+      KERNEL_LOG_ERROR("IndexFill kernel compute failed."); \
+      return result;                                        \
+    }                                                       \
+    break;                                                  \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t IndexFillCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kNumInput, kNumOutput), "IndexFill check input and output number failed.");
+  // get input Tensors
+  for (uint32_t i = 0; i < kNumInput; ++i) {
+    Tensor *tensor = ctx.Input(i);
+    inputs_.push_back(tensor);
+  }
+  // get output Tensors
+  Tensor *tensor = ctx.Output(0);
+  outputs_.push_back(tensor);
+
+  int32_t value_dim = inputs_[3]->GetTensorShape()->GetDims();
+
+  KERNEL_CHECK_FALSE((value_dim == 0), KERNEL_STATUS_INNER_ERROR,
+                     "IndexFill only supports a 0-dimensional value tensor, "
+                     "but got tensor with [%d] dimension(s).",
+                     value_dim)
+
+  DataType dim_type = inputs_[1]->GetDataType();
+  DataType index_type = inputs_[2]->GetDataType();
+
+  if (dim_type != DT_INT32) {
+    KERNEL_LOG_ERROR("IndexFill: Expected dtype int32 for dim.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (index_type != DT_INT32) {
+    KERNEL_LOG_ERROR("IndexFill: Expected dtype int32 for index.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+void IndexFillCpuKernel::SpecialCompute(int64_t start, int64_t end, const int32_t *input_dim,
+                                        std::map<int32_t, bool> &index_dict) {
+  auto *input_x = reinterpret_cast<T *>(inputs_[0]->GetData());
+  auto *input_value = reinterpret_cast<T *>(inputs_[3]->GetData());
+  auto *output_y = reinterpret_cast<T *>(outputs_[0]->GetData());
+  int32_t x_dim_nums = inputs_[0]->GetTensorShape()->GetDims();
+  auto x_dims = inputs_[0]->GetTensorShape()->GetDimSizes();
+
+  int32_t dim_flag;
+  if (x_dim_nums != 0) {
+    dim_flag = *input_dim % x_dim_nums + 1;
+  } else {
+    dim_flag = 0;
+  }
+
+  int32_t remain_dims = 1;
+  if (dim_flag == x_dim_nums) {
+    if (dim_flag != 0) {
+      remain_dims = x_dims[*input_dim];
+    }
+    for (int64_t i = start; i < end; i++) {
+      int32_t index_flag = i % remain_dims;
+      std::map<int32_t, bool>::iterator f = index_dict.find(index_flag);
+      if (f != index_dict.end()) {
+        output_y[i] = *input_value;
+      } else {
+        output_y[i] = input_x[i];
+      }
+    }
+  } else {
+    for (int32_t i = *input_dim + 1; i < x_dim_nums; i++) {
+      remain_dims *= x_dims[i];
+    }
+    for (int64_t i = start; i < end; i++) {
+      int32_t index_flag = (i / remain_dims) % x_dims[*input_dim];
+      std::map<int32_t, bool>::iterator f = index_dict.find(index_flag);
+      if (f != index_dict.end()) {
+        output_y[i] = *input_value;
+      } else {
+        output_y[i] = input_x[i];
+      }
+    }
+  }
+}
+
+template <typename T>
+uint32_t IndexFillCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  int32_t *input_1 = reinterpret_cast<int32_t *>(inputs_[1]->GetData());
+  int32_t *input_2 = reinterpret_cast<int32_t *>(inputs_[2]->GetData());
+
+  int32_t x_dim_nums = inputs_[0]->GetTensorShape()->GetDims();
+  int32_t dim_nums = inputs_[1]->GetTensorShape()->GetDims();
+  int32_t index_dim_nums = inputs_[2]->GetTensorShape()->GetDims();
+  auto x_dims = inputs_[0]->GetTensorShape()->GetDimSizes();
+
+  uint32_t data_num = outputs_[0]->NumElements();
+  int64_t index_num = inputs_[2]->GetTensorShape()->NumElements();
+
+  KERNEL_CHECK_FALSE(dim_nums == 0, KERNEL_STATUS_PARAM_INVALID, "Dim has to be a scalar.")
+  KERNEL_CHECK_FALSE(index_dim_nums <= 1, KERNEL_STATUS_PARAM_INVALID, "Index has to be a vector/scalar.")
+
+  int32_t cur_dim = *input_1;
+  if (*input_1 < 0) {
+    *input_1 = *input_1 + x_dim_nums;
+  }
+
+  std::map<int32_t, bool> index_dict;
+  if (x_dim_nums == 0) {
+    for (int32_t i = 0; i < index_num; i++) {
+      if (input_2[i] < -1 || input_2[i] > 0) {
+        KERNEL_LOG_ERROR("Invalid argument 3: out of range.");
+        return KERNEL_STATUS_PARAM_INVALID;
+      } else {
+        index_dict.insert(std::pair<int32_t, bool>(0, true));
+      }
+    }
+  } else if (cur_dim < -x_dim_nums || cur_dim >= x_dim_nums) {
+    KERNEL_LOG_ERROR(
+      "Dimension out of range (expected to be in range of "
+      "[%d, %d], but got %d).",
+      0 - x_dim_nums, x_dim_nums - 1, cur_dim);
+    return KERNEL_STATUS_PARAM_INVALID;
+  } else {
+    for (int32_t i = 0; i < index_num; i++) {
+      if (input_2[i] < -x_dims[*input_1] || input_2[i] >= x_dims[*input_1]) {
+        KERNEL_LOG_ERROR("Invalid argument 3: out of range.");
+        return KERNEL_STATUS_PARAM_INVALID;
+      } else {
+        input_2[i] = (input_2[i] < 0) ? (input_2[i] + x_dims[*input_1]) : input_2[i];
+        index_dict.insert(std::pair<int32_t, bool>(input_2[i], true));
+      }
+    }
+  }
+
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("The number of available CPU cores must be greater than 0!");
+    }
+
+    auto sharder_index_fill = [&](int64_t start, int64_t end) { SpecialCompute<T>(start, end, input_1, index_dict); };
+
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_index_fill),
+                        "IndexFill Compute failed.");
+  } else {
+    SpecialCompute<T>(0, data_num, input_1, index_dict);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t IndexFillCpuKernel::Compute(CpuKernelContext &ctx) {
+  uint32_t res = GetInputAndCheck(ctx);
+  if (res != KERNEL_STATUS_OK) {
+    return res;
+  }
+
+  DataType input_type{ctx.Input(0)->GetDataType()};
+  switch (input_type) {
+    INDEXFILL_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    INDEXFILL_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    INDEXFILL_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    INDEXFILL_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    INDEXFILL_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    INDEXFILL_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
+    INDEXFILL_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
+    INDEXFILL_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
+    INDEXFILL_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    INDEXFILL_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    INDEXFILL_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                       DTypeStr(input_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kIndexFill, IndexFillCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.h
@ -0,0 +1,40 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_INDEX_FILL_H_
+#define AICPU_KERNELS_NORMALIZED_INDEX_FILL_H_
+
+#include <vector>
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class IndexFillCpuKernel : public CpuKernel {
+ public:
+  ~IndexFillCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t DoCompute(CpuKernelContext &ctx);
+  uint32_t GetInputAndCheck(CpuKernelContext &ctx);
+  template <typename T>
+  void SpecialCompute(int64_t start, int64_t end, const int32_t *input_dim, std::map<int32_t, bool> &index_dict);
+
+  std::vector<Tensor *> inputs_;
+  std::vector<Tensor *> outputs_;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.cc
@ -0,0 +1,185 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kldiv.h"
+
+#include <iostream>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const std::uint32_t kKLDivInputNum{2};
+const std::uint32_t kKLDivOutputNum{1};
+const std::int64_t ParallelNum{4096};
+const char *kKLDiv{"KLDiv"};
+}  // namespace
+
+namespace aicpu {
+namespace detail {
+template <typename T>
+inline std::uint32_t ComputeKLDivKernel(const CpuKernelContext &ctx) {
+  const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
+  auto input = static_cast<T *>(ctx.Input(0)->GetData());
+  auto target = static_cast<T *>(ctx.Input(1)->GetData());
+  auto output = static_cast<T *>(ctx.Output(0)->GetData());
+  std::int64_t total = ctx.Input(0)->NumElements();
+  std::size_t data_size = ctx.Input(0)->GetDataSize();
+  uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
+  std::string reduction = ctx.GetAttr("reduction")->GetString();
+  if (reduction != "sum" && reduction != "batchmean" && reduction != "none" && reduction != "mean") {
+    KERNEL_LOG_ERROR("%s is not a valid value for reduction", reduction.c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  bool parallel_flag = false;
+  if (data_size > ParallelNum * sizeof(T)) {
+    parallel_flag = true;
+  }
+  if (cores == 0) {
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  T *tmp_array = nullptr;
+  if (reduction == "none") {
+    tmp_array = output;
+  } else {
+    tmp_array = new T[total];
+  }
+  if (parallel_flag) {
+    std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
+    ParallelFor(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
+      std::int64_t length = end - begin;
+      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input + begin, length, 1);
+      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target + begin, length, 1);
+      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_reduce(tmp_array + begin, length, 1);
+      T constant_zero{0};
+      array_reduce = array_target * (Eigen::log(array_target) - array_input);
+      for (std::int64_t idx = 0; idx < length; ++idx) {
+        if (!(target[begin + idx] > constant_zero)) {
+          array_reduce(idx) = constant_zero;
+        }
+      }
+    });
+  } else {
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input, total, 1);
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target, total, 1);
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_reduce(tmp_array, total, 1);
+    array_reduce = array_target * (Eigen::log(array_target) - array_input);
+    T constant_zero{0};
+    for (uint32_t idx = 0; idx < total; ++idx) {
+      if (!(target[idx] > constant_zero)) {
+        array_reduce(idx) = constant_zero;
+      }
+    }
+  }
+  Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > reduce(tmp_array, total, 1);
+  if (reduction == "sum") {
+    output[0] = reduce.sum();
+  } else if (reduction == "batchmean") {
+    std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+    output[0] = reduce.sum() / T(input_dims[0]);
+  } else if (reduction == "mean") {
+    output[0] = reduce.mean();
+  }
+  if (reduction != "none") {
+    delete[] tmp_array;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+inline std::uint32_t ComputeKLDiv(const CpuKernelContext &ctx) {
+  uint32_t result = ComputeKLDivKernel<T>(ctx);
+  if (result != 0) {
+    KERNEL_LOG_ERROR("KLDiv compute failed.");
+  }
+  return result;
+}
+
+inline std::uint32_t KLDivExtraCheck(const CpuKernelContext &ctx) {
+  if (ctx.Input(0)->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Get input x data failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Input(1)->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Get input target data failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Output(0)->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Get output y data failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
+    KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
+                     DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Input(0)->GetDataSize() != ctx.Input(1)->GetDataSize()) {
+    KERNEL_LOG_ERROR(
+      "The data size of the input [%llu] need be the same as the target "
+      "[%llu].",
+      ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> target_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  if (input_dims.size() != target_dims.size()) {
+    KERNEL_LOG_ERROR(
+      "The data dim size of the input x [%llu] need be the same as the "
+      "target "
+      "[%llu].",
+      input_dims.size(), target_dims.size());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  for (size_t index = 0; index < input_dims.size(); index++) {
+    if (input_dims[index] != target_dims[index]) {
+      KERNEL_LOG_ERROR("The data dim of the input x need be the same as the target.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+std::uint32_t KLDivCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
+  return NormalCheck(ctx, kKLDivInputNum, kKLDivOutputNum, {"reduction"}) ? KERNEL_STATUS_PARAM_INVALID
+                                                                          : KLDivExtraCheck(ctx);
+}
+// DT_FLOAT16, DT_FLOAT, DT_DOUBLE
+std::uint32_t KLDivCompute(const CpuKernelContext &ctx) {
+  DataType input_type{ctx.Input(0)->GetDataType()};
+  switch (input_type) {
+    case DT_FLOAT16:
+      return ComputeKLDiv<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return ComputeKLDiv<std::float_t>(ctx);
+    case DT_DOUBLE:
+      return ComputeKLDiv<std::double_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+}  // namespace detail
+
+std::uint32_t KLDivCpuKernel::Compute(CpuKernelContext &ctx) {
+  return detail::KLDivCheck(ctx, kKLDivInputNum, kKLDivOutputNum) ? KERNEL_STATUS_PARAM_INVALID
+                                                                  : detail::KLDivCompute(ctx);
+}
+
+REGISTER_CPU_KERNEL(kKLDiv, KLDivCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.h
@ -0,0 +1,27 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_KLDIV_H_
+#define AICPU_KERNELS_NORMALIZED_KLDIV_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class KLDivCpuKernel final : public CpuKernel {
+  virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.cc
@ -0,0 +1,226 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kldivlossgrad.h"
+
+#include <complex>
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kKlDivLossGrad = "KlDivLossGrad";
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 3;
+const uint32_t kGradIndex = 0;
+const uint32_t kInputIndex = 1;
+const uint32_t kTargetIndex = 2;
+const std::string AttrReduction = "reduction";
+const std::string AttrLog = "log_target";
+const int64_t DataDefaultParallelNum = 16384;
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+void KlDivLossGradOp(Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > &target,
+                     Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > &grad,
+                     Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > &output, std::int64_t &len, bool &log_target,
+                     std::string &reduction) {
+  T constant_zero{0};
+  if (log_target) {
+    output = -Eigen::exp(target) * grad;
+    return;
+  }
+  if (reduction == "none") {
+    for (uint32_t idx = 0; idx < len; ++idx) {
+      if (target(idx) > constant_zero) {
+        output(idx) = -target(idx) * grad(idx);
+      }
+    }
+  } else {
+    for (uint32_t idx = 0; idx < len; ++idx) {
+      if (target(idx) > constant_zero) {
+        output(idx) = -target(idx) * grad(0);
+      }
+    }
+  }
+  return;
+}
+
+std::uint32_t KlDivLossGradExtraCheck(CpuKernelContext &ctx) {
+  Tensor *grad = ctx.Input(0);
+  Tensor *input = ctx.Input(1);
+  Tensor *target = ctx.Input(2);
+  Tensor *output = ctx.Output(0);
+  if (grad->GetDataSize() == 0) {
+    KERNEL_LOG_ERROR("[%s] grad is empty tensor.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (input->GetDataSize() == 0) {
+    KERNEL_LOG_ERROR("[%s] input is empty tensor.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (target->GetDataSize() == 0) {
+    KERNEL_LOG_ERROR("[%s] target is empty tensor.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (output->GetDataSize() == 0) {
+    KERNEL_LOG_ERROR("[%s] output is empty tensor.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if ((input->GetDataType() != grad->GetDataType()) || (target->GetDataType() != grad->GetDataType()) ||
+      (output->GetDataType() != grad->GetDataType())) {
+    KERNEL_LOG_ERROR(
+      "The data type of the grad [%s], input [%s], target [%s], output y "
+      "[%s] must be the same type.",
+      DTypeStr(grad->GetDataType()).c_str(), DTypeStr(input->GetDataType()).c_str(),
+      DTypeStr(target->GetDataType()).c_str(), DTypeStr(output->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  std::vector<int64_t> grad_dims = ctx.Input(kGradIndex)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> input_dims = ctx.Input(kInputIndex)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> target_dims = ctx.Input(kTargetIndex)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
+  std::string reduction = ctx.GetAttr(AttrReduction)->GetString();
+  if (output_dims != input_dims) {
+    KERNEL_LOG_ERROR(
+      "The data shape of the output need be the same as the input. output "
+      "shape [%s], input shape [%s]",
+      VectorToString(output_dims).c_str(), VectorToString(input_dims).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (target_dims != input_dims) {
+    KERNEL_LOG_ERROR(
+      "The data shape of the target need be the same as the input. target "
+      "shape [%s], input shape [%s]",
+      VectorToString(target_dims).c_str(), VectorToString(input_dims).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (reduction == "mean" || reduction == "sum" || reduction == "batchmean") {
+    if (ctx.Input(0)->NumElements() != 1) {
+      KERNEL_LOG_ERROR("The data num of the grad [%llu] must be 1", ctx.Input(0)->NumElements());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  } else if (reduction == "none") {
+    if (input_dims != grad_dims) {
+      KERNEL_LOG_ERROR(
+        "The data shape of the grad need be the same as the input. grad "
+        "shape "
+        "[%s], input shape [%s]",
+        VectorToString(grad_dims).c_str(), VectorToString(input_dims).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t KlDivLossGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (KlDivLossGradExtraCheck(ctx) == KERNEL_STATUS_PARAM_INVALID) {
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  // choose compute function depend on dataType
+  auto data_type = static_cast<DataType>(ctx.Input(kFirstInputIndex)->GetDataType());
+  switch (data_type) {
+    case DT_FLOAT16:
+      return KlDivLossGradCompute<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return KlDivLossGradCompute<float>(ctx);
+    case DT_DOUBLE:
+      return KlDivLossGradCompute<double>(ctx);
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                       DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+template <typename T>
+uint32_t KlDivLossGradCpuKernel::KlDivLossGradCompute(CpuKernelContext &ctx) {
+  int64_t grad_total = ctx.Input(0)->NumElements();
+  int64_t input_total = ctx.Input(1)->NumElements();
+  int64_t target_total = ctx.Input(2)->NumElements();
+  int64_t output_y_total = ctx.Output(0)->NumElements();
+  int64_t total = input_total;
+  uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
+  T *grad = (T *)(ctx.Input(0)->GetData());
+  T *input = (T *)(ctx.Input(1)->GetData());
+  T *target = (T *)(ctx.Input(2)->GetData());
+  T *output = (T *)(ctx.Output(0)->GetData());
+  bool parallel_flag = false;
+  uint64_t data_size = ctx.Input(1)->GetDataSize();
+  // Determine whether to enable multi-core parallel computing
+  if (data_size > DataDefaultParallelNum * sizeof(T)) {
+    parallel_flag = true;
+  }
+  // Eigen::Array
+  bool log_target{false};
+  if (ctx.GetAttr(AttrLog) != nullptr) {
+    log_target = ctx.GetAttr(AttrLog)->GetBool();
+  }
+  std::string reduction{"mean"};
+  if (ctx.GetAttr(AttrReduction) != nullptr) {
+    reduction = ctx.GetAttr(AttrReduction)->GetString();
+  }
+  if (cores == 0) {
+    KERNEL_LOG_ERROR("KlDivLossGrad compute failed.");
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  if (parallel_flag) {
+    const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
+    std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
+    auto shard_kldivlossgrad = [&](std::int64_t begin, std::int64_t end) {
+      std::int64_t length = end - begin;
+      std::int64_t grad_begin{0}, grad_length{grad_total};
+      if (reduction == "none") {
+        grad_begin = begin;
+        grad_length = length;
+      }
+      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_grad(grad + grad_begin, grad_length, 1);
+      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input + begin, length, 1);
+      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target + begin, length, 1);
+      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_output(output + begin, length, 1);
+      T constant_zero{0};
+      array_output = constant_zero;
+      KlDivLossGradOp<T>(array_target, array_grad, array_output, length, log_target, reduction);
+      if (reduction == "mean") {
+        array_output = array_output / T(output_y_total);
+      } else if (reduction == "batchmean") {
+        std::vector<int64_t> input_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+        array_output = array_output / T(input_dims[0]);
+      }
+    };
+    KERNEL_HANDLE_ERROR(ParallelFor(ctx, total, per_unit_size, shard_kldivlossgrad), "KlDivLossGrad Compute failed.");
+  } else {
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_grad(grad, grad_total, 1);
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input, input_total, 1);
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target, target_total, 1);
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_output(output, output_y_total, 1);
+    T constant_zero{0};
+    array_output = constant_zero;
+    KlDivLossGradOp<T>(array_target, array_grad, array_output, output_y_total, log_target, reduction);
+    if (reduction == "mean") {
+      array_output = array_output / T(output_y_total);
+    } else if (reduction == "batchmean") {
+      std::vector<int64_t> input_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+      array_output = array_output / T(input_dims[0]);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kKlDivLossGrad, KlDivLossGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.h
@ -0,0 +1,42 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H
+#define AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H
+#define EIGEN_USE_THREADS
+#define EIGEN_USE_SIMPLE_THREAD_POOL
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class KlDivLossGradCpuKernel : public CpuKernel {
+ public:
+  KlDivLossGradCpuKernel() = default;
+  ~KlDivLossGradCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  /**
+   * @brief compute for all types
+   * @param ctx cpu kernel context
+   * @return status if success
+   */
+  template <typename T>
+  uint32_t KlDivLossGradCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.cc
@ -0,0 +1,173 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lcm.h"
+
+#include <cmath>
+#include <set>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kLcmOutputNum = 1;
+const uint32_t kLcmInputNum = 2;
+const char *kLcm = "Lcm";
+// when input data size is more than kParallelDataNum, use Parallel func
+const int64_t kParallelDataNum = 2 * 1024;
+const int64_t kParallelDataNumMid = 16 * 1024;
+const int32_t kInput_32_32 = 3;
+const int32_t kInput_32_64 = 2;
+const int32_t kInput_64_32 = 1;
+const int32_t kInput_64_64 = 0;
+}  // namespace
+
+namespace aicpu {
+// Simple recursive gcd.
+template <class T>
+T elewise_gcd(T a, T b) {
+  if (b == 0) {
+    return a;
+  }
+  return elewise_gcd(b, a % b);
+}
+// Simple lcm.
+template <typename T>
+T elewise_lcm(T a, T b) {
+  T gcd_tmp = elewise_gcd<T>(a, b);
+  if (gcd_tmp == 0) {
+    return static_cast<T>(0);
+  }
+  return std::abs(a / gcd_tmp * b);
+}
+
+uint32_t LcmIOTypeCheck(CpuKernelContext &ctx, int32_t &dual_types) {
+  Tensor *x1 = ctx.Input(kFirstInputIndex);
+  Tensor *x2 = ctx.Input(kSecondInputIndex);
+  Tensor *y = ctx.Output(kFirstOutputIndex);
+  const std::set<DataType> supported_types{DT_INT32, DT_INT64};
+  auto x1_type = x1->GetDataType();
+  auto x2_type = x2->GetDataType();
+  auto y_type = y->GetDataType();
+  KERNEL_CHECK_FALSE(supported_types.count(x1_type) != 0, KERNEL_STATUS_PARAM_INVALID,
+                     "[Lcm] input x1 data type [%s] is not supported.", DTypeStr(x1_type).c_str());
+  KERNEL_CHECK_FALSE(supported_types.count(x2_type) != 0, KERNEL_STATUS_PARAM_INVALID,
+                     "[Lcm] input x2 data type [%s] is not supported.", DTypeStr(x2_type).c_str());
+  int32_t x1_is_i32 = static_cast<int32_t>(x1_type == DT_INT32) << 1;
+  int32_t x2_is_i32 = static_cast<int32_t>(x2_type == DT_INT32);
+  int32_t _dual_types = x1_is_i32 | x2_is_i32;
+  switch (_dual_types) {
+    case kInput_64_64:
+    case kInput_64_32:
+    case kInput_32_64:
+      KERNEL_CHECK_FALSE(y_type == DT_INT64, KERNEL_STATUS_PARAM_INVALID,
+                         "[Lcm] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
+      dual_types = _dual_types;
+      break;
+    case kInput_32_32:
+      KERNEL_CHECK_FALSE(y_type == DT_INT32, KERNEL_STATUS_PARAM_INVALID,
+                         "[Lcm] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
+      dual_types = _dual_types;
+      break;
+    default:
+      KERNEL_LOG_ERROR("[Lcm] input data type tuple is not supported.");
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <class T1, class T2, class T3>
+uint32_t LcmElewiseCompute(CpuKernelContext &ctx, const T1 *x1_ptr, const T2 *x2_ptr, T3 *y_ptr, Bcast &bcast) {
+  int64_t data_num = ctx.Output(kFirstOutputIndex)->NumElements();
+  auto lcm_shard = [&](int64_t start, int64_t end) {
+    for (int64_t i = start; i < end; ++i) {
+      T3 x1_ele_abs = std::abs(static_cast<T3>(x1_ptr[bcast.GetBroadcastXIndex(i)]));
+      T3 x2_ele_abs = std::abs(static_cast<T3>(x2_ptr[bcast.GetBroadcastYIndex(i)]));
+      y_ptr[i] = elewise_lcm(x1_ele_abs, x2_ele_abs);
+    }
+  };
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("[Lcm] max_core_num is 0, please check the cpu num.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, lcm_shard);
+    if (ret != KERNEL_STATUS_OK) {
+      KERNEL_LOG_ERROR("[Lcm] Lcm Compute failed.");
+      return ret;
+    }
+  } else {
+    lcm_shard(0, data_num);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <class T1, class T2, class T3>
+uint32_t LcmCompute(CpuKernelContext &ctx) {
+  Tensor *x1 = ctx.Input(kFirstInputIndex);
+  Tensor *x2 = ctx.Input(kSecondInputIndex);
+  Tensor *y = ctx.Output(kFirstOutputIndex);
+  const T1 *x1_ptr = reinterpret_cast<const T1 *>(x1->GetData());
+  const T2 *x2_ptr = reinterpret_cast<const T2 *>(x2->GetData());
+  T3 *y_ptr = reinterpret_cast<T3 *>(y->GetData());
+  auto x1_shape = x1->GetTensorShape()->GetDimSizes();
+  auto x2_shape = x2->GetTensorShape()->GetDimSizes();
+  Bcast bcast(x1_shape, x2_shape);
+  if (bcast.IsValid()) {
+    return LcmElewiseCompute<T1, T2, T3>(ctx, x1_ptr, x2_ptr, y_ptr, bcast);
+  } else {
+    KERNEL_LOG_ERROR("[Lcm] broadcast failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+uint32_t LcmCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kLcmInputNum, kLcmOutputNum), "[Lcm] check input and output number failed.");
+  int32_t dual_types = static_cast<int32_t>(-1);
+  KERNEL_HANDLE_ERROR(LcmIOTypeCheck(ctx, dual_types), "[Lcm] check data type failed.");
+  switch (dual_types) {
+    case kInput_64_64:
+      return LcmCompute<int64_t, int64_t, int64_t>(ctx);
+      break;
+    case kInput_64_32:
+      return LcmCompute<int64_t, int32_t, int64_t>(ctx);
+      break;
+    case kInput_32_64:
+      return LcmCompute<int32_t, int64_t, int64_t>(ctx);
+      break;
+    case kInput_32_32:
+      return LcmCompute<int32_t, int32_t, int32_t>(ctx);
+      break;
+    default:
+      KERNEL_LOG_ERROR("[Lcm] input data type tuple is not supported.");
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kLcm, LcmCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.h
@ -0,0 +1,32 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_LCM_H_
+#define AICPU_KERNELS_NORMALIZED_LCM_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class LcmCpuKernel : public CpuKernel {
+ public:
+  ~LcmCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.cc
@ -0,0 +1,126 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All right reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "logit.h"
+
+#include "Eigen/Core"
+#include "Eigen/Dense"
+#include "Eigen/LU"
+#include "cmath"
+#include "cpu_context.h"
+#include "cpu_kernel_utils.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 1;
+const int64_t kParallelDataNumSameShape = 7 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 16 * 1024;
+const char *kLogit = "Logit";
+
+#define LOGIT_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                       \
+    uint32_t result = LogitCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                   \
+      KERNEL_LOG_ERROR("Logit kernel compute failed."); \
+      return result;                                    \
+    }                                                   \
+    break;                                              \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t LogitCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLogit);
+  DataType data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    LOGIT_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    LOGIT_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    LOGIT_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    default:
+      KERNEL_LOG_ERROR("Logit kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t LogitCpuKernel::LogitCompute(CpuKernelContext &ctx) {
+  auto input_tensor = ctx.Input(0);
+  auto output_tensor = ctx.Output(0);
+  auto input = reinterpret_cast<T *>(input_tensor->GetData());
+  auto output = reinterpret_cast<T *>(output_tensor->GetData());
+  AttrValue *attr = ctx.GetAttr("eps");
+  float eps = -1.0;
+  if (attr != nullptr) {
+    eps = attr->GetFloat();
+  }
+  auto input_shape = input_tensor->GetTensorShape();
+  int64_t data_num = input_shape->NumElements();
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (data_num <= kParallelDataNumSameShapeMid) {
+      max_core_num = std::min(max_core_num, 4U);
+    }
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shared_less = [&](size_t start, size_t end) {
+      T one = T(1);
+      T up_bound = static_cast<T>(1) - static_cast<T>(eps);
+      if (eps < 0) {
+        for (size_t i = start; i < end; i++) {
+          T x = input[i];
+          output[i] = log(x / (one - x));
+        }
+      } else {
+        for (size_t i = start; i < end; i++) {
+          T z;
+          T x = input[i];
+          z = x < static_cast<T>(eps) ? static_cast<T>(eps) : (x > up_bound ? up_bound : x);
+          output[i] = log(z / (one - z));
+        }
+      }
+    };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max core num is 0");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_less),
+                        "Logit Compute failed.");
+  } else {
+    T one = T(1);
+    T up_bound = static_cast<T>(1) - static_cast<T>(eps);
+    if (eps < 0) {
+      for (int64_t i = 0; i < data_num; i++) {
+        T x = input[i];
+        output[i] = log(x / (one - x));
+      }
+    } else {
+      for (int64_t i = 0; i < data_num; i++) {
+        T z;
+        T x = input[i];
+        z = x < static_cast<T>(eps) ? static_cast<T>(eps) : (x > up_bound ? up_bound : x);
+        output[i] = log(z / (one - z));
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kLogit, LogitCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h
@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_LOGIT_H
+#define AICPU_KERNELS_NORMALIZED_LOGIT_H
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class LogitCpuKernel : public CpuKernel {
+ public:
+  LogitCpuKernel() = default;
+  ~LogitCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t LogitCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.cc
@ -0,0 +1,133 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All right reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "logit_grad.h"
+
+#include "Eigen/Core"
+#include "Eigen/Dense"
+#include "Eigen/LU"
+#include "cmath"
+#include "cpu_context.h"
+#include "cpu_kernel_utils.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const int64_t kParallelDataNumSameShape = 7 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 16 * 1024;
+const char *kLogitGrad = "LogitGrad";
+
+#define LOGITGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                           \
+    uint32_t result = LogitGradCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                       \
+      KERNEL_LOG_ERROR("LogitGrad kernel compute failed."); \
+      return result;                                        \
+    }                                                       \
+    break;                                                  \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t LogitGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLogitGrad);
+  DataType data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    LOGITGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    LOGITGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    LOGITGRAD_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("LogitGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t LogitGradCpuKernel::LogitGradCompute(CpuKernelContext &ctx) {
+  auto input_y_grad_tensor = ctx.Input(0);
+  auto input_x_tensor = ctx.Input(1);
+  auto output_x_grad_tensor = ctx.Output(0);
+  auto input_y_grad = reinterpret_cast<T *>(input_y_grad_tensor->GetData());
+  auto input_x = reinterpret_cast<T *>(input_x_tensor->GetData());
+  auto output_x_grad = reinterpret_cast<T *>(output_x_grad_tensor->GetData());
+  auto input_shape = input_x_tensor->GetTensorShape();
+  int64_t data_num = input_shape->NumElements();
+  float eps = -1.0;
+  AttrValue *attr = ctx.GetAttr("eps");
+  if (attr != nullptr) {
+    eps = attr->GetFloat();
+  }
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (data_num <= kParallelDataNumSameShapeMid) {
+      max_core_num = std::min(max_core_num, 4U);
+    }
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shared_less = [&](size_t start, size_t end) {
+      T one = T(1);
+      T zero = T(0);
+      T up_bound = static_cast<T>(1) - static_cast<T>(eps);
+      if (eps < 0) {
+        for (size_t i = start; i < end; i++) {
+          T y_grad = input_y_grad[i];
+          T x = input_x[i];
+          output_x_grad[i] = (x < zero || x > one) ? std::numeric_limits<T>::quiet_NaN() : (y_grad / (x * (one - x)));
+        }
+      } else {
+        for (size_t i = start; i < end; i++) {
+          T y_grad = input_y_grad[i];
+          T x = input_x[i];
+          output_x_grad[i] =
+            static_cast<float>(x) < static_cast<float>(eps) || static_cast<float>(x) > static_cast<float>(up_bound)
+              ? zero
+              : (y_grad / (x * (one - x)));
+        }
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_less),
+                        "LogitGrad Compute failed.");
+  } else {
+    T one = T(1);
+    T zero = T(0);
+    T up_bound = static_cast<T>(1) - static_cast<T>(eps);
+    if (eps < 0) {
+      for (int64_t i = 0; i < data_num; i++) {
+        T y_grad = input_y_grad[i];
+        T x = input_x[i];
+        output_x_grad[i] = (x < zero || x > one) ? std::numeric_limits<T>::quiet_NaN() : (y_grad / (x * (one - x)));
+      }
+    } else {
+      for (int64_t i = 0; i < data_num; i++) {
+        T y_grad = input_y_grad[i];
+        T x = input_x[i];
+        output_x_grad[i] =
+          static_cast<float>(x) < static_cast<float>(eps) || static_cast<float>(x) > static_cast<float>(up_bound)
+            ? zero
+            : (y_grad / (x * (one - x)));
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kLogitGrad, LogitGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.h
@ -0,0 +1,36 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_LOGIT_GRAD_H
+#define AICPU_KERNELS_NORMALIZED_LOGIT_GRAD_H
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class LogitGradCpuKernel : public CpuKernel {
+ public:
+  LogitGradCpuKernel() = default;
+  ~LogitGradCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t LogitGradCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.cc
@ -0,0 +1,153 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "lower_bound.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 2;
+const uint32_t kOutputNum = 1;
+const char *kLowerBound = "LowerBound";
+
+#define LOWERBOUND_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX)    \
+  case (DTYPE): {                                            \
+    uint32_t result = LowerBoundCompute<TYPE1, TYPE2>(CTX);  \
+    if (result != KERNEL_STATUS_OK) {                        \
+      KERNEL_LOG_ERROR("LowerBound kernel compute failed."); \
+      return result;                                         \
+    }                                                        \
+    break;                                                   \
+  }
+
+#define LOWERBOUND_COMPUTE_CASE_ALL(TYPE, CTX)                \
+  LOWERBOUND_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX)         \
+  LOWERBOUND_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX)       \
+  LOWERBOUND_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX)       \
+  LOWERBOUND_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX)       \
+  LOWERBOUND_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX)       \
+  LOWERBOUND_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX)     \
+  LOWERBOUND_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
+  LOWERBOUND_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX)         \
+  LOWERBOUND_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
+}  // namespace
+
+namespace aicpu {
+uint32_t LowerBoundCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LowerBound check input and output number failed.");
+  Tensor *sorted_x_data = ctx.Input(0);
+  Tensor *values_data = ctx.Input(1);
+  Tensor *output_data = ctx.Output(0);
+  auto output_type = output_data->GetDataType();
+  auto sorted_x_type = sorted_x_data->GetDataType();
+  auto values_type = values_data->GetDataType();
+  if (sorted_x_type != values_type) {
+    KERNEL_LOG_ERROR("Input[0] data type[%s] must be same with Input[1] data type[%s]", DTypeStr(sorted_x_type).c_str(),
+                     DTypeStr(values_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  switch (output_type) {
+    case DT_INT32:
+      switch (sorted_x_type) {
+        LOWERBOUND_COMPUTE_CASE_ALL(int32_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    case DT_INT64:
+      switch (sorted_x_type) {
+        LOWERBOUND_COMPUTE_CASE_ALL(int64_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    default:
+      KERNEL_LOG_ERROR("Output data type[%s] not supported.", DTypeStr(output_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T1, typename T2>
+uint32_t LowerBoundCpuKernel::LowerBoundCompute(CpuKernelContext &ctx) {
+  Tensor *sorted_x_data = ctx.Input(0);
+  auto sorted_x_data_addr = reinterpret_cast<T1 *>(sorted_x_data->GetData());
+  auto sorted_x_data_shape = sorted_x_data->GetTensorShape();
+  std::vector<int64_t> sorted_x_data_shape_dims = sorted_x_data_shape->GetDimSizes();
+  Tensor *values_data = ctx.Input(1);
+  auto values_data_addr = reinterpret_cast<T1 *>(values_data->GetData());
+  auto values_data_shape = values_data->GetTensorShape();
+  int64_t values_data_num = values_data_shape->NumElements();
+  std::vector<int64_t> values_data_shape_dims = values_data_shape->GetDimSizes();
+  Tensor *output_data = ctx.Output(0);
+  auto output_data_addr = reinterpret_cast<T2 *>(output_data->GetData());
+  if (sorted_x_data_shape_dims[0] != values_data_shape_dims[0]) {
+    KERNEL_LOG_ERROR("The number of rows of Input[0]:([%d]) should be consistent with that of Input[1]:([%d]).",
+                     sorted_x_data_shape_dims[0], values_data_shape_dims[0]);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  int64_t sorted_x_data_column = sorted_x_data_shape_dims[1];
+  int64_t values_data_column = values_data_shape_dims[1];
+  if (values_data_num < 1024) {
+    for (int64_t i = 0; i < values_data_num; i++) {
+      int64_t seq_row = i / values_data_column;
+      int64_t low = seq_row * sorted_x_data_column;
+      int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
+      int64_t mid;
+      while (low <= up) {
+        mid = (low + up) / 2;
+        if (values_data_addr[i] <= sorted_x_data_addr[mid]) {
+          up = mid - 1;
+        } else {
+          low = mid + 1;
+        }
+      }
+      output_data_addr[i] = low - seq_row * sorted_x_data_column;
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    int64_t sum_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (sum_core_num > values_data_num) {
+      sum_core_num = values_data_num;
+    }
+    auto shard_compute = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        int64_t seq_row = i / values_data_column;
+        int64_t low = seq_row * sorted_x_data_column;
+        int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
+        int64_t mid;
+        while (low <= up) {
+          mid = (low + up) / 2;
+          if (values_data_addr[i] <= sorted_x_data_addr[mid]) {
+            up = mid - 1;
+          } else {
+            low = mid + 1;
+          }
+        }
+        output_data_addr[i] = low - seq_row * sorted_x_data_column;
+      }
+    };
+    KERNEL_HANDLE_ERROR(
+      CpuKernelUtils::ParallelFor(ctx, values_data_num, values_data_num / sum_core_num, shard_compute),
+      "LowerBound Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kLowerBound, LowerBoundCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_LOWERBOUND_H_
+#define AICPU_KERNELS_NORMALIZED_LOWERBOUND_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class LowerBoundCpuKernel : public CpuKernel {
+ public:
+  LowerBoundCpuKernel() = default;
+  ~LowerBoundCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T1, typename T2>
+  static uint32_t LowerBoundCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.cc
@ -0,0 +1,115 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lstsq.h"
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+#include "utils/eigen_tensor.h"
+#include <Eigen/Dense>
+#include <Eigen/Cholesky>
+#include <iostream>
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *kLstsq = "Lstsq";
+}  // namespace
+// namespace aicpu
+namespace aicpu {
+uint32_t LstsqCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Lstsq check input and output number failed.");
+  Tensor *input_x0 = ctx.Input(0);
+  Tensor *input_x1 = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  auto dims_0 = input_x0->GetTensorShape()->GetDims();
+  auto dims_1 = input_x1->GetTensorShape()->GetDims();
+  KERNEL_CHECK_FALSE((dims_0 == 2), KERNEL_STATUS_PARAM_INVALID, "Dimension of input[0] must be 2, but got[%zu].",
+                     dims_0);
+  KERNEL_CHECK_FALSE(((dims_1 == 2) || (dims_1 == 1)), KERNEL_STATUS_PARAM_INVALID,
+                     "Dimension of input[1] must be 2 or 1, but got[%zu].", dims_1);
+  auto shape_0 = input_x0->GetTensorShape();
+  auto shape_1 = input_x1->GetTensorShape();
+  KERNEL_CHECK_FALSE((shape_0->GetDimSize(0) == shape_1->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID,
+                     "Lstsq shape_0[0] and shape_1[0] not equal.", shape_0->GetDimSize(0), shape_0->GetDimSize(1));
+  AttrValue *I2_regularizer = ctx.GetAttr("l2_regularizer");
+  AttrValue *fast = ctx.GetAttr("fast");
+  KERNEL_CHECK_NULLPTR(I2_regularizer, KERNEL_STATUS_PARAM_INVALID, "Get l2_regularizer failed.");
+  KERNEL_CHECK_NULLPTR(fast, KERNEL_STATUS_PARAM_INVALID, "Get fast failed.");
+  KERNEL_LOG_DEBUG(
+    "LstsqCpuKernel[%s], inputx0: size[%llu];"
+    "inputx1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), input_x0->GetDataSize(), input_x1->GetDataSize(), output->GetDataSize());
+  DataType data_type1 = ctx.Input(0)->GetDataType();
+  DataType data_type2 = ctx.Input(1)->GetDataType();
+  KERNEL_CHECK_FALSE((data_type1 == data_type2), KERNEL_STATUS_PARAM_INVALID,
+                     "Lstsq input_0_dtype must be equal to input_1_dtype.", data_type1, data_type2);
+  switch (data_type1) {
+    case DT_FLOAT16:
+      return LstsqCompute<float, Eigen::half>(ctx);
+    case DT_FLOAT:
+      return LstsqCompute<float, float>(ctx);
+    case DT_DOUBLE:
+      return LstsqCompute<double, double>(ctx);
+    default:
+      KERNEL_LOG_ERROR("Lstsq kernel data type [%u] not support.", DTypeStr(data_type1).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T1, typename T2>
+uint32_t LstsqCpuKernel::LstsqCompute(CpuKernelContext &ctx) {
+  Eigen::Index m = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
+  Eigen::Index n = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
+  Eigen::Index k = 1;
+  if (ctx.Input(1)->GetTensorShape()->GetDims() == 2) {
+    k = ctx.Input(1)->GetTensorShape()->GetDimSize(1);
+  }
+
+  typedef Eigen::Matrix<T1, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartixXd;
+  MartixXd A(m, n);
+  MartixXd B(m, k);
+
+  auto aptr = reinterpret_cast<T2 *>(ctx.Input(0)->GetData());
+  auto bptr = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
+
+  for (int i = 0; i < m * n; i++) {
+    *(A.data() + i) = static_cast<T1>(*(aptr + i));
+  }
+  for (int i = 0; i < m * k; i++) {
+    *(B.data() + i) = static_cast<T1>(*(bptr + i));
+  }
+
+  MartixXd result(n, k);
+  if (m >= n) {
+    result = A.colPivHouseholderQr().solve(B);
+  } else {
+    MartixXd A_Transpose = A.transpose();
+    MartixXd temp = A * A_Transpose;
+    MartixXd tempI = temp.inverse();
+    MartixXd x = A_Transpose * tempI;
+    MartixXd output = x * B;
+    result = output;
+  }
+  auto output_addr = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
+  for (int i = 0; i < n * k; i++) {
+    *(output_addr + i) = static_cast<T2>(*(result.data() + i));
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kLstsq, LstsqCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.h
@ -0,0 +1,37 @@
+
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_LSTSQ_H_
+#define AICPU_KERNELS_NORMALIZED_LSTSQ_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+
+class LstsqCpuKernel : public CpuKernel {
+ public:
+  LstsqCpuKernel() = default;
+  ~LstsqCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T1, typename T2>
+  static uint32_t LstsqCompute(CpuKernelContext &ctx);
+};
+}  // namespace  aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.cc
@ -0,0 +1,185 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "lu_solve.h"
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+#include "utils/eigen_tensor.h"
+#include <Eigen/Dense>
+#include <iostream>
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 3;
+const int64_t kParallelBatchNum1 = 50;
+const int64_t kParallelBatchNum4 = 200;
+const int64_t kParallelBatchNum8 = 500;
+const int64_t kParallelBatchNumx = 1000;
+const char *kLuSolve = "LuSolve";
+}  // namespace
+namespace aicpu {
+uint32_t LuSolveCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check LuSolve params failed.");
+  Tensor *input_0 = ctx.Input(0);
+  KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input0 data failed.");
+  Tensor *input_1 = ctx.Input(1);
+  KERNEL_CHECK_NULLPTR(input_1->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input1 data failed.");
+  Tensor *input_2 = ctx.Input(2);
+  KERNEL_CHECK_NULLPTR(input_2->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input2 data failed.");
+  Tensor *output = ctx.Output(0);
+  auto input_0_Shape = input_0->GetTensorShape();
+  KERNEL_CHECK_NULLPTR(input_0_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_0_Shape failed.")
+  auto input_1_Shape = input_1->GetTensorShape();
+  KERNEL_CHECK_NULLPTR(input_1_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_1_Shape failed.")
+  auto input_2_Shape = input_2->GetTensorShape();
+  KERNEL_CHECK_NULLPTR(input_2_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_2_Shape failed.")
+  int32_t b_dims = input_0_Shape->GetDims();
+  int32_t lu_dims = input_1_Shape->GetDims();
+  int32_t pivots_dims = input_2_Shape->GetDims();
+  std::vector<int64_t> b_dims_vector = input_0_Shape->GetDimSizes();
+  std::vector<int64_t> lu_dims_vector = input_1_Shape->GetDimSizes();
+  std::vector<int64_t> pivots_dims_vector = input_2_Shape->GetDimSizes();
+  if (b_dims == lu_dims) {
+    for (int32_t i = 0; i <= b_dims - 2; i++) {
+      if (b_dims_vector[i] != lu_dims_vector[i]) {
+        KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!");
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+    }
+  } else if (lu_dims > b_dims) {
+    for (int32_t i = 0; i < b_dims - 2; i++) {
+      if (b_dims_vector[i] != lu_dims_vector[lu_dims - b_dims + i]) {
+        KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!");
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+    }
+  } else {
+    for (int32_t i = 0; i < lu_dims - 2; i++) {
+      if (lu_dims_vector[i] != b_dims_vector[b_dims - lu_dims + i]) {
+        KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!");
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+    }
+  }
+  for (int32_t i = 0; i < pivots_dims; i++) {
+    if (lu_dims_vector[i] != pivots_dims_vector[i]) {
+      KERNEL_LOG_ERROR("batch dimension of LU_pivots doesn't match batch dimension of LU_data!");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  auto data_type = ctx.Input(0)->GetDataType();
+  KERNEL_LOG_DEBUG(
+    "LuSolveCpuKernel[%s], input_0: size[%llu], input_1: size[%llu], input_2: size[%llu]"
+    "output: size[%llu].",
+    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), input_2->GetDataSize(),
+    output->GetDataSize());
+  switch (data_type) {
+    case DT_FLOAT:
+      return LuSolveCompute<float, float>(ctx);
+    case DT_FLOAT16:
+      return LuSolveCompute<float, Eigen::half>(ctx);
+    default:
+      KERNEL_LOG_ERROR("LuSolve kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T, typename T2>
+uint32_t LuSolveCpuKernel::LuSolve(CpuKernelContext &ctx, T *b_working_ptr, T *lu_working_ptr,
+                                   int32_t *pivots_working_ptr, int64_t b_stride, int64_t a) {
+  auto output_y = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
+  auto input_0_Shape = ctx.Input(0)->GetTensorShape();
+  auto input_1_Shape = ctx.Input(1)->GetTensorShape();
+  int32_t lu_dims = input_1_Shape->GetDims();
+  int64_t lu_maxtrix_sizes = input_1_Shape->GetDimSize(lu_dims - 2);
+  int32_t b_dim = input_0_Shape->GetDims();
+  int64_t b_m = input_0_Shape->GetDimSize(b_dim - 1);
+  typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
+  MatrixXd matrix_b = Eigen::Map<MatrixXd>(b_working_ptr, lu_maxtrix_sizes, b_m);
+  MatrixXd matrix_A = Eigen::Map<MatrixXd>(lu_working_ptr, lu_maxtrix_sizes, lu_maxtrix_sizes);
+  for (int64_t i = 0; i < input_0_Shape->GetDimSize(b_dim - 2); i++) {
+    matrix_b.row(i).swap(matrix_b.row(*(pivots_working_ptr + i) - 1));
+  }
+  MatrixXd L = matrix_A.template triangularView<Eigen::UnitLower>();
+  MatrixXd U = matrix_A.template triangularView<Eigen::Upper>();
+  MatrixXd result = (L * U).lu().solve(matrix_b);
+  for (int64_t m = 0; m < b_stride; m++) {
+    *(output_y + a * b_stride + m) = (T2) * (result.data() + m);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T, typename T2>
+uint32_t LuSolveCpuKernel::LuSolveCompute(CpuKernelContext &ctx) {
+  auto input_x0 = reinterpret_cast<T2 *>(ctx.Input(0)->GetData());
+  auto input_x1 = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
+  auto input_x2 = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
+  auto input_0_Shape = ctx.Input(0)->GetTensorShape();
+  auto input_1_Shape = ctx.Input(1)->GetTensorShape();
+  auto input_2_Shape = ctx.Input(2)->GetTensorShape();
+  T *input_0 = new T[input_0_Shape->NumElements()];
+  T *input_1 = new T[input_1_Shape->NumElements()];
+  for (int64_t i = 0; i < input_0_Shape->NumElements(); i++) {
+    *(input_0 + i) = (T) * (input_x0 + i);
+  }
+  for (int64_t i = 0; i < input_1_Shape->NumElements(); i++) {
+    *(input_1 + i) = (T) * (input_x1 + i);
+  }
+  int32_t b_dims = input_0_Shape->GetDims();
+  int32_t lu_dims = input_1_Shape->GetDims();
+  std::vector<int64_t> b_dims_vector = input_0_Shape->GetDimSizes();
+  std::vector<int64_t> lu_dims_vector = input_1_Shape->GetDimSizes();
+  std::vector<int64_t> pivots_dims_vector = input_2_Shape->GetDimSizes();
+  int64_t b_stride = input_0_Shape->GetDimSize(b_dims - 1) * input_0_Shape->GetDimSize(b_dims - 2);
+  int64_t lu_stride = input_1_Shape->GetDimSize(lu_dims - 1) * input_1_Shape->GetDimSize(lu_dims - 2);
+  int64_t pivots_stride = input_1_Shape->GetDimSize(lu_dims - 1);
+  std::vector<int64_t> b_shape = b_dims_vector;
+  std::vector<int64_t> lu_shape = lu_dims_vector;
+  for (size_t i = 0; i < 2; i++) {
+    b_shape.pop_back();
+    lu_shape.pop_back();
+  }
+  Bcast bcast(b_shape, lu_shape);
+  int64_t batch_num = ctx.Output(0)->NumElements() / b_stride;
+  if (batch_num < kParallelBatchNum1) {
+    for (int64_t i = 0; i < batch_num; i++) {
+      T *b_working_ptr = &input_0[bcast.GetBroadcastXIndex(i) * b_stride];
+      T *lu_working_ptr = &input_1[bcast.GetBroadcastYIndex(i) * lu_stride];
+      int32_t *pivots_working_ptr = &input_x2[bcast.GetBroadcastYIndex(i) * pivots_stride];
+      LuSolve<T, T2>(ctx, b_working_ptr, lu_working_ptr, pivots_working_ptr, b_stride, i);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (batch_num < kParallelBatchNumx) max_core_num = 8U;
+    if (batch_num < kParallelBatchNum8) max_core_num = 4U;
+    if (batch_num < kParallelBatchNum4) max_core_num = 2U;
+    std::cout << max_core_num << std::endl;
+    auto sharder = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; i++) {
+        T *b_working_ptr = &input_0[bcast.GetBroadcastXIndex(i) * b_stride];
+        T *lu_working_ptr = &input_1[bcast.GetBroadcastYIndex(i) * lu_stride];
+        int32_t *pivots_working_ptr = &input_x2[bcast.GetBroadcastYIndex(i) * pivots_stride];
+        LuSolve<T, T2>(ctx, b_working_ptr, lu_working_ptr, pivots_working_ptr, b_stride, i);
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch_num, batch_num / max_core_num, sharder),
+                        "LuSolve Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kLuSolve, LuSolveCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.h
@ -0,0 +1,22 @@
+#ifndef AICPU_KERNELS_NORMALIZED_LUSOLVE_H_
+#define AICPU_KERNELS_NORMALIZED_LUSOLVE_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+namespace aicpu {
+
+class LuSolveCpuKernel : public CpuKernel {
+ public:
+  LuSolveCpuKernel() = default;
+  ~LuSolveCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T, typename T2>
+  static uint32_t LuSolve(CpuKernelContext &ctx, T *b_working_ptr, T *lu_working_ptr, int32_t *pivots_working_ptr,
+                          int64_t b_stride, int64_t i);
+  template <typename T, typename T2>
+  static uint32_t LuSolveCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.cc
@ -0,0 +1,321 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "lu_unpack.h"
+#include <string.h>
+#include <Eigen/Dense>
+#include <algorithm>
+#include <iostream>
+#include "cpu_context.h"
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel_utils.h"
+#include "cpu_tensor.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 3;
+const uint32_t kInputNum = 2;
+const uint32_t kFirstInputIndex = 0;
+const uint32_t kSecondInputIndex = 1;
+const uint32_t kFirstOutputIndex = 0;
+const uint32_t kSecondOutputIndex = 1;
+const uint32_t kThirdOutputIndex = 2;
+const int32_t kLuDataMinRank = 2;
+const int32_t kLuPivotsMinRank = 2;
+const int64_t kParallelBatchNum = 70;
+const char *kLuUnpack = "LuUnpack";
+}  // namespace
+namespace aicpu {
+template <typename T_data, typename T_pivots>
+uint32_t LuUnpackCpuKernel::LuUnpack(CpuKernelContext &ctx, T_pivots *Lu_pivots_working_ptr, int64_t matrix_index,
+                                     T_data *P_eye) {
+  int32_t Lu_data_dims = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDims();
+  int64_t Lu_data_dim1 = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSize(Lu_data_dims - 2);
+  int64_t Lu_data_dim2 = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSize(Lu_data_dims - 1);
+  int32_t Lu_pivots_dims = ctx.Input(kSecondInputIndex)->GetTensorShape()->GetDims();
+  int64_t Lu_pivots_dim = ctx.Input(kSecondInputIndex)->GetTensorShape()->GetDimSize(Lu_pivots_dims - 1);
+  int64_t matrix_width = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes()[Lu_data_dims - 2];
+  int64_t matrix_height = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes()[Lu_data_dims - 1];
+  int64_t pivots_stride = Lu_data_dim1 * Lu_data_dim1;
+  int64_t L_stride = 0;
+  int64_t U_stride = 0;
+  if (Lu_data_dim1 > Lu_data_dim2) {
+    L_stride = Lu_data_dim1 * Lu_data_dim2;
+    U_stride = Lu_data_dim2 * Lu_data_dim2;
+  } else {
+    L_stride = Lu_data_dim1 * Lu_data_dim1;
+    U_stride = Lu_data_dim1 * Lu_data_dim2;
+  }
+  int64_t matrix_size = matrix_width * matrix_height;
+  using MatrixMap = Eigen::Map<Eigen::Matrix<T_data, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+  MatrixMap input(reinterpret_cast<T_data *>(ctx.Input(kFirstInputIndex)->GetData()) + matrix_index * matrix_size,
+                  matrix_width, matrix_height);
+  //  Triu
+  if (matrix_width > matrix_height) {
+    MatrixMap output2(reinterpret_cast<T_data *>(ctx.Output(kThirdOutputIndex)->GetData()) + matrix_index * U_stride,
+                      matrix_height, matrix_height);
+    T_data *MiddlePtr = new T_data[matrix_size];
+    MatrixMap MiddleData(MiddlePtr, matrix_width, matrix_height);
+    MiddleData = input.template triangularView<Eigen::Upper>();
+    output2 = MiddleData.block(0, 0, matrix_height, matrix_height);
+    delete[] MiddlePtr;
+  } else {
+    MatrixMap output2(reinterpret_cast<T_data *>(ctx.Output(kThirdOutputIndex)->GetData()) + matrix_index * U_stride,
+                      matrix_width, matrix_height);
+    output2 = input.template triangularView<Eigen::Upper>();
+  }
+  //  Tril
+  if (matrix_height > matrix_width) {
+    MatrixMap output1(reinterpret_cast<T_data *>(ctx.Output(kSecondOutputIndex)->GetData()) + matrix_index * L_stride,
+                      matrix_width, matrix_width);
+    T_data *MiddlePtr = new T_data[matrix_size];
+    MatrixMap MiddleData(MiddlePtr, matrix_width, matrix_height);
+    MiddleData = input.template triangularView<Eigen::UnitLower>();
+    output1 = MiddleData.block(0, 0, matrix_width, matrix_width);
+    delete[] MiddlePtr;
+  } else {
+    MatrixMap output1(reinterpret_cast<T_data *>(ctx.Output(kSecondOutputIndex)->GetData()) + matrix_index * L_stride,
+                      matrix_width, matrix_height);
+    output1 = input.template triangularView<Eigen::UnitLower>();
+  }
+  //  Swap
+  std::vector<T_pivots> final_order;
+  final_order.resize(Lu_data_dim1);
+  for (int i = 0; i < Lu_data_dim1; i++) {
+    final_order[i] = T_pivots(i);
+  }
+  for (T_pivots id = 0; id < Lu_pivots_dim; id++) {
+    int64_t perm_id = 0;
+    int64_t perm_pivots_id = 0;
+    for (int64_t i = 0; i < Lu_data_dim1; i++) {
+      if (id == final_order[i]) {
+        perm_id = i;
+      }
+      if (!((*(Lu_pivots_working_ptr + id) <= Lu_data_dim1) && (*(Lu_pivots_working_ptr + id) >= 1))) {
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+      if ((*(Lu_pivots_working_ptr + id) - 1) == final_order[i]) {
+        perm_pivots_id = i;
+      }
+    }
+    std::swap(final_order[perm_id], final_order[perm_pivots_id]);
+  }
+  //  Index_select
+  auto output_y0 = reinterpret_cast<T_data *>(ctx.Output(kFirstOutputIndex)->GetData());
+  int64_t indices_num = final_order.size();
+  int64_t inner_size = Lu_data_dim1;
+  int64_t slice_size = inner_size * sizeof(T_data);
+  for (int64_t j = 0; j < indices_num; ++j) {
+    auto params_idx = final_order[j] * inner_size;
+    auto out_idx = j * inner_size;
+    memcpy(output_y0 + matrix_index * pivots_stride + out_idx, P_eye + params_idx, slice_size);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T_data, typename T_pivots>
+uint32_t LuUnpackCpuKernel::LuUnpackCompute(CpuKernelContext &ctx) {
+  Tensor *input0_tensor = ctx.Input(kFirstInputIndex);
+  Tensor *input1_tensor = ctx.Input(kSecondInputIndex);
+  auto input_0_Shape = input0_tensor->GetTensorShape();
+  auto input_1_Shape = input1_tensor->GetTensorShape();
+  int32_t Lu_data_dims = input_0_Shape->GetDims();
+  int64_t Lu_data_dim1 = input_0_Shape->GetDimSize(Lu_data_dims - 2);
+  int64_t Lu_data_dim2 = input_0_Shape->GetDimSize(Lu_data_dims - 1);
+  int32_t Lu_pivots_dims = input_1_Shape->GetDims();
+  int64_t Lu_pivots_dim = input_1_Shape->GetDimSize(Lu_pivots_dims - 1);
+  auto input_dim_size = input_0_Shape->GetDimSizes();
+  auto input_x1 = reinterpret_cast<T_pivots *>(input1_tensor->GetData());
+
+  int32_t block_size = Lu_data_dim1 * Lu_data_dim1;
+  T_data *P_eye = new T_data[block_size]{};
+  T_data num = static_cast<T_data>(1);
+  for (int32_t i = 0; i < Lu_data_dim1; i++) {
+    *(P_eye + (Lu_data_dim1 + 1) * i) = num;
+  }
+  uint32_t check_status = 0;
+  int64_t Lu_data_stride = Lu_data_dim1 * Lu_data_dim2;
+  int64_t Lu_pivots_stride = Lu_pivots_dim;
+  int64_t batch_num = ctx.Input(0)->NumElements() / Lu_data_stride;
+  if (batch_num < kParallelBatchNum || Lu_data_dims == kLuDataMinRank) {
+    for (int64_t matrix_index = 0; matrix_index < batch_num; matrix_index++) {
+      T_pivots *Lu_pivots_working_ptr = input_x1 + matrix_index * Lu_pivots_stride;
+      check_status = LuUnpack(ctx, Lu_pivots_working_ptr, matrix_index, P_eye);
+      if (check_status == KERNEL_STATUS_PARAM_INVALID) {
+        return check_status;
+      }
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num > batch_num) {
+      max_core_num = batch_num;
+    }
+    uint32_t parallel_status = 0;
+    auto sharder = [&](int64_t start, int64_t end) {
+      for (int64_t matrix_index = start; matrix_index < end; matrix_index++) {
+        T_pivots *Lu_pivots_working_ptr = input_x1 + matrix_index * Lu_pivots_stride;
+        if (LuUnpack(ctx, Lu_pivots_working_ptr, matrix_index, P_eye) == KERNEL_STATUS_OK) {
+          parallel_status = KERNEL_STATUS_OK;
+        } else {
+          parallel_status = KERNEL_STATUS_PARAM_INVALID;
+          break;
+        }
+      }
+    };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch_num, batch_num / max_core_num, sharder),
+                        "LuUnpack Compute failed.");
+    if (parallel_status != KERNEL_STATUS_OK) {
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  delete[] P_eye;
+  return KERNEL_STATUS_OK;
+}
+
+void LuUnpackCpuKernel::SetMap() {
+  calls_[DT_INT8][DT_INT8] = LuUnpackCompute<int8_t, int8_t>;
+  calls_[DT_INT8][DT_UINT8] = LuUnpackCompute<int8_t, uint8_t>;
+  calls_[DT_INT8][DT_INT16] = LuUnpackCompute<int8_t, int16_t>;
+  calls_[DT_INT8][DT_INT32] = LuUnpackCompute<int8_t, int32_t>;
+  calls_[DT_INT8][DT_INT64] = LuUnpackCompute<int8_t, int64_t>;
+
+  calls_[DT_INT16][DT_INT8] = LuUnpackCompute<int16_t, int8_t>;
+  calls_[DT_INT16][DT_INT16] = LuUnpackCompute<int16_t, int16_t>;
+  calls_[DT_INT16][DT_INT32] = LuUnpackCompute<int16_t, int32_t>;
+  calls_[DT_INT16][DT_INT64] = LuUnpackCompute<int16_t, int64_t>;
+  calls_[DT_INT16][DT_UINT8] = LuUnpackCompute<int16_t, uint8_t>;
+
+  calls_[DT_INT32][DT_INT8] = LuUnpackCompute<int32_t, int8_t>;
+  calls_[DT_INT32][DT_INT16] = LuUnpackCompute<int32_t, int16_t>;
+  calls_[DT_INT32][DT_INT32] = LuUnpackCompute<int32_t, int32_t>;
+  calls_[DT_INT32][DT_INT64] = LuUnpackCompute<int32_t, int64_t>;
+  calls_[DT_INT32][DT_UINT8] = LuUnpackCompute<int32_t, uint8_t>;
+
+  calls_[DT_INT64][DT_INT8] = LuUnpackCompute<int64_t, int8_t>;
+  calls_[DT_INT64][DT_INT16] = LuUnpackCompute<int64_t, int16_t>;
+  calls_[DT_INT64][DT_INT32] = LuUnpackCompute<int64_t, int32_t>;
+  calls_[DT_INT64][DT_INT64] = LuUnpackCompute<int64_t, int64_t>;
+  calls_[DT_INT64][DT_UINT8] = LuUnpackCompute<int64_t, uint8_t>;
+
+  calls_[DT_FLOAT16][DT_INT8] = LuUnpackCompute<Eigen::half, int8_t>;
+  calls_[DT_FLOAT16][DT_INT16] = LuUnpackCompute<Eigen::half, int16_t>;
+  calls_[DT_FLOAT16][DT_INT32] = LuUnpackCompute<Eigen::half, int32_t>;
+  calls_[DT_FLOAT16][DT_INT64] = LuUnpackCompute<Eigen::half, int64_t>;
+  calls_[DT_FLOAT16][DT_UINT8] = LuUnpackCompute<Eigen::half, uint8_t>;
+
+  calls_[DT_FLOAT][DT_INT8] = LuUnpackCompute<float, int8_t>;
+  calls_[DT_FLOAT][DT_INT16] = LuUnpackCompute<float, int16_t>;
+  calls_[DT_FLOAT][DT_INT32] = LuUnpackCompute<float, int32_t>;
+  calls_[DT_FLOAT][DT_INT64] = LuUnpackCompute<float, int64_t>;
+  calls_[DT_FLOAT][DT_UINT8] = LuUnpackCompute<float, uint8_t>;
+
+  calls_[DT_DOUBLE][DT_INT8] = LuUnpackCompute<double, int8_t>;
+  calls_[DT_DOUBLE][DT_INT16] = LuUnpackCompute<double, int16_t>;
+  calls_[DT_DOUBLE][DT_INT32] = LuUnpackCompute<double, int32_t>;
+  calls_[DT_DOUBLE][DT_INT64] = LuUnpackCompute<double, int64_t>;
+  calls_[DT_DOUBLE][DT_UINT8] = LuUnpackCompute<double, uint8_t>;
+
+  calls_[DT_UINT8][DT_INT8] = LuUnpackCompute<uint8_t, int8_t>;
+  calls_[DT_UINT8][DT_INT16] = LuUnpackCompute<uint8_t, int16_t>;
+  calls_[DT_UINT8][DT_INT32] = LuUnpackCompute<uint8_t, int32_t>;
+  calls_[DT_UINT8][DT_INT64] = LuUnpackCompute<uint8_t, int64_t>;
+  calls_[DT_UINT8][DT_UINT8] = LuUnpackCompute<uint8_t, uint8_t>;
+}
+
+uint32_t LuUnpackCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LuUnpack check input and output number failed.");
+  Tensor *LU_data_ = ctx.Input(0);
+  Tensor *LU_pivots_ = ctx.Input(1);
+  std::shared_ptr<TensorShape> LU_data_shape = LU_data_->GetTensorShape();
+  std::shared_ptr<TensorShape> LU_pivots_shape = LU_pivots_->GetTensorShape();
+  int32_t LU_data_rank = LU_data_shape->GetDims();
+  if (LU_data_rank < kLuDataMinRank) {
+    KERNEL_LOG_ERROR(
+      "The input dim size of LU_data must be at least 2-D, "
+      "while %d",
+      LU_data_rank);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  int32_t Lu_data_dims = LU_data_shape->GetDims();
+  int64_t Lu_data_dim1 = LU_data_shape->GetDimSize(Lu_data_dims - 2);
+  int64_t Lu_data_dim2 = LU_data_shape->GetDimSize(Lu_data_dims - 1);
+  int32_t Lu_pivots_dims = LU_pivots_shape->GetDims();
+  int64_t Lu_pivots_dim = LU_pivots_shape->GetDimSize(Lu_pivots_dims - 1);
+  if (Lu_pivots_dim != std::min(Lu_data_dim1, Lu_data_dim2)) {
+    KERNEL_LOG_ERROR(
+      "The last dimension of LU_pivots must be the same as the minimum value "
+      "of the last two dimensions of LU_data, "
+      "but got The last dimension of LU_pivots [%d], the minimum value of "
+      "the last two dimensions of LU_data: [%d]",
+      Lu_pivots_dim, std::min(Lu_data_dim1, Lu_data_dim2));
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  for (int32_t i = 0; i < Lu_pivots_dims - 1; i++) {
+    if (LU_data_shape->GetDimSize(i) != LU_pivots_shape->GetDimSize(i)) {
+      KERNEL_LOG_ERROR(
+        " LU_data's batch dimensions does not match LU_pivots's batch "
+        "dimensions.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  DataType LU_data_dtype = static_cast<DataType>(LU_data_->GetDataType());
+  bool LU_data_dtype_flag = LU_data_dtype != DT_FLOAT16 && LU_data_dtype != DT_FLOAT && LU_data_dtype != DT_DOUBLE &&
+                            LU_data_dtype != DT_INT8 && LU_data_dtype != DT_UINT8 && LU_data_dtype != DT_INT16 &&
+                            LU_data_dtype != DT_INT32 && LU_data_dtype != DT_INT64;
+  if (LU_data_dtype_flag) {
+    KERNEL_LOG_ERROR(
+      "Op LuUnpack first input LU_data_type's data type should be of the "
+      "follows: "
+      "DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT16, "
+      "DT_FLOAT, DT_DOUBLE, "
+      "but this type is [%s].",
+      DTypeStr(LU_data_dtype).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  DataType LU_pivots_dtype = static_cast<DataType>(LU_pivots_->GetDataType());
+  bool LU_pivots_dtype_flag = LU_pivots_dtype != DT_INT8 && LU_pivots_dtype != DT_UINT8 &&
+                              LU_pivots_dtype != DT_INT16 && LU_pivots_dtype != DT_INT32 && LU_pivots_dtype != DT_INT64;
+  if (LU_pivots_dtype_flag) {
+    KERNEL_LOG_ERROR(
+      "Op LuUnpack second input LU_pivots_type's data type should be of the "
+      "follows: "
+      "DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, "
+      "but this type is [%s].",
+      DTypeStr(LU_pivots_dtype).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  SetMap();
+  std::vector<DataType> LU_data_type_vec = {DT_INT8,  DT_UINT8,   DT_INT16, DT_INT32,
+                                            DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE};
+  std::vector<DataType> LU_pivots_type_vec = {DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64};
+  for (uint64_t i = 0; i < LU_data_type_vec.size(); i++) {
+    for (uint64_t j = 0; j < LU_pivots_type_vec.size(); j++) {
+      if (LU_data_dtype == LU_data_type_vec[i] && LU_pivots_dtype == LU_pivots_type_vec[j]) {
+        KERNEL_HANDLE_ERROR(calls_[LU_data_type_vec[i]][LU_pivots_type_vec[j]](ctx),
+                            "The elements of LU_pivots must be greater than 1 "
+                            "and be less than the size of LU_pivots's last dimension.");
+      }
+    }
+  }
+  calls_.clear();
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kLuUnpack, LuUnpackCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.h
@ -0,0 +1,40 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_LUUNPACK_H_
+#define AICPU_KERNELS_NORMALIZED_LUUNPACK_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+namespace aicpu {
+class LuUnpackCpuKernel : public CpuKernel {
+ public:
+  LuUnpackCpuKernel() = default;
+  ~LuUnpackCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T_data, typename T_pivots>
+  static uint32_t LuUnpack(CpuKernelContext &ctx, T_pivots *Lu_pivots_working_ptr, int64_t matrix_index, T_data *P_eye);
+  template <typename T_data, typename T_pivots>
+  static uint32_t LuUnpackCompute(CpuKernelContext &ctx);
+  template <typename T_pivots>
+  static uint32_t DataAndTypeCheck(CpuKernelContext &ctx);
+  std::map<int, std::map<int, std::function<uint32_t(CpuKernelContext &)>>> calls_;
+  void SetMap();
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.cc
@ -0,0 +1,183 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "lu_unpack_grad.h"
+#include <iostream>
+#include "Eigen/Core"
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "securec.h"
+#include "status.h"
+#include "utils/broadcast_iterator.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kLuUnpackGrad = "LuUnpackGrad";
+const int64_t kParallelBatchNum = 30;
+const uint32_t kInputNum = 3;
+const uint32_t kOutputNum = 2;
+const uint32_t kInputFirst = 0;
+const uint32_t kInputSecond = 1;
+const uint32_t kInputThird = 2;
+}  // namespace
+
+namespace aicpu {
+uint32_t LuUnpackGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Lu Unpack Grad check input and output number failed.");
+  // choose compute function depend on dataType
+  auto input_type = static_cast<DataType>(ctx.Input(kInputThird)->GetDataType());
+  switch (input_type) {
+    case DT_FLOAT16:
+      return LuUnpackGradCompute<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return LuUnpackGradCompute<float>(ctx);
+    case DT_DOUBLE:
+      return LuUnpackGradCompute<double>(ctx);
+    case DT_INT8:
+      return LuUnpackGradCompute<int8_t>(ctx);
+    case DT_INT16:
+      return LuUnpackGradCompute<int16_t>(ctx);
+    case DT_INT32:
+      return LuUnpackGradCompute<int32_t>(ctx);
+    case DT_INT64:
+      return LuUnpackGradCompute<int64_t>(ctx);
+    case DT_UINT8:
+      return LuUnpackGradCompute<uint8_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                       DTypeStr(input_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t LuUnpackGradCpuKernel::TriLU(CpuKernelContext &ctx, Tensor *L_grad_output, Tensor *U_grad_output, int64_t a) {
+  Tensor *L_grad = NULL;
+  Tensor *U_grad = NULL;
+  Tensor *LU_data = NULL;
+  L_grad = ctx.Input(kInputFirst);
+  U_grad = ctx.Input(kInputSecond);
+  LU_data = ctx.Input(kInputThird);
+  auto LU_data_shape = LU_data->GetTensorShape();
+  int32_t LU_data_dims = LU_data_shape->GetDims();
+  int64_t LU_data_height = LU_data_shape->GetDimSize(LU_data_dims - 2);
+  int64_t LU_data_width = LU_data_shape->GetDimSize(LU_data_dims - 1);
+  auto LU_dim_min = std::min(LU_data_height, LU_data_width);
+  auto input_U_shape = U_grad->GetTensorShape();
+  auto input_U_dim_size = input_U_shape->GetDimSizes();
+  auto input_U_dims = input_U_shape->GetDims();
+  int64_t matrix_U_width = input_U_dim_size[input_U_dims - 2];
+  int64_t matrix_U_height = input_U_dim_size[input_U_dims - 1];
+  int64_t matrix_U_size = matrix_U_width * matrix_U_height;
+  auto input_L_shape = L_grad->GetTensorShape();
+  auto input_L_dim_size = input_L_shape->GetDimSizes();
+  using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+  auto input_L_dims = input_L_shape->GetDims();
+  int64_t matrix_L_width = input_L_dim_size[input_L_dims - 2];
+  int64_t matrix_L_height = input_L_dim_size[input_L_dims - 1];
+  int64_t matrix_L_size = matrix_L_width * matrix_L_height;
+  int64_t output_stride = LU_data_height * LU_data_width;
+
+  MatrixMap input_L(reinterpret_cast<T *>(L_grad->GetData()) + a * matrix_L_size, matrix_L_width, matrix_L_height);
+  MatrixMap input_U(reinterpret_cast<T *>(U_grad->GetData()) + a * matrix_U_size, matrix_U_width, matrix_U_height);
+  if (LU_data_width > LU_data_height) {
+    MatrixMap output_L(reinterpret_cast<T *>(L_grad_output->GetData()) + a * output_stride, LU_data_height,
+                       LU_data_width);
+    T *MiddlePtr = new T[matrix_L_size];
+    MatrixMap MiddleData(MiddlePtr, matrix_L_width, matrix_L_height);
+    MiddleData = input_L.template triangularView<Eigen::StrictlyLower>();
+    for (auto i = 0; i < LU_data_height; i++) {
+      for (auto j = 0; j < LU_dim_min; j++) {
+        output_L(i, j) = MiddleData(i, j);
+      }
+    }
+    delete[] MiddlePtr;
+  } else {
+    MatrixMap output_L(reinterpret_cast<T *>(L_grad_output->GetData()) + a * output_stride, LU_data_height,
+                       LU_data_width);
+    output_L = input_L.template triangularView<Eigen::StrictlyLower>();
+  }
+  if (LU_data_height > LU_data_width) {
+    MatrixMap output_U(reinterpret_cast<T *>(U_grad_output->GetData()) + a * output_stride, LU_data_height,
+                       LU_data_width);
+    T *MiddlePtr = new T[matrix_U_size];
+    MatrixMap MiddleData(MiddlePtr, matrix_U_width, matrix_U_height);
+    MiddleData = input_U.template triangularView<Eigen::Upper>();
+    for (auto i = 0; i < LU_dim_min; i++) {
+      for (auto j = i; j < LU_data_width; j++) {
+        output_U(i, j) = MiddleData(i, j);
+      }
+    }
+    delete[] MiddlePtr;
+  } else {
+    MatrixMap output_U(reinterpret_cast<T *>(U_grad_output->GetData()) + a * output_stride, LU_data_height,
+                       LU_data_width);
+    output_U = input_U.template triangularView<Eigen::Upper>();
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t LuUnpackGradCpuKernel::LuUnpackGradCompute(CpuKernelContext &ctx) {
+  Tensor *LU_data = NULL;
+  Tensor *L_grad_output = NULL;
+  Tensor *U_grad_output = NULL;
+  LU_data = ctx.Input(kInputThird);
+  L_grad_output = ctx.Output(0);
+  U_grad_output = ctx.Output(1);
+
+  auto LU_data_shape = LU_data->GetTensorShape();
+  int32_t LU_data_dims = LU_data_shape->GetDims();
+  int64_t LU_data_elem_num = LU_data->NumElements();
+
+  int64_t LU_data_height = LU_data_shape->GetDimSize(LU_data_dims - 2);
+  int64_t LU_data_width = LU_data_shape->GetDimSize(LU_data_dims - 1);
+  int64_t LU_data_stride = LU_data_height * LU_data_width;
+  int64_t matrix_num = LU_data_elem_num / LU_data_stride;
+
+  auto L_grad_output_data = reinterpret_cast<T *>(L_grad_output->GetData());
+  auto U_grad_output_data = reinterpret_cast<T *>(U_grad_output->GetData());
+  for (auto i = 0; i < LU_data_elem_num; i++) {
+    *(L_grad_output_data + i) = static_cast<T>(0);
+    *(U_grad_output_data + i) = static_cast<T>(0);
+  }
+  if (matrix_num < kParallelBatchNum) {
+    for (auto i = 0; i < matrix_num; i++) {
+      TriLU<T>(ctx, L_grad_output, U_grad_output, i);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num > matrix_num) {
+      max_core_num = matrix_num;
+    }
+    auto sharder = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; i++) {
+        TriLU<T>(ctx, L_grad_output, U_grad_output, i);
+      }
+    };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, sharder),
+                        "LuUnpackGrad Compute failed.");
+  }
+
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kLuUnpackGrad, LuUnpackGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.h
@ -0,0 +1,40 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_LU_UNPACK_GRAD_H_
+#define AICPU_KERNELS_LU_UNPACK_GRAD_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class LuUnpackGradCpuKernel : public CpuKernel {
+ public:
+  ~LuUnpackGradCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  /**
+   * @brief compute for all types
+   * @param ctx cpu kernel context
+   * @return status if success
+   */
+  template <typename T>
+  uint32_t LuUnpackGradCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t TriLU(CpuKernelContext &ctx, Tensor *L_grad_output, Tensor *U_grad_output, int64_t a);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.cc
@ -0,0 +1,179 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "matmul.h"
+
+#include <complex>
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#include "utils/kernel_util.h"
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "status.h"
+
+using namespace std;
+
+namespace {
+const char *kMatmul = "MatMul";
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+uint32_t MatMulCpuKernel::AddCompute(CpuKernelContext &ctx, Bcast &bcast) {
+  auto in2 = reinterpret_cast<T *>(ctx.Input(2)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Output(0)->NumElements();
+
+  for (int64_t i = 0; i < data_num; i++) {
+    auto input1 = in2 + bcast.GetBroadcastXIndex(i);  // i-th value of input0
+    auto input2 = out + bcast.GetBroadcastYIndex(i);  // i-th value of input1
+    *(out + i) = (*input1) + (*input2);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MatMulCpuKernel::BiasCompute(CpuKernelContext &ctx) {
+  auto input0_tensor = ctx.Input(0);
+  auto input2_tensor = ctx.Input(2);
+  auto input2_shape = input2_tensor->GetTensorShape()->GetDimSizes();
+  auto output_tensor = ctx.Output(kFirstOutputIndex);
+  auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
+
+  KERNEL_CHECK_FALSE(input2_tensor->GetTensorShape()->GetDims() == 1, KERNEL_STATUS_PARAM_INVALID,
+                     "Input[x3] must be a 1D tensor")
+
+  DataType input0_data_type = input0_tensor->GetDataType();
+  DataType input2_data_type = input2_tensor->GetDataType();
+  KERNEL_CHECK_FALSE((input0_data_type == input2_data_type), KERNEL_STATUS_PARAM_INVALID,
+                     "Input[x1] data type[%s] and input[x3] data type[%s] must be same",
+                     DTypeStr(input0_data_type).c_str(), DTypeStr(input2_data_type).c_str())
+
+  Bcast bcast(input2_shape, output_shape);
+  if (!bcast.IsValid()) {
+    KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return AddCompute<T>(ctx, bcast);
+}
+
+template <typename T>
+uint32_t MatMulCpuKernel::MatMulCompute(CpuKernelContext &ctx) {
+  auto input0_tensor = ctx.Input(0);
+  auto input0_tensor_shape = input0_tensor->GetTensorShape();
+  KERNEL_CHECK_FALSE((IsMatrix(input0_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
+                     "Input[x1] must be a matrix")
+
+  auto input1_tensor = ctx.Input(1);
+  auto input1_tensor_shape = input1_tensor->GetTensorShape();
+  KERNEL_CHECK_FALSE((IsMatrix(input1_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
+                     "Input[x2] must be a matrix")
+
+  auto transpose_x1 = ctx.GetAttr("transpose_x1")->GetBool();
+  auto transpose_x2 = ctx.GetAttr("transpose_x2")->GetBool();
+  KERNEL_LOG_DEBUG(
+    "%s Attr[transpose_x1] value[%d], "
+    "Attr[transpose_x2] value[%d].",
+    kMatmul, transpose_x1, transpose_x2);
+  int32_t x1_dim = transpose_x1 ? 0 : 1;
+  int32_t x2_dim = transpose_x2 ? 1 : 0;
+  KERNEL_CHECK_FALSE((input0_tensor_shape->GetDimSize(x1_dim) == input1_tensor_shape->GetDimSize(x2_dim)),
+                     KERNEL_STATUS_PARAM_INVALID,
+                     "Matrix size incompatible, input[x1] dim[%d] value[%lld], "
+                     "input[x2] dim[%d] value[%lld]",
+                     x1_dim, input0_tensor_shape->GetDimSize(x1_dim), x2_dim, input1_tensor_shape->GetDimSize(x2_dim))
+
+  auto input0_shape = input0_tensor_shape->GetDimSizes();
+  using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+  MatrixMap input0(reinterpret_cast<T *>(input0_tensor->GetData()), input0_shape[0], input0_shape[1]);
+
+  auto input1_shape = input1_tensor_shape->GetDimSizes();
+  MatrixMap input1(reinterpret_cast<T *>(input1_tensor->GetData()), input1_shape[0], input1_shape[1]);
+
+  auto output_tensor = ctx.Output(kFirstOutputIndex);
+  auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
+  MatrixMap output(reinterpret_cast<T *>(output_tensor->GetData()), output_shape[0], output_shape[1]);
+  if (transpose_x1) {
+    if (transpose_x2) {
+      output = input0.transpose() * input1.transpose();
+    } else {
+      output = input0.transpose() * input1;
+    }
+  } else {
+    if (transpose_x2) {
+      output = input0 * input1.transpose();
+    } else {
+      output = input0 * input1;
+    }
+  }
+  if (ctx.GetInputsSize() == 3) {
+    return BiasCompute<T>(ctx);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MatMulCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  uint32_t input_num = ctx.GetInputsSize();
+  uint32_t output_num = ctx.GetOutputsSize();
+  if ((input_num != 2 && input_num != 3) || output_num != 1) {
+    KERNEL_LOG_ERROR("The number of input or output parameters does not match.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  auto input0_tensor = ctx.Input(0);
+  KERNEL_CHECK_NULLPTR(input0_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get input[x1] data failed",
+                       ctx.GetOpType().c_str())
+
+  auto input1_tensor = ctx.Input(1);
+  auto input1_tensor_shape = input1_tensor->GetTensorShape();
+  KERNEL_CHECK_NULLPTR(input1_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get input[x2] data failed",
+                       ctx.GetOpType().c_str())
+
+  DataType input0_data_type = input0_tensor->GetDataType();
+  DataType input1_data_type = input1_tensor->GetDataType();
+  KERNEL_CHECK_FALSE((input0_data_type == input1_data_type), KERNEL_STATUS_PARAM_INVALID,
+                     "Input[x1] data type[%s] and input[x2] data type[%s] must be same",
+                     DTypeStr(input0_data_type).c_str(), DTypeStr(input1_data_type).c_str())
+  KERNEL_LOG_DEBUG("%s op input[x1] data type is [%s].", kMatmul, DTypeStr(input0_data_type).c_str());
+  uint32_t ret = KERNEL_STATUS_OK;
+  switch (input0_data_type) {
+    case DT_FLOAT:
+      ret = MatMulCompute<float>(ctx);
+      break;
+    case DT_DOUBLE:
+      ret = MatMulCompute<double>(ctx);
+      break;
+    case DT_FLOAT16:
+      ret = MatMulCompute<Eigen::half>(ctx);
+      break;
+    case DT_INT32:
+      ret = MatMulCompute<int32_t>(ctx);
+      break;
+    case DT_COMPLEX64:
+      ret = MatMulCompute<std::complex<float>>(ctx);
+      break;
+    case DT_COMPLEX128:
+      ret = MatMulCompute<std::complex<double>>(ctx);
+      break;
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                       DTypeStr(input0_data_type).c_str());
+      ret = KERNEL_STATUS_PARAM_INVALID;
+  }
+  return ret;
+}
+
+REGISTER_CPU_KERNEL(kMatmul, MatMulCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_HOST_MATMUL_H_
+#define AICPU_KERNELS_HOST_MATMUL_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class MatMulCpuKernel : public CpuKernel {
+ public:
+  MatMulCpuKernel() = default;
+  ~MatMulCpuKernel() = default;
+
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t AddCompute(CpuKernelContext &ctx, Bcast &bcast);
+  template <typename T>
+  uint32_t BiasCompute(CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t MatMulCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.cc
@ -0,0 +1,320 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "matrix_exp.h"
+
+#include <array>
+#include <complex>
+#include <cmath>
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+
+namespace {
+constexpr uint32_t kMatrixExpInputNum = 1;
+constexpr uint32_t kMatrixExpOutputNum = 1;
+constexpr uint32_t kIndexTwo = 2;
+const int64_t paralled_data_size = 8 * 1024;
+const char *kMatrixExp = "MatrixExp";
+constexpr int total_n_degs = 6;
+
+// Coefficients for computing taylor approximant of order 8.
+constexpr double sqrt_177 = 0.1330413469565007072504e+2, x3 = 2. / 3.;
+constexpr double x1 = x3 * ((1. + sqrt_177) / 88.), x2 = x3 * ((1. + sqrt_177) / 352.);
+constexpr double x4 = (-271. + 29. * sqrt_177) / (315. * x3), x5 = (-11. + 11. * sqrt_177) / (1260. * x3);
+constexpr double x6 = (-99. + 11. * sqrt_177) / (5040. * x3), x7 = (89. - sqrt_177) / (5040. * x3);
+constexpr double y2 = (857. - 58. * sqrt_177) / 630.;
+
+template <typename T, int ROW, int COL>
+using array2d = std::array<std::array<T, COL>, ROW>;
+
+// Coefficients for computing taylor approximant of order 12.
+constexpr int num_prods_12 = 4;
+array2d<double, num_prods_12, num_prods_12> b12 = {
+  {{9.0198e-16, 0.46932117595418237389, -0.20099424927047284052, -0.04623946134063071740},
+   {5.31597895759871264183, 1.19926790417132231573, 0.01179296240992997031, 0.01108844528519167989},
+   {0.18188869982170434744, 0.05502798439925399070, 0.09351590770535414968, 0.00610700528898058230},
+   {-2.0861320e-13, -0.13181061013830184015, -0.02027855540589259079, -0.00675951846863086359}}};
+
+// Coefficients for computing taylor approximant of order 18.
+constexpr int num_prods_18 = 5;
+array2d<double, num_prods_18, num_prods_18> b18 = {
+  {{0., -1.00365581030144618291e-01, -8.02924648241156932449e-03, -8.92138498045729985177e-04, 0.},
+   {0., 3.97849749499645077844e-01, 1.36783778460411720168e+00, 4.98289622525382669416e-01,
+    -6.37898194594723280150e-04},
+   {-1.09676396052962061844e+01, 1.68015813878906206114e+00, 5.71779846478865511061e-02, -6.98210122488052056106e-03,
+    3.34975017086070470649e-05},
+   {-9.04316832390810593223e-02, -6.76404519071381882256e-02, 6.75961301770459654925e-02, 2.95552570429315521194e-02,
+    -1.39180257516060693404e-05},
+   {0., 0., -9.23364619367118555360e-02, -1.69364939002081722752e-02, -1.40086798182036094347e-05}}};
+
+// Threshold for different order of taylor approximant.
+constexpr std::array<float, total_n_degs> thetas_float = {1.192092800768788e-07, 5.978858893805233e-04,
+                                                          5.116619363445086e-02, 5.800524627688768e-01,
+                                                          1.461661507209034e+00, 3.010066362817634e+00};
+
+// Threshold for different order of taylor approximant.
+constexpr std::array<double, total_n_degs> thetas_double = {2.220446049250313e-16, 2.580956802971767e-08,
+                                                            3.397168839976962e-04, 4.991228871115323e-02,
+                                                            2.996158913811580e-01, 1.090863719290036e+00};
+
+#define MATRIX_EXP_COMPUTE_CASE(DTYPE, TYPE, CTX)           \
+  case (DTYPE): {                                           \
+    uint32_t result = MatrixExpCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                       \
+      KERNEL_LOG_ERROR("MatrixExp kernel compute failed."); \
+      return result;                                        \
+    }                                                       \
+    break;                                                  \
+  }
+
+#define MATRIX_EXP_COMPUTE_DIFF_CASE(DTYPE, TYPE, CTX)      \
+  case (DTYPE): {                                           \
+    uint32_t result = MatrixExpDiffTypeCompute<TYPE>(CTX);  \
+    if (result != KERNEL_STATUS_OK) {                       \
+      KERNEL_LOG_ERROR("MatrixExp kernel compute failed."); \
+      return result;                                        \
+    }                                                       \
+    break;                                                  \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t MatrixExpCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMatrixExpInputNum, kMatrixExpOutputNum),
+                      "[%s] check input and output number failed.", kMatrixExp);
+  KERNEL_HANDLE_ERROR(MatrixExpCheck(ctx), "[%s] check params failed.", kMatrixExp);
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    MATRIX_EXP_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    MATRIX_EXP_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    MATRIX_EXP_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
+    MATRIX_EXP_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
+    MATRIX_EXP_COMPUTE_DIFF_CASE(DT_FLOAT16, Eigen::half, ctx)
+    default:
+      KERNEL_LOG_ERROR("MatrixExp kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MatrixExpCpuKernel::MatrixExpCheck(CpuKernelContext &ctx) {
+  auto input_0 = ctx.Input(0);
+  std::vector<int64_t> shape_x = input_0->GetTensorShape()->GetDimSizes();
+  size_t shape_size_x = shape_x.size();
+  KERNEL_CHECK_FALSE((shape_size_x > 1), KERNEL_STATUS_PARAM_INVALID, "Input x must be at least rank 2, got [%zu].",
+                     shape_size_x)
+  KERNEL_CHECK_FALSE((shape_x[shape_size_x - 1] > 0), KERNEL_STATUS_PARAM_INVALID,
+                     "Input x's last dimension must be at least 1.")
+  KERNEL_CHECK_FALSE((shape_x[shape_size_x - kIndexTwo] == shape_x[shape_size_x - 1]), KERNEL_STATUS_PARAM_INVALID,
+                     "Input x's last two dimensions must be equal, but are [%lld] and [%lld].",
+                     shape_x[shape_size_x - kIndexTwo], shape_x[shape_size_x - 1])
+  return KERNEL_STATUS_OK;
+}
+
+template <typename Derived1, typename Derived2, typename Derived3>
+void MatrixExpCpuKernel::MTaylorApproximant(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
+                                            int order, Eigen::MatrixBase<Derived3> &E) {
+  constexpr int expension_order_1 = 1;
+  constexpr int expension_order_2 = 2;
+  constexpr int expension_order_4 = 4;
+  constexpr int expension_order_8 = 8;
+  constexpr int expension_order_12 = 12;
+  auto A2 = A * A;
+  auto A3 = A * A2;
+  if (order == expension_order_1) {
+    E = I + A;
+  } else if (order == expension_order_2) {
+    constexpr int A2_divisor = 2;
+    E = I + A + A2 / A2_divisor;
+  } else if (order == expension_order_4) {
+    constexpr int I_divisor = 2;
+    constexpr int A_divisor = 6;
+    constexpr int A2_divisor = 24;
+    E = I + A + A2 * (I / I_divisor + A / A_divisor + A2 / A2_divisor);
+  } else if (order == expension_order_8) {
+    auto A4 = A2 * (x1 * A + x2 * A2);
+    auto A8 = (x3 * A2 + A4) * (x4 * I + x5 * A + x6 * A2 + x7 * A4);
+    E = I + A + y2 * A2 + A8;
+  } else if (order == expension_order_12) {
+    auto q31 = b12[0][0] * I + b12[0][1] * A + b12[0][2] * A2 + b12[0][3] * A3;
+    auto q32 = b12[1][0] * I + b12[1][1] * A + b12[1][2] * A2 + b12[1][3] * A3;
+    auto q33 = b12[2][0] * I + b12[2][1] * A + b12[2][2] * A2 + b12[2][3] * A3;
+    auto q34 = b12[3][0] * I + b12[3][1] * A + b12[3][2] * A2 + b12[3][3] * A3;
+    auto q61 = q33 + q34 * q34;
+    E = q31 + (q32 + q61) * q61;
+  } else {
+    auto A6 = A3 * A3;
+    auto q31 = b18[0][0] * I + b18[0][1] * A + b18[0][2] * A2 + b18[0][3] * A3 + b18[0][4] * A6;
+    auto q61 = b18[1][0] * I + b18[1][1] * A + b18[1][2] * A2 + b18[1][3] * A3 + b18[1][4] * A6;
+    auto q62 = b18[2][0] * I + b18[2][1] * A + b18[2][2] * A2 + b18[2][3] * A3 + b18[2][4] * A6;
+    auto q63 = b18[3][0] * I + b18[3][1] * A + b18[3][2] * A2 + b18[3][3] * A3 + b18[3][4] * A6;
+    auto q64 = b18[4][0] * I + b18[4][1] * A + b18[4][2] * A2 + b18[4][3] * A3 + b18[4][4] * A6;
+    auto q91 = q31 * q64 + q63;
+    E = q61 + (q62 + q91) * q91;
+  }
+}
+
+template <typename Derived1, typename Derived2>
+void MatrixExpCpuKernel::MexpImpl(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
+                                  Eigen::MatrixBase<Derived1> &mexp, CpuKernelContext &ctx) {
+  const auto norm = A.cwiseAbs().colwise().sum().maxCoeff();
+  constexpr std::array<int, total_n_degs> m_vals = {1, 2, 4, 8, 12, 18};
+  constexpr int cut_deg = 2;
+  int64_t s = -1;
+  auto data_type = ctx.Input(0)->GetDataType();
+  if (data_type == DT_FLOAT16 || data_type == DT_FLOAT || data_type == DT_COMPLEX64) {
+    for (int i = 0; i < total_n_degs - 1; i++) {
+      if (norm <= thetas_float[i]) {
+        MTaylorApproximant(A, I, m_vals[i], mexp);
+        break;
+      }
+    }
+    if (norm >= thetas_float[total_n_degs - cut_deg]) {
+      s = ceil(log2(norm / thetas_float[total_n_degs - 1]));
+      if (s <= 0) {
+        s = 0;
+      }
+    }
+  } else {
+    for (int i = 0; i < total_n_degs - 1; i++) {
+      if (norm <= thetas_double[i]) {
+        MTaylorApproximant(A, I, m_vals[i], mexp);
+        break;
+      }
+    }
+    if (norm >= thetas_double[total_n_degs - cut_deg]) {
+      s = ceil(log2(norm / thetas_double[total_n_degs - 1]));
+      if (s <= 0) {
+        s = 0;
+      }
+    }
+  }
+  if (s >= 0) {
+    const auto pow2s = pow(2, s);
+    const auto A_scaled = A / pow2s;
+    MTaylorApproximant(A_scaled, I, m_vals[total_n_degs - 1], mexp);
+    for (int k = 0; k < s; k++) {
+      mexp = mexp * mexp;
+    }
+  }
+}
+
+template <typename T>
+uint32_t MatrixExpCpuKernel::MatrixExpCompute(CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+
+  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  size_t shape_size = shape_x.size();
+  int64_t m = shape_x[shape_size - 1];
+  int64_t size_mm = m * m;
+  typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
+  MatrixXd I(m, m);
+  I.setIdentity();
+  int64_t matrix_num = ctx.Input(0)->NumElements() / size_mm;
+  int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
+  if (data_size <= paralled_data_size) {
+    for (int64_t i = 0; i < matrix_num; i++) {
+      Eigen::Map<MatrixXd> matrix_x(input_x + i * m * m, m, m);
+      Eigen::Map<MatrixXd> matrix_y(output_y + i * m * m, m, m);
+      if (matrix_x.size() > 0) {
+        MexpImpl(matrix_x, I, matrix_y, ctx);
+      }
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (max_core_num == 0) {
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (max_core_num > matrix_num) {
+      max_core_num = matrix_num;
+    }
+    auto shard_work = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        Eigen::Map<MatrixXd> matrix_x(input_x + i * m * m, m, m);
+        Eigen::Map<MatrixXd> matrix_y(output_y + i * m * m, m, m);
+        if (matrix_x.size() > 0) {
+          MexpImpl(matrix_x, I, matrix_y, ctx);
+        }
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_work),
+                        "MatrixExp Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+void MatrixExpCpuKernel::TyepChangeForFp16(int64_t i, int64_t m, Eigen::half *input_x, Eigen::half *output_y,
+                                           CpuKernelContext &ctx) {
+  typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
+  MatrixXd I(m, m);
+  (void)I.setIdentity();
+  MatrixXd matrix_x(m, m);
+  MatrixXd matrix_y(m, m);
+  int64_t size_mm = m * m;
+  for (int p = 0; p < m; p++) {
+    for (int q = 0; q < m; q++) {
+      matrix_x(p, q) = static_cast<float>(input_x[i * size_mm + p * m + q]);
+    }
+  }
+  if (matrix_x.size() > 0) {
+    MexpImpl(matrix_x, I, matrix_y, ctx);
+  }
+  for (int p = 0; p < m; p++) {
+    for (int q = 0; q < m; q++) {
+      output_y[i * size_mm + p * m + q] = static_cast<Eigen::half>(matrix_y(p, q));
+    }
+  }
+}
+
+template <typename T>
+uint32_t MatrixExpCpuKernel::MatrixExpDiffTypeCompute(CpuKernelContext &ctx) {
+  T *input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+
+  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  size_t shape_size = shape_x.size();
+  int64_t m = shape_x[shape_size - 1];
+  int64_t size_mm = m * m;
+  int64_t matrix_num = ctx.Input(0)->NumElements() / size_mm;
+  int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
+  if (data_size <= paralled_data_size) {
+    for (int64_t i = 0; i < matrix_num; i++) {
+      TyepChangeForFp16(i, m, input_x, output_y, ctx);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (max_core_num == 0) {
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (max_core_num > matrix_num) {
+      max_core_num = matrix_num;
+    }
+    auto shard_work = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        TyepChangeForFp16(i, m, input_x, output_y, ctx);
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_work),
+                        "MatrixExp Compute failed.");
+  }
+  // }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMatrixExp, MatrixExpCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.h
@ -0,0 +1,50 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_MATRIX_EXP_H_
+#define AICPU_KERNELS_NORMALIZED_MATRIX_EXP_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/eigen_tensor.h"
+namespace aicpu {
+class MatrixExpCpuKernel : public CpuKernel {
+ public:
+  MatrixExpCpuKernel() = default;
+  ~MatrixExpCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t MatrixExpCheck(CpuKernelContext &ctx);
+
+  template <typename Derived1, typename Derived2, typename Derived3>
+  void MTaylorApproximant(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I, int order,
+                          Eigen::MatrixBase<Derived3> &E);
+
+  template <typename Derived1, typename Derived2>
+  void MexpImpl(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
+                Eigen::MatrixBase<Derived1> &mexp, CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t MatrixExpCompute(CpuKernelContext &ctx);
+
+  void TyepChangeForFp16(int64_t i, int64_t m, Eigen::half *input_x, Eigen::half *output_y, CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t MatrixExpDiffTypeCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc
@ -0,0 +1,460 @@
+/**
+ * Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unminimum required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "maximum.h"
+
+#include "Eigen/Dense"
+#include "cmath"
+#include "cpu_kernel_utils.h"
+#include "iostream"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+namespace {
+const uint32_t kInputNum = 2;
+const uint32_t kOutputNum = 1;
+const char *kMaximum = "Maximum";
+// when input data size is more than kParallelDataNum, use Parallel func
+const int64_t kParallelDataNum = 2 * 1024;
+const int64_t kParallelDataNumMid = 16 * 1024;
+const int64_t kParallelDataNumSameShape = 7 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
+
+#define MAXIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                         \
+    uint32_t result = MaximumCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                     \
+      KERNEL_LOG_ERROR("Maximum kernel compute failed."); \
+      return result;                                      \
+    }                                                     \
+    break;                                                \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t MaximumCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Maximum check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MaximumParamCheck(ctx), "Maximum check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    MAXIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    MAXIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    MAXIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    MAXIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    MAXIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("Maximum kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MaximumCpuKernel::MaximumParamCheck(CpuKernelContext &ctx) {
+  // the non null of input_0, input_1, output has been verified in NormalCheck
+  Tensor *input_0 = ctx.Input(0);
+  Tensor *input_1 = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  DataType input0_type = input_0->GetDataType();
+  DataType input1_type = input_1->GetDataType();
+  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with "
+                     "input1 [%s].",
+                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
+  KERNEL_LOG_DEBUG(
+    "MaximumCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+void MaximumCpuKernel::SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
+  auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto ignore_nan = false;
+  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
+  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
+  for (int64_t i = start; i < end; ++i) {
+    if (ignore_nan == true && is_float16 == true) {
+      if (Eigen::numext::isnan(*(input1 + i))) {
+        *(output + i) = *(input2 + i);
+      } else if (Eigen::numext::isnan(*(input2 + i))) {
+        *(output + i) = *(input1 + i);
+      } else {
+        *(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
+      }
+    }
+    if (ignore_nan == true && is_float16 == false) {
+      if (isnan(*(input1 + i))) {
+        *(output + i) = *(input2 + i);
+      } else if (isnan(*(input2 + i))) {
+        *(output + i) = *(input1 + i);
+      } else {
+        *(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
+      }
+    }
+    if (ignore_nan == false && is_float16 == true) {
+      if (Eigen::numext::isnan(*(input1 + i))) {
+        *(output + i) = *(input1 + i);
+      } else if (Eigen::numext::isnan(*(input2 + i))) {
+        *(output + i) = *(input2 + i);
+      } else {
+        *(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
+      }
+    }
+    if (ignore_nan == false && is_float16 == false) {
+      if (isnan(*(input1 + i))) {
+        *(output + i) = *(input1 + i);
+      } else if (isnan(*(input2 + i))) {
+        *(output + i) = *(input2 + i);
+      } else {
+        *(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
+      }
+    }
+  }
+}
+
+template <typename T>
+void MaximumCpuKernel::SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
+  auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto ignore_nan = false;
+  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
+  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
+  for (int64_t i = start; i < end; ++i) {
+    if (ignore_nan == true && is_float16 == true) {
+      if (Eigen::numext::isnan(*(input1))) {
+        *(output + i) = *(input2 + i);
+      } else if (Eigen::numext::isnan(*(input2 + i))) {
+        *(output + i) = *(input1);
+      } else {
+        *(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
+      }
+    }
+    if (ignore_nan == true && is_float16 == false) {
+      if (isnan(*(input1))) {
+        *(output + i) = *(input2 + i);
+      } else if (isnan(*(input2 + i))) {
+        *(output + i) = *(input1);
+      } else {
+        *(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
+      }
+    }
+    if (ignore_nan == false && is_float16 == true) {
+      if (Eigen::numext::isnan(*(input1))) {
+        *(output + i) = *(input1);
+      } else if (Eigen::numext::isnan(*(input2 + i))) {
+        *(output + i) = *(input2 + i);
+      } else {
+        *(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
+      }
+    }
+    if (ignore_nan == false && is_float16 == false) {
+      if (isnan(*(input1))) {
+        *(output + i) = *(input1);
+      } else if (isnan(*(input2 + i))) {
+        *(output + i) = *(input2 + i);
+      } else {
+        *(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
+      }
+    }
+  }
+}
+
+template <typename T>
+void MaximumCpuKernel::SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
+  auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto ignore_nan = false;
+  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
+  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
+  for (int64_t i = start; i < end; ++i) {
+    if (ignore_nan == true && is_float16 == true) {
+      if (Eigen::numext::isnan(*(input1 + i))) {
+        *(output + i) = *(input2);
+      } else if (Eigen::numext::isnan(*(input2))) {
+        *(output + i) = *(input1 + i);
+      } else {
+        *(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
+      }
+    }
+    if (ignore_nan == true && is_float16 == false) {
+      if (isnan(*(input1 + i))) {
+        *(output + i) = *(input2);
+      } else if (isnan(*(input2))) {
+        *(output + i) = *(input1 + i);
+      } else {
+        *(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
+      }
+    }
+    if (ignore_nan == false && is_float16 == true) {
+      if (Eigen::numext::isnan(*(input1 + i))) {
+        *(output + i) = *(input1 + i);
+      } else if (Eigen::numext::isnan(*(input2))) {
+        *(output + i) = *(input2);
+      } else {
+        *(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
+      }
+    }
+    if (ignore_nan == false && is_float16 == false) {
+      if (isnan(*(input1 + i))) {
+        *(output + i) = *(input1 + i);
+      } else if (isnan(*(input2))) {
+        *(output + i) = *(input2);
+      } else {
+        *(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
+      }
+    }
+  }
+}
+
+template <typename T>
+void MaximumCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) {
+  bool is_float16 = false;
+  if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
+      std::is_same<T, double>::value) {
+    is_float16 = false;
+  } else {
+    is_float16 = true;
+  }
+  switch (type) {
+    case BcastShapeType::SAME_SHAPE:
+      SpecialComputeSameShape<T>(start, end, ctx, is_float16);
+      break;
+    case BcastShapeType::X_ONE_ELEMENT:
+      SpecialComputeXOneElement<T>(start, end, ctx, is_float16);
+      break;
+    case BcastShapeType::Y_ONE_ELEMENT:
+      SpecialComputeYOneElement<T>(start, end, ctx, is_float16);
+      break;
+    default:
+      KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
+      break;
+  }
+}
+
+template <typename T>
+uint32_t MaximumCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
+  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
+  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
+  int64_t data_num = ctx.Output(0)->NumElements();
+  BcastShapeType type = in0_elements_nums == in1_elements_nums
+                          ? BcastShapeType::SAME_SHAPE
+                          : (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumSameShapeMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_fmax = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, ctx); };
+
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_fmax),
+                        "Maximum Compute failed.");
+  } else {
+    SpecialCompute<T>(type, 0, data_num, ctx);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+void MaximumCpuKernel::BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast,
+                                               bool is_float16) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto ignore_nan = false;
+  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
+  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
+  for (int64_t i = start; i < end; ++i) {
+    if (ignore_nan == true && is_float16 == true) {
+      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+    if (ignore_nan == true && is_float16 == false) {
+      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+    if (ignore_nan == false && is_float16 == true) {
+      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+    if (ignore_nan == false && is_float16 == false) {
+      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+  }
+}
+
+template <typename T>
+void MaximumCpuKernel::BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto ignore_nan = false;
+  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
+  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
+  int64_t data_num = ctx.Output(0)->NumElements();
+  for (int64_t i = 0; i < data_num; ++i) {
+    if (ignore_nan == true && is_float16 == true) {
+      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+    if (ignore_nan == true && is_float16 == false) {
+      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+    if (ignore_nan == false && is_float16 == true) {
+      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+    if (ignore_nan == false && is_float16 == false) {
+      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+  }
+}
+
+template <typename T>
+uint32_t MaximumCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
+  int64_t data_num = ctx.Output(0)->NumElements();
+  bool is_float16 = false;
+  if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
+      std::is_same<T, double>::value) {
+    is_float16 = false;
+  } else {
+    is_float16 = true;
+  }
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_fmax = [&](int64_t start, int64_t end) {
+      BcastComputeMultiKernel<T>(start, end, ctx, bcast, is_float16);
+    };
+
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_fmax),
+                        "Maximum Compute failed.");
+  } else {
+    BcastComputeOneKernel<T>(ctx, bcast, is_float16);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MaximumCpuKernel::MaximumCompute(CpuKernelContext &ctx) {
+  Tensor *input0_tensor = ctx.Input(0);
+  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input0_elements_nums = input0_tensor->NumElements();
+
+  Tensor *input1_tensor = ctx.Input(1);
+  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input1_elements_nums = input1_tensor->NumElements();
+
+  bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
+  if (isNeedBcast) {
+    return NoBcastCompute<T>(ctx);
+  } else {
+    Bcast bcast(input0_shape, input1_shape);
+    if (!bcast.IsValid()) {
+      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+
+    return BcastCompute<T>(ctx, bcast);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMaximum, MaximumCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.h
@ -0,0 +1,63 @@
+/**
+ * Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unminimum required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_MAXIMUM_H_
+#define AICPU_KERNELS_NORMALIZED_MAXIMUM_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class MaximumCpuKernel : public CpuKernel {
+ public:
+  MaximumCpuKernel() = default;
+  ~MaximumCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t MaximumParamCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx);
+
+  template <typename T>
+  void SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
+
+  template <typename T>
+  void SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
+
+  template <typename T>
+  void SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
+
+  template <typename T>
+  uint32_t NoBcastCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
+
+  template <typename T>
+  void BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
+
+  template <typename T>
+  void BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
+
+  template <typename T>
+  uint32_t MaximumCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc
@ -0,0 +1,456 @@
+/**
+ * Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unminimum required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "minimum.h"
+
+#include "Eigen/Dense"
+#include "cmath"
+#include "cpu_kernel_utils.h"
+#include "iostream"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+namespace {
+const uint32_t kInputNum = 2;
+const uint32_t kOutputNum = 1;
+const char *kMinimum = "Minimum";
+// when input data size is more than kParallelDataNum, use Parallel func
+const int64_t kParallelDataNum = 2 * 1024;
+const int64_t kParallelDataNumMid = 16 * 1024;
+const int64_t kParallelDataNumSameShape = 7 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
+
+#define MINIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                         \
+    uint32_t result = MinimumCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                     \
+      KERNEL_LOG_ERROR("Minimum kernel compute failed."); \
+      return result;                                      \
+    }                                                     \
+    break;                                                \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t MinimumCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Minimum check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MinimumParamCheck(ctx), "Minimum check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+
+  switch (data_type) {
+    MINIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    MINIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    MINIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    MINIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    MINIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("Minimum kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MinimumCpuKernel::MinimumParamCheck(CpuKernelContext &ctx) {
+  // the non null of input_0, input_1, output has been verified in NormalCheck
+  Tensor *input_0 = ctx.Input(0);
+  Tensor *input_1 = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  DataType input0_type = input_0->GetDataType();
+  DataType input1_type = input_1->GetDataType();
+  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with "
+                     "input1 [%s].",
+                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
+  KERNEL_LOG_DEBUG(
+    "MinimumCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+void MinimumCpuKernel::SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
+  auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto ignore_nan = false;
+  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
+  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
+  for (int64_t i = start; i < end; ++i) {
+    if (ignore_nan == false && is_float16 == true) {
+      if (Eigen::numext::isnan(*(input1 + i))) {
+        *(output + i) = *(input1 + i);
+      } else if (Eigen::numext::isnan(*(input2 + i))) {
+        *(output + i) = *(input2 + i);
+      } else {
+        *(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
+      }
+    }
+    if (ignore_nan == false && is_float16 == false) {
+      if (isnan(*(input1 + i))) {
+        *(output + i) = *(input1 + i);
+      } else if (isnan(*(input2 + i))) {
+        *(output + i) = *(input2 + i);
+      } else {
+        *(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
+      }
+    }
+    if (ignore_nan == true && is_float16 == true) {
+      if (Eigen::numext::isnan(*(input1 + i))) {
+        *(output + i) = *(input2 + i);
+      } else if (Eigen::numext::isnan(*(input2 + i))) {
+        *(output + i) = *(input1 + i);
+      } else {
+        *(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
+      }
+    }
+    if (ignore_nan == true && is_float16 == false) {
+      if (isnan(*(input1 + i))) {
+        *(output + i) = *(input2 + i);
+      } else if (isnan(*(input2 + i))) {
+        *(output + i) = *(input1 + i);
+      } else {
+        *(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
+      }
+    }
+  }
+}
+template <typename T>
+void MinimumCpuKernel::SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
+  auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto ignore_nan = false;
+  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
+  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
+  for (int64_t i = start; i < end; ++i) {
+    if (ignore_nan == false && is_float16 == true) {
+      if (Eigen::numext::isnan(*input1)) {
+        *(output + i) = *input1;
+      } else if (Eigen::numext::isnan(*(input2 + i))) {
+        *(output + i) = *(input2 + i);
+      } else {
+        *(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
+      }
+    }
+    if (ignore_nan == false && is_float16 == false) {
+      if (isnan(*input1)) {
+        *(output + i) = *input1;
+      } else if (isnan(*(input2 + i))) {
+        *(output + i) = *(input2 + i);
+      } else {
+        *(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
+      }
+    }
+    if (ignore_nan == true && is_float16 == true) {
+      if (Eigen::numext::isnan(*input1)) {
+        *(output + i) = *(input2 + i);
+      } else if (Eigen::numext::isnan(*(input2 + i))) {
+        *(output + i) = *input1;
+      } else {
+        *(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
+      }
+    }
+    if (ignore_nan == true && is_float16 == false) {
+      if (isnan(*input1)) {
+        *(output + i) = *(input2 + i);
+      } else if (isnan(*(input2 + i))) {
+        *(output + i) = *input1;
+      } else {
+        *(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
+      }
+    }
+  }
+}
+template <typename T>
+void MinimumCpuKernel::SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
+  auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto ignore_nan = false;
+  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
+  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
+  for (int64_t i = start; i < end; ++i) {
+    if (ignore_nan == false && is_float16 == true) {
+      if (Eigen::numext::isnan(*(input1 + i))) {
+        *(output + i) = *(input1 + i);
+      } else if (Eigen::numext::isnan(*input2)) {
+        *(output + i) = *input2;
+      } else {
+        *(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
+      }
+    }
+    if (ignore_nan == false && is_float16 == false) {
+      if (isnan(*(input1 + i))) {
+        *(output + i) = *(input1 + i);
+      } else if (isnan(*input2)) {
+        *(output + i) = *input2;
+      } else {
+        *(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
+      }
+    }
+    if (ignore_nan == true && is_float16 == true) {
+      if (Eigen::numext::isnan(*(input1 + i))) {
+        *(output + i) = *input2;
+      } else if (Eigen::numext::isnan(*input2)) {
+        *(output + i) = *(input1 + i);
+      } else {
+        *(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
+      }
+    }
+    if (ignore_nan == true && is_float16 == false) {
+      if (isnan(*(input1 + i))) {
+        *(output + i) = *input2;
+      } else if (isnan(*input2)) {
+        *(output + i) = *(input1 + i);
+      } else {
+        *(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
+      }
+    }
+  }
+}
+
+template <typename T>
+void MinimumCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) {
+  bool is_float16 = false;
+  if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
+      std::is_same<T, double>::value) {
+    is_float16 = false;
+  } else {
+    is_float16 = true;
+  }
+  switch (type) {
+    case BcastShapeType::SAME_SHAPE:
+      SpecialComputeSameShape<T>(start, end, ctx, is_float16);
+      break;
+    case BcastShapeType::X_ONE_ELEMENT:
+      SpecialComputeXOneElement<T>(start, end, ctx, is_float16);
+      break;
+    case BcastShapeType::Y_ONE_ELEMENT:
+      SpecialComputeYOneElement<T>(start, end, ctx, is_float16);
+      break;
+    default:
+      KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
+      break;
+  }
+}
+
+template <typename T>
+uint32_t MinimumCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
+  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
+  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
+  int64_t data_num = ctx.Output(0)->NumElements();
+  BcastShapeType type = in0_elements_nums == in1_elements_nums
+                          ? BcastShapeType::SAME_SHAPE
+                          : (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumSameShapeMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_minimum = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, ctx); };
+
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_minimum),
+                        "Minimum Compute failed.");
+  } else {
+    SpecialCompute<T>(type, 0, data_num, ctx);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+template <typename T>
+void MinimumCpuKernel::BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast,
+                                               bool is_float16) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto ignore_nan = false;
+  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
+  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
+  for (int64_t i = start; i < end; ++i) {
+    if (ignore_nan == false && is_float16 == true) {
+      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+    if (ignore_nan == false && is_float16 == false) {
+      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+    if (ignore_nan == true && is_float16 == true) {
+      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+
+    if (ignore_nan == true && is_float16 == false) {
+      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+  }
+}
+
+template <typename T>
+void MinimumCpuKernel::BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto ignore_nan = false;
+  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
+  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
+  int64_t data_num = ctx.Output(0)->NumElements();
+  for (int64_t i = 0; i < data_num; ++i) {
+    if (ignore_nan == false && is_float16 == true) {
+      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+    if (ignore_nan == false && is_float16 == false) {
+      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+    if (ignore_nan == true && is_float16 == true) {
+      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+    if (ignore_nan == true && is_float16 == false) {
+      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
+        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
+      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
+                       ? *(in0 + bcast.GetBroadcastXIndex(i))
+                       : *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+  }
+}
+
+template <typename T>
+uint32_t MinimumCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
+  int64_t data_num = ctx.Output(0)->NumElements();
+  bool is_float16 = false;
+  if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
+      std::is_same<T, double>::value) {
+    is_float16 = false;
+  } else {
+    is_float16 = true;
+  }
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_minimum = [&](int64_t start, int64_t end) {
+      BcastComputeMultiKernel<T>(start, end, ctx, bcast, is_float16);
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_minimum),
+                        "Minimum Compute failed.");
+  } else {
+    BcastComputeOneKernel<T>(ctx, bcast, is_float16);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MinimumCpuKernel::MinimumCompute(CpuKernelContext &ctx) {
+  Tensor *input0_tensor = ctx.Input(0);
+  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input0_elements_nums = input0_tensor->NumElements();
+
+  Tensor *input1_tensor = ctx.Input(1);
+  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input1_elements_nums = input1_tensor->NumElements();
+  bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
+  if (isNeedBcast) {
+    return NoBcastCompute<T>(ctx);
+  } else {
+    Bcast bcast(input0_shape, input1_shape);
+    if (!bcast.IsValid()) {
+      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+
+    return BcastCompute<T>(ctx, bcast);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMinimum, MinimumCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.h
@ -0,0 +1,63 @@
+/**
+ * Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unminimum required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_MINIMUM_H_
+#define AICPU_KERNELS_NORMALIZED_MINIMUM_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class MinimumCpuKernel : public CpuKernel {
+ public:
+  MinimumCpuKernel() = default;
+  ~MinimumCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t MinimumParamCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx);
+
+  template <typename T>
+  void SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
+
+  template <typename T>
+  void SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
+
+  template <typename T>
+  void SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
+
+  template <typename T>
+  uint32_t NoBcastCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
+
+  template <typename T>
+  void BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
+
+  template <typename T>
+  void BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
+
+  template <typename T>
+  uint32_t MinimumCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal.h
@ -1,5 +1,5 @@
 /**
- * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
@ -171,11 +171,8 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
                                                               mindspore::kKLDivOpName,
                                                               mindspore::kKlDivLossGradOpName,
                                                               mindspore::kLcmOpName,
-                                                               mindspore::kLessEqualOpName,
-                                                               mindspore::kLogicalXorOpName,
                                                               mindspore::kLogitOpName,
                                                               mindspore::kLogitGradOpName,
-                                                               mindspore::kLogNormalReverseOpName,
                                                               mindspore::kLowerBoundOpName,
                                                               mindspore::kLstsqOpName,
                                                               mindspore::kLuUnpackOpName,