!48016 second half of 0103 aicpu migration without IsInf

Merge pull request !48016 from 李林杰/0118_second_half_0103_aicpu_migration_fix_test_conj
2023-01-18 08:23:36 +00:00 · 2023-01-18 08:23:36 +00:00 · 08aa1515d3
parent 2795accd48 3f71793aa3
commit 08aa1515d3
36 changed files with 4436 additions and 4 deletions
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.cc
@ -0,0 +1,228 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "hypot.h"
 #include "cpu_kernel_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 2;
 const char *kHypot = "Hypot";
 const int64_t kParallelDataNum = 2 * 1024;
 const int64_t kParallelDataNumMid = 16 * 1024;
 const int64_t kParallelDataNumSameShape = 7 * 1024;
 const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
 #define HYPOT_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
  case (DTYPE): {                                       \
    uint32_t result = HypotCompute<TYPE>(CTX);          \
    if (result != KERNEL_STATUS_OK) {                   \
      KERNEL_LOG_ERROR("Hypot kernel compute failed."); \
      return result;                                    \
    }                                                   \
    break;                                              \
  }
 }  // namespace
 namespace aicpu {
 template <typename T>
 T hypot(T a, T b) {
  return std::hypot(a, b);
 }
 uint32_t HypotCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Hypot check input and output number failed.");
  KERNEL_HANDLE_ERROR(HypotParamCheck(ctx), "Hypot check params failed.");
  auto data_type = ctx.Input(0)->GetDataType();
  switch (data_type) {
    HYPOT_COMPUTE_CASE(DT_FLOAT, float_t, ctx)
    HYPOT_COMPUTE_CASE(DT_DOUBLE, double_t, ctx)
    default:
      KERNEL_LOG_ERROR("Hypot kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t HypotCpuKernel::HypotParamCheck(CpuKernelContext &ctx) {
  Tensor *input_0 = ctx.Input(0);
  Tensor *input_1 = ctx.Input(1);
  Tensor *output = ctx.Output(0);
  DataType input0_type = input_0->GetDataType();
  DataType input1_type = input_1->GetDataType();
  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
                     "The data type of input0 [%s] need be same with "
                     "input1 [%s].",
                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
  KERNEL_LOG_DEBUG(
    "HypotCpuKernel[%s], input0: size[%llu];"
    "input1: size[%llu], output: size[%llu].",
    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t HypotCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
  int64_t data_num = ctx.Output(0)->NumElements();
  BcastShapeType type;
  if (in0_elements_nums == in1_elements_nums) {
    type = BcastShapeType::SAME_SHAPE;
  } else {
    type = (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
  }
  if (data_num >= kParallelDataNumSameShape) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (data_num <= kParallelDataNumSameShapeMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    auto sharder_hypot = [&](int64_t start, int64_t end) {
      switch (type) {
        case BcastShapeType::SAME_SHAPE:
          for (int64_t i = start; i < end; ++i) {
            *(out + i) = hypot(*(in0 + i), *(in1 + i));
          }
          break;
        case BcastShapeType::X_ONE_ELEMENT:
          for (int64_t i = start; i < end; ++i) {
            *(out + i) = hypot(*in0, *(in1 + i));
          }
          break;
        case BcastShapeType::Y_ONE_ELEMENT:
          for (int64_t i = start; i < end; ++i) {
            *(out + i) = hypot(*(in0 + i), *in1);
          }
          break;
        default:
          KERNEL_LOG_ERROR("Invalid type [%d]", static_cast<int32_t>(type));
          break;
      }
    };
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("max_core_num could not be 0");
      return KERNEL_STATUS_PARAM_INVALID;
    }
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_hypot),
                        "Hypot Compute failed.");
  } else {
    switch (type) {
      case BcastShapeType::SAME_SHAPE:
        for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
          *(out + i) = hypot(*(in0 + i), *(in1 + i));
        }
        break;
      case BcastShapeType::X_ONE_ELEMENT:
        for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
          *(out + i) = hypot(*in0, *(in1 + i));
        }
        break;
      case BcastShapeType::Y_ONE_ELEMENT:
        for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
          *(out + i) = hypot(*(in0 + i), *in1);
        }
        break;
      default:
        KERNEL_LOG_ERROR("Invalid type [%d]", static_cast<int32_t>(type));
        break;
    }
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t HypotCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
  T *in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  T *in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  T *out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  int64_t data_num = ctx.Output(0)->NumElements();
  if (data_num >= kParallelDataNum) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (data_num <= kParallelDataNumMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    auto sharder_hypot = [&](int64_t start, int64_t end) {
      for (int64_t i = start; i < end; ++i) {
        *(out + i) = hypot<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
      }
    };
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("max_core_num could not be 0");
      return KERNEL_STATUS_PARAM_INVALID;
    }
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_hypot),
                        "Hypot Compute failed.");
  } else {
    for (int64_t i = 0; i < data_num; ++i) {
      *(out + i) = hypot<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
    }
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t HypotCpuKernel::HypotCompute(CpuKernelContext &ctx) {
  Tensor *input0_tensor = ctx.Input(0);
  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
  int64_t input0_elements_nums = input0_tensor->NumElements();
  Tensor *input1_tensor = ctx.Input(1);
  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
  int64_t input1_elements_nums = input1_tensor->NumElements();
  bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
  if (isNeedBcast) {
    return NoBcastCompute<T>(ctx);
  } else {
    Bcast bcast(input0_shape, input1_shape);
    if (!bcast.IsValid()) {
      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
      return KERNEL_STATUS_PARAM_INVALID;
    }
    return BcastCompute<T>(ctx, bcast);
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kHypot, HypotCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.h
@ -0,0 +1,43 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_HYPOT_H_
 #define AICPU_KERNELS_NORMALIZED_HYPOT_H_
 #include "cpu_ops_kernel.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class HypotCpuKernel : public CpuKernel {
 public:
  HypotCpuKernel() = default;
  ~HypotCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  uint32_t HypotParamCheck(CpuKernelContext &ctx);
  template <typename T>
  uint32_t NoBcastCompute(CpuKernelContext &ctx);
  template <typename T>
  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
  template <typename T>
  uint32_t HypotCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.cc
@ -0,0 +1,81 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "identityn.h"
 #include <algorithm>
 #include <vector>
 #include "cpu_types.h"
 #include "kernel_log.h"
 #include "securec.h"
 #include "status.h"
 #include "utils/kernel_util.h"
 namespace {
 const char *kIdentityN = "IdentityN";
 }  // namespace
 namespace aicpu {
 uint32_t IdentityNCpuKernel::IdentityNParamCheck(CpuKernelContext &ctx) {
  // input size and output size check
  uint32_t input_size = ctx.GetInputsSize();
  uint32_t output_size = ctx.GetOutputsSize();
  KERNEL_CHECK_FALSE((input_size == output_size), KERNEL_STATUS_PARAM_INVALID,
                     "Input size should equal to Output size.");
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, input_size, output_size), "[%s] check params failed.", kIdentityN);
  for (uint32_t idx = 0; idx < input_size; ++idx) {
    Tensor *in_tensor = ctx.Input(idx);
    Tensor *out_tensor = ctx.Output(idx);
    // TensorShape check
    auto in_shape = in_tensor->GetTensorShape();
    auto out_shape = out_tensor->GetTensorShape();
    KERNEL_CHECK_FALSE((in_shape->GetDimSizes() == out_shape->GetDimSizes()), KERNEL_STATUS_PARAM_INVALID,
                       "In tensor shape should equal to out tensor shape.");
    // DataType Check
    DataType in_type = in_tensor->GetDataType();
    DataType out_type = out_tensor->GetDataType();
    KERNEL_CHECK_FALSE((in_type == out_type), KERNEL_STATUS_PARAM_INVALID,
                       "In tensor data type should equal to out tensor data type.");
    bool type_support =
      std::find(support_data_type.begin(), support_data_type.end(), in_type) != support_data_type.end();
    KERNEL_CHECK_FALSE(type_support, KERNEL_STATUS_PARAM_INVALID, "IdentityN kernel data type [%s] not support.",
                       DTypeStr(in_type).c_str());
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t IdentityNCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(IdentityNParamCheck(ctx), "IdentityNCpuKernel check params failed");
  uint32_t input_size = ctx.GetInputsSize();
  for (uint32_t idx = 0; idx < input_size; ++idx) {
    Tensor *in_tensor = ctx.Input(idx);
    Tensor *out_tensor = ctx.Output(idx);
    auto in_data = in_tensor->GetData();
    auto out_data = out_tensor->GetData();
    uint64_t in_size = in_tensor->GetDataSize();
    uint64_t out_size = out_tensor->GetDataSize();
    // memory copy
    if (out_data != in_data) {
      int cpret = memcpy_s(out_data, out_size, in_data, in_size);
      KERNEL_CHECK_FALSE((cpret == EOK), KERNEL_STATUS_INNER_ERROR,
                         "[%s] memcpy_s to output failed, destMax [%ld], count [%ld].", kIdentityN, out_size, in_size);
    }
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kIdentityN, IdentityNCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.h
@ -0,0 +1,36 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_IDENTITY_N_H_
 #define AICPU_KERNELS_NORMALIZED_IDENTITY_N_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class IdentityNCpuKernel : public CpuKernel {
 public:
  IdentityNCpuKernel() = default;
  ~IdentityNCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  uint32_t IdentityNParamCheck(CpuKernelContext &ctx);
  const std::vector<DataType> support_data_type = {DT_FLOAT, DT_FLOAT16, DT_INT8,   DT_INT16,  DT_UINT16, DT_UINT8,
                                                   DT_INT32, DT_INT64,   DT_UINT32, DT_UINT64, DT_BOOL,   DT_DOUBLE};
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.cc
@ -0,0 +1,230 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "index_fill.h"
 #include <securec.h>
 #include <map>
 #include "Eigen/Core"
 #include "cpu_kernel_utils.h"
 #include "cpu_types.h"
 #include "kernel_log.h"
 #include "status.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kNumInput = 4;
 const uint32_t kNumOutput = 1;
 const char *kIndexFill = "IndexFill";
 // when input data size is more than kParallelDataNum, use Parallel func
 const uint32_t kParallelDataNum = 16 * 1024;
 const uint32_t kParallelDataNumMid = 128 * 1024;
 #define INDEXFILL_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
  case (DTYPE): {                                           \
    uint32_t result = DoCompute<TYPE>(CTX);                 \
    if (result != KERNEL_STATUS_OK) {                       \
      KERNEL_LOG_ERROR("IndexFill kernel compute failed."); \
      return result;                                        \
    }                                                       \
    break;                                                  \
  }
 }  // namespace
 namespace aicpu {
 uint32_t IndexFillCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
  // check params
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kNumInput, kNumOutput), "IndexFill check input and output number failed.");
  // get input Tensors
  for (uint32_t i = 0; i < kNumInput; ++i) {
    Tensor *tensor = ctx.Input(i);
    inputs_.push_back(tensor);
  }
  // get output Tensors
  Tensor *tensor = ctx.Output(0);
  outputs_.push_back(tensor);
  int32_t value_dim = inputs_[3]->GetTensorShape()->GetDims();
  KERNEL_CHECK_FALSE((value_dim == 0), KERNEL_STATUS_INNER_ERROR,
                     "IndexFill only supports a 0-dimensional value tensor, "
                     "but got tensor with [%d] dimension(s).",
                     value_dim)
  DataType dim_type = inputs_[1]->GetDataType();
  DataType index_type = inputs_[2]->GetDataType();
  if (dim_type != DT_INT32) {
    KERNEL_LOG_ERROR("IndexFill: Expected dtype int32 for dim.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (index_type != DT_INT32) {
    KERNEL_LOG_ERROR("IndexFill: Expected dtype int32 for index.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 void IndexFillCpuKernel::SpecialCompute(int64_t start, int64_t end, const int32_t *input_dim,
                                        std::map<int32_t, bool> &index_dict) {
  auto *input_x = reinterpret_cast<T *>(inputs_[0]->GetData());
  auto *input_value = reinterpret_cast<T *>(inputs_[3]->GetData());
  auto *output_y = reinterpret_cast<T *>(outputs_[0]->GetData());
  int32_t x_dim_nums = inputs_[0]->GetTensorShape()->GetDims();
  auto x_dims = inputs_[0]->GetTensorShape()->GetDimSizes();
  int32_t dim_flag;
  if (x_dim_nums != 0) {
    dim_flag = *input_dim % x_dim_nums + 1;
  } else {
    dim_flag = 0;
  }
  int32_t remain_dims = 1;
  if (dim_flag == x_dim_nums) {
    if (dim_flag != 0) {
      remain_dims = x_dims[*input_dim];
    }
    for (int64_t i = start; i < end; i++) {
      int32_t index_flag = i % remain_dims;
      std::map<int32_t, bool>::iterator f = index_dict.find(index_flag);
      if (f != index_dict.end()) {
        output_y[i] = *input_value;
      } else {
        output_y[i] = input_x[i];
      }
    }
  } else {
    for (int32_t i = *input_dim + 1; i < x_dim_nums; i++) {
      remain_dims *= x_dims[i];
    }
    for (int64_t i = start; i < end; i++) {
      int32_t index_flag = (i / remain_dims) % x_dims[*input_dim];
      std::map<int32_t, bool>::iterator f = index_dict.find(index_flag);
      if (f != index_dict.end()) {
        output_y[i] = *input_value;
      } else {
        output_y[i] = input_x[i];
      }
    }
  }
 }
 template <typename T>
 uint32_t IndexFillCpuKernel::DoCompute(CpuKernelContext &ctx) {
  int32_t *input_1 = reinterpret_cast<int32_t *>(inputs_[1]->GetData());
  int32_t *input_2 = reinterpret_cast<int32_t *>(inputs_[2]->GetData());
  int32_t x_dim_nums = inputs_[0]->GetTensorShape()->GetDims();
  int32_t dim_nums = inputs_[1]->GetTensorShape()->GetDims();
  int32_t index_dim_nums = inputs_[2]->GetTensorShape()->GetDims();
  auto x_dims = inputs_[0]->GetTensorShape()->GetDimSizes();
  uint32_t data_num = outputs_[0]->NumElements();
  int64_t index_num = inputs_[2]->GetTensorShape()->NumElements();
  KERNEL_CHECK_FALSE(dim_nums == 0, KERNEL_STATUS_PARAM_INVALID, "Dim has to be a scalar.")
  KERNEL_CHECK_FALSE(index_dim_nums <= 1, KERNEL_STATUS_PARAM_INVALID, "Index has to be a vector/scalar.")
  int32_t cur_dim = *input_1;
  if (*input_1 < 0) {
    *input_1 = *input_1 + x_dim_nums;
  }
  std::map<int32_t, bool> index_dict;
  if (x_dim_nums == 0) {
    for (int32_t i = 0; i < index_num; i++) {
      if (input_2[i] < -1 || input_2[i] > 0) {
        KERNEL_LOG_ERROR("Invalid argument 3: out of range.");
        return KERNEL_STATUS_PARAM_INVALID;
      } else {
        index_dict.insert(std::pair<int32_t, bool>(0, true));
      }
    }
  } else if (cur_dim < -x_dim_nums || cur_dim >= x_dim_nums) {
    KERNEL_LOG_ERROR(
      "Dimension out of range (expected to be in range of "
      "[%d, %d], but got %d).",
      0 - x_dim_nums, x_dim_nums - 1, cur_dim);
    return KERNEL_STATUS_PARAM_INVALID;
  } else {
    for (int32_t i = 0; i < index_num; i++) {
      if (input_2[i] < -x_dims[*input_1] || input_2[i] >= x_dims[*input_1]) {
        KERNEL_LOG_ERROR("Invalid argument 3: out of range.");
        return KERNEL_STATUS_PARAM_INVALID;
      } else {
        input_2[i] = (input_2[i] < 0) ? (input_2[i] + x_dims[*input_1]) : input_2[i];
        index_dict.insert(std::pair<int32_t, bool>(input_2[i], true));
      }
    }
  }
  if (data_num >= kParallelDataNum) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (data_num <= kParallelDataNumMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("The number of available CPU cores must be greater than 0!");
    }
    auto sharder_index_fill = [&](int64_t start, int64_t end) { SpecialCompute<T>(start, end, input_1, index_dict); };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_index_fill),
                        "IndexFill Compute failed.");
  } else {
    SpecialCompute<T>(0, data_num, input_1, index_dict);
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t IndexFillCpuKernel::Compute(CpuKernelContext &ctx) {
  uint32_t res = GetInputAndCheck(ctx);
  if (res != KERNEL_STATUS_OK) {
    return res;
  }
  DataType input_type{ctx.Input(0)->GetDataType()};
  switch (input_type) {
    INDEXFILL_COMPUTE_CASE(DT_INT8, int8_t, ctx)
    INDEXFILL_COMPUTE_CASE(DT_INT16, int16_t, ctx)
    INDEXFILL_COMPUTE_CASE(DT_INT32, int32_t, ctx)
    INDEXFILL_COMPUTE_CASE(DT_INT64, int64_t, ctx)
    INDEXFILL_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
    INDEXFILL_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
    INDEXFILL_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
    INDEXFILL_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
    INDEXFILL_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
    INDEXFILL_COMPUTE_CASE(DT_FLOAT, float, ctx)
    INDEXFILL_COMPUTE_CASE(DT_DOUBLE, double, ctx)
    default:
      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
                       DTypeStr(input_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kIndexFill, IndexFillCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.h
@ -0,0 +1,40 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_INDEX_FILL_H_
 #define AICPU_KERNELS_NORMALIZED_INDEX_FILL_H_
 #include <vector>
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class IndexFillCpuKernel : public CpuKernel {
 public:
  ~IndexFillCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  uint32_t DoCompute(CpuKernelContext &ctx);
  uint32_t GetInputAndCheck(CpuKernelContext &ctx);
  template <typename T>
  void SpecialCompute(int64_t start, int64_t end, const int32_t *input_dim, std::map<int32_t, bool> &index_dict);
  std::vector<Tensor *> inputs_;
  std::vector<Tensor *> outputs_;
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.cc
@ -0,0 +1,185 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "kldiv.h"
 #include <iostream>
 #include <unsupported/Eigen/CXX11/Tensor>
 #include "cpu_kernel_utils.h"
 #include "cpu_types.h"
 #include "kernel_log.h"
 #include "status.h"
 #include "utils/kernel_util.h"
 namespace {
 const std::uint32_t kKLDivInputNum{2};
 const std::uint32_t kKLDivOutputNum{1};
 const std::int64_t ParallelNum{4096};
 const char *kKLDiv{"KLDiv"};
 }  // namespace
 namespace aicpu {
 namespace detail {
 template <typename T>
 inline std::uint32_t ComputeKLDivKernel(const CpuKernelContext &ctx) {
  const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
  auto input = static_cast<T *>(ctx.Input(0)->GetData());
  auto target = static_cast<T *>(ctx.Input(1)->GetData());
  auto output = static_cast<T *>(ctx.Output(0)->GetData());
  std::int64_t total = ctx.Input(0)->NumElements();
  std::size_t data_size = ctx.Input(0)->GetDataSize();
  uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
  std::string reduction = ctx.GetAttr("reduction")->GetString();
  if (reduction != "sum" && reduction != "batchmean" && reduction != "none" && reduction != "mean") {
    KERNEL_LOG_ERROR("%s is not a valid value for reduction", reduction.c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  bool parallel_flag = false;
  if (data_size > ParallelNum * sizeof(T)) {
    parallel_flag = true;
  }
  if (cores == 0) {
    return KERNEL_STATUS_INNER_ERROR;
  }
  T *tmp_array = nullptr;
  if (reduction == "none") {
    tmp_array = output;
  } else {
    tmp_array = new T[total];
  }
  if (parallel_flag) {
    std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
    ParallelFor(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
      std::int64_t length = end - begin;
      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input + begin, length, 1);
      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target + begin, length, 1);
      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_reduce(tmp_array + begin, length, 1);
      T constant_zero{0};
      array_reduce = array_target * (Eigen::log(array_target) - array_input);
      for (std::int64_t idx = 0; idx < length; ++idx) {
        if (!(target[begin + idx] > constant_zero)) {
          array_reduce(idx) = constant_zero;
        }
      }
    });
  } else {
    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input, total, 1);
    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target, total, 1);
    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_reduce(tmp_array, total, 1);
    array_reduce = array_target * (Eigen::log(array_target) - array_input);
    T constant_zero{0};
    for (uint32_t idx = 0; idx < total; ++idx) {
      if (!(target[idx] > constant_zero)) {
        array_reduce(idx) = constant_zero;
      }
    }
  }
  Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > reduce(tmp_array, total, 1);
  if (reduction == "sum") {
    output[0] = reduce.sum();
  } else if (reduction == "batchmean") {
    std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
    output[0] = reduce.sum() / T(input_dims[0]);
  } else if (reduction == "mean") {
    output[0] = reduce.mean();
  }
  if (reduction != "none") {
    delete[] tmp_array;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 inline std::uint32_t ComputeKLDiv(const CpuKernelContext &ctx) {
  uint32_t result = ComputeKLDivKernel<T>(ctx);
  if (result != 0) {
    KERNEL_LOG_ERROR("KLDiv compute failed.");
  }
  return result;
 }
 inline std::uint32_t KLDivExtraCheck(const CpuKernelContext &ctx) {
  if (ctx.Input(0)->GetData() == nullptr) {
    KERNEL_LOG_ERROR("Get input x data failed.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (ctx.Input(1)->GetData() == nullptr) {
    KERNEL_LOG_ERROR("Get input target data failed.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (ctx.Output(0)->GetData() == nullptr) {
    KERNEL_LOG_ERROR("Get output y data failed.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
    KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
                     DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (ctx.Input(0)->GetDataSize() != ctx.Input(1)->GetDataSize()) {
    KERNEL_LOG_ERROR(
      "The data size of the input [%llu] need be the same as the target "
      "[%llu].",
      ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
  std::vector<int64_t> target_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
  if (input_dims.size() != target_dims.size()) {
    KERNEL_LOG_ERROR(
      "The data dim size of the input x [%llu] need be the same as the "
      "target "
      "[%llu].",
      input_dims.size(), target_dims.size());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  for (size_t index = 0; index < input_dims.size(); index++) {
    if (input_dims[index] != target_dims[index]) {
      KERNEL_LOG_ERROR("The data dim of the input x need be the same as the target.");
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  return KERNEL_STATUS_OK;
 }
 std::uint32_t KLDivCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
  return NormalCheck(ctx, kKLDivInputNum, kKLDivOutputNum, {"reduction"}) ? KERNEL_STATUS_PARAM_INVALID
                                                                          : KLDivExtraCheck(ctx);
 }
 // DT_FLOAT16, DT_FLOAT, DT_DOUBLE
 std::uint32_t KLDivCompute(const CpuKernelContext &ctx) {
  DataType input_type{ctx.Input(0)->GetDataType()};
  switch (input_type) {
    case DT_FLOAT16:
      return ComputeKLDiv<Eigen::half>(ctx);
    case DT_FLOAT:
      return ComputeKLDiv<std::float_t>(ctx);
    case DT_DOUBLE:
      return ComputeKLDiv<std::double_t>(ctx);
    default:
      KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
 }
 }  // namespace detail
 std::uint32_t KLDivCpuKernel::Compute(CpuKernelContext &ctx) {
  return detail::KLDivCheck(ctx, kKLDivInputNum, kKLDivOutputNum) ? KERNEL_STATUS_PARAM_INVALID
                                                                  : detail::KLDivCompute(ctx);
 }
 REGISTER_CPU_KERNEL(kKLDiv, KLDivCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.h
@ -0,0 +1,27 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_KLDIV_H_
 #define AICPU_KERNELS_NORMALIZED_KLDIV_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class KLDivCpuKernel final : public CpuKernel {
  virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.cc
@ -0,0 +1,226 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "kldivlossgrad.h"
 #include <complex>
 #include "cpu_kernel_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const char *kKlDivLossGrad = "KlDivLossGrad";
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 3;
 const uint32_t kGradIndex = 0;
 const uint32_t kInputIndex = 1;
 const uint32_t kTargetIndex = 2;
 const std::string AttrReduction = "reduction";
 const std::string AttrLog = "log_target";
 const int64_t DataDefaultParallelNum = 16384;
 }  // namespace
 namespace aicpu {
 template <typename T>
 void KlDivLossGradOp(Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > &target,
                     Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > &grad,
                     Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > &output, std::int64_t &len, bool &log_target,
                     std::string &reduction) {
  T constant_zero{0};
  if (log_target) {
    output = -Eigen::exp(target) * grad;
    return;
  }
  if (reduction == "none") {
    for (uint32_t idx = 0; idx < len; ++idx) {
      if (target(idx) > constant_zero) {
        output(idx) = -target(idx) * grad(idx);
      }
    }
  } else {
    for (uint32_t idx = 0; idx < len; ++idx) {
      if (target(idx) > constant_zero) {
        output(idx) = -target(idx) * grad(0);
      }
    }
  }
  return;
 }
 std::uint32_t KlDivLossGradExtraCheck(CpuKernelContext &ctx) {
  Tensor *grad = ctx.Input(0);
  Tensor *input = ctx.Input(1);
  Tensor *target = ctx.Input(2);
  Tensor *output = ctx.Output(0);
  if (grad->GetDataSize() == 0) {
    KERNEL_LOG_ERROR("[%s] grad is empty tensor.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (input->GetDataSize() == 0) {
    KERNEL_LOG_ERROR("[%s] input is empty tensor.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (target->GetDataSize() == 0) {
    KERNEL_LOG_ERROR("[%s] target is empty tensor.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (output->GetDataSize() == 0) {
    KERNEL_LOG_ERROR("[%s] output is empty tensor.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if ((input->GetDataType() != grad->GetDataType()) || (target->GetDataType() != grad->GetDataType()) ||
      (output->GetDataType() != grad->GetDataType())) {
    KERNEL_LOG_ERROR(
      "The data type of the grad [%s], input [%s], target [%s], output y "
      "[%s] must be the same type.",
      DTypeStr(grad->GetDataType()).c_str(), DTypeStr(input->GetDataType()).c_str(),
      DTypeStr(target->GetDataType()).c_str(), DTypeStr(output->GetDataType()).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  std::vector<int64_t> grad_dims = ctx.Input(kGradIndex)->GetTensorShape()->GetDimSizes();
  std::vector<int64_t> input_dims = ctx.Input(kInputIndex)->GetTensorShape()->GetDimSizes();
  std::vector<int64_t> target_dims = ctx.Input(kTargetIndex)->GetTensorShape()->GetDimSizes();
  std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
  std::string reduction = ctx.GetAttr(AttrReduction)->GetString();
  if (output_dims != input_dims) {
    KERNEL_LOG_ERROR(
      "The data shape of the output need be the same as the input. output "
      "shape [%s], input shape [%s]",
      VectorToString(output_dims).c_str(), VectorToString(input_dims).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (target_dims != input_dims) {
    KERNEL_LOG_ERROR(
      "The data shape of the target need be the same as the input. target "
      "shape [%s], input shape [%s]",
      VectorToString(target_dims).c_str(), VectorToString(input_dims).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (reduction == "mean" || reduction == "sum" || reduction == "batchmean") {
    if (ctx.Input(0)->NumElements() != 1) {
      KERNEL_LOG_ERROR("The data num of the grad [%llu] must be 1", ctx.Input(0)->NumElements());
      return KERNEL_STATUS_PARAM_INVALID;
    }
  } else if (reduction == "none") {
    if (input_dims != grad_dims) {
      KERNEL_LOG_ERROR(
        "The data shape of the grad need be the same as the input. grad "
        "shape "
        "[%s], input shape [%s]",
        VectorToString(grad_dims).c_str(), VectorToString(input_dims).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t KlDivLossGradCpuKernel::Compute(CpuKernelContext &ctx) {
  if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (KlDivLossGradExtraCheck(ctx) == KERNEL_STATUS_PARAM_INVALID) {
    return KERNEL_STATUS_PARAM_INVALID;
  }
  // choose compute function depend on dataType
  auto data_type = static_cast<DataType>(ctx.Input(kFirstInputIndex)->GetDataType());
  switch (data_type) {
    case DT_FLOAT16:
      return KlDivLossGradCompute<Eigen::half>(ctx);
    case DT_FLOAT:
      return KlDivLossGradCompute<float>(ctx);
    case DT_DOUBLE:
      return KlDivLossGradCompute<double>(ctx);
    default:
      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
                       DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
 }
 template <typename T>
 uint32_t KlDivLossGradCpuKernel::KlDivLossGradCompute(CpuKernelContext &ctx) {
  int64_t grad_total = ctx.Input(0)->NumElements();
  int64_t input_total = ctx.Input(1)->NumElements();
  int64_t target_total = ctx.Input(2)->NumElements();
  int64_t output_y_total = ctx.Output(0)->NumElements();
  int64_t total = input_total;
  uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
  T *grad = (T *)(ctx.Input(0)->GetData());
  T *input = (T *)(ctx.Input(1)->GetData());
  T *target = (T *)(ctx.Input(2)->GetData());
  T *output = (T *)(ctx.Output(0)->GetData());
  bool parallel_flag = false;
  uint64_t data_size = ctx.Input(1)->GetDataSize();
  // Determine whether to enable multi-core parallel computing
  if (data_size > DataDefaultParallelNum * sizeof(T)) {
    parallel_flag = true;
  }
  // Eigen::Array
  bool log_target{false};
  if (ctx.GetAttr(AttrLog) != nullptr) {
    log_target = ctx.GetAttr(AttrLog)->GetBool();
  }
  std::string reduction{"mean"};
  if (ctx.GetAttr(AttrReduction) != nullptr) {
    reduction = ctx.GetAttr(AttrReduction)->GetString();
  }
  if (cores == 0) {
    KERNEL_LOG_ERROR("KlDivLossGrad compute failed.");
    return KERNEL_STATUS_INNER_ERROR;
  }
  if (parallel_flag) {
    const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
    std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
    auto shard_kldivlossgrad = [&](std::int64_t begin, std::int64_t end) {
      std::int64_t length = end - begin;
      std::int64_t grad_begin{0}, grad_length{grad_total};
      if (reduction == "none") {
        grad_begin = begin;
        grad_length = length;
      }
      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_grad(grad + grad_begin, grad_length, 1);
      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input + begin, length, 1);
      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target + begin, length, 1);
      Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_output(output + begin, length, 1);
      T constant_zero{0};
      array_output = constant_zero;
      KlDivLossGradOp<T>(array_target, array_grad, array_output, length, log_target, reduction);
      if (reduction == "mean") {
        array_output = array_output / T(output_y_total);
      } else if (reduction == "batchmean") {
        std::vector<int64_t> input_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
        array_output = array_output / T(input_dims[0]);
      }
    };
    KERNEL_HANDLE_ERROR(ParallelFor(ctx, total, per_unit_size, shard_kldivlossgrad), "KlDivLossGrad Compute failed.");
  } else {
    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_grad(grad, grad_total, 1);
    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input, input_total, 1);
    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target, target_total, 1);
    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_output(output, output_y_total, 1);
    T constant_zero{0};
    array_output = constant_zero;
    KlDivLossGradOp<T>(array_target, array_grad, array_output, output_y_total, log_target, reduction);
    if (reduction == "mean") {
      array_output = array_output / T(output_y_total);
    } else if (reduction == "batchmean") {
      std::vector<int64_t> input_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
      array_output = array_output / T(input_dims[0]);
    }
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kKlDivLossGrad, KlDivLossGradCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.h
@ -0,0 +1,42 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H
 #define AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H
 #define EIGEN_USE_THREADS
 #define EIGEN_USE_SIMPLE_THREAD_POOL
 #include "cpu_ops_kernel.h"
 #include "cpu_types.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class KlDivLossGradCpuKernel : public CpuKernel {
 public:
  KlDivLossGradCpuKernel() = default;
  ~KlDivLossGradCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  /**
   * @brief compute for all types
   * @param ctx cpu kernel context
   * @return status if success
   */
  template <typename T>
  uint32_t KlDivLossGradCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif  // AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.cc
@ -0,0 +1,173 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "lcm.h"
 #include <cmath>
 #include <set>
 #include "cpu_kernel_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kLcmOutputNum = 1;
 const uint32_t kLcmInputNum = 2;
 const char *kLcm = "Lcm";
 // when input data size is more than kParallelDataNum, use Parallel func
 const int64_t kParallelDataNum = 2 * 1024;
 const int64_t kParallelDataNumMid = 16 * 1024;
 const int32_t kInput_32_32 = 3;
 const int32_t kInput_32_64 = 2;
 const int32_t kInput_64_32 = 1;
 const int32_t kInput_64_64 = 0;
 }  // namespace
 namespace aicpu {
 // Simple recursive gcd.
 template <class T>
 T elewise_gcd(T a, T b) {
  if (b == 0) {
    return a;
  }
  return elewise_gcd(b, a % b);
 }
 // Simple lcm.
 template <typename T>
 T elewise_lcm(T a, T b) {
  T gcd_tmp = elewise_gcd<T>(a, b);
  if (gcd_tmp == 0) {
    return static_cast<T>(0);
  }
  return std::abs(a / gcd_tmp * b);
 }
 uint32_t LcmIOTypeCheck(CpuKernelContext &ctx, int32_t &dual_types) {
  Tensor *x1 = ctx.Input(kFirstInputIndex);
  Tensor *x2 = ctx.Input(kSecondInputIndex);
  Tensor *y = ctx.Output(kFirstOutputIndex);
  const std::set<DataType> supported_types{DT_INT32, DT_INT64};
  auto x1_type = x1->GetDataType();
  auto x2_type = x2->GetDataType();
  auto y_type = y->GetDataType();
  KERNEL_CHECK_FALSE(supported_types.count(x1_type) != 0, KERNEL_STATUS_PARAM_INVALID,
                     "[Lcm] input x1 data type [%s] is not supported.", DTypeStr(x1_type).c_str());
  KERNEL_CHECK_FALSE(supported_types.count(x2_type) != 0, KERNEL_STATUS_PARAM_INVALID,
                     "[Lcm] input x2 data type [%s] is not supported.", DTypeStr(x2_type).c_str());
  int32_t x1_is_i32 = static_cast<int32_t>(x1_type == DT_INT32) << 1;
  int32_t x2_is_i32 = static_cast<int32_t>(x2_type == DT_INT32);
  int32_t _dual_types = x1_is_i32 | x2_is_i32;
  switch (_dual_types) {
    case kInput_64_64:
    case kInput_64_32:
    case kInput_32_64:
      KERNEL_CHECK_FALSE(y_type == DT_INT64, KERNEL_STATUS_PARAM_INVALID,
                         "[Lcm] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
      dual_types = _dual_types;
      break;
    case kInput_32_32:
      KERNEL_CHECK_FALSE(y_type == DT_INT32, KERNEL_STATUS_PARAM_INVALID,
                         "[Lcm] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
      dual_types = _dual_types;
      break;
    default:
      KERNEL_LOG_ERROR("[Lcm] input data type tuple is not supported.");
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <class T1, class T2, class T3>
 uint32_t LcmElewiseCompute(CpuKernelContext &ctx, const T1 *x1_ptr, const T2 *x2_ptr, T3 *y_ptr, Bcast &bcast) {
  int64_t data_num = ctx.Output(kFirstOutputIndex)->NumElements();
  auto lcm_shard = [&](int64_t start, int64_t end) {
    for (int64_t i = start; i < end; ++i) {
      T3 x1_ele_abs = std::abs(static_cast<T3>(x1_ptr[bcast.GetBroadcastXIndex(i)]));
      T3 x2_ele_abs = std::abs(static_cast<T3>(x2_ptr[bcast.GetBroadcastYIndex(i)]));
      y_ptr[i] = elewise_lcm(x1_ele_abs, x2_ele_abs);
    }
  };
  if (data_num >= kParallelDataNum) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (data_num <= kParallelDataNumMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("[Lcm] max_core_num is 0, please check the cpu num.");
      return KERNEL_STATUS_PARAM_INVALID;
    }
    uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, lcm_shard);
    if (ret != KERNEL_STATUS_OK) {
      KERNEL_LOG_ERROR("[Lcm] Lcm Compute failed.");
      return ret;
    }
  } else {
    lcm_shard(0, data_num);
  }
  return KERNEL_STATUS_OK;
 }
 template <class T1, class T2, class T3>
 uint32_t LcmCompute(CpuKernelContext &ctx) {
  Tensor *x1 = ctx.Input(kFirstInputIndex);
  Tensor *x2 = ctx.Input(kSecondInputIndex);
  Tensor *y = ctx.Output(kFirstOutputIndex);
  const T1 *x1_ptr = reinterpret_cast<const T1 *>(x1->GetData());
  const T2 *x2_ptr = reinterpret_cast<const T2 *>(x2->GetData());
  T3 *y_ptr = reinterpret_cast<T3 *>(y->GetData());
  auto x1_shape = x1->GetTensorShape()->GetDimSizes();
  auto x2_shape = x2->GetTensorShape()->GetDimSizes();
  Bcast bcast(x1_shape, x2_shape);
  if (bcast.IsValid()) {
    return LcmElewiseCompute<T1, T2, T3>(ctx, x1_ptr, x2_ptr, y_ptr, bcast);
  } else {
    KERNEL_LOG_ERROR("[Lcm] broadcast failed.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
 }
 uint32_t LcmCpuKernel::Compute(CpuKernelContext &ctx) {
  // check params
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kLcmInputNum, kLcmOutputNum), "[Lcm] check input and output number failed.");
  int32_t dual_types = static_cast<int32_t>(-1);
  KERNEL_HANDLE_ERROR(LcmIOTypeCheck(ctx, dual_types), "[Lcm] check data type failed.");
  switch (dual_types) {
    case kInput_64_64:
      return LcmCompute<int64_t, int64_t, int64_t>(ctx);
      break;
    case kInput_64_32:
      return LcmCompute<int64_t, int32_t, int64_t>(ctx);
      break;
    case kInput_32_64:
      return LcmCompute<int32_t, int64_t, int64_t>(ctx);
      break;
    case kInput_32_32:
      return LcmCompute<int32_t, int32_t, int32_t>(ctx);
      break;
    default:
      KERNEL_LOG_ERROR("[Lcm] input data type tuple is not supported.");
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kLcm, LcmCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.h
@ -0,0 +1,32 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_LCM_H_
 #define AICPU_KERNELS_NORMALIZED_LCM_H_
 #include "cpu_ops_kernel.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class LcmCpuKernel : public CpuKernel {
 public:
  ~LcmCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.cc
@ -0,0 +1,126 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All right reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "logit.h"
 #include "Eigen/Core"
 #include "Eigen/Dense"
 #include "Eigen/LU"
 #include "cmath"
 #include "cpu_context.h"
 #include "cpu_kernel_utils.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 1;
 const int64_t kParallelDataNumSameShape = 7 * 1024;
 const int64_t kParallelDataNumSameShapeMid = 16 * 1024;
 const char *kLogit = "Logit";
 #define LOGIT_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
  case (DTYPE): {                                       \
    uint32_t result = LogitCompute<TYPE>(CTX);          \
    if (result != KERNEL_STATUS_OK) {                   \
      KERNEL_LOG_ERROR("Logit kernel compute failed."); \
      return result;                                    \
    }                                                   \
    break;                                              \
  }
 }  // namespace
 namespace aicpu {
 uint32_t LogitCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLogit);
  DataType data_type = ctx.Input(0)->GetDataType();
  switch (data_type) {
    LOGIT_COMPUTE_CASE(DT_DOUBLE, double, ctx)
    LOGIT_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
    LOGIT_COMPUTE_CASE(DT_FLOAT, float, ctx)
    default:
      KERNEL_LOG_ERROR("Logit kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t LogitCpuKernel::LogitCompute(CpuKernelContext &ctx) {
  auto input_tensor = ctx.Input(0);
  auto output_tensor = ctx.Output(0);
  auto input = reinterpret_cast<T *>(input_tensor->GetData());
  auto output = reinterpret_cast<T *>(output_tensor->GetData());
  AttrValue *attr = ctx.GetAttr("eps");
  float eps = -1.0;
  if (attr != nullptr) {
    eps = attr->GetFloat();
  }
  auto input_shape = input_tensor->GetTensorShape();
  int64_t data_num = input_shape->NumElements();
  if (data_num >= kParallelDataNumSameShape) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
    if (data_num <= kParallelDataNumSameShapeMid) {
      max_core_num = std::min(max_core_num, 4U);
    }
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    auto shared_less = [&](size_t start, size_t end) {
      T one = T(1);
      T up_bound = static_cast<T>(1) - static_cast<T>(eps);
      if (eps < 0) {
        for (size_t i = start; i < end; i++) {
          T x = input[i];
          output[i] = log(x / (one - x));
        }
      } else {
        for (size_t i = start; i < end; i++) {
          T z;
          T x = input[i];
          z = x < static_cast<T>(eps) ? static_cast<T>(eps) : (x > up_bound ? up_bound : x);
          output[i] = log(z / (one - z));
        }
      }
    };
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("max core num is 0");
      return KERNEL_STATUS_PARAM_INVALID;
    }
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_less),
                        "Logit Compute failed.");
  } else {
    T one = T(1);
    T up_bound = static_cast<T>(1) - static_cast<T>(eps);
    if (eps < 0) {
      for (int64_t i = 0; i < data_num; i++) {
        T x = input[i];
        output[i] = log(x / (one - x));
      }
    } else {
      for (int64_t i = 0; i < data_num; i++) {
        T z;
        T x = input[i];
        z = x < static_cast<T>(eps) ? static_cast<T>(eps) : (x > up_bound ? up_bound : x);
        output[i] = log(z / (one - z));
      }
    }
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kLogit, LogitCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h
@ -0,0 +1,36 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_LOGIT_H
 #define AICPU_KERNELS_NORMALIZED_LOGIT_H
 #include "cpu_ops_kernel.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class LogitCpuKernel : public CpuKernel {
 public:
  LogitCpuKernel() = default;
  ~LogitCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  uint32_t LogitCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.cc
@ -0,0 +1,133 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All right reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "logit_grad.h"
 #include "Eigen/Core"
 #include "Eigen/Dense"
 #include "Eigen/LU"
 #include "cmath"
 #include "cpu_context.h"
 #include "cpu_kernel_utils.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 2;
 const int64_t kParallelDataNumSameShape = 7 * 1024;
 const int64_t kParallelDataNumSameShapeMid = 16 * 1024;
 const char *kLogitGrad = "LogitGrad";
 #define LOGITGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
  case (DTYPE): {                                           \
    uint32_t result = LogitGradCompute<TYPE>(CTX);          \
    if (result != KERNEL_STATUS_OK) {                       \
      KERNEL_LOG_ERROR("LogitGrad kernel compute failed."); \
      return result;                                        \
    }                                                       \
    break;                                                  \
  }
 }  // namespace
 namespace aicpu {
 uint32_t LogitGradCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLogitGrad);
  DataType data_type = ctx.Input(0)->GetDataType();
  switch (data_type) {
    LOGITGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
    LOGITGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
    LOGITGRAD_COMPUTE_CASE(DT_DOUBLE, double, ctx)
    default:
      KERNEL_LOG_ERROR("LogitGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t LogitGradCpuKernel::LogitGradCompute(CpuKernelContext &ctx) {
  auto input_y_grad_tensor = ctx.Input(0);
  auto input_x_tensor = ctx.Input(1);
  auto output_x_grad_tensor = ctx.Output(0);
  auto input_y_grad = reinterpret_cast<T *>(input_y_grad_tensor->GetData());
  auto input_x = reinterpret_cast<T *>(input_x_tensor->GetData());
  auto output_x_grad = reinterpret_cast<T *>(output_x_grad_tensor->GetData());
  auto input_shape = input_x_tensor->GetTensorShape();
  int64_t data_num = input_shape->NumElements();
  float eps = -1.0;
  AttrValue *attr = ctx.GetAttr("eps");
  if (attr != nullptr) {
    eps = attr->GetFloat();
  }
  if (data_num >= kParallelDataNumSameShape) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
    if (data_num <= kParallelDataNumSameShapeMid) {
      max_core_num = std::min(max_core_num, 4U);
    }
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    auto shared_less = [&](size_t start, size_t end) {
      T one = T(1);
      T zero = T(0);
      T up_bound = static_cast<T>(1) - static_cast<T>(eps);
      if (eps < 0) {
        for (size_t i = start; i < end; i++) {
          T y_grad = input_y_grad[i];
          T x = input_x[i];
          output_x_grad[i] = (x < zero || x > one) ? std::numeric_limits<T>::quiet_NaN() : (y_grad / (x * (one - x)));
        }
      } else {
        for (size_t i = start; i < end; i++) {
          T y_grad = input_y_grad[i];
          T x = input_x[i];
          output_x_grad[i] =
            static_cast<float>(x) < static_cast<float>(eps) || static_cast<float>(x) > static_cast<float>(up_bound)
              ? zero
              : (y_grad / (x * (one - x)));
        }
      }
    };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_less),
                        "LogitGrad Compute failed.");
  } else {
    T one = T(1);
    T zero = T(0);
    T up_bound = static_cast<T>(1) - static_cast<T>(eps);
    if (eps < 0) {
      for (int64_t i = 0; i < data_num; i++) {
        T y_grad = input_y_grad[i];
        T x = input_x[i];
        output_x_grad[i] = (x < zero || x > one) ? std::numeric_limits<T>::quiet_NaN() : (y_grad / (x * (one - x)));
      }
    } else {
      for (int64_t i = 0; i < data_num; i++) {
        T y_grad = input_y_grad[i];
        T x = input_x[i];
        output_x_grad[i] =
          static_cast<float>(x) < static_cast<float>(eps) || static_cast<float>(x) > static_cast<float>(up_bound)
            ? zero
            : (y_grad / (x * (one - x)));
      }
    }
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kLogitGrad, LogitGradCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.h
@ -0,0 +1,36 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_LOGIT_GRAD_H
 #define AICPU_KERNELS_NORMALIZED_LOGIT_GRAD_H
 #include "cpu_ops_kernel.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class LogitGradCpuKernel : public CpuKernel {
 public:
  LogitGradCpuKernel() = default;
  ~LogitGradCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  uint32_t LogitGradCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.cc
@ -0,0 +1,153 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "lower_bound.h"
 #include "cpu_kernel_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kInputNum = 2;
 const uint32_t kOutputNum = 1;
 const char *kLowerBound = "LowerBound";
 #define LOWERBOUND_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX)    \
  case (DTYPE): {                                            \
    uint32_t result = LowerBoundCompute<TYPE1, TYPE2>(CTX);  \
    if (result != KERNEL_STATUS_OK) {                        \
      KERNEL_LOG_ERROR("LowerBound kernel compute failed."); \
      return result;                                         \
    }                                                        \
    break;                                                   \
  }
 #define LOWERBOUND_COMPUTE_CASE_ALL(TYPE, CTX)                \
  LOWERBOUND_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX)         \
  LOWERBOUND_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX)       \
  LOWERBOUND_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX)       \
  LOWERBOUND_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX)       \
  LOWERBOUND_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX)       \
  LOWERBOUND_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX)     \
  LOWERBOUND_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
  LOWERBOUND_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX)         \
  LOWERBOUND_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
 }  // namespace
 namespace aicpu {
 uint32_t LowerBoundCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LowerBound check input and output number failed.");
  Tensor *sorted_x_data = ctx.Input(0);
  Tensor *values_data = ctx.Input(1);
  Tensor *output_data = ctx.Output(0);
  auto output_type = output_data->GetDataType();
  auto sorted_x_type = sorted_x_data->GetDataType();
  auto values_type = values_data->GetDataType();
  if (sorted_x_type != values_type) {
    KERNEL_LOG_ERROR("Input[0] data type[%s] must be same with Input[1] data type[%s]", DTypeStr(sorted_x_type).c_str(),
                     DTypeStr(values_type).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  switch (output_type) {
    case DT_INT32:
      switch (sorted_x_type) {
        LOWERBOUND_COMPUTE_CASE_ALL(int32_t, ctx)
        default:
          KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
          return KERNEL_STATUS_PARAM_INVALID;
      }
      break;
    case DT_INT64:
      switch (sorted_x_type) {
        LOWERBOUND_COMPUTE_CASE_ALL(int64_t, ctx)
        default:
          KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
          return KERNEL_STATUS_PARAM_INVALID;
      }
      break;
    default:
      KERNEL_LOG_ERROR("Output data type[%s] not supported.", DTypeStr(output_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T1, typename T2>
 uint32_t LowerBoundCpuKernel::LowerBoundCompute(CpuKernelContext &ctx) {
  Tensor *sorted_x_data = ctx.Input(0);
  auto sorted_x_data_addr = reinterpret_cast<T1 *>(sorted_x_data->GetData());
  auto sorted_x_data_shape = sorted_x_data->GetTensorShape();
  std::vector<int64_t> sorted_x_data_shape_dims = sorted_x_data_shape->GetDimSizes();
  Tensor *values_data = ctx.Input(1);
  auto values_data_addr = reinterpret_cast<T1 *>(values_data->GetData());
  auto values_data_shape = values_data->GetTensorShape();
  int64_t values_data_num = values_data_shape->NumElements();
  std::vector<int64_t> values_data_shape_dims = values_data_shape->GetDimSizes();
  Tensor *output_data = ctx.Output(0);
  auto output_data_addr = reinterpret_cast<T2 *>(output_data->GetData());
  if (sorted_x_data_shape_dims[0] != values_data_shape_dims[0]) {
    KERNEL_LOG_ERROR("The number of rows of Input[0]:([%d]) should be consistent with that of Input[1]:([%d]).",
                     sorted_x_data_shape_dims[0], values_data_shape_dims[0]);
    return KERNEL_STATUS_PARAM_INVALID;
  }
  int64_t sorted_x_data_column = sorted_x_data_shape_dims[1];
  int64_t values_data_column = values_data_shape_dims[1];
  if (values_data_num < 1024) {
    for (int64_t i = 0; i < values_data_num; i++) {
      int64_t seq_row = i / values_data_column;
      int64_t low = seq_row * sorted_x_data_column;
      int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
      int64_t mid;
      while (low <= up) {
        mid = (low + up) / 2;
        if (values_data_addr[i] <= sorted_x_data_addr[mid]) {
          up = mid - 1;
        } else {
          low = mid + 1;
        }
      }
      output_data_addr[i] = low - seq_row * sorted_x_data_column;
    }
  } else {
    uint32_t min_core_num = 1;
    int64_t sum_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
    if (sum_core_num > values_data_num) {
      sum_core_num = values_data_num;
    }
    auto shard_compute = [&](size_t start, size_t end) {
      for (size_t i = start; i < end; i++) {
        int64_t seq_row = i / values_data_column;
        int64_t low = seq_row * sorted_x_data_column;
        int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
        int64_t mid;
        while (low <= up) {
          mid = (low + up) / 2;
          if (values_data_addr[i] <= sorted_x_data_addr[mid]) {
            up = mid - 1;
          } else {
            low = mid + 1;
          }
        }
        output_data_addr[i] = low - seq_row * sorted_x_data_column;
      }
    };
    KERNEL_HANDLE_ERROR(
      CpuKernelUtils::ParallelFor(ctx, values_data_num, values_data_num / sum_core_num, shard_compute),
      "LowerBound Compute failed.");
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kLowerBound, LowerBoundCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.h
@ -0,0 +1,35 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_LOWERBOUND_H_
 #define AICPU_KERNELS_NORMALIZED_LOWERBOUND_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class LowerBoundCpuKernel : public CpuKernel {
 public:
  LowerBoundCpuKernel() = default;
  ~LowerBoundCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T1, typename T2>
  static uint32_t LowerBoundCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.cc
@ -0,0 +1,115 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "lstsq.h"
 #include "cpu_kernel_utils.h"
 #include "utils/kernel_util.h"
 #include "utils/eigen_tensor.h"
 #include <Eigen/Dense>
 #include <Eigen/Cholesky>
 #include <iostream>
 namespace {
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 2;
 const char *kLstsq = "Lstsq";
 }  // namespace
 // namespace aicpu
 namespace aicpu {
 uint32_t LstsqCpuKernel::Compute(CpuKernelContext &ctx) {
  // check params
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Lstsq check input and output number failed.");
  Tensor *input_x0 = ctx.Input(0);
  Tensor *input_x1 = ctx.Input(1);
  Tensor *output = ctx.Output(0);
  auto dims_0 = input_x0->GetTensorShape()->GetDims();
  auto dims_1 = input_x1->GetTensorShape()->GetDims();
  KERNEL_CHECK_FALSE((dims_0 == 2), KERNEL_STATUS_PARAM_INVALID, "Dimension of input[0] must be 2, but got[%zu].",
                     dims_0);
  KERNEL_CHECK_FALSE(((dims_1 == 2) || (dims_1 == 1)), KERNEL_STATUS_PARAM_INVALID,
                     "Dimension of input[1] must be 2 or 1, but got[%zu].", dims_1);
  auto shape_0 = input_x0->GetTensorShape();
  auto shape_1 = input_x1->GetTensorShape();
  KERNEL_CHECK_FALSE((shape_0->GetDimSize(0) == shape_1->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID,
                     "Lstsq shape_0[0] and shape_1[0] not equal.", shape_0->GetDimSize(0), shape_0->GetDimSize(1));
  AttrValue *I2_regularizer = ctx.GetAttr("l2_regularizer");
  AttrValue *fast = ctx.GetAttr("fast");
  KERNEL_CHECK_NULLPTR(I2_regularizer, KERNEL_STATUS_PARAM_INVALID, "Get l2_regularizer failed.");
  KERNEL_CHECK_NULLPTR(fast, KERNEL_STATUS_PARAM_INVALID, "Get fast failed.");
  KERNEL_LOG_DEBUG(
    "LstsqCpuKernel[%s], inputx0: size[%llu];"
    "inputx1: size[%llu], output: size[%llu].",
    ctx.GetOpType().c_str(), input_x0->GetDataSize(), input_x1->GetDataSize(), output->GetDataSize());
  DataType data_type1 = ctx.Input(0)->GetDataType();
  DataType data_type2 = ctx.Input(1)->GetDataType();
  KERNEL_CHECK_FALSE((data_type1 == data_type2), KERNEL_STATUS_PARAM_INVALID,
                     "Lstsq input_0_dtype must be equal to input_1_dtype.", data_type1, data_type2);
  switch (data_type1) {
    case DT_FLOAT16:
      return LstsqCompute<float, Eigen::half>(ctx);
    case DT_FLOAT:
      return LstsqCompute<float, float>(ctx);
    case DT_DOUBLE:
      return LstsqCompute<double, double>(ctx);
    default:
      KERNEL_LOG_ERROR("Lstsq kernel data type [%u] not support.", DTypeStr(data_type1).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T1, typename T2>
 uint32_t LstsqCpuKernel::LstsqCompute(CpuKernelContext &ctx) {
  Eigen::Index m = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
  Eigen::Index n = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
  Eigen::Index k = 1;
  if (ctx.Input(1)->GetTensorShape()->GetDims() == 2) {
    k = ctx.Input(1)->GetTensorShape()->GetDimSize(1);
  }
  typedef Eigen::Matrix<T1, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartixXd;
  MartixXd A(m, n);
  MartixXd B(m, k);
  auto aptr = reinterpret_cast<T2 *>(ctx.Input(0)->GetData());
  auto bptr = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
  for (int i = 0; i < m * n; i++) {
    *(A.data() + i) = static_cast<T1>(*(aptr + i));
  }
  for (int i = 0; i < m * k; i++) {
    *(B.data() + i) = static_cast<T1>(*(bptr + i));
  }
  MartixXd result(n, k);
  if (m >= n) {
    result = A.colPivHouseholderQr().solve(B);
  } else {
    MartixXd A_Transpose = A.transpose();
    MartixXd temp = A * A_Transpose;
    MartixXd tempI = temp.inverse();
    MartixXd x = A_Transpose * tempI;
    MartixXd output = x * B;
    result = output;
  }
  auto output_addr = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
  for (int i = 0; i < n * k; i++) {
    *(output_addr + i) = static_cast<T2>(*(result.data() + i));
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kLstsq, LstsqCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.h
@ -0,0 +1,37 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_LSTSQ_H_
 #define AICPU_KERNELS_NORMALIZED_LSTSQ_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class LstsqCpuKernel : public CpuKernel {
 public:
  LstsqCpuKernel() = default;
  ~LstsqCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T1, typename T2>
  static uint32_t LstsqCompute(CpuKernelContext &ctx);
 };
 }  // namespace  aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.cc
@ -0,0 +1,185 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "lu_solve.h"
 #include "cpu_kernel_utils.h"
 #include "utils/kernel_util.h"
 #include "utils/eigen_tensor.h"
 #include <Eigen/Dense>
 #include <iostream>
 namespace {
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 3;
 const int64_t kParallelBatchNum1 = 50;
 const int64_t kParallelBatchNum4 = 200;
 const int64_t kParallelBatchNum8 = 500;
 const int64_t kParallelBatchNumx = 1000;
 const char *kLuSolve = "LuSolve";
 }  // namespace
 namespace aicpu {
 uint32_t LuSolveCpuKernel::Compute(CpuKernelContext &ctx) {
  // check params
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check LuSolve params failed.");
  Tensor *input_0 = ctx.Input(0);
  KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input0 data failed.");
  Tensor *input_1 = ctx.Input(1);
  KERNEL_CHECK_NULLPTR(input_1->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input1 data failed.");
  Tensor *input_2 = ctx.Input(2);
  KERNEL_CHECK_NULLPTR(input_2->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input2 data failed.");
  Tensor *output = ctx.Output(0);
  auto input_0_Shape = input_0->GetTensorShape();
  KERNEL_CHECK_NULLPTR(input_0_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_0_Shape failed.")
  auto input_1_Shape = input_1->GetTensorShape();
  KERNEL_CHECK_NULLPTR(input_1_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_1_Shape failed.")
  auto input_2_Shape = input_2->GetTensorShape();
  KERNEL_CHECK_NULLPTR(input_2_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_2_Shape failed.")
  int32_t b_dims = input_0_Shape->GetDims();
  int32_t lu_dims = input_1_Shape->GetDims();
  int32_t pivots_dims = input_2_Shape->GetDims();
  std::vector<int64_t> b_dims_vector = input_0_Shape->GetDimSizes();
  std::vector<int64_t> lu_dims_vector = input_1_Shape->GetDimSizes();
  std::vector<int64_t> pivots_dims_vector = input_2_Shape->GetDimSizes();
  if (b_dims == lu_dims) {
    for (int32_t i = 0; i <= b_dims - 2; i++) {
      if (b_dims_vector[i] != lu_dims_vector[i]) {
        KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!");
        return KERNEL_STATUS_PARAM_INVALID;
      }
    }
  } else if (lu_dims > b_dims) {
    for (int32_t i = 0; i < b_dims - 2; i++) {
      if (b_dims_vector[i] != lu_dims_vector[lu_dims - b_dims + i]) {
        KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!");
        return KERNEL_STATUS_PARAM_INVALID;
      }
    }
  } else {
    for (int32_t i = 0; i < lu_dims - 2; i++) {
      if (lu_dims_vector[i] != b_dims_vector[b_dims - lu_dims + i]) {
        KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!");
        return KERNEL_STATUS_PARAM_INVALID;
      }
    }
  }
  for (int32_t i = 0; i < pivots_dims; i++) {
    if (lu_dims_vector[i] != pivots_dims_vector[i]) {
      KERNEL_LOG_ERROR("batch dimension of LU_pivots doesn't match batch dimension of LU_data!");
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  auto data_type = ctx.Input(0)->GetDataType();
  KERNEL_LOG_DEBUG(
    "LuSolveCpuKernel[%s], input_0: size[%llu], input_1: size[%llu], input_2: size[%llu]"
    "output: size[%llu].",
    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), input_2->GetDataSize(),
    output->GetDataSize());
  switch (data_type) {
    case DT_FLOAT:
      return LuSolveCompute<float, float>(ctx);
    case DT_FLOAT16:
      return LuSolveCompute<float, Eigen::half>(ctx);
    default:
      KERNEL_LOG_ERROR("LuSolve kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T, typename T2>
 uint32_t LuSolveCpuKernel::LuSolve(CpuKernelContext &ctx, T *b_working_ptr, T *lu_working_ptr,
                                   int32_t *pivots_working_ptr, int64_t b_stride, int64_t a) {
  auto output_y = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
  auto input_0_Shape = ctx.Input(0)->GetTensorShape();
  auto input_1_Shape = ctx.Input(1)->GetTensorShape();
  int32_t lu_dims = input_1_Shape->GetDims();
  int64_t lu_maxtrix_sizes = input_1_Shape->GetDimSize(lu_dims - 2);
  int32_t b_dim = input_0_Shape->GetDims();
  int64_t b_m = input_0_Shape->GetDimSize(b_dim - 1);
  typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
  MatrixXd matrix_b = Eigen::Map<MatrixXd>(b_working_ptr, lu_maxtrix_sizes, b_m);
  MatrixXd matrix_A = Eigen::Map<MatrixXd>(lu_working_ptr, lu_maxtrix_sizes, lu_maxtrix_sizes);
  for (int64_t i = 0; i < input_0_Shape->GetDimSize(b_dim - 2); i++) {
    matrix_b.row(i).swap(matrix_b.row(*(pivots_working_ptr + i) - 1));
  }
  MatrixXd L = matrix_A.template triangularView<Eigen::UnitLower>();
  MatrixXd U = matrix_A.template triangularView<Eigen::Upper>();
  MatrixXd result = (L * U).lu().solve(matrix_b);
  for (int64_t m = 0; m < b_stride; m++) {
    *(output_y + a * b_stride + m) = (T2) * (result.data() + m);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T, typename T2>
 uint32_t LuSolveCpuKernel::LuSolveCompute(CpuKernelContext &ctx) {
  auto input_x0 = reinterpret_cast<T2 *>(ctx.Input(0)->GetData());
  auto input_x1 = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
  auto input_x2 = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
  auto input_0_Shape = ctx.Input(0)->GetTensorShape();
  auto input_1_Shape = ctx.Input(1)->GetTensorShape();
  auto input_2_Shape = ctx.Input(2)->GetTensorShape();
  T *input_0 = new T[input_0_Shape->NumElements()];
  T *input_1 = new T[input_1_Shape->NumElements()];
  for (int64_t i = 0; i < input_0_Shape->NumElements(); i++) {
    *(input_0 + i) = (T) * (input_x0 + i);
  }
  for (int64_t i = 0; i < input_1_Shape->NumElements(); i++) {
    *(input_1 + i) = (T) * (input_x1 + i);
  }
  int32_t b_dims = input_0_Shape->GetDims();
  int32_t lu_dims = input_1_Shape->GetDims();
  std::vector<int64_t> b_dims_vector = input_0_Shape->GetDimSizes();
  std::vector<int64_t> lu_dims_vector = input_1_Shape->GetDimSizes();
  std::vector<int64_t> pivots_dims_vector = input_2_Shape->GetDimSizes();
  int64_t b_stride = input_0_Shape->GetDimSize(b_dims - 1) * input_0_Shape->GetDimSize(b_dims - 2);
  int64_t lu_stride = input_1_Shape->GetDimSize(lu_dims - 1) * input_1_Shape->GetDimSize(lu_dims - 2);
  int64_t pivots_stride = input_1_Shape->GetDimSize(lu_dims - 1);
  std::vector<int64_t> b_shape = b_dims_vector;
  std::vector<int64_t> lu_shape = lu_dims_vector;
  for (size_t i = 0; i < 2; i++) {
    b_shape.pop_back();
    lu_shape.pop_back();
  }
  Bcast bcast(b_shape, lu_shape);
  int64_t batch_num = ctx.Output(0)->NumElements() / b_stride;
  if (batch_num < kParallelBatchNum1) {
    for (int64_t i = 0; i < batch_num; i++) {
      T *b_working_ptr = &input_0[bcast.GetBroadcastXIndex(i) * b_stride];
      T *lu_working_ptr = &input_1[bcast.GetBroadcastYIndex(i) * lu_stride];
      int32_t *pivots_working_ptr = &input_x2[bcast.GetBroadcastYIndex(i) * pivots_stride];
      LuSolve<T, T2>(ctx, b_working_ptr, lu_working_ptr, pivots_working_ptr, b_stride, i);
    }
  } else {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
    if (batch_num < kParallelBatchNumx) max_core_num = 8U;
    if (batch_num < kParallelBatchNum8) max_core_num = 4U;
    if (batch_num < kParallelBatchNum4) max_core_num = 2U;
    std::cout << max_core_num << std::endl;
    auto sharder = [&](int64_t start, int64_t end) {
      for (int64_t i = start; i < end; i++) {
        T *b_working_ptr = &input_0[bcast.GetBroadcastXIndex(i) * b_stride];
        T *lu_working_ptr = &input_1[bcast.GetBroadcastYIndex(i) * lu_stride];
        int32_t *pivots_working_ptr = &input_x2[bcast.GetBroadcastYIndex(i) * pivots_stride];
        LuSolve<T, T2>(ctx, b_working_ptr, lu_working_ptr, pivots_working_ptr, b_stride, i);
      }
    };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch_num, batch_num / max_core_num, sharder),
                        "LuSolve Compute failed.");
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kLuSolve, LuSolveCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.h
@ -0,0 +1,22 @@
 #ifndef AICPU_KERNELS_NORMALIZED_LUSOLVE_H_
 #define AICPU_KERNELS_NORMALIZED_LUSOLVE_H_
 #include "cpu_ops_kernel.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class LuSolveCpuKernel : public CpuKernel {
 public:
  LuSolveCpuKernel() = default;
  ~LuSolveCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T, typename T2>
  static uint32_t LuSolve(CpuKernelContext &ctx, T *b_working_ptr, T *lu_working_ptr, int32_t *pivots_working_ptr,
                          int64_t b_stride, int64_t i);
  template <typename T, typename T2>
  static uint32_t LuSolveCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.cc
@ -0,0 +1,321 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "lu_unpack.h"
 #include <string.h>
 #include <Eigen/Dense>
 #include <algorithm>
 #include <iostream>
 #include "cpu_context.h"
 #include "cpu_ops_kernel.h"
 #include "cpu_kernel_utils.h"
 #include "cpu_tensor.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kOutputNum = 3;
 const uint32_t kInputNum = 2;
 const uint32_t kFirstInputIndex = 0;
 const uint32_t kSecondInputIndex = 1;
 const uint32_t kFirstOutputIndex = 0;
 const uint32_t kSecondOutputIndex = 1;
 const uint32_t kThirdOutputIndex = 2;
 const int32_t kLuDataMinRank = 2;
 const int32_t kLuPivotsMinRank = 2;
 const int64_t kParallelBatchNum = 70;
 const char *kLuUnpack = "LuUnpack";
 }  // namespace
 namespace aicpu {
 template <typename T_data, typename T_pivots>
 uint32_t LuUnpackCpuKernel::LuUnpack(CpuKernelContext &ctx, T_pivots *Lu_pivots_working_ptr, int64_t matrix_index,
                                     T_data *P_eye) {
  int32_t Lu_data_dims = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDims();
  int64_t Lu_data_dim1 = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSize(Lu_data_dims - 2);
  int64_t Lu_data_dim2 = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSize(Lu_data_dims - 1);
  int32_t Lu_pivots_dims = ctx.Input(kSecondInputIndex)->GetTensorShape()->GetDims();
  int64_t Lu_pivots_dim = ctx.Input(kSecondInputIndex)->GetTensorShape()->GetDimSize(Lu_pivots_dims - 1);
  int64_t matrix_width = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes()[Lu_data_dims - 2];
  int64_t matrix_height = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes()[Lu_data_dims - 1];
  int64_t pivots_stride = Lu_data_dim1 * Lu_data_dim1;
  int64_t L_stride = 0;
  int64_t U_stride = 0;
  if (Lu_data_dim1 > Lu_data_dim2) {
    L_stride = Lu_data_dim1 * Lu_data_dim2;
    U_stride = Lu_data_dim2 * Lu_data_dim2;
  } else {
    L_stride = Lu_data_dim1 * Lu_data_dim1;
    U_stride = Lu_data_dim1 * Lu_data_dim2;
  }
  int64_t matrix_size = matrix_width * matrix_height;
  using MatrixMap = Eigen::Map<Eigen::Matrix<T_data, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
  MatrixMap input(reinterpret_cast<T_data *>(ctx.Input(kFirstInputIndex)->GetData()) + matrix_index * matrix_size,
                  matrix_width, matrix_height);
  //  Triu
  if (matrix_width > matrix_height) {
    MatrixMap output2(reinterpret_cast<T_data *>(ctx.Output(kThirdOutputIndex)->GetData()) + matrix_index * U_stride,
                      matrix_height, matrix_height);
    T_data *MiddlePtr = new T_data[matrix_size];
    MatrixMap MiddleData(MiddlePtr, matrix_width, matrix_height);
    MiddleData = input.template triangularView<Eigen::Upper>();
    output2 = MiddleData.block(0, 0, matrix_height, matrix_height);
    delete[] MiddlePtr;
  } else {
    MatrixMap output2(reinterpret_cast<T_data *>(ctx.Output(kThirdOutputIndex)->GetData()) + matrix_index * U_stride,
                      matrix_width, matrix_height);
    output2 = input.template triangularView<Eigen::Upper>();
  }
  //  Tril
  if (matrix_height > matrix_width) {
    MatrixMap output1(reinterpret_cast<T_data *>(ctx.Output(kSecondOutputIndex)->GetData()) + matrix_index * L_stride,
                      matrix_width, matrix_width);
    T_data *MiddlePtr = new T_data[matrix_size];
    MatrixMap MiddleData(MiddlePtr, matrix_width, matrix_height);
    MiddleData = input.template triangularView<Eigen::UnitLower>();
    output1 = MiddleData.block(0, 0, matrix_width, matrix_width);
    delete[] MiddlePtr;
  } else {
    MatrixMap output1(reinterpret_cast<T_data *>(ctx.Output(kSecondOutputIndex)->GetData()) + matrix_index * L_stride,
                      matrix_width, matrix_height);
    output1 = input.template triangularView<Eigen::UnitLower>();
  }
  //  Swap
  std::vector<T_pivots> final_order;
  final_order.resize(Lu_data_dim1);
  for (int i = 0; i < Lu_data_dim1; i++) {
    final_order[i] = T_pivots(i);
  }
  for (T_pivots id = 0; id < Lu_pivots_dim; id++) {
    int64_t perm_id = 0;
    int64_t perm_pivots_id = 0;
    for (int64_t i = 0; i < Lu_data_dim1; i++) {
      if (id == final_order[i]) {
        perm_id = i;
      }
      if (!((*(Lu_pivots_working_ptr + id) <= Lu_data_dim1) && (*(Lu_pivots_working_ptr + id) >= 1))) {
        return KERNEL_STATUS_PARAM_INVALID;
      }
      if ((*(Lu_pivots_working_ptr + id) - 1) == final_order[i]) {
        perm_pivots_id = i;
      }
    }
    std::swap(final_order[perm_id], final_order[perm_pivots_id]);
  }
  //  Index_select
  auto output_y0 = reinterpret_cast<T_data *>(ctx.Output(kFirstOutputIndex)->GetData());
  int64_t indices_num = final_order.size();
  int64_t inner_size = Lu_data_dim1;
  int64_t slice_size = inner_size * sizeof(T_data);
  for (int64_t j = 0; j < indices_num; ++j) {
    auto params_idx = final_order[j] * inner_size;
    auto out_idx = j * inner_size;
    memcpy(output_y0 + matrix_index * pivots_stride + out_idx, P_eye + params_idx, slice_size);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T_data, typename T_pivots>
 uint32_t LuUnpackCpuKernel::LuUnpackCompute(CpuKernelContext &ctx) {
  Tensor *input0_tensor = ctx.Input(kFirstInputIndex);
  Tensor *input1_tensor = ctx.Input(kSecondInputIndex);
  auto input_0_Shape = input0_tensor->GetTensorShape();
  auto input_1_Shape = input1_tensor->GetTensorShape();
  int32_t Lu_data_dims = input_0_Shape->GetDims();
  int64_t Lu_data_dim1 = input_0_Shape->GetDimSize(Lu_data_dims - 2);
  int64_t Lu_data_dim2 = input_0_Shape->GetDimSize(Lu_data_dims - 1);
  int32_t Lu_pivots_dims = input_1_Shape->GetDims();
  int64_t Lu_pivots_dim = input_1_Shape->GetDimSize(Lu_pivots_dims - 1);
  auto input_dim_size = input_0_Shape->GetDimSizes();
  auto input_x1 = reinterpret_cast<T_pivots *>(input1_tensor->GetData());
  int32_t block_size = Lu_data_dim1 * Lu_data_dim1;
  T_data *P_eye = new T_data[block_size]{};
  T_data num = static_cast<T_data>(1);
  for (int32_t i = 0; i < Lu_data_dim1; i++) {
    *(P_eye + (Lu_data_dim1 + 1) * i) = num;
  }
  uint32_t check_status = 0;
  int64_t Lu_data_stride = Lu_data_dim1 * Lu_data_dim2;
  int64_t Lu_pivots_stride = Lu_pivots_dim;
  int64_t batch_num = ctx.Input(0)->NumElements() / Lu_data_stride;
  if (batch_num < kParallelBatchNum || Lu_data_dims == kLuDataMinRank) {
    for (int64_t matrix_index = 0; matrix_index < batch_num; matrix_index++) {
      T_pivots *Lu_pivots_working_ptr = input_x1 + matrix_index * Lu_pivots_stride;
      check_status = LuUnpack(ctx, Lu_pivots_working_ptr, matrix_index, P_eye);
      if (check_status == KERNEL_STATUS_PARAM_INVALID) {
        return check_status;
      }
    }
  } else {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
    if (max_core_num > batch_num) {
      max_core_num = batch_num;
    }
    uint32_t parallel_status = 0;
    auto sharder = [&](int64_t start, int64_t end) {
      for (int64_t matrix_index = start; matrix_index < end; matrix_index++) {
        T_pivots *Lu_pivots_working_ptr = input_x1 + matrix_index * Lu_pivots_stride;
        if (LuUnpack(ctx, Lu_pivots_working_ptr, matrix_index, P_eye) == KERNEL_STATUS_OK) {
          parallel_status = KERNEL_STATUS_OK;
        } else {
          parallel_status = KERNEL_STATUS_PARAM_INVALID;
          break;
        }
      }
    };
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("max_core_num could not be 0.");
    }
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch_num, batch_num / max_core_num, sharder),
                        "LuUnpack Compute failed.");
    if (parallel_status != KERNEL_STATUS_OK) {
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  delete[] P_eye;
  return KERNEL_STATUS_OK;
 }
 void LuUnpackCpuKernel::SetMap() {
  calls_[DT_INT8][DT_INT8] = LuUnpackCompute<int8_t, int8_t>;
  calls_[DT_INT8][DT_UINT8] = LuUnpackCompute<int8_t, uint8_t>;
  calls_[DT_INT8][DT_INT16] = LuUnpackCompute<int8_t, int16_t>;
  calls_[DT_INT8][DT_INT32] = LuUnpackCompute<int8_t, int32_t>;
  calls_[DT_INT8][DT_INT64] = LuUnpackCompute<int8_t, int64_t>;
  calls_[DT_INT16][DT_INT8] = LuUnpackCompute<int16_t, int8_t>;
  calls_[DT_INT16][DT_INT16] = LuUnpackCompute<int16_t, int16_t>;
  calls_[DT_INT16][DT_INT32] = LuUnpackCompute<int16_t, int32_t>;
  calls_[DT_INT16][DT_INT64] = LuUnpackCompute<int16_t, int64_t>;
  calls_[DT_INT16][DT_UINT8] = LuUnpackCompute<int16_t, uint8_t>;
  calls_[DT_INT32][DT_INT8] = LuUnpackCompute<int32_t, int8_t>;
  calls_[DT_INT32][DT_INT16] = LuUnpackCompute<int32_t, int16_t>;
  calls_[DT_INT32][DT_INT32] = LuUnpackCompute<int32_t, int32_t>;
  calls_[DT_INT32][DT_INT64] = LuUnpackCompute<int32_t, int64_t>;
  calls_[DT_INT32][DT_UINT8] = LuUnpackCompute<int32_t, uint8_t>;
  calls_[DT_INT64][DT_INT8] = LuUnpackCompute<int64_t, int8_t>;
  calls_[DT_INT64][DT_INT16] = LuUnpackCompute<int64_t, int16_t>;
  calls_[DT_INT64][DT_INT32] = LuUnpackCompute<int64_t, int32_t>;
  calls_[DT_INT64][DT_INT64] = LuUnpackCompute<int64_t, int64_t>;
  calls_[DT_INT64][DT_UINT8] = LuUnpackCompute<int64_t, uint8_t>;
  calls_[DT_FLOAT16][DT_INT8] = LuUnpackCompute<Eigen::half, int8_t>;
  calls_[DT_FLOAT16][DT_INT16] = LuUnpackCompute<Eigen::half, int16_t>;
  calls_[DT_FLOAT16][DT_INT32] = LuUnpackCompute<Eigen::half, int32_t>;
  calls_[DT_FLOAT16][DT_INT64] = LuUnpackCompute<Eigen::half, int64_t>;
  calls_[DT_FLOAT16][DT_UINT8] = LuUnpackCompute<Eigen::half, uint8_t>;
  calls_[DT_FLOAT][DT_INT8] = LuUnpackCompute<float, int8_t>;
  calls_[DT_FLOAT][DT_INT16] = LuUnpackCompute<float, int16_t>;
  calls_[DT_FLOAT][DT_INT32] = LuUnpackCompute<float, int32_t>;
  calls_[DT_FLOAT][DT_INT64] = LuUnpackCompute<float, int64_t>;
  calls_[DT_FLOAT][DT_UINT8] = LuUnpackCompute<float, uint8_t>;
  calls_[DT_DOUBLE][DT_INT8] = LuUnpackCompute<double, int8_t>;
  calls_[DT_DOUBLE][DT_INT16] = LuUnpackCompute<double, int16_t>;
  calls_[DT_DOUBLE][DT_INT32] = LuUnpackCompute<double, int32_t>;
  calls_[DT_DOUBLE][DT_INT64] = LuUnpackCompute<double, int64_t>;
  calls_[DT_DOUBLE][DT_UINT8] = LuUnpackCompute<double, uint8_t>;
  calls_[DT_UINT8][DT_INT8] = LuUnpackCompute<uint8_t, int8_t>;
  calls_[DT_UINT8][DT_INT16] = LuUnpackCompute<uint8_t, int16_t>;
  calls_[DT_UINT8][DT_INT32] = LuUnpackCompute<uint8_t, int32_t>;
  calls_[DT_UINT8][DT_INT64] = LuUnpackCompute<uint8_t, int64_t>;
  calls_[DT_UINT8][DT_UINT8] = LuUnpackCompute<uint8_t, uint8_t>;
 }
 uint32_t LuUnpackCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LuUnpack check input and output number failed.");
  Tensor *LU_data_ = ctx.Input(0);
  Tensor *LU_pivots_ = ctx.Input(1);
  std::shared_ptr<TensorShape> LU_data_shape = LU_data_->GetTensorShape();
  std::shared_ptr<TensorShape> LU_pivots_shape = LU_pivots_->GetTensorShape();
  int32_t LU_data_rank = LU_data_shape->GetDims();
  if (LU_data_rank < kLuDataMinRank) {
    KERNEL_LOG_ERROR(
      "The input dim size of LU_data must be at least 2-D, "
      "while %d",
      LU_data_rank);
    return KERNEL_STATUS_PARAM_INVALID;
  }
  int32_t Lu_data_dims = LU_data_shape->GetDims();
  int64_t Lu_data_dim1 = LU_data_shape->GetDimSize(Lu_data_dims - 2);
  int64_t Lu_data_dim2 = LU_data_shape->GetDimSize(Lu_data_dims - 1);
  int32_t Lu_pivots_dims = LU_pivots_shape->GetDims();
  int64_t Lu_pivots_dim = LU_pivots_shape->GetDimSize(Lu_pivots_dims - 1);
  if (Lu_pivots_dim != std::min(Lu_data_dim1, Lu_data_dim2)) {
    KERNEL_LOG_ERROR(
      "The last dimension of LU_pivots must be the same as the minimum value "
      "of the last two dimensions of LU_data, "
      "but got The last dimension of LU_pivots [%d], the minimum value of "
      "the last two dimensions of LU_data: [%d]",
      Lu_pivots_dim, std::min(Lu_data_dim1, Lu_data_dim2));
    return KERNEL_STATUS_PARAM_INVALID;
  }
  for (int32_t i = 0; i < Lu_pivots_dims - 1; i++) {
    if (LU_data_shape->GetDimSize(i) != LU_pivots_shape->GetDimSize(i)) {
      KERNEL_LOG_ERROR(
        " LU_data's batch dimensions does not match LU_pivots's batch "
        "dimensions.");
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  DataType LU_data_dtype = static_cast<DataType>(LU_data_->GetDataType());
  bool LU_data_dtype_flag = LU_data_dtype != DT_FLOAT16 && LU_data_dtype != DT_FLOAT && LU_data_dtype != DT_DOUBLE &&
                            LU_data_dtype != DT_INT8 && LU_data_dtype != DT_UINT8 && LU_data_dtype != DT_INT16 &&
                            LU_data_dtype != DT_INT32 && LU_data_dtype != DT_INT64;
  if (LU_data_dtype_flag) {
    KERNEL_LOG_ERROR(
      "Op LuUnpack first input LU_data_type's data type should be of the "
      "follows: "
      "DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT16, "
      "DT_FLOAT, DT_DOUBLE, "
      "but this type is [%s].",
      DTypeStr(LU_data_dtype).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  DataType LU_pivots_dtype = static_cast<DataType>(LU_pivots_->GetDataType());
  bool LU_pivots_dtype_flag = LU_pivots_dtype != DT_INT8 && LU_pivots_dtype != DT_UINT8 &&
                              LU_pivots_dtype != DT_INT16 && LU_pivots_dtype != DT_INT32 && LU_pivots_dtype != DT_INT64;
  if (LU_pivots_dtype_flag) {
    KERNEL_LOG_ERROR(
      "Op LuUnpack second input LU_pivots_type's data type should be of the "
      "follows: "
      "DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, "
      "but this type is [%s].",
      DTypeStr(LU_pivots_dtype).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  SetMap();
  std::vector<DataType> LU_data_type_vec = {DT_INT8,  DT_UINT8,   DT_INT16, DT_INT32,
                                            DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE};
  std::vector<DataType> LU_pivots_type_vec = {DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64};
  for (uint64_t i = 0; i < LU_data_type_vec.size(); i++) {
    for (uint64_t j = 0; j < LU_pivots_type_vec.size(); j++) {
      if (LU_data_dtype == LU_data_type_vec[i] && LU_pivots_dtype == LU_pivots_type_vec[j]) {
        KERNEL_HANDLE_ERROR(calls_[LU_data_type_vec[i]][LU_pivots_type_vec[j]](ctx),
                            "The elements of LU_pivots must be greater than 1 "
                            "and be less than the size of LU_pivots's last dimension.");
      }
    }
  }
  calls_.clear();
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kLuUnpack, LuUnpackCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.h
@ -0,0 +1,40 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_LUUNPACK_H_
 #define AICPU_KERNELS_NORMALIZED_LUUNPACK_H_
 #include "cpu_ops_kernel.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class LuUnpackCpuKernel : public CpuKernel {
 public:
  LuUnpackCpuKernel() = default;
  ~LuUnpackCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T_data, typename T_pivots>
  static uint32_t LuUnpack(CpuKernelContext &ctx, T_pivots *Lu_pivots_working_ptr, int64_t matrix_index, T_data *P_eye);
  template <typename T_data, typename T_pivots>
  static uint32_t LuUnpackCompute(CpuKernelContext &ctx);
  template <typename T_pivots>
  static uint32_t DataAndTypeCheck(CpuKernelContext &ctx);
  std::map<int, std::map<int, std::function<uint32_t(CpuKernelContext &)>>> calls_;
  void SetMap();
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.cc
@ -0,0 +1,183 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "lu_unpack_grad.h"
 #include <iostream>
 #include "Eigen/Core"
 #include "cpu_kernel_utils.h"
 #include "cpu_types.h"
 #include "kernel_log.h"
 #include "securec.h"
 #include "status.h"
 #include "utils/broadcast_iterator.h"
 #include "utils/kernel_util.h"
 namespace {
 const char *kLuUnpackGrad = "LuUnpackGrad";
 const int64_t kParallelBatchNum = 30;
 const uint32_t kInputNum = 3;
 const uint32_t kOutputNum = 2;
 const uint32_t kInputFirst = 0;
 const uint32_t kInputSecond = 1;
 const uint32_t kInputThird = 2;
 }  // namespace
 namespace aicpu {
 uint32_t LuUnpackGradCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Lu Unpack Grad check input and output number failed.");
  // choose compute function depend on dataType
  auto input_type = static_cast<DataType>(ctx.Input(kInputThird)->GetDataType());
  switch (input_type) {
    case DT_FLOAT16:
      return LuUnpackGradCompute<Eigen::half>(ctx);
    case DT_FLOAT:
      return LuUnpackGradCompute<float>(ctx);
    case DT_DOUBLE:
      return LuUnpackGradCompute<double>(ctx);
    case DT_INT8:
      return LuUnpackGradCompute<int8_t>(ctx);
    case DT_INT16:
      return LuUnpackGradCompute<int16_t>(ctx);
    case DT_INT32:
      return LuUnpackGradCompute<int32_t>(ctx);
    case DT_INT64:
      return LuUnpackGradCompute<int64_t>(ctx);
    case DT_UINT8:
      return LuUnpackGradCompute<uint8_t>(ctx);
    default:
      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
                       DTypeStr(input_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t LuUnpackGradCpuKernel::TriLU(CpuKernelContext &ctx, Tensor *L_grad_output, Tensor *U_grad_output, int64_t a) {
  Tensor *L_grad = NULL;
  Tensor *U_grad = NULL;
  Tensor *LU_data = NULL;
  L_grad = ctx.Input(kInputFirst);
  U_grad = ctx.Input(kInputSecond);
  LU_data = ctx.Input(kInputThird);
  auto LU_data_shape = LU_data->GetTensorShape();
  int32_t LU_data_dims = LU_data_shape->GetDims();
  int64_t LU_data_height = LU_data_shape->GetDimSize(LU_data_dims - 2);
  int64_t LU_data_width = LU_data_shape->GetDimSize(LU_data_dims - 1);
  auto LU_dim_min = std::min(LU_data_height, LU_data_width);
  auto input_U_shape = U_grad->GetTensorShape();
  auto input_U_dim_size = input_U_shape->GetDimSizes();
  auto input_U_dims = input_U_shape->GetDims();
  int64_t matrix_U_width = input_U_dim_size[input_U_dims - 2];
  int64_t matrix_U_height = input_U_dim_size[input_U_dims - 1];
  int64_t matrix_U_size = matrix_U_width * matrix_U_height;
  auto input_L_shape = L_grad->GetTensorShape();
  auto input_L_dim_size = input_L_shape->GetDimSizes();
  using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
  auto input_L_dims = input_L_shape->GetDims();
  int64_t matrix_L_width = input_L_dim_size[input_L_dims - 2];
  int64_t matrix_L_height = input_L_dim_size[input_L_dims - 1];
  int64_t matrix_L_size = matrix_L_width * matrix_L_height;
  int64_t output_stride = LU_data_height * LU_data_width;
  MatrixMap input_L(reinterpret_cast<T *>(L_grad->GetData()) + a * matrix_L_size, matrix_L_width, matrix_L_height);
  MatrixMap input_U(reinterpret_cast<T *>(U_grad->GetData()) + a * matrix_U_size, matrix_U_width, matrix_U_height);
  if (LU_data_width > LU_data_height) {
    MatrixMap output_L(reinterpret_cast<T *>(L_grad_output->GetData()) + a * output_stride, LU_data_height,
                       LU_data_width);
    T *MiddlePtr = new T[matrix_L_size];
    MatrixMap MiddleData(MiddlePtr, matrix_L_width, matrix_L_height);
    MiddleData = input_L.template triangularView<Eigen::StrictlyLower>();
    for (auto i = 0; i < LU_data_height; i++) {
      for (auto j = 0; j < LU_dim_min; j++) {
        output_L(i, j) = MiddleData(i, j);
      }
    }
    delete[] MiddlePtr;
  } else {
    MatrixMap output_L(reinterpret_cast<T *>(L_grad_output->GetData()) + a * output_stride, LU_data_height,
                       LU_data_width);
    output_L = input_L.template triangularView<Eigen::StrictlyLower>();
  }
  if (LU_data_height > LU_data_width) {
    MatrixMap output_U(reinterpret_cast<T *>(U_grad_output->GetData()) + a * output_stride, LU_data_height,
                       LU_data_width);
    T *MiddlePtr = new T[matrix_U_size];
    MatrixMap MiddleData(MiddlePtr, matrix_U_width, matrix_U_height);
    MiddleData = input_U.template triangularView<Eigen::Upper>();
    for (auto i = 0; i < LU_dim_min; i++) {
      for (auto j = i; j < LU_data_width; j++) {
        output_U(i, j) = MiddleData(i, j);
      }
    }
    delete[] MiddlePtr;
  } else {
    MatrixMap output_U(reinterpret_cast<T *>(U_grad_output->GetData()) + a * output_stride, LU_data_height,
                       LU_data_width);
    output_U = input_U.template triangularView<Eigen::Upper>();
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t LuUnpackGradCpuKernel::LuUnpackGradCompute(CpuKernelContext &ctx) {
  Tensor *LU_data = NULL;
  Tensor *L_grad_output = NULL;
  Tensor *U_grad_output = NULL;
  LU_data = ctx.Input(kInputThird);
  L_grad_output = ctx.Output(0);
  U_grad_output = ctx.Output(1);
  auto LU_data_shape = LU_data->GetTensorShape();
  int32_t LU_data_dims = LU_data_shape->GetDims();
  int64_t LU_data_elem_num = LU_data->NumElements();
  int64_t LU_data_height = LU_data_shape->GetDimSize(LU_data_dims - 2);
  int64_t LU_data_width = LU_data_shape->GetDimSize(LU_data_dims - 1);
  int64_t LU_data_stride = LU_data_height * LU_data_width;
  int64_t matrix_num = LU_data_elem_num / LU_data_stride;
  auto L_grad_output_data = reinterpret_cast<T *>(L_grad_output->GetData());
  auto U_grad_output_data = reinterpret_cast<T *>(U_grad_output->GetData());
  for (auto i = 0; i < LU_data_elem_num; i++) {
    *(L_grad_output_data + i) = static_cast<T>(0);
    *(U_grad_output_data + i) = static_cast<T>(0);
  }
  if (matrix_num < kParallelBatchNum) {
    for (auto i = 0; i < matrix_num; i++) {
      TriLU<T>(ctx, L_grad_output, U_grad_output, i);
    }
  } else {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
    if (max_core_num > matrix_num) {
      max_core_num = matrix_num;
    }
    auto sharder = [&](int64_t start, int64_t end) {
      for (int64_t i = start; i < end; i++) {
        TriLU<T>(ctx, L_grad_output, U_grad_output, i);
      }
    };
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("max_core_num could not be 0.");
    }
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, sharder),
                        "LuUnpackGrad Compute failed.");
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kLuUnpackGrad, LuUnpackGradCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.h
@ -0,0 +1,40 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_LU_UNPACK_GRAD_H_
 #define AICPU_KERNELS_LU_UNPACK_GRAD_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class LuUnpackGradCpuKernel : public CpuKernel {
 public:
  ~LuUnpackGradCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  /**
   * @brief compute for all types
   * @param ctx cpu kernel context
   * @return status if success
   */
  template <typename T>
  uint32_t LuUnpackGradCompute(CpuKernelContext &ctx);
  template <typename T>
  uint32_t TriLU(CpuKernelContext &ctx, Tensor *L_grad_output, Tensor *U_grad_output, int64_t a);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.cc
@ -0,0 +1,179 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "matmul.h"
 #include <complex>
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "utils/kernel_util.h"
 #include "cpu_kernel_utils.h"
 #include "kernel_log.h"
 #include "status.h"
 using namespace std;
 namespace {
 const char *kMatmul = "MatMul";
 }  // namespace
 namespace aicpu {
 template <typename T>
 uint32_t MatMulCpuKernel::AddCompute(CpuKernelContext &ctx, Bcast &bcast) {
  auto in2 = reinterpret_cast<T *>(ctx.Input(2)->GetData());
  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  int64_t data_num = ctx.Output(0)->NumElements();
  for (int64_t i = 0; i < data_num; i++) {
    auto input1 = in2 + bcast.GetBroadcastXIndex(i);  // i-th value of input0
    auto input2 = out + bcast.GetBroadcastYIndex(i);  // i-th value of input1
    *(out + i) = (*input1) + (*input2);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t MatMulCpuKernel::BiasCompute(CpuKernelContext &ctx) {
  auto input0_tensor = ctx.Input(0);
  auto input2_tensor = ctx.Input(2);
  auto input2_shape = input2_tensor->GetTensorShape()->GetDimSizes();
  auto output_tensor = ctx.Output(kFirstOutputIndex);
  auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
  KERNEL_CHECK_FALSE(input2_tensor->GetTensorShape()->GetDims() == 1, KERNEL_STATUS_PARAM_INVALID,
                     "Input[x3] must be a 1D tensor")
  DataType input0_data_type = input0_tensor->GetDataType();
  DataType input2_data_type = input2_tensor->GetDataType();
  KERNEL_CHECK_FALSE((input0_data_type == input2_data_type), KERNEL_STATUS_PARAM_INVALID,
                     "Input[x1] data type[%s] and input[x3] data type[%s] must be same",
                     DTypeStr(input0_data_type).c_str(), DTypeStr(input2_data_type).c_str())
  Bcast bcast(input2_shape, output_shape);
  if (!bcast.IsValid()) {
    KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  return AddCompute<T>(ctx, bcast);
 }
 template <typename T>
 uint32_t MatMulCpuKernel::MatMulCompute(CpuKernelContext &ctx) {
  auto input0_tensor = ctx.Input(0);
  auto input0_tensor_shape = input0_tensor->GetTensorShape();
  KERNEL_CHECK_FALSE((IsMatrix(input0_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
                     "Input[x1] must be a matrix")
  auto input1_tensor = ctx.Input(1);
  auto input1_tensor_shape = input1_tensor->GetTensorShape();
  KERNEL_CHECK_FALSE((IsMatrix(input1_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
                     "Input[x2] must be a matrix")
  auto transpose_x1 = ctx.GetAttr("transpose_x1")->GetBool();
  auto transpose_x2 = ctx.GetAttr("transpose_x2")->GetBool();
  KERNEL_LOG_DEBUG(
    "%s Attr[transpose_x1] value[%d], "
    "Attr[transpose_x2] value[%d].",
    kMatmul, transpose_x1, transpose_x2);
  int32_t x1_dim = transpose_x1 ? 0 : 1;
  int32_t x2_dim = transpose_x2 ? 1 : 0;
  KERNEL_CHECK_FALSE((input0_tensor_shape->GetDimSize(x1_dim) == input1_tensor_shape->GetDimSize(x2_dim)),
                     KERNEL_STATUS_PARAM_INVALID,
                     "Matrix size incompatible, input[x1] dim[%d] value[%lld], "
                     "input[x2] dim[%d] value[%lld]",
                     x1_dim, input0_tensor_shape->GetDimSize(x1_dim), x2_dim, input1_tensor_shape->GetDimSize(x2_dim))
  auto input0_shape = input0_tensor_shape->GetDimSizes();
  using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
  MatrixMap input0(reinterpret_cast<T *>(input0_tensor->GetData()), input0_shape[0], input0_shape[1]);
  auto input1_shape = input1_tensor_shape->GetDimSizes();
  MatrixMap input1(reinterpret_cast<T *>(input1_tensor->GetData()), input1_shape[0], input1_shape[1]);
  auto output_tensor = ctx.Output(kFirstOutputIndex);
  auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
  MatrixMap output(reinterpret_cast<T *>(output_tensor->GetData()), output_shape[0], output_shape[1]);
  if (transpose_x1) {
    if (transpose_x2) {
      output = input0.transpose() * input1.transpose();
    } else {
      output = input0.transpose() * input1;
    }
  } else {
    if (transpose_x2) {
      output = input0 * input1.transpose();
    } else {
      output = input0 * input1;
    }
  }
  if (ctx.GetInputsSize() == 3) {
    return BiasCompute<T>(ctx);
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t MatMulCpuKernel::Compute(CpuKernelContext &ctx) {
  // check params
  uint32_t input_num = ctx.GetInputsSize();
  uint32_t output_num = ctx.GetOutputsSize();
  if ((input_num != 2 && input_num != 3) || output_num != 1) {
    KERNEL_LOG_ERROR("The number of input or output parameters does not match.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  auto input0_tensor = ctx.Input(0);
  KERNEL_CHECK_NULLPTR(input0_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get input[x1] data failed",
                       ctx.GetOpType().c_str())
  auto input1_tensor = ctx.Input(1);
  auto input1_tensor_shape = input1_tensor->GetTensorShape();
  KERNEL_CHECK_NULLPTR(input1_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get input[x2] data failed",
                       ctx.GetOpType().c_str())
  DataType input0_data_type = input0_tensor->GetDataType();
  DataType input1_data_type = input1_tensor->GetDataType();
  KERNEL_CHECK_FALSE((input0_data_type == input1_data_type), KERNEL_STATUS_PARAM_INVALID,
                     "Input[x1] data type[%s] and input[x2] data type[%s] must be same",
                     DTypeStr(input0_data_type).c_str(), DTypeStr(input1_data_type).c_str())
  KERNEL_LOG_DEBUG("%s op input[x1] data type is [%s].", kMatmul, DTypeStr(input0_data_type).c_str());
  uint32_t ret = KERNEL_STATUS_OK;
  switch (input0_data_type) {
    case DT_FLOAT:
      ret = MatMulCompute<float>(ctx);
      break;
    case DT_DOUBLE:
      ret = MatMulCompute<double>(ctx);
      break;
    case DT_FLOAT16:
      ret = MatMulCompute<Eigen::half>(ctx);
      break;
    case DT_INT32:
      ret = MatMulCompute<int32_t>(ctx);
      break;
    case DT_COMPLEX64:
      ret = MatMulCompute<std::complex<float>>(ctx);
      break;
    case DT_COMPLEX128:
      ret = MatMulCompute<std::complex<double>>(ctx);
      break;
    default:
      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
                       DTypeStr(input0_data_type).c_str());
      ret = KERNEL_STATUS_PARAM_INVALID;
  }
  return ret;
 }
 REGISTER_CPU_KERNEL(kMatmul, MatMulCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.h
@ -0,0 +1,39 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_HOST_MATMUL_H_
 #define AICPU_KERNELS_HOST_MATMUL_H_
 #include "cpu_ops_kernel.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class MatMulCpuKernel : public CpuKernel {
 public:
  MatMulCpuKernel() = default;
  ~MatMulCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  uint32_t AddCompute(CpuKernelContext &ctx, Bcast &bcast);
  template <typename T>
  uint32_t BiasCompute(CpuKernelContext &ctx);
  template <typename T>
  uint32_t MatMulCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.cc
@ -0,0 +1,320 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "matrix_exp.h"
 #include <array>
 #include <complex>
 #include <cmath>
 #include "cpu_kernel_utils.h"
 #include "utils/kernel_util.h"
 namespace {
 constexpr uint32_t kMatrixExpInputNum = 1;
 constexpr uint32_t kMatrixExpOutputNum = 1;
 constexpr uint32_t kIndexTwo = 2;
 const int64_t paralled_data_size = 8 * 1024;
 const char *kMatrixExp = "MatrixExp";
 constexpr int total_n_degs = 6;
 // Coefficients for computing taylor approximant of order 8.
 constexpr double sqrt_177 = 0.1330413469565007072504e+2, x3 = 2. / 3.;
 constexpr double x1 = x3 * ((1. + sqrt_177) / 88.), x2 = x3 * ((1. + sqrt_177) / 352.);
 constexpr double x4 = (-271. + 29. * sqrt_177) / (315. * x3), x5 = (-11. + 11. * sqrt_177) / (1260. * x3);
 constexpr double x6 = (-99. + 11. * sqrt_177) / (5040. * x3), x7 = (89. - sqrt_177) / (5040. * x3);
 constexpr double y2 = (857. - 58. * sqrt_177) / 630.;
 template <typename T, int ROW, int COL>
 using array2d = std::array<std::array<T, COL>, ROW>;
 // Coefficients for computing taylor approximant of order 12.
 constexpr int num_prods_12 = 4;
 array2d<double, num_prods_12, num_prods_12> b12 = {
  {{9.0198e-16, 0.46932117595418237389, -0.20099424927047284052, -0.04623946134063071740},
   {5.31597895759871264183, 1.19926790417132231573, 0.01179296240992997031, 0.01108844528519167989},
   {0.18188869982170434744, 0.05502798439925399070, 0.09351590770535414968, 0.00610700528898058230},
   {-2.0861320e-13, -0.13181061013830184015, -0.02027855540589259079, -0.00675951846863086359}}};
 // Coefficients for computing taylor approximant of order 18.
 constexpr int num_prods_18 = 5;
 array2d<double, num_prods_18, num_prods_18> b18 = {
  {{0., -1.00365581030144618291e-01, -8.02924648241156932449e-03, -8.92138498045729985177e-04, 0.},
   {0., 3.97849749499645077844e-01, 1.36783778460411720168e+00, 4.98289622525382669416e-01,
    -6.37898194594723280150e-04},
   {-1.09676396052962061844e+01, 1.68015813878906206114e+00, 5.71779846478865511061e-02, -6.98210122488052056106e-03,
    3.34975017086070470649e-05},
   {-9.04316832390810593223e-02, -6.76404519071381882256e-02, 6.75961301770459654925e-02, 2.95552570429315521194e-02,
    -1.39180257516060693404e-05},
   {0., 0., -9.23364619367118555360e-02, -1.69364939002081722752e-02, -1.40086798182036094347e-05}}};
 // Threshold for different order of taylor approximant.
 constexpr std::array<float, total_n_degs> thetas_float = {1.192092800768788e-07, 5.978858893805233e-04,
                                                          5.116619363445086e-02, 5.800524627688768e-01,
                                                          1.461661507209034e+00, 3.010066362817634e+00};
 // Threshold for different order of taylor approximant.
 constexpr std::array<double, total_n_degs> thetas_double = {2.220446049250313e-16, 2.580956802971767e-08,
                                                            3.397168839976962e-04, 4.991228871115323e-02,
                                                            2.996158913811580e-01, 1.090863719290036e+00};
 #define MATRIX_EXP_COMPUTE_CASE(DTYPE, TYPE, CTX)           \
  case (DTYPE): {                                           \
    uint32_t result = MatrixExpCompute<TYPE>(CTX);          \
    if (result != KERNEL_STATUS_OK) {                       \
      KERNEL_LOG_ERROR("MatrixExp kernel compute failed."); \
      return result;                                        \
    }                                                       \
    break;                                                  \
  }
 #define MATRIX_EXP_COMPUTE_DIFF_CASE(DTYPE, TYPE, CTX)      \
  case (DTYPE): {                                           \
    uint32_t result = MatrixExpDiffTypeCompute<TYPE>(CTX);  \
    if (result != KERNEL_STATUS_OK) {                       \
      KERNEL_LOG_ERROR("MatrixExp kernel compute failed."); \
      return result;                                        \
    }                                                       \
    break;                                                  \
  }
 }  // namespace
 namespace aicpu {
 uint32_t MatrixExpCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMatrixExpInputNum, kMatrixExpOutputNum),
                      "[%s] check input and output number failed.", kMatrixExp);
  KERNEL_HANDLE_ERROR(MatrixExpCheck(ctx), "[%s] check params failed.", kMatrixExp);
  auto data_type = ctx.Input(0)->GetDataType();
  switch (data_type) {
    MATRIX_EXP_COMPUTE_CASE(DT_FLOAT, float, ctx)
    MATRIX_EXP_COMPUTE_CASE(DT_DOUBLE, double, ctx)
    MATRIX_EXP_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
    MATRIX_EXP_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
    MATRIX_EXP_COMPUTE_DIFF_CASE(DT_FLOAT16, Eigen::half, ctx)
    default:
      KERNEL_LOG_ERROR("MatrixExp kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t MatrixExpCpuKernel::MatrixExpCheck(CpuKernelContext &ctx) {
  auto input_0 = ctx.Input(0);
  std::vector<int64_t> shape_x = input_0->GetTensorShape()->GetDimSizes();
  size_t shape_size_x = shape_x.size();
  KERNEL_CHECK_FALSE((shape_size_x > 1), KERNEL_STATUS_PARAM_INVALID, "Input x must be at least rank 2, got [%zu].",
                     shape_size_x)
  KERNEL_CHECK_FALSE((shape_x[shape_size_x - 1] > 0), KERNEL_STATUS_PARAM_INVALID,
                     "Input x's last dimension must be at least 1.")
  KERNEL_CHECK_FALSE((shape_x[shape_size_x - kIndexTwo] == shape_x[shape_size_x - 1]), KERNEL_STATUS_PARAM_INVALID,
                     "Input x's last two dimensions must be equal, but are [%lld] and [%lld].",
                     shape_x[shape_size_x - kIndexTwo], shape_x[shape_size_x - 1])
  return KERNEL_STATUS_OK;
 }
 template <typename Derived1, typename Derived2, typename Derived3>
 void MatrixExpCpuKernel::MTaylorApproximant(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
                                            int order, Eigen::MatrixBase<Derived3> &E) {
  constexpr int expension_order_1 = 1;
  constexpr int expension_order_2 = 2;
  constexpr int expension_order_4 = 4;
  constexpr int expension_order_8 = 8;
  constexpr int expension_order_12 = 12;
  auto A2 = A * A;
  auto A3 = A * A2;
  if (order == expension_order_1) {
    E = I + A;
  } else if (order == expension_order_2) {
    constexpr int A2_divisor = 2;
    E = I + A + A2 / A2_divisor;
  } else if (order == expension_order_4) {
    constexpr int I_divisor = 2;
    constexpr int A_divisor = 6;
    constexpr int A2_divisor = 24;
    E = I + A + A2 * (I / I_divisor + A / A_divisor + A2 / A2_divisor);
  } else if (order == expension_order_8) {
    auto A4 = A2 * (x1 * A + x2 * A2);
    auto A8 = (x3 * A2 + A4) * (x4 * I + x5 * A + x6 * A2 + x7 * A4);
    E = I + A + y2 * A2 + A8;
  } else if (order == expension_order_12) {
    auto q31 = b12[0][0] * I + b12[0][1] * A + b12[0][2] * A2 + b12[0][3] * A3;
    auto q32 = b12[1][0] * I + b12[1][1] * A + b12[1][2] * A2 + b12[1][3] * A3;
    auto q33 = b12[2][0] * I + b12[2][1] * A + b12[2][2] * A2 + b12[2][3] * A3;
    auto q34 = b12[3][0] * I + b12[3][1] * A + b12[3][2] * A2 + b12[3][3] * A3;
    auto q61 = q33 + q34 * q34;
    E = q31 + (q32 + q61) * q61;
  } else {
    auto A6 = A3 * A3;
    auto q31 = b18[0][0] * I + b18[0][1] * A + b18[0][2] * A2 + b18[0][3] * A3 + b18[0][4] * A6;
    auto q61 = b18[1][0] * I + b18[1][1] * A + b18[1][2] * A2 + b18[1][3] * A3 + b18[1][4] * A6;
    auto q62 = b18[2][0] * I + b18[2][1] * A + b18[2][2] * A2 + b18[2][3] * A3 + b18[2][4] * A6;
    auto q63 = b18[3][0] * I + b18[3][1] * A + b18[3][2] * A2 + b18[3][3] * A3 + b18[3][4] * A6;
    auto q64 = b18[4][0] * I + b18[4][1] * A + b18[4][2] * A2 + b18[4][3] * A3 + b18[4][4] * A6;
    auto q91 = q31 * q64 + q63;
    E = q61 + (q62 + q91) * q91;
  }
 }
 template <typename Derived1, typename Derived2>
 void MatrixExpCpuKernel::MexpImpl(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
                                  Eigen::MatrixBase<Derived1> &mexp, CpuKernelContext &ctx) {
  const auto norm = A.cwiseAbs().colwise().sum().maxCoeff();
  constexpr std::array<int, total_n_degs> m_vals = {1, 2, 4, 8, 12, 18};
  constexpr int cut_deg = 2;
  int64_t s = -1;
  auto data_type = ctx.Input(0)->GetDataType();
  if (data_type == DT_FLOAT16 || data_type == DT_FLOAT || data_type == DT_COMPLEX64) {
    for (int i = 0; i < total_n_degs - 1; i++) {
      if (norm <= thetas_float[i]) {
        MTaylorApproximant(A, I, m_vals[i], mexp);
        break;
      }
    }
    if (norm >= thetas_float[total_n_degs - cut_deg]) {
      s = ceil(log2(norm / thetas_float[total_n_degs - 1]));
      if (s <= 0) {
        s = 0;
      }
    }
  } else {
    for (int i = 0; i < total_n_degs - 1; i++) {
      if (norm <= thetas_double[i]) {
        MTaylorApproximant(A, I, m_vals[i], mexp);
        break;
      }
    }
    if (norm >= thetas_double[total_n_degs - cut_deg]) {
      s = ceil(log2(norm / thetas_double[total_n_degs - 1]));
      if (s <= 0) {
        s = 0;
      }
    }
  }
  if (s >= 0) {
    const auto pow2s = pow(2, s);
    const auto A_scaled = A / pow2s;
    MTaylorApproximant(A_scaled, I, m_vals[total_n_degs - 1], mexp);
    for (int k = 0; k < s; k++) {
      mexp = mexp * mexp;
    }
  }
 }
 template <typename T>
 uint32_t MatrixExpCpuKernel::MatrixExpCompute(CpuKernelContext &ctx) {
  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
  size_t shape_size = shape_x.size();
  int64_t m = shape_x[shape_size - 1];
  int64_t size_mm = m * m;
  typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
  MatrixXd I(m, m);
  I.setIdentity();
  int64_t matrix_num = ctx.Input(0)->NumElements() / size_mm;
  int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
  if (data_size <= paralled_data_size) {
    for (int64_t i = 0; i < matrix_num; i++) {
      Eigen::Map<MatrixXd> matrix_x(input_x + i * m * m, m, m);
      Eigen::Map<MatrixXd> matrix_y(output_y + i * m * m, m, m);
      if (matrix_x.size() > 0) {
        MexpImpl(matrix_x, I, matrix_y, ctx);
      }
    }
  } else {
    uint32_t min_core_num = 1;
    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (max_core_num == 0) {
      return KERNEL_STATUS_PARAM_INVALID;
    }
    if (max_core_num > matrix_num) {
      max_core_num = matrix_num;
    }
    auto shard_work = [&](size_t start, size_t end) {
      for (size_t i = start; i < end; i++) {
        Eigen::Map<MatrixXd> matrix_x(input_x + i * m * m, m, m);
        Eigen::Map<MatrixXd> matrix_y(output_y + i * m * m, m, m);
        if (matrix_x.size() > 0) {
          MexpImpl(matrix_x, I, matrix_y, ctx);
        }
      }
    };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_work),
                        "MatrixExp Compute failed.");
  }
  return KERNEL_STATUS_OK;
 }
 void MatrixExpCpuKernel::TyepChangeForFp16(int64_t i, int64_t m, Eigen::half *input_x, Eigen::half *output_y,
                                           CpuKernelContext &ctx) {
  typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
  MatrixXd I(m, m);
  (void)I.setIdentity();
  MatrixXd matrix_x(m, m);
  MatrixXd matrix_y(m, m);
  int64_t size_mm = m * m;
  for (int p = 0; p < m; p++) {
    for (int q = 0; q < m; q++) {
      matrix_x(p, q) = static_cast<float>(input_x[i * size_mm + p * m + q]);
    }
  }
  if (matrix_x.size() > 0) {
    MexpImpl(matrix_x, I, matrix_y, ctx);
  }
  for (int p = 0; p < m; p++) {
    for (int q = 0; q < m; q++) {
      output_y[i * size_mm + p * m + q] = static_cast<Eigen::half>(matrix_y(p, q));
    }
  }
 }
 template <typename T>
 uint32_t MatrixExpCpuKernel::MatrixExpDiffTypeCompute(CpuKernelContext &ctx) {
  T *input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
  size_t shape_size = shape_x.size();
  int64_t m = shape_x[shape_size - 1];
  int64_t size_mm = m * m;
  int64_t matrix_num = ctx.Input(0)->NumElements() / size_mm;
  int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
  if (data_size <= paralled_data_size) {
    for (int64_t i = 0; i < matrix_num; i++) {
      TyepChangeForFp16(i, m, input_x, output_y, ctx);
    }
  } else {
    uint32_t min_core_num = 1;
    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (max_core_num == 0) {
      return KERNEL_STATUS_PARAM_INVALID;
    }
    if (max_core_num > matrix_num) {
      max_core_num = matrix_num;
    }
    auto shard_work = [&](size_t start, size_t end) {
      for (size_t i = start; i < end; i++) {
        TyepChangeForFp16(i, m, input_x, output_y, ctx);
      }
    };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_work),
                        "MatrixExp Compute failed.");
  }
  // }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kMatrixExp, MatrixExpCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.h
@ -0,0 +1,50 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_MATRIX_EXP_H_
 #define AICPU_KERNELS_NORMALIZED_MATRIX_EXP_H_
 #include "cpu_ops_kernel.h"
 #include "utils/eigen_tensor.h"
 namespace aicpu {
 class MatrixExpCpuKernel : public CpuKernel {
 public:
  MatrixExpCpuKernel() = default;
  ~MatrixExpCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  uint32_t MatrixExpCheck(CpuKernelContext &ctx);
  template <typename Derived1, typename Derived2, typename Derived3>
  void MTaylorApproximant(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I, int order,
                          Eigen::MatrixBase<Derived3> &E);
  template <typename Derived1, typename Derived2>
  void MexpImpl(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
                Eigen::MatrixBase<Derived1> &mexp, CpuKernelContext &ctx);
  template <typename T>
  uint32_t MatrixExpCompute(CpuKernelContext &ctx);
  void TyepChangeForFp16(int64_t i, int64_t m, Eigen::half *input_x, Eigen::half *output_y, CpuKernelContext &ctx);
  template <typename T>
  uint32_t MatrixExpDiffTypeCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc
@ -0,0 +1,460 @@
 /**
 * Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unminimum required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "maximum.h"
 #include "Eigen/Dense"
 #include "cmath"
 #include "cpu_kernel_utils.h"
 #include "iostream"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kInputNum = 2;
 const uint32_t kOutputNum = 1;
 const char *kMaximum = "Maximum";
 // when input data size is more than kParallelDataNum, use Parallel func
 const int64_t kParallelDataNum = 2 * 1024;
 const int64_t kParallelDataNumMid = 16 * 1024;
 const int64_t kParallelDataNumSameShape = 7 * 1024;
 const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
 #define MAXIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
  case (DTYPE): {                                         \
    uint32_t result = MaximumCompute<TYPE>(CTX);          \
    if (result != KERNEL_STATUS_OK) {                     \
      KERNEL_LOG_ERROR("Maximum kernel compute failed."); \
      return result;                                      \
    }                                                     \
    break;                                                \
  }
 }  // namespace
 namespace aicpu {
 uint32_t MaximumCpuKernel::Compute(CpuKernelContext &ctx) {
  // check params
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Maximum check input and output number failed.");
  KERNEL_HANDLE_ERROR(MaximumParamCheck(ctx), "Maximum check params failed.");
  auto data_type = ctx.Input(0)->GetDataType();
  switch (data_type) {
    MAXIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
    MAXIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
    MAXIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
    MAXIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
    MAXIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
    default:
      KERNEL_LOG_ERROR("Maximum kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t MaximumCpuKernel::MaximumParamCheck(CpuKernelContext &ctx) {
  // the non null of input_0, input_1, output has been verified in NormalCheck
  Tensor *input_0 = ctx.Input(0);
  Tensor *input_1 = ctx.Input(1);
  Tensor *output = ctx.Output(0);
  DataType input0_type = input_0->GetDataType();
  DataType input1_type = input_1->GetDataType();
  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
                     "The data type of input0 [%s] need be same with "
                     "input1 [%s].",
                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
  KERNEL_LOG_DEBUG(
    "MaximumCpuKernel[%s], input0: size[%llu];"
    "input1: size[%llu], output: size[%llu].",
    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 void MaximumCpuKernel::SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
  auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  auto ignore_nan = false;
  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
  for (int64_t i = start; i < end; ++i) {
    if (ignore_nan == true && is_float16 == true) {
      if (Eigen::numext::isnan(*(input1 + i))) {
        *(output + i) = *(input2 + i);
      } else if (Eigen::numext::isnan(*(input2 + i))) {
        *(output + i) = *(input1 + i);
      } else {
        *(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
      }
    }
    if (ignore_nan == true && is_float16 == false) {
      if (isnan(*(input1 + i))) {
        *(output + i) = *(input2 + i);
      } else if (isnan(*(input2 + i))) {
        *(output + i) = *(input1 + i);
      } else {
        *(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
      }
    }
    if (ignore_nan == false && is_float16 == true) {
      if (Eigen::numext::isnan(*(input1 + i))) {
        *(output + i) = *(input1 + i);
      } else if (Eigen::numext::isnan(*(input2 + i))) {
        *(output + i) = *(input2 + i);
      } else {
        *(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
      }
    }
    if (ignore_nan == false && is_float16 == false) {
      if (isnan(*(input1 + i))) {
        *(output + i) = *(input1 + i);
      } else if (isnan(*(input2 + i))) {
        *(output + i) = *(input2 + i);
      } else {
        *(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
      }
    }
  }
 }
 template <typename T>
 void MaximumCpuKernel::SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
  auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  auto ignore_nan = false;
  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
  for (int64_t i = start; i < end; ++i) {
    if (ignore_nan == true && is_float16 == true) {
      if (Eigen::numext::isnan(*(input1))) {
        *(output + i) = *(input2 + i);
      } else if (Eigen::numext::isnan(*(input2 + i))) {
        *(output + i) = *(input1);
      } else {
        *(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
      }
    }
    if (ignore_nan == true && is_float16 == false) {
      if (isnan(*(input1))) {
        *(output + i) = *(input2 + i);
      } else if (isnan(*(input2 + i))) {
        *(output + i) = *(input1);
      } else {
        *(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
      }
    }
    if (ignore_nan == false && is_float16 == true) {
      if (Eigen::numext::isnan(*(input1))) {
        *(output + i) = *(input1);
      } else if (Eigen::numext::isnan(*(input2 + i))) {
        *(output + i) = *(input2 + i);
      } else {
        *(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
      }
    }
    if (ignore_nan == false && is_float16 == false) {
      if (isnan(*(input1))) {
        *(output + i) = *(input1);
      } else if (isnan(*(input2 + i))) {
        *(output + i) = *(input2 + i);
      } else {
        *(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
      }
    }
  }
 }
 template <typename T>
 void MaximumCpuKernel::SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
  auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  auto ignore_nan = false;
  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
  for (int64_t i = start; i < end; ++i) {
    if (ignore_nan == true && is_float16 == true) {
      if (Eigen::numext::isnan(*(input1 + i))) {
        *(output + i) = *(input2);
      } else if (Eigen::numext::isnan(*(input2))) {
        *(output + i) = *(input1 + i);
      } else {
        *(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
      }
    }
    if (ignore_nan == true && is_float16 == false) {
      if (isnan(*(input1 + i))) {
        *(output + i) = *(input2);
      } else if (isnan(*(input2))) {
        *(output + i) = *(input1 + i);
      } else {
        *(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
      }
    }
    if (ignore_nan == false && is_float16 == true) {
      if (Eigen::numext::isnan(*(input1 + i))) {
        *(output + i) = *(input1 + i);
      } else if (Eigen::numext::isnan(*(input2))) {
        *(output + i) = *(input2);
      } else {
        *(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
      }
    }
    if (ignore_nan == false && is_float16 == false) {
      if (isnan(*(input1 + i))) {
        *(output + i) = *(input1 + i);
      } else if (isnan(*(input2))) {
        *(output + i) = *(input2);
      } else {
        *(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
      }
    }
  }
 }
 template <typename T>
 void MaximumCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) {
  bool is_float16 = false;
  if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
      std::is_same<T, double>::value) {
    is_float16 = false;
  } else {
    is_float16 = true;
  }
  switch (type) {
    case BcastShapeType::SAME_SHAPE:
      SpecialComputeSameShape<T>(start, end, ctx, is_float16);
      break;
    case BcastShapeType::X_ONE_ELEMENT:
      SpecialComputeXOneElement<T>(start, end, ctx, is_float16);
      break;
    case BcastShapeType::Y_ONE_ELEMENT:
      SpecialComputeYOneElement<T>(start, end, ctx, is_float16);
      break;
    default:
      KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
      break;
  }
 }
 template <typename T>
 uint32_t MaximumCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
  int64_t data_num = ctx.Output(0)->NumElements();
  BcastShapeType type = in0_elements_nums == in1_elements_nums
                          ? BcastShapeType::SAME_SHAPE
                          : (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
  if (data_num >= kParallelDataNumSameShape) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (data_num <= kParallelDataNumSameShapeMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    auto sharder_fmax = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, ctx); };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_fmax),
                        "Maximum Compute failed.");
  } else {
    SpecialCompute<T>(type, 0, data_num, ctx);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 void MaximumCpuKernel::BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast,
                                               bool is_float16) {
  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  auto ignore_nan = false;
  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
  for (int64_t i = start; i < end; ++i) {
    if (ignore_nan == true && is_float16 == true) {
      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
    if (ignore_nan == true && is_float16 == false) {
      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
    if (ignore_nan == false && is_float16 == true) {
      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
    if (ignore_nan == false && is_float16 == false) {
      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
  }
 }
 template <typename T>
 void MaximumCpuKernel::BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16) {
  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  auto ignore_nan = false;
  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
  int64_t data_num = ctx.Output(0)->NumElements();
  for (int64_t i = 0; i < data_num; ++i) {
    if (ignore_nan == true && is_float16 == true) {
      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
    if (ignore_nan == true && is_float16 == false) {
      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
    if (ignore_nan == false && is_float16 == true) {
      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
    if (ignore_nan == false && is_float16 == false) {
      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
  }
 }
 template <typename T>
 uint32_t MaximumCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
  int64_t data_num = ctx.Output(0)->NumElements();
  bool is_float16 = false;
  if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
      std::is_same<T, double>::value) {
    is_float16 = false;
  } else {
    is_float16 = true;
  }
  if (data_num >= kParallelDataNum) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (data_num <= kParallelDataNumMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    auto sharder_fmax = [&](int64_t start, int64_t end) {
      BcastComputeMultiKernel<T>(start, end, ctx, bcast, is_float16);
    };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_fmax),
                        "Maximum Compute failed.");
  } else {
    BcastComputeOneKernel<T>(ctx, bcast, is_float16);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t MaximumCpuKernel::MaximumCompute(CpuKernelContext &ctx) {
  Tensor *input0_tensor = ctx.Input(0);
  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
  int64_t input0_elements_nums = input0_tensor->NumElements();
  Tensor *input1_tensor = ctx.Input(1);
  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
  int64_t input1_elements_nums = input1_tensor->NumElements();
  bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
  if (isNeedBcast) {
    return NoBcastCompute<T>(ctx);
  } else {
    Bcast bcast(input0_shape, input1_shape);
    if (!bcast.IsValid()) {
      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
      return KERNEL_STATUS_PARAM_INVALID;
    }
    return BcastCompute<T>(ctx, bcast);
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kMaximum, MaximumCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.h
@ -0,0 +1,63 @@
 /**
 * Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unminimum required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_MAXIMUM_H_
 #define AICPU_KERNELS_NORMALIZED_MAXIMUM_H_
 #include "cpu_ops_kernel.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class MaximumCpuKernel : public CpuKernel {
 public:
  MaximumCpuKernel() = default;
  ~MaximumCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  uint32_t MaximumParamCheck(CpuKernelContext &ctx);
  template <typename T>
  void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx);
  template <typename T>
  void SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
  template <typename T>
  void SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
  template <typename T>
  void SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
  template <typename T>
  uint32_t NoBcastCompute(CpuKernelContext &ctx);
  template <typename T>
  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
  template <typename T>
  void BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
  template <typename T>
  void BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
  template <typename T>
  uint32_t MaximumCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc
@ -0,0 +1,456 @@
 /**
 * Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unminimum required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "minimum.h"
 #include "Eigen/Dense"
 #include "cmath"
 #include "cpu_kernel_utils.h"
 #include "iostream"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kInputNum = 2;
 const uint32_t kOutputNum = 1;
 const char *kMinimum = "Minimum";
 // when input data size is more than kParallelDataNum, use Parallel func
 const int64_t kParallelDataNum = 2 * 1024;
 const int64_t kParallelDataNumMid = 16 * 1024;
 const int64_t kParallelDataNumSameShape = 7 * 1024;
 const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
 #define MINIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
  case (DTYPE): {                                         \
    uint32_t result = MinimumCompute<TYPE>(CTX);          \
    if (result != KERNEL_STATUS_OK) {                     \
      KERNEL_LOG_ERROR("Minimum kernel compute failed."); \
      return result;                                      \
    }                                                     \
    break;                                                \
  }
 }  // namespace
 namespace aicpu {
 uint32_t MinimumCpuKernel::Compute(CpuKernelContext &ctx) {
  // check params
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Minimum check input and output number failed.");
  KERNEL_HANDLE_ERROR(MinimumParamCheck(ctx), "Minimum check params failed.");
  auto data_type = ctx.Input(0)->GetDataType();
  switch (data_type) {
    MINIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
    MINIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
    MINIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
    MINIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
    MINIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
    default:
      KERNEL_LOG_ERROR("Minimum kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t MinimumCpuKernel::MinimumParamCheck(CpuKernelContext &ctx) {
  // the non null of input_0, input_1, output has been verified in NormalCheck
  Tensor *input_0 = ctx.Input(0);
  Tensor *input_1 = ctx.Input(1);
  Tensor *output = ctx.Output(0);
  DataType input0_type = input_0->GetDataType();
  DataType input1_type = input_1->GetDataType();
  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
                     "The data type of input0 [%s] need be same with "
                     "input1 [%s].",
                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
  KERNEL_LOG_DEBUG(
    "MinimumCpuKernel[%s], input0: size[%llu];"
    "input1: size[%llu], output: size[%llu].",
    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 void MinimumCpuKernel::SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
  auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  auto ignore_nan = false;
  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
  for (int64_t i = start; i < end; ++i) {
    if (ignore_nan == false && is_float16 == true) {
      if (Eigen::numext::isnan(*(input1 + i))) {
        *(output + i) = *(input1 + i);
      } else if (Eigen::numext::isnan(*(input2 + i))) {
        *(output + i) = *(input2 + i);
      } else {
        *(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
      }
    }
    if (ignore_nan == false && is_float16 == false) {
      if (isnan(*(input1 + i))) {
        *(output + i) = *(input1 + i);
      } else if (isnan(*(input2 + i))) {
        *(output + i) = *(input2 + i);
      } else {
        *(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
      }
    }
    if (ignore_nan == true && is_float16 == true) {
      if (Eigen::numext::isnan(*(input1 + i))) {
        *(output + i) = *(input2 + i);
      } else if (Eigen::numext::isnan(*(input2 + i))) {
        *(output + i) = *(input1 + i);
      } else {
        *(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
      }
    }
    if (ignore_nan == true && is_float16 == false) {
      if (isnan(*(input1 + i))) {
        *(output + i) = *(input2 + i);
      } else if (isnan(*(input2 + i))) {
        *(output + i) = *(input1 + i);
      } else {
        *(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
      }
    }
  }
 }
 template <typename T>
 void MinimumCpuKernel::SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
  auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  auto ignore_nan = false;
  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
  for (int64_t i = start; i < end; ++i) {
    if (ignore_nan == false && is_float16 == true) {
      if (Eigen::numext::isnan(*input1)) {
        *(output + i) = *input1;
      } else if (Eigen::numext::isnan(*(input2 + i))) {
        *(output + i) = *(input2 + i);
      } else {
        *(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
      }
    }
    if (ignore_nan == false && is_float16 == false) {
      if (isnan(*input1)) {
        *(output + i) = *input1;
      } else if (isnan(*(input2 + i))) {
        *(output + i) = *(input2 + i);
      } else {
        *(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
      }
    }
    if (ignore_nan == true && is_float16 == true) {
      if (Eigen::numext::isnan(*input1)) {
        *(output + i) = *(input2 + i);
      } else if (Eigen::numext::isnan(*(input2 + i))) {
        *(output + i) = *input1;
      } else {
        *(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
      }
    }
    if (ignore_nan == true && is_float16 == false) {
      if (isnan(*input1)) {
        *(output + i) = *(input2 + i);
      } else if (isnan(*(input2 + i))) {
        *(output + i) = *input1;
      } else {
        *(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
      }
    }
  }
 }
 template <typename T>
 void MinimumCpuKernel::SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
  auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  auto ignore_nan = false;
  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
  for (int64_t i = start; i < end; ++i) {
    if (ignore_nan == false && is_float16 == true) {
      if (Eigen::numext::isnan(*(input1 + i))) {
        *(output + i) = *(input1 + i);
      } else if (Eigen::numext::isnan(*input2)) {
        *(output + i) = *input2;
      } else {
        *(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
      }
    }
    if (ignore_nan == false && is_float16 == false) {
      if (isnan(*(input1 + i))) {
        *(output + i) = *(input1 + i);
      } else if (isnan(*input2)) {
        *(output + i) = *input2;
      } else {
        *(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
      }
    }
    if (ignore_nan == true && is_float16 == true) {
      if (Eigen::numext::isnan(*(input1 + i))) {
        *(output + i) = *input2;
      } else if (Eigen::numext::isnan(*input2)) {
        *(output + i) = *(input1 + i);
      } else {
        *(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
      }
    }
    if (ignore_nan == true && is_float16 == false) {
      if (isnan(*(input1 + i))) {
        *(output + i) = *input2;
      } else if (isnan(*input2)) {
        *(output + i) = *(input1 + i);
      } else {
        *(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
      }
    }
  }
 }
 template <typename T>
 void MinimumCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) {
  bool is_float16 = false;
  if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
      std::is_same<T, double>::value) {
    is_float16 = false;
  } else {
    is_float16 = true;
  }
  switch (type) {
    case BcastShapeType::SAME_SHAPE:
      SpecialComputeSameShape<T>(start, end, ctx, is_float16);
      break;
    case BcastShapeType::X_ONE_ELEMENT:
      SpecialComputeXOneElement<T>(start, end, ctx, is_float16);
      break;
    case BcastShapeType::Y_ONE_ELEMENT:
      SpecialComputeYOneElement<T>(start, end, ctx, is_float16);
      break;
    default:
      KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
      break;
  }
 }
 template <typename T>
 uint32_t MinimumCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
  int64_t data_num = ctx.Output(0)->NumElements();
  BcastShapeType type = in0_elements_nums == in1_elements_nums
                          ? BcastShapeType::SAME_SHAPE
                          : (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
  if (data_num >= kParallelDataNumSameShape) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (data_num <= kParallelDataNumSameShapeMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    auto sharder_minimum = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, ctx); };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_minimum),
                        "Minimum Compute failed.");
  } else {
    SpecialCompute<T>(type, 0, data_num, ctx);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 void MinimumCpuKernel::BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast,
                                               bool is_float16) {
  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  auto ignore_nan = false;
  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
  for (int64_t i = start; i < end; ++i) {
    if (ignore_nan == false && is_float16 == true) {
      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
    if (ignore_nan == false && is_float16 == false) {
      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
    if (ignore_nan == true && is_float16 == true) {
      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
    if (ignore_nan == true && is_float16 == false) {
      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
  }
 }
 template <typename T>
 void MinimumCpuKernel::BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16) {
  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  auto ignore_nan = false;
  auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
  ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
  int64_t data_num = ctx.Output(0)->NumElements();
  for (int64_t i = 0; i < data_num; ++i) {
    if (ignore_nan == false && is_float16 == true) {
      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
    if (ignore_nan == false && is_float16 == false) {
      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
    if (ignore_nan == true && is_float16 == true) {
      if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
    if (ignore_nan == true && is_float16 == false) {
      if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
        *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
      } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
      } else {
        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
                       ? *(in0 + bcast.GetBroadcastXIndex(i))
                       : *(in1 + bcast.GetBroadcastYIndex(i));
      }
    }
  }
 }
 template <typename T>
 uint32_t MinimumCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
  int64_t data_num = ctx.Output(0)->NumElements();
  bool is_float16 = false;
  if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
      std::is_same<T, double>::value) {
    is_float16 = false;
  } else {
    is_float16 = true;
  }
  if (data_num >= kParallelDataNum) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (data_num <= kParallelDataNumMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    auto sharder_minimum = [&](int64_t start, int64_t end) {
      BcastComputeMultiKernel<T>(start, end, ctx, bcast, is_float16);
    };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_minimum),
                        "Minimum Compute failed.");
  } else {
    BcastComputeOneKernel<T>(ctx, bcast, is_float16);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t MinimumCpuKernel::MinimumCompute(CpuKernelContext &ctx) {
  Tensor *input0_tensor = ctx.Input(0);
  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
  int64_t input0_elements_nums = input0_tensor->NumElements();
  Tensor *input1_tensor = ctx.Input(1);
  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
  int64_t input1_elements_nums = input1_tensor->NumElements();
  bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
  if (isNeedBcast) {
    return NoBcastCompute<T>(ctx);
  } else {
    Bcast bcast(input0_shape, input1_shape);
    if (!bcast.IsValid()) {
      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
      return KERNEL_STATUS_PARAM_INVALID;
    }
    return BcastCompute<T>(ctx, bcast);
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kMinimum, MinimumCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.h
@ -0,0 +1,63 @@
 /**
 * Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unminimum required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_MINIMUM_H_
 #define AICPU_KERNELS_NORMALIZED_MINIMUM_H_
 #include "cpu_ops_kernel.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class MinimumCpuKernel : public CpuKernel {
 public:
  MinimumCpuKernel() = default;
  ~MinimumCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  uint32_t MinimumParamCheck(CpuKernelContext &ctx);
  template <typename T>
  void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx);
  template <typename T>
  void SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
  template <typename T>
  void SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
  template <typename T>
  void SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
  template <typename T>
  uint32_t NoBcastCompute(CpuKernelContext &ctx);
  template <typename T>
  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
  template <typename T>
  void BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
  template <typename T>
  void BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
  template <typename T>
  uint32_t MinimumCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal.h
@ -1,5 +1,5 @@
 /**
- * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
@ -171,11 +171,8 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
                                                               mindspore::kKLDivOpName,
                                                               mindspore::kKlDivLossGradOpName,
                                                               mindspore::kLcmOpName,
                                                               mindspore::kLessEqualOpName,
                                                               mindspore::kLogicalXorOpName,
                                                               mindspore::kLogitOpName,
                                                               mindspore::kLogitGradOpName,
                                                               mindspore::kLogNormalReverseOpName,
                                                               mindspore::kLowerBoundOpName,
                                                               mindspore::kLstsqOpName,
                                                               mindspore::kLuUnpackOpName,