aicpu migration from sjx del first 9 ops

2023-01-18 19:07:06 +08:00 · 2023-01-18 19:07:06 +08:00 · c137e34989
parent 08aa1515d3
commit c137e34989
65 changed files with 7276 additions and 1 deletions
--- a/.jenkins/check/config/filter_cppcheck.txt
+++ b/.jenkins/check/config/filter_cppcheck.txt
@ -96,6 +96,8 @@
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "truncLongCastAssignment"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "knownConditionTrueFalse"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "passedByValue"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitMemberVar"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
--- a/.jenkins/check/config/filter_cpplint.txt
+++ b/.jenkins/check/config/filter_cpplint.txt
@ -129,6 +129,7 @@
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "legal/copyright"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "readability/inheritance"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "runtime/int"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/empty_if_body"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/newline"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/operators"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/comma"
--- a/.jenkins/check/config/whitelizard.txt
+++ b/.jenkins/check/config/whitelizard.txt
@ -282,6 +282,14 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multinomial.cc:aicpu::Generate     
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_set_diag_v3.cc:aicpu::MatrixSetDiagV3CpuKernel::DoCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc:aicpu::MaxUnpool2DCpuKernel::MaxUnpool2DCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparseaddmm.cc:aicpu::SparseAddmmCpuKernel::SparseAddmmCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc:aicpu::TripletMarginLossCpuKernel::Compute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc:aicpu::TripletMarginLossCpuKernel::TripletMarginLossComputeRealType
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc:aicpu::TripletMarginLossCpuKernel::TripletMarginLossComputeComplexType
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc:aicpu::TripletMarginLossCpuKernel::TripletMarginLossComputeRealTypeFloat16
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_transpose.cc:aicpu::SparseMatrixTransposeCpuKernel::Compute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_reshape.cc:aicpu::SparseReshapeCpuKernel::Compute
 mindspore/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc:mindspore::opt::AICpuLibSelectPass::Process
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc:aicpu::MatrixSolveLsCpuKernel::Compute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tensor_scatter_update.cc:aicpu::TensorScatterUpdateCpuKernel::Compute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.cc:aicpu::ScatterNdCpuKernel::Compute
--- a/mindspore/ccsrc/include/common/utils/utils.h
+++ b/mindspore/ccsrc/include/common/utils/utils.h
@ -35,6 +35,7 @@
 namespace mindspore {
 // op name. Op which not exists in operator/ops.h, so define it's name here
 constexpr auto kSparseApplyCenteredRMSPropOpName = "SparseApplyCenteredRMSProp";
 constexpr auto kSparseApplyMomentumOpName = "SparseApplyMomentum";
 constexpr auto kAbsOpName = "Abs";
 constexpr auto kAccumulateNV2OpName = "AccumulateNV2";
 constexpr auto kAdamApplyOneAssignOpName = "AdamApplyOneAssign";
@ -49,7 +50,17 @@ constexpr auto kAdaptiveAvgPool2dOpName = "AdaptiveAvgPool2d";
 constexpr auto kAdaptiveAvgPool2dGradOpName = "AdaptiveAvgPool2dGrad";
 constexpr auto kAdaptiveMaxPool3DGradOpName = "AdaptiveMaxPool3DGrad";
 constexpr auto kAddNOpName = "AddN";
 constexpr auto kAddV2OpName = "AddV2";
 constexpr auto kAddOpName = "Add";
 constexpr auto kAdaptiveAvgPool3DOpName = "AdaptiveAvgPool3D";
 constexpr auto kAdaptiveMaxPool3DOpName = "AdaptiveMaxPool3D";
 constexpr auto kAdaptiveAvgPool3DGradOpName = "AdaptiveAvgPool3DGrad";
 constexpr auto kAdaptiveMaxPool2DGradOpName = "AdaptiveMaxPool2DGrad";
 constexpr auto kAdjustContrastv2OpName = "AdjustContrastv2";
 constexpr auto kAdjustHueOpName = "AdjustHue";
 constexpr auto kAdjustSaturationOpName = "AdjustSaturation";
 constexpr auto kAngleOpName = "Angle";
 constexpr auto kAffineGridGradOpName = "AffineGridGrad";
 constexpr auto kApplyAdadeltaDOpName = "ApplyAdadeltaD";
 constexpr auto kApplyAdadeltaOpName = "ApplyAdadelta";
 constexpr auto kApplyAdagradDADOpName = "ApplyAdagradDAD";
@ -92,7 +103,10 @@ constexpr auto kArgMinDOpName = "ArgMinD";
 constexpr auto kArgminOpName = "Argmin";
 constexpr auto kArgMinOpName = "ArgMin";
 constexpr auto kArgminV2OpName = "ArgminV2";
 constexpr auto kArgMinWithValueOpName = "ArgMinWithValue";
 constexpr auto kArgMaxWithValueOpName = "ArgMaxWithValue";
 constexpr auto KAsinGradOpName = "AsinGrad";
 constexpr auto KAsinhGradOpName = "AsinhGrad";
 constexpr auto kAssignAddOpName = "AssignAdd";
 constexpr auto kAssignOpName = "Assign";
 constexpr auto kAssignSubOpName = "AssignSub";
@ -103,6 +117,8 @@ constexpr auto kAvgPool3DOpName = "AvgPool3D";
 constexpr auto kACosOpName = "ACos";
 constexpr auto kACosGradOpName = "ACosGrad";
 constexpr auto kAcosGradOpName = "AcosGrad";
 constexpr auto kACoshOpName = "ACosh";
 constexpr auto kAcoshGradOpName = "ACoshGrad";
 constexpr auto kAvgPool3DDOpName = "AvgPool3DD";
 constexpr auto kAvgPoolGradOpName = "AvgPoolGrad";
 constexpr auto kAvgPoolGradDOpName = "AvgPoolGradD";
@ -113,10 +129,12 @@ constexpr auto kBasicLSTMCellCStateGradV2OpName = "BasicLSTMCellCStateGradV2";
 constexpr auto kBasicLSTMCellInputGradOpName = "BasicLSTMCellInputGrad";
 constexpr auto kBasicLSTMCellOpName = "BasicLSTMCell";
 constexpr auto kBasicLSTMCellWeightGradOpName = "BasicLSTMCellWeightGrad";
 constexpr auto kBartlettWindowOpName = "BartlettWindow";
 constexpr auto kBatchMatMulOpName = "BatchMatMul";
 constexpr auto kBatchMatMulV2OpName = "BatchMatMulV2";
 constexpr auto kBatchNormOpName = "BatchNorm";
 constexpr auto kBatchNormGradOpName = "BatchNormGrad";
 constexpr auto kBatchNormGradGradOpName = "BatchNormGradGrad";
 constexpr auto kBatchNormGradWithActivation = "BatchNormGradWithActivation";
 constexpr auto kBatchNormGradWithAddAndActivation = "BatchNormGradWithAddAndActivation";
 constexpr auto kBatchNormWithActivation = "BatchNormWithActivation";
@ -130,7 +148,9 @@ constexpr auto kBiasAddOpName = "BiasAdd";
 constexpr auto kBiasAddGradOpName = "BiasAddGrad";
 constexpr auto kIndexAddOpName = "IndexAdd";
 constexpr auto kBitwiseOrOpName = "BitwiseOr";
 constexpr auto kBincountOpName = "Bincount";
 constexpr auto kBCEWithLogitsLossOpName = "BCEWithLogitsLoss";
 constexpr auto kBlackmanWindowOpName = "BlackmanWindow";
 constexpr auto kBN2AddReluOpName = "BN2AddRelu";
 constexpr auto kBN2OpName = "BN2";
 constexpr auto kBN2ReLUOpName = "BN2Relu";
@ -214,6 +234,13 @@ constexpr auto kCSRMVOpName = "CSRMV";
 constexpr auto kCSRReduceSumOpName = "CSRReduceSum";
 constexpr auto kCSRSparseMatrixToDenseOpName = "CSRSparseMatrixToDense";
 constexpr auto kCSRSparseMatrixToSparseTensorOpName = "CSRSparseMatrixToSparseTensor";
 constexpr auto kSparseMatrixMatMulOpName = "SparseMatrixMatMul";
 constexpr auto kSparseMatrixNNZOpName = "SparseMatrixNNZ";
 constexpr auto kSparseMatrixTransposeOpName = "SparseMatrixTranspose";
 constexpr auto kSparseReshapeOpName = "SparseReshape";
 constexpr auto kSparseSegmentSqrtNGradOpName = "SparseSegmentSqrtNGrad";
 constexpr auto kSparseSegmentSumWithNumSegmentsOpName = "SparseSegmentSumWithNumSegments";
 constexpr auto kSparseSegmentSqrtNWithNumSegmentsOpName = "SparseSegmentSqrtNWithNumSegments";
 constexpr auto kCTCGreedyDecoderOpName = "CTCGreedyDecoder";
 constexpr auto kCumprodOpName = "Cumprod";
 constexpr auto kCumprodDOpName = "CumprodD";
@ -610,6 +637,7 @@ constexpr auto kRpcSendOpName = "RpcSend";
 constexpr auto kRpnProposalsOpName = "RpnProposals";
 constexpr auto kRpnProposalsDOpName = "RpnProposalsD";
 constexpr auto kRsqrtGradOpName = "RsqrtGrad";
 constexpr auto kSqrtGradOpName = "SqrtGrad";
 constexpr auto kRsqrtOpName = "Rsqrt";
 constexpr auto kSampleDistortedBoundingBoxExt2OpName = "SampleDistortedBoundingBoxExt2";
 constexpr auto kScaleAndTranslateOpName = "ScaleAndTranslate";
@ -659,9 +687,11 @@ constexpr auto kSpaceToBatchNDDOpName = "SpaceToBatchNDD";
 constexpr auto kSpaceToDepthOpName = "SpaceToDepth";
 constexpr auto kSparseApplyAdadeltaOpName = "SparseApplyAdadelta";
 constexpr auto kSparseFillEmptyRows = "SparseFillEmptyRows";
 constexpr auto kSparseFillEmptyRowsGradOpName = "SparseFillEmptyRowsGrad";
 constexpr auto kSparseApplyAdadeltaDOpName = "SparseApplyAdadeltaD";
 constexpr auto kSparseApplyAdagradOpName = "SparseApplyAdagrad";
 constexpr auto kSparseApplyAdagradDOpName = "SparseApplyAdagradD";
 constexpr auto kSparseApplyAdagradDAOpName = "SparseApplyAdagradDA";
 constexpr auto kSparseApplyAdagradV2OpName = "SparseApplyAdagradV2";
 constexpr auto kSparseApplyAdagradV2DOpName = "SparseApplyAdagradV2D";
 constexpr auto kSparseApplyFtrlOpName = "SparseApplyFtrl";
@ -670,9 +700,15 @@ constexpr auto kSparseApplyFtrlV2OpName = "SparseApplyFtrlV2";
 constexpr auto kSparseApplyFtrlV2DOpName = "SparseApplyFtrlV2D";
 constexpr auto kSparseApplyProximalAdagradDOpName = "SparseApplyProximalAdagradD";
 constexpr auto kSparseApplyProximalAdagradOpName = "SparseApplyProximalAdagrad";
 constexpr auto kSparseApplyProximalGradientDescentOpName = "SparseApplyProximalGradientDescent";
 constexpr auto kSparseApplyRMSPropOpName = "SparseApplyRMSProp";
 constexpr auto kSparseApplyRMSPropDOpName = "SparseApplyRMSPropD";
 constexpr auto kSparseAddmmOpName = "SparseAddmm";
 constexpr auto kSparseCrossOpName = "SparseCross";
 constexpr auto kSparseDenseCwiseMulOpName = "SparseDenseCwiseMul";
 constexpr auto kSparseDenseCwiseDivOpName = "SparseDenseCwiseDiv";
 constexpr auto kSparseDenseCwiseAddOpName = "SparseDenseCwiseAdd";
 constexpr auto kSparseConcatOpName = "SparseConcat";
 constexpr auto kSparseGatherV2OpName = "SparseGatherV2";
 constexpr auto kSparseSliceOpName = "SparseSlice";
 constexpr auto kSparseSoftmaxCrossEntropyWithLogitsOpName = "SparseSoftmaxCrossEntropyWithLogits";
@ -711,6 +747,7 @@ constexpr auto kSubAndFilterOpName = "SubAndFilter";
 constexpr auto kSubOpName = "Sub";
 constexpr auto kSubscalarOpName = "Subscalar";
 constexpr auto kSwitchOpName = "Switch";
 constexpr auto kTanhOpName = "Tanh";
 constexpr auto kTensorAddOpName = "Add";
 constexpr auto kTensorCopySlicesOpName = "TensorCopySlices";
 constexpr auto kTensorMoveOpName = "TensorMove";
@ -725,6 +762,10 @@ constexpr auto kTransposeDOpName = "TransposeD";
 constexpr auto kTruncatedNormal = "TruncatedNormal";
 constexpr auto kTruncateDivOpName = "TruncateDiv";
 constexpr auto kTruncOpName = "Trunc";
 constexpr auto kTridiagonalMatMulOpName = "TridiagonalMatMul";
 constexpr auto kTrilIndicesOpName = "TrilIndices";
 constexpr auto kTriuIndicesOpName = "TriuIndices";
 constexpr auto kTripletMarginLossOpName = "TripletMarginLoss";
 constexpr auto kUniformCandidateSamplerOpName = "UniformCandidateSampler";
 constexpr auto kLogUniformCandidateSamplerOpName = "LogUniformCandidateSampler";
 constexpr auto kUniformIntOpName = "UniformInt";
@ -743,8 +784,12 @@ constexpr auto kUnsortedSegmentProdOpName = "UnsortedSegmentProd";
 constexpr auto kUnsortedSegmentProdDOpName = "UnsortedSegmentProdD";
 constexpr auto kUnsortedSegmentSumOpName = "UnsortedSegmentSum";
 constexpr auto kUnsortedSegmentSumDOpName = "UnsortedSegmentSumD";
 constexpr auto kUnravelIndexOpName = "UnravelIndex";
 constexpr auto kUpdateCacheOpName = "UpdateCache";
 constexpr auto kUpdateStateOpName = "UpdateState";
 constexpr auto kUpperBoundOpName = "UpperBound";
 constexpr auto kXlogyOpName = "Xlogy";
 constexpr auto kXdivyOpName = "Xdivy";
 constexpr auto kDynamicBroadcastToOpName = "DynamicBroadcastTo";
 constexpr auto kCheckValidOpName = "CheckValid";
 constexpr auto kSoftmaxGradFusionOpName = "SoftmaxGradFusion";
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_mat_mul.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_mat_mul.cc
@ -0,0 +1,283 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sparse_matrix_mat_mul.h"
 #include <securec.h>
 #include <complex>
 #include <numeric>
 #include <string>
 #include <vector>
 #include "cpu_kernel_utils.h"
 #include "cpu_types.h"
 #include "kernel_log.h"
 #include "status.h"
 #include "utils/allocator_utils.h"
 #include "utils/kernel_util.h"
 using namespace std;
 namespace aicpu {
 const char *SparseMatrixMatMul = "SparseMatrixMatMul";
 const int INPUT_PARAMS_NUM = 6;
 const int OUTPUT_PARAMS_NUM = 1;
 }  // namespace aicpu
 namespace aicpu {
 uint32_t SparseMatrixMatMulCpuKernel::Compute(CpuKernelContext &ctx) {
  if (ValidParam(ctx) != KERNEL_STATUS_OK) {
    KERNEL_LOG_ERROR("valid sparse matrix mat mul param error.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  DataType indice_type = ctx.Input(0)->GetDataType();
  DataType value_type = ctx.Input(4)->GetDataType();
  uint32_t status;
  switch (indice_type) {
    case DT_INT32:
      switch (value_type) {
        case DT_FLOAT:
          status = DoCompute<int32_t, float_t>(ctx);
          break;
        case DT_DOUBLE:
          status = DoCompute<int32_t, double_t>(ctx);
          break;
        case DT_COMPLEX64:
          status = DoCompute<int32_t, complex<float_t> >(ctx);
          break;
        case DT_COMPLEX128:
          status = DoCompute<int32_t, complex<double_t> >(ctx);
          break;
        default:
          KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
          return KERNEL_STATUS_PARAM_INVALID;
      }
      break;
    case DT_INT64:
      switch (value_type) {
        case DT_FLOAT:
          status = DoCompute<int64_t, float_t>(ctx);
          break;
        case DT_DOUBLE:
          status = DoCompute<int64_t, double_t>(ctx);
          break;
        case DT_COMPLEX64:
          status = DoCompute<int64_t, complex<float_t> >(ctx);
          break;
        case DT_COMPLEX128:
          status = DoCompute<int64_t, complex<double_t> >(ctx);
          break;
        default:
          KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
          return KERNEL_STATUS_PARAM_INVALID;
      }
      break;
    default:
      KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
      return KERNEL_STATUS_PARAM_INVALID;
  }
  if (status != KERNEL_STATUS_OK) {
    KERNEL_LOG_ERROR("error in do the actual compute!");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename indiceT, typename valueT>
 Eigen::Ref<const Eigen::SparseMatrix<valueT, Eigen::RowMajor, indiceT> >
 SparseMatrixMatMulCpuKernel::CreateEigenSparseMatrix(indiceT rows, indiceT cols, int64_t nnz, indiceT *row_pointers,
                                                     indiceT *col_indices, valueT *values, bool transpose,
                                                     bool adjoint) {
  Eigen::Map<const Eigen::SparseMatrix<valueT, Eigen::RowMajor, indiceT> > sparse_matrix(rows, cols, nnz, row_pointers,
                                                                                         col_indices, values);
  // The transpose/adjoint expressions are not actually evaluated until
  // necessary. Hence we don't create copies or modify the input matrix
  // inplace.
  if (transpose) {
    return sparse_matrix.transpose();
  }
  if (adjoint) {
    return sparse_matrix.adjoint();
  }
  return sparse_matrix;
 }
 uint32_t SparseMatrixMatMulCpuKernel::ValidParam(CpuKernelContext &ctx) {
  KERNEL_LOG_DEBUG("Start to execute ValidParam.");
  // valid input and output nullptr
  if (NormalCheck(ctx, INPUT_PARAMS_NUM, OUTPUT_PARAMS_NUM) != KERNEL_STATUS_OK) {
    return KERNEL_STATUS_PARAM_INVALID;
  }
  // check if the matrix can mul
  DataType dt = ctx.Input(0)->GetDataType();  // dense shape x1
  uint32_t checkStatus;
  switch (dt) {
    case DT_INT32:
      checkStatus = CheckMatMul<int32_t>(ctx);
      break;
    case DT_INT64:
      checkStatus = CheckMatMul<int64_t>(ctx);
      break;
    default:
      // KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
      return KERNEL_STATUS_PARAM_INVALID;
  }
  if (checkStatus != KERNEL_STATUS_OK) {
    KERNEL_LOG_ERROR("the two input matrixs cannot mul cause their dim!");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t SparseMatrixMatMulCpuKernel::CheckMatMul(CpuKernelContext &ctx) {
  KERNEL_LOG_DEBUG("check if the matrix can mul");
  const int rank = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
  const int row_dim = (rank == 2) ? 0 : 1;
  Tensor *dense_shape_x1 = ctx.Input(0);
  T *shape_x1 = static_cast<T *>(dense_shape_x1->GetData());
  std::vector<int64_t> shape_x2 = ctx.Input(5)->GetTensorShape()->GetDimSizes();
  bool transpose_a = false;
  bool transpose_b = false;
  bool adjoint_a = false;
  bool adjoint_b = false;
  if (ctx.GetAttr("transpose_x1") != nullptr) {
    transpose_a = ctx.GetAttr("transpose_x1")->GetBool();
  }
  if (ctx.GetAttr("transpose_x2") != nullptr) {
    transpose_b = ctx.GetAttr("transpose_x2")->GetBool();
  }
  if (ctx.GetAttr("adjoint_x1") != nullptr) {
    adjoint_a = ctx.GetAttr("adjoint_x1")->GetBool();
  }
  if (ctx.GetAttr("adjoint_x2") != nullptr) {
    adjoint_b = ctx.GetAttr("adjoint_x2")->GetBool();
  }
  T x1_col = (transpose_a || adjoint_a) ? shape_x1[row_dim] : shape_x1[row_dim + 1];
  T x2_row = (transpose_b || adjoint_b) ? shape_x2[row_dim + 1] : shape_x2[row_dim];
  if (x1_col != x2_row) {
    KERNEL_LOG_ERROR("x1's col is no equal x2's row, cannot do mat mul!");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename indiceT, typename valueT>
 uint32_t SparseMatrixMatMulCpuKernel::DoCompute(CpuKernelContext &ctx) {
  using Matrix = Eigen::Matrix<valueT, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
  indiceT batch_size = ctx.Input(1)->NumElements() - 1;
  std::vector<Matrix> results(batch_size);
  int shift = (ctx.Input(0)->NumElements() == 2) ? 0 : 1;
  indiceT row_x1 = *(static_cast<indiceT *>(ctx.Input(0)->GetData()) + shift);
  indiceT col_x1 = *(static_cast<indiceT *>(ctx.Input(0)->GetData()) + shift + 1);
  indiceT *batch_pointers_x1 = static_cast<indiceT *>(ctx.Input(1)->GetData());
  indiceT *row_pointers_x1 = static_cast<indiceT *>(ctx.Input(2)->GetData());
  indiceT *col_indices_x1 = static_cast<indiceT *>(ctx.Input(3)->GetData());
  valueT *value_x1 = static_cast<valueT *>(ctx.Input(4)->GetData());
  std::vector<int64_t> shape_x2 = ctx.Input(5)->GetTensorShape()->GetDimSizes();
  const int rank = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
  const int row_dim = (rank == 2) ? 0 : 1;
  indiceT row_x2 = shape_x2[row_dim];
  indiceT col_x2 = shape_x2[row_dim + 1];
  valueT *value_x2 = static_cast<valueT *>(ctx.Input(5)->GetData());
  bool transpose_a = false;
  bool transpose_b = false;
  bool adjoint_a = false;
  bool adjoint_b = false;
  bool transpose_output = false;
  bool conjugate_output = false;
  if (ctx.GetAttr("transpose_x1") != nullptr) {
    transpose_a = ctx.GetAttr("transpose_x1")->GetBool();
  }
  if (ctx.GetAttr("transpose_x2") != nullptr) {
    transpose_b = ctx.GetAttr("transpose_x2")->GetBool();
  }
  if (ctx.GetAttr("adjoint_x1") != nullptr) {
    adjoint_a = ctx.GetAttr("adjoint_x1")->GetBool();
  }
  if (ctx.GetAttr("adjoint_x2") != nullptr) {
    adjoint_b = ctx.GetAttr("adjoint_x2")->GetBool();
  }
  if (ctx.GetAttr("transpose_output") != nullptr) {
    transpose_output = ctx.GetAttr("transpose_output")->GetBool();
  }
  if (ctx.GetAttr("conjugate_output") != nullptr) {
    conjugate_output = ctx.GetAttr("conjugate_output")->GetBool();
  }
  uint32_t min_core_num = 1;
  uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
  max_core_num = std::min(max_core_num, (uint32_t)batch_size);
  if (max_core_num == 0) {
    KERNEL_LOG_ERROR("max core num cannot be zero");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  KERNEL_HANDLE_ERROR(
    CpuKernelUtils::ParallelFor(ctx, batch_size, batch_size / max_core_num,
                                [&](int64_t start, int64_t end) {
                                  for (int64_t i = start; i < end; i++) {
                                    int64_t nnz_x1 = batch_pointers_x1[i + 1] - batch_pointers_x1[i];
                                    indiceT *row_pointers_x1_batch_i = row_pointers_x1 + (row_x1 + 1) * i;
                                    indiceT *col_indices_x1_batch_i = col_indices_x1 + batch_pointers_x1[i];
                                    valueT *value_x1_batch_i = value_x1 + batch_pointers_x1[i];
                                    auto x1_sparse_matrix = CreateEigenSparseMatrix<indiceT, valueT>(
                                      row_x1, col_x1, nnz_x1, row_pointers_x1_batch_i, col_indices_x1_batch_i,
                                      value_x1_batch_i, transpose_a, adjoint_a);
                                    Eigen::Map<Matrix> x2_dense_matrix(value_x2 + col_x2 * row_x2 * i, row_x2, col_x2);
                                    Matrix temp;
                                    if (transpose_b) {
                                      temp = x1_sparse_matrix * x2_dense_matrix.transpose();
                                    } else if (adjoint_b) {
                                      temp = x1_sparse_matrix * x2_dense_matrix.adjoint();
                                    } else {
                                      temp = x1_sparse_matrix * x2_dense_matrix;
                                    }
                                    if (transpose_output) {
                                      results[i] = temp.transpose();
                                    } else if (conjugate_output) {
                                      results[i] = temp.conjugate();
                                    } else
                                      results[i] = temp;
                                  }
                                }),
    "SparseMatrixMatMul Compute failed.");
  // computer result_row_pointers|result_col_indices|result_values data
  indiceT row_output, col_output;
  row_output = results[0].rows();
  col_output = results[0].cols();
  for (int i = 0; i < batch_size; i++) {
    valueT *output_values_data = static_cast<valueT *>(ctx.Output(0)->GetData());
    std::copy(results[i].data(), results[i].data() + row_output * col_output,
              output_values_data + i * row_output * col_output);
  }
  KERNEL_LOG_DEBUG("DoCompute end!!");
  return KERNEL_STATUS_OK;
 }
 // register the opetaor
 REGISTER_CPU_KERNEL(SparseMatrixMatMul, SparseMatrixMatMulCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_mat_mul.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_mat_mul.h
@ -0,0 +1,46 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SPARSEMATRIXSPARSEMATMUL_H_
 #define AICPU_KERNELS_NORMALIZED_SPARSEMATRIXSPARSEMATMUL_H_
 #include "Eigen/Core"
 #include "Eigen/SparseCore"
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class SparseMatrixMatMulCpuKernel : public CpuKernel {
 public:
  ~SparseMatrixMatMulCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  uint32_t ValidParam(CpuKernelContext &ctx);
  // check if the matrix can mul
  template <typename T>
  uint32_t CheckMatMul(CpuKernelContext &ctx);
  // create eigen sparsematrix with eigen::map
  template <typename indiceT, typename valueT>
  Eigen::Ref<const Eigen::SparseMatrix<valueT, Eigen::RowMajor, indiceT> > CreateEigenSparseMatrix(
    indiceT rows, indiceT cols, int64_t nnz, indiceT *row_pointers, indiceT *col_indices, valueT *values,
    bool transpose, bool adjoint);
  // do the actual complute
  template <typename indiceT, typename valueT>
  uint32_t DoCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_nnz.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_nnz.cc
@ -0,0 +1,86 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sparse_matrix_nnz.h"
 #include <securec.h>
 #include <complex>
 #include <numeric>
 #include <string>
 #include "cpu_kernel_utils.h"
 #include "cpu_types.h"
 #include "kernel_log.h"
 #include "status.h"
 #include "utils/allocator_utils.h"
 #include "utils/kernel_util.h"
 using namespace std;
 namespace aicpu {
 const char *SparseMatrixNNZ = "SparseMatrixNNZ";
 const int INPUT_PARAMS_NUM = 5;
 const int OUTPUT_PARAMS_NUM = 1;
 }  // namespace aicpu
 namespace aicpu {
 uint32_t SparseMatrixNNZCpuKernel::Compute(CpuKernelContext &ctx) {
  if (NormalCheck(ctx, INPUT_PARAMS_NUM, OUTPUT_PARAMS_NUM) != KERNEL_STATUS_OK) {
    return KERNEL_STATUS_PARAM_INVALID;
  }
  DataType indice_type = ctx.Input(1)->GetDataType();
  uint32_t status;
  switch (indice_type) {
    case DT_INT32:
      status = DoCompute<int32_t>(ctx);
      break;
    case DT_INT64:
      status = DoCompute<int64_t>(ctx);
      break;
    default:
      KERNEL_LOG_ERROR("data type of batch pointers is not int32 or int64");
      status = KERNEL_STATUS_PARAM_INVALID;
  }
  if (status != KERNEL_STATUS_OK) {
    KERNEL_LOG_ERROR("error in do the actual compute!");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename indiceT>
 uint32_t SparseMatrixNNZCpuKernel::DoCompute(CpuKernelContext &ctx) {
  const indiceT batch_size = ctx.Input(1)->NumElements() - 1;
  // define some temp arrays to store the output tensor data
  int32_t result_nnz[batch_size];
  // do computer
  indiceT *batch_pointers_x = static_cast<indiceT *>(ctx.Input(1)->GetData());
  indiceT curr = 0;
  for (int i = 1; i < batch_size + 1; i++) {
    result_nnz[i - 1] = batch_pointers_x[i] - curr;
    // update curr
    curr = batch_pointers_x[i];
  }
  // write result
  int32_t *output_y = static_cast<int32_t *>(ctx.Output(0)->GetData());
  std::copy(result_nnz, result_nnz + (int32_t)batch_size, output_y);
  KERNEL_LOG_DEBUG("DoCompute end!!");
  return KERNEL_STATUS_OK;
 }
 // register the opetaor
 REGISTER_CPU_KERNEL(SparseMatrixNNZ, SparseMatrixNNZCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_nnz.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_nnz.h
@ -0,0 +1,35 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SPARSEMATRIXNNZ_H_
 #define AICPU_KERNELS_NORMALIZED_SPARSEMATRIXNNZ_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class SparseMatrixNNZCpuKernel : public CpuKernel {
 public:
  ~SparseMatrixNNZCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  // do the actual complute
  template <typename indiceT>
  uint32_t DoCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_transpose.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_transpose.cc
@ -0,0 +1,337 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * UnSparseMatrixTranspose required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sparse_matrix_transpose.h"
 #include "cpu_kernel_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 #include <numeric>
 #include <iostream>
 using namespace std;
 namespace aicpu {
 const uint32_t kInputNum = 5;
 const uint32_t kOutputNum = 5;
 const uint32_t kzero = 0;
 const uint32_t kone = 1;
 const uint32_t ktwo = 2;
 const uint32_t kthree = 3;
 const uint32_t kfour = 4;
 const uint32_t krankwithbatch = 3;
 const char *SPARSEMATRIXTRANSPOSE = "SparseMatrixTranspose";
 }  // namespace aicpu
 namespace aicpu {
 uint32_t SparseMatrixTransposeCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseMatrixTranspose normal check failed.");
  DataType indice_type = ctx.Input(0)->GetDataType();
  DataType value_type = ctx.Input(4)->GetDataType();
  uint32_t status;
  switch (indice_type) {
    case DT_INT32:
      switch (value_type) {
        case DT_UINT8:
          status = SparseMatrixTransposeCompute<int32_t, uint8_t>(ctx);
          break;
        case DT_UINT16:
          status = SparseMatrixTransposeCompute<int32_t, uint16_t>(ctx);
          break;
        case DT_UINT32:
          status = SparseMatrixTransposeCompute<int32_t, uint32_t>(ctx);
          break;
        case DT_UINT64:
          status = SparseMatrixTransposeCompute<int32_t, uint64_t>(ctx);
          break;
        case DT_INT8:
          status = SparseMatrixTransposeCompute<int32_t, int8_t>(ctx);
          break;
        case DT_INT16:
          status = SparseMatrixTransposeCompute<int32_t, int16_t>(ctx);
          break;
        case DT_INT32:
          status = SparseMatrixTransposeCompute<int32_t, int32_t>(ctx);
          break;
        case DT_INT64:
          status = SparseMatrixTransposeCompute<int32_t, int64_t>(ctx);
          break;
        case DT_FLOAT16:
          status = SparseMatrixTransposeCompute<int32_t, Eigen::half>(ctx);
          break;
        case DT_FLOAT:
          status = SparseMatrixTransposeCompute<int32_t, float_t>(ctx);
          break;
        case DT_DOUBLE:
          status = SparseMatrixTransposeCompute<int32_t, double_t>(ctx);
          break;
        case DT_COMPLEX64:
          status = SparseMatrixTransposeComputecomplex<int32_t, complex<float_t>>(ctx);
          break;
        case DT_COMPLEX128:
          status = SparseMatrixTransposeComputecomplex<int32_t, complex<double_t>>(ctx);
          break;
        default:
          KERNEL_LOG_ERROR("data type of x_value is not required");
          return KERNEL_STATUS_PARAM_INVALID;
      }
      break;
    case DT_INT64:
      switch (value_type) {
        case DT_UINT8:
          status = SparseMatrixTransposeCompute<int64_t, uint8_t>(ctx);
          break;
        case DT_UINT16:
          status = SparseMatrixTransposeCompute<int64_t, uint16_t>(ctx);
          break;
        case DT_UINT32:
          status = SparseMatrixTransposeCompute<int64_t, uint32_t>(ctx);
          break;
        case DT_UINT64:
          status = SparseMatrixTransposeCompute<int64_t, uint64_t>(ctx);
          break;
        case DT_INT8:
          status = SparseMatrixTransposeCompute<int64_t, int8_t>(ctx);
          break;
        case DT_INT16:
          status = SparseMatrixTransposeCompute<int64_t, int16_t>(ctx);
          break;
        case DT_INT32:
          status = SparseMatrixTransposeCompute<int64_t, int32_t>(ctx);
          break;
        case DT_INT64:
          status = SparseMatrixTransposeCompute<int64_t, int64_t>(ctx);
          break;
        case DT_FLOAT16:
          status = SparseMatrixTransposeCompute<int64_t, Eigen::half>(ctx);
          break;
        case DT_FLOAT:
          status = SparseMatrixTransposeCompute<int64_t, float_t>(ctx);
          break;
        case DT_DOUBLE:
          status = SparseMatrixTransposeCompute<int64_t, double_t>(ctx);
          break;
        case DT_COMPLEX64:
          status = SparseMatrixTransposeComputecomplex<int64_t, complex<float_t>>(ctx);
          break;
        case DT_COMPLEX128:
          status = SparseMatrixTransposeComputecomplex<int64_t, complex<double_t>>(ctx);
          break;
        default:
          KERNEL_LOG_ERROR("data type of x_value is not required");
          return KERNEL_STATUS_PARAM_INVALID;
      }
      break;
    default:
      KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
      return KERNEL_STATUS_PARAM_INVALID;
  }
  if (status != KERNEL_STATUS_OK) {
    KERNEL_LOG_ERROR("error in do the actual compute!");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename indiceT, typename valueT>
 uint32_t SparseMatrixTransposeCpuKernel::SparseMatrixTransposeCompute(CpuKernelContext &ctx) {
  indiceT *x_dense_shape = static_cast<indiceT *>(ctx.Input(0)->GetData());
  indiceT *x_batch_pointers = static_cast<indiceT *>(ctx.Input(1)->GetData());
  indiceT *x_row_pointers = static_cast<indiceT *>(ctx.Input(2)->GetData());
  indiceT *x_col_indices = static_cast<indiceT *>(ctx.Input(3)->GetData());
  valueT *x_values = static_cast<valueT *>(ctx.Input(4)->GetData());
  bool conjugate = (ctx.GetAttr("conjugate") == nullptr) ? false : ctx.GetAttr("conjugate")->GetBool();
  indiceT *y_dense_shape = static_cast<indiceT *>(ctx.Output(0)->GetData());
  indiceT *y_batch_pointers = static_cast<indiceT *>(ctx.Output(1)->GetData());
  indiceT *y_row_pointers = static_cast<indiceT *>(ctx.Output(2)->GetData());
  indiceT *y_col_indices = static_cast<indiceT *>(ctx.Output(3)->GetData());
  valueT *y_values = static_cast<valueT *>(ctx.Output(4)->GetData());
  auto rank = ctx.Input(0)->NumElements();
  if (rank == krankwithbatch) {
    y_dense_shape[0] = x_dense_shape[0];
    y_dense_shape[1] = x_dense_shape[ktwo];
    y_dense_shape[ktwo] = x_dense_shape[1];
  } else {
    y_dense_shape[0] = x_dense_shape[1];
    y_dense_shape[1] = x_dense_shape[0];
  }
  auto batch_pointers = ctx.Input(1)->NumElements();
  for (int i = 0; i < batch_pointers; ++i) {
    y_batch_pointers[i] = x_batch_pointers[i];
  }
  auto num_rows = x_dense_shape[rank - 2];
  auto num_cols = x_dense_shape[rank - 1];
  auto num_batch = ctx.Input(1)->NumElements() - 1;
  int y_part_row_pointers[num_cols + 1];
  int part_row_pointers[num_rows + 1];
  for (int j = 0; j < num_batch; ++j) {
    int n = x_batch_pointers[j + 1] - x_batch_pointers[j];
    valueT part_values[n];
    indiceT part_col_indices[n];
    indiceT y_part_col_indices[n];
    valueT y_part_values[n];
    for (int i = 0; i < num_cols + 1; ++i) {
      y_part_row_pointers[i] = 0;
    }
    for (int k = 0; k < num_rows + 1; ++k) {
      part_row_pointers[k] = x_row_pointers[(num_rows + 1) * j + k];
    }
    for (int k = 0; k < n; ++k) {
      part_values[k] = x_values[x_batch_pointers[j] + k];
      part_col_indices[k] = x_col_indices[x_batch_pointers[j] + k];
    }
    for (int64_t i = 0; i < n; ++i) {
      y_part_row_pointers[part_col_indices[i] + 1] += 1;
    }
    std::partial_sum(y_part_row_pointers, y_part_row_pointers + num_cols + 1, y_part_row_pointers);
    for (int k = 0; k < num_cols + 1; ++k) {
      y_row_pointers[(num_cols + 1) * j + k] = y_part_row_pointers[k];
    }
    for (int k = 0; k < n; ++k) {
      part_values[k] = x_values[x_batch_pointers[j] + k];
      part_col_indices[k] = x_col_indices[x_batch_pointers[j] + k];
    }
    std::vector<int> current_col_count(num_cols);
    for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
      const int64_t row_begin = part_row_pointers[row_idx];
      const int64_t row_end = part_row_pointers[row_idx + 1];
      for (int64_t i = row_begin; i < row_end; ++i) {
        const int col_idx = part_col_indices[i];
        const int64_t offset = y_part_row_pointers[col_idx] + current_col_count[col_idx];
        y_part_col_indices[offset] = row_idx;
        y_part_values[offset] = part_values[i];
        current_col_count[col_idx] += 1;
      }
    }
    for (int k = 0; k < n; ++k) {
      y_values[x_batch_pointers[j] + k] = y_part_values[k];
      y_col_indices[x_batch_pointers[j] + k] = y_part_col_indices[k];
    }
  }
  if (conjugate == false) {
  }
  auto output = ctx.Output(2);
  auto output_shape = output->GetTensorShape();
  if (rank == ktwo) {
    output_shape->SetDimSizes({num_cols + 1});
  } else {
    output_shape->SetDimSizes({x_dense_shape[0] * (num_cols + 1)});
  }
  output->SetTensorShape(output_shape.get());
  return KERNEL_STATUS_OK;
 }
 template <typename indiceT, typename valueT>
 uint32_t SparseMatrixTransposeCpuKernel::SparseMatrixTransposeComputecomplex(CpuKernelContext &ctx) {
  indiceT *x_dense_shape = static_cast<indiceT *>(ctx.Input(0)->GetData());
  indiceT *x_batch_pointers = static_cast<indiceT *>(ctx.Input(1)->GetData());
  indiceT *x_row_pointers = static_cast<indiceT *>(ctx.Input(2)->GetData());
  indiceT *x_col_indices = static_cast<indiceT *>(ctx.Input(3)->GetData());
  valueT *x_values = static_cast<valueT *>(ctx.Input(4)->GetData());
  bool conjugate = (ctx.GetAttr("conjugate") == nullptr) ? false : ctx.GetAttr("conjugate")->GetBool();
  indiceT *y_dense_shape = static_cast<indiceT *>(ctx.Output(0)->GetData());
  indiceT *y_batch_pointers = static_cast<indiceT *>(ctx.Output(1)->GetData());
  indiceT *y_row_pointers = static_cast<indiceT *>(ctx.Output(2)->GetData());
  indiceT *y_col_indices = static_cast<indiceT *>(ctx.Output(3)->GetData());
  valueT *y_values = static_cast<valueT *>(ctx.Output(4)->GetData());
  auto rank = ctx.Input(0)->NumElements();
  if (rank == krankwithbatch) {
    y_dense_shape[0] = x_dense_shape[0];
    y_dense_shape[1] = x_dense_shape[ktwo];
    y_dense_shape[ktwo] = x_dense_shape[1];
  } else {
    y_dense_shape[0] = x_dense_shape[1];
    y_dense_shape[1] = x_dense_shape[0];
  }
  auto batch_pointers = ctx.Input(1)->NumElements();
  for (int i = 0; i < batch_pointers; ++i) {
    y_batch_pointers[i] = x_batch_pointers[i];
  }
  auto num_rows = x_dense_shape[rank - 2];
  auto num_cols = x_dense_shape[rank - 1];
  auto num_batch = ctx.Input(1)->NumElements() - 1;
  int y_part_row_pointers[num_cols + 1];
  int part_row_pointers[num_rows + 1];
  for (int j = 0; j < num_batch; ++j) {
    int n = x_batch_pointers[j + 1] - x_batch_pointers[j];
    valueT part_values[n];
    indiceT part_col_indices[n];
    indiceT y_part_col_indices[n];
    valueT y_part_values[n];
    for (int i = 0; i < num_cols + 1; ++i) {
      y_part_row_pointers[i] = 0;
    }
    for (int k = 0; k < num_rows + 1; ++k) {
      part_row_pointers[k] = x_row_pointers[(num_rows + 1) * j + k];
    }
    for (int k = 0; k < n; ++k) {
      part_values[k] = x_values[x_batch_pointers[j] + k];
      part_col_indices[k] = x_col_indices[x_batch_pointers[j] + k];
    }
    for (int64_t i = 0; i < n; ++i) {
      y_part_row_pointers[part_col_indices[i] + 1] += 1;
    }
    std::partial_sum(y_part_row_pointers, y_part_row_pointers + num_cols + 1, y_part_row_pointers);
    for (int k = 0; k < num_cols + 1; ++k) {
      y_row_pointers[(num_cols + 1) * j + k] = y_part_row_pointers[k];
    }
    for (int k = 0; k < n; ++k) {
      part_values[k] = x_values[x_batch_pointers[j] + k];
      part_col_indices[k] = x_col_indices[x_batch_pointers[j] + k];
    }
    std::vector<int> current_col_count(num_cols);
    for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
      const int64_t row_begin = part_row_pointers[row_idx];
      const int64_t row_end = part_row_pointers[row_idx + 1];
      for (int64_t i = row_begin; i < row_end; ++i) {
        const int col_idx = part_col_indices[i];
        const int64_t offset = y_part_row_pointers[col_idx] + current_col_count[col_idx];
        y_part_col_indices[offset] = row_idx;
        y_part_values[offset] = part_values[i];
        current_col_count[col_idx] += 1;
      }
    }
    for (int k = 0; k < n; ++k) {
      y_values[x_batch_pointers[j] + k] = y_part_values[k];
      y_col_indices[x_batch_pointers[j] + k] = y_part_col_indices[k];
    }
  }
  if (conjugate == true) {
    for (int i = 0; i < ctx.Input(kfour)->GetTensorShape()->NumElements(); ++i) {
      y_values[i] = std::conj(y_values[i]);
    }
  }
  auto output = ctx.Output(2);
  auto output_shape = output->GetTensorShape();
  if (rank == ktwo) {
    output_shape->SetDimSizes({num_cols + 1});
  } else {
    output_shape->SetDimSizes({x_dense_shape[0] * (num_cols + 1)});
  }
  output->SetTensorShape(output_shape.get());
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(SPARSEMATRIXTRANSPOSE, SparseMatrixTransposeCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_transpose.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_transpose.h
@ -0,0 +1,37 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SPARSEMATRIXTRANSPOSE_H_
 #define AICPU_KERNELS_NORMALIZED_SPARSEMATRIXTRANSPOSE_H_
 #include "cpu_ops_kernel.h"
 #include "utils/sparse_tensor.h"
 #include "Eigen/SparseCore"
 namespace aicpu {
 class SparseMatrixTransposeCpuKernel : public CpuKernel {
 public:
  ~SparseMatrixTransposeCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  uint32_t SparseMatrixTransposeParamCheck(CpuKernelContext &ctx);
  template <typename indiceT, typename valueT>
  uint32_t SparseMatrixTransposeCompute(CpuKernelContext &ctx);
  template <typename indiceT, typename valueT>
  uint32_t SparseMatrixTransposeComputecomplex(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_reshape.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_reshape.cc
@ -0,0 +1,180 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sparse_reshape.h"
 #include <vector>
 #include "cpu_kernel_utils.h"
 #include "securec.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 constexpr uint32_t kSparseReshapeInputNum = 3;
 constexpr uint32_t kSparseReshapeOutputNum = 2;
 const char *kSparseReshape = "SparseReshape";
 // when input data size is more than kParallelDataNum, use Parallel func
 const int64_t kParallelDataNumSameShape = 24 * 1024;
 const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
 }  // namespace
 namespace aicpu {
 void SparseReshapeCpuKernel::SpecialCompute(int64_t start, int64_t end, const int64_t *in0, int64_t *out0,
                                            const int64_t *input_strides, const int64_t *output_strides,
                                            const int64_t input_rank, const int64_t output_rank) {
  for (int i = start; i < end; i++) {
    int64_t id = 0;
    for (int j = 0; j < input_rank; j++) {
      id += *(in0 + i * input_rank + j) * input_strides[j];
    }
    for (int j = 0; j < output_rank; j++) {
      *(out0 + i * output_rank + j) = id / output_strides[j];
      id %= output_strides[j];
    }
  }
 }
 uint32_t SparseReshapeCpuKernel::Compute(CpuKernelContext &ctx) {
  // check params
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kSparseReshapeInputNum, kSparseReshapeOutputNum), "[%s] check params failed.",
                      kSparseReshape);
  Tensor *input_0 = ctx.Input(0);
  Tensor *input_1 = ctx.Input(1);
  Tensor *input_2 = ctx.Input(2);
  Tensor *output_0 = ctx.Output(0);
  Tensor *output_1 = ctx.Output(1);
  KERNEL_CHECK_FALSE(
    (input_0->GetDataType() == DT_INT64 && input_1->GetDataType() == DT_INT64 && input_2->GetDataType() == DT_INT64 &&
     output_0->GetDataType() == DT_INT64 && output_1->GetDataType() == DT_INT64),
    KERNEL_STATUS_INNER_ERROR, "the data of SparseReshape kernel must be DT_INT64.");
  KERNEL_CHECK_FALSE((input_0->GetTensorShape()->GetDimSize(1) == input_1->GetTensorShape()->GetDimSize(0)),
                     KERNEL_STATUS_INNER_ERROR, "Input tensor rank must match input shape length.");
  int64_t *in0 = reinterpret_cast<int64_t *>(input_0->GetData());
  int64_t *in1 = reinterpret_cast<int64_t *>(input_1->GetData());
  int64_t *in2 = reinterpret_cast<int64_t *>(input_2->GetData());
  int64_t *out0 = reinterpret_cast<int64_t *>(output_0->GetData());
  int64_t *out1 = reinterpret_cast<int64_t *>(output_1->GetData());
  const int64_t input_rank = input_1->NumElements();
  const int64_t output_rank = input_2->NumElements();
  const int64_t nnz = input_0->GetTensorShape()->GetDimSize(0);
  int64_t dense_size = 1;
  int64_t product = 1;
  int64_t out_num = 1;
  int unknown_index = -1;
  for (int i = 0; i < input_rank; i++) {
    dense_size *= *(in1 + i);
  }
  for (int d = 0; d < output_rank; d++) {
    const int64_t size = *(in2 + d);
    if (size == -1) {
      KERNEL_CHECK_FALSE((unknown_index == -1), KERNEL_STATUS_INNER_ERROR,
                         "only one output dimension may be -1, "
                         "not both [%d] and [%d]",
                         unknown_index, d);
      unknown_index = d;
    } else {
      KERNEL_CHECK_FALSE((size >= 0), KERNEL_STATUS_INNER_ERROR, "size [%d] must be non-negative, not [%ld]", d, size);
      product *= size;
      *(out1 + d) = size;
      out_num *= size;
    }
  }
  if (unknown_index != -1) {
    KERNEL_CHECK_FALSE((product >= 0), KERNEL_STATUS_INNER_ERROR,
                       "reshape cannot infer the missing "
                       "input size for an empty tensor unless all "
                       "specified input sizes are non-zero");
    const int64_t missing = dense_size / product;
    KERNEL_CHECK_FALSE((product * missing == dense_size), KERNEL_STATUS_INNER_ERROR,
                       "Input to reshape is a SparseTensor with [%ld]"
                       " dense values, but the requested shape requires"
                       " a multiple of [%ld].",
                       dense_size, product);
    out_num *= missing;
    *(out1 + unknown_index) = missing;
  }
  KERNEL_CHECK_FALSE((out_num == dense_size), KERNEL_STATUS_INNER_ERROR,
                     "Input to reshape is a tensor with [%ld]"
                     " dense values, but the requested shape has [%ld].",
                     dense_size, out_num);
  int64_t input_size = input_0->GetDataSize();
  int64_t output_size = output_0->GetDataSize();
  if (input_size == output_size && input_rank == output_rank) {
    bool flag = true;
    for (int64_t i = 0; i < input_rank; ++i) {
      if (*(in1 + i) != *(out1 + i)) {
        flag = false;
        break;
      }
    }
    if (flag) {
      auto mem_ret = memcpy_s(out0, output_size, in0, input_size);
      KERNEL_CHECK_FALSE(mem_ret == EOK, KERNEL_STATUS_INNER_ERROR,
                         "[%s] memcpy_s to output failed, destMax [%ld], count [%ld].", kSparseReshape, output_size,
                         input_size);
      return KERNEL_STATUS_OK;
    }
  }
  if (nnz <= 0) return KERNEL_STATUS_OK;
  int64_t *input_strides = new int64_t[input_rank];
  int64_t *output_strides = new int64_t[output_rank];
  if (input_rank > 0) {
    input_strides[input_rank - 1] = 1;
    for (int d = input_rank - 2; d >= 0; d--) {
      input_strides[d] = input_strides[d + 1] * *(in1 + d + 1);
    }
  }
  if (output_rank > 0) {
    output_strides[output_rank - 1] = 1;
    for (int d = output_rank - 2; d >= 0; d--) {
      output_strides[d] = output_strides[d + 1] * *(out1 + d + 1);
    }
  }
  if (nnz * input_rank >= kParallelDataNumSameShape) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    KERNEL_CHECK_FALSE(max_core_num != 0, KERNEL_STATUS_INNER_ERROR, "core num should not be 0.");
    if (nnz * input_rank <= kParallelDataNumSameShapeMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num > nnz) {
      max_core_num = nnz;
    }
    auto sharder_sparse_reshape = [&](int64_t start, int64_t end) {
      SpecialCompute(start, end, in0, out0, input_strides, output_strides, input_rank, output_rank);
    };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, nnz, nnz / max_core_num, sharder_sparse_reshape),
                        "SparseReshape Compute failed.");
  } else {
    SpecialCompute(0, nnz, in0, out0, input_strides, output_strides, input_rank, output_rank);
  }
  delete[] input_strides;
  delete[] output_strides;
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kSparseReshape, SparseReshapeCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_reshape.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_reshape.h
@ -0,0 +1,33 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SPARSE_RESHAPE_H_
 #define AICPU_KERNELS_NORMALIZED_SPARSE_RESHAPE_H_
 #include "cpu_ops_kernel.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class SparseReshapeCpuKernel : public CpuKernel {
 public:
  ~SparseReshapeCpuKernel() override = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  void SpecialCompute(int64_t start, int64_t end, const int64_t *in0, int64_t *out0, const int64_t *input_strides,
                      const int64_t *output_strides, const int64_t input_rank, const int64_t output_rank);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n_grad.cc
@ -0,0 +1,136 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sparse_segment_sqrt_n_grad.h"
 #include "Eigen/Core"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kInputNum = 4;
 const uint32_t kOutputNum = 1;
 const char *SparseSegmentSqrtNGrad = "SparseSegmentSqrtNGrad";
 }  // namespace
 namespace aicpu {
 uint32_t SparseSegmentSqrtNGradCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
                      "SparseSegmentSqrtNGrad check input and output number failed.");
  Tensor *inputx = ctx.Input(0);
  Tensor *input_indices = ctx.Input(1);
  Tensor *input_segment_ids = ctx.Input(2);
  Tensor *input_output_dim = ctx.Input(3);
  auto data_type0 = inputx->GetDataType();
  auto data_type1 = input_indices->GetDataType();
  auto data_type2 = input_segment_ids->GetDataType();
  auto data_type3 = input_output_dim->GetDataType();
  if (data_type0 != DT_FLOAT && data_type0 != DT_DOUBLE && data_type0 != DT_FLOAT16) {
    KERNEL_LOG_ERROR("SparseSegmentSqrtNGrad kernel data type [%u] not support.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (data_type1 != data_type2 || data_type1 != data_type3 || data_type1 != DT_INT32) {
    KERNEL_LOG_ERROR("SparseSegmentSqrtNGrad kernel data type [%u] not support.", data_type1);
    return KERNEL_STATUS_PARAM_INVALID;
  }
  auto shape0 = inputx->GetTensorShape();
  auto shape1 = input_indices->GetTensorShape();
  auto shape2 = input_segment_ids->GetTensorShape();
  auto scalarshape = input_output_dim->GetTensorShape();
  if (shape0->GetDims() < 1) {
    KERNEL_LOG_ERROR("[%s] Tensor input0's rank less than 1.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (shape1->NumElements() != shape2->NumElements()) {
    KERNEL_LOG_ERROR("[%s] Tensor input1&input2's ranks mismatch.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (data_type0 == DT_FLOAT) {
    return ComputeKernal<float>(ctx);
  } else if (data_type0 == DT_DOUBLE) {
    return ComputeKernal<double>(ctx);
  } else {
    return ComputeKernal<Eigen::half>(ctx);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t SparseSegmentSqrtNGradCpuKernel::ComputeKernal(CpuKernelContext &ctx) {
  size_t n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
  size_t m = ctx.Input(2)->GetTensorShape()->NumElements();
  int l = ctx.Output(0)->GetTensorShape()->GetDimSize(0);
  auto x_addr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto indices_addr = reinterpret_cast<int32_t *>(ctx.Input(1)->GetData());
  auto segment_ids_addr = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
  int k = *reinterpret_cast<int32_t *>(ctx.Input(3)->GetData());
  auto y_addr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  std::vector<int64_t> y_shape_values = ctx.Input(0)->GetTensorShape()->GetDimSizes();
  y_shape_values[0] = k;
  ctx.Output(0)->GetTensorShape()->SetDimSizes(y_shape_values);
  const size_t tensor_dim = 2;
  Eigen::TensorMap<Eigen::Tensor<T, tensor_dim>, Eigen::Aligned> res_map(y_addr, l, n);
  res_map.setZero();
  for (size_t i = 1; i < m; i++) {
    if (segment_ids_addr[i] < segment_ids_addr[i - 1]) {
      KERNEL_LOG_ERROR("Segment_ids should be sorted.");
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  for (size_t i = 0; i < m; i++) {
    if (indices_addr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
      KERNEL_LOG_ERROR("Indices out of range.");
      return KERNEL_STATUS_PARAM_INVALID;
    }
    if (segment_ids_addr[i] >= k) {
      KERNEL_LOG_ERROR("Segment_ids out of range.");
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  int beginindex = segment_ids_addr[0];
  size_t countnum = 1;
  for (size_t i = 1; i < m; i++) {
    if (segment_ids_addr[i] == beginindex) {
      countnum++;
      continue;
    }
    for (size_t j = 1; j <= countnum; j++) {
      for (size_t l = 0; l < n; l++) {
        y_addr[indices_addr[i - j] * n + l] += x_addr[beginindex * n + l] / (T)(sqrt(countnum));
      }
      beginindex = segment_ids_addr[i];
      countnum = 1;
    }
  }
  int i = m;
  for (size_t j = 1; j <= countnum; j++) {
    for (size_t l = 0; l < n; l++) {
      y_addr[indices_addr[i - j] * n + l] += x_addr[beginindex * n + l] / (T)(sqrt(countnum));
    }
  }
  return KERNEL_STATUS_OK;
 };
 REGISTER_CPU_KERNEL(SparseSegmentSqrtNGrad, SparseSegmentSqrtNGradCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n_grad.h
@ -0,0 +1,37 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_GRAD_H_
 #define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_GRAD_H_
 #include "cpu_ops_kernel.h"
 #include "cpu_types.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class SparseSegmentSqrtNGradCpuKernel : public CpuKernel {
 public:
  SparseSegmentSqrtNGradCpuKernel() = default;
  ~SparseSegmentSqrtNGradCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  static uint32_t ComputeKernal(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n_with_num_segments.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n_with_num_segments.cc
@ -0,0 +1,186 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sparse_segment_sqrt_n_with_num_segments.h"
 #include <math.h>
 #include "Eigen/Core"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kInputNum = 4;
 const uint32_t kOutputNum = 1;
 const char *SparseSegmentSqrtNWithNumSegments = "SparseSegmentSqrtNWithNumSegments";
 #define COMPUTE_CASE(DTYPE, TYPE, DTYPE_1, DTYPE_2, DTYPE_3, CTX)     \
  case (DTYPE):                                                       \
    if ((DTYPE_1) == DT_INT32) {                                      \
      if ((DTYPE_2) == DT_INT32) {                                    \
        if ((DTYPE_3) == DT_INT32) {                                  \
          return Computekernel<TYPE, int32_t, int32_t, int32_t>(CTX); \
        } else {                                                      \
          return Computekernel<TYPE, int32_t, int32_t, int64_t>(CTX); \
        }                                                             \
      } else {                                                        \
        if ((DTYPE_3) == DT_INT32) {                                  \
          return Computekernel<TYPE, int32_t, int64_t, int32_t>(CTX); \
        } else {                                                      \
          return Computekernel<TYPE, int32_t, int64_t, int64_t>(CTX); \
        }                                                             \
      }                                                               \
    } else {                                                          \
      if ((DTYPE_2) == DT_INT32) {                                    \
        if ((DTYPE_3) == DT_INT32) {                                  \
          return Computekernel<TYPE, int64_t, int32_t, int32_t>(CTX); \
        } else {                                                      \
          return Computekernel<TYPE, int64_t, int32_t, int64_t>(CTX); \
        }                                                             \
      } else {                                                        \
        if ((DTYPE_3) == DT_INT32) {                                  \
          return Computekernel<TYPE, int64_t, int64_t, int32_t>(CTX); \
        } else {                                                      \
          return Computekernel<TYPE, int64_t, int64_t, int64_t>(CTX); \
        }                                                             \
      }                                                               \
    }                                                                 \
    break;
 }  // namespace
 namespace aicpu {
 uint32_t SparseSegmentSqrtNWithNumSegmentsCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSegmentSqrtNWithNumSegments normalcheck failed.");
  Tensor *x = ctx.Input(0);
  Tensor *indices = ctx.Input(1);
  Tensor *segment_ids = ctx.Input(2);
  Tensor *num_segments = ctx.Input(3);
  auto x_shape = x->GetTensorShape();
  auto indices_shape = indices->GetTensorShape();
  auto segment_ids_shape = segment_ids->GetTensorShape();
  auto num_segments_shape = num_segments->GetTensorShape();
  if (x_shape->GetDims() < 1) {
    KERNEL_LOG_ERROR("[%s] Tensor x's rank less than 1.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (indices->NumElements() != segment_ids->NumElements()) {
    KERNEL_LOG_ERROR("[%s] Tensor indices&segment_ids's ranks mismatch.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  auto x_data_type = x->GetDataType();
  auto indices_data_type = indices->GetDataType();
  auto segment_ids_data_type = segment_ids->GetDataType();
  auto num_segments_data_type = num_segments->GetDataType();
  if (indices_data_type != DT_INT32 && indices_data_type != DT_INT64) {
    KERNEL_LOG_ERROR("SparseSegmentSqrtNWithNumSegments kernel data type [%s] not support.",
                     DTypeStr(indices_data_type).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (segment_ids_data_type != DT_INT32 && segment_ids_data_type != DT_INT64) {
    KERNEL_LOG_ERROR("SparseSegmentSqrtNWithNumSegments kernel data type [%s] not support.",
                     DTypeStr(segment_ids_data_type).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (num_segments_data_type != DT_INT32 && num_segments_data_type != DT_INT64) {
    KERNEL_LOG_ERROR("SparseSegmentSqrtNWithNumSegments kernel data type [%s] not support.",
                     DTypeStr(num_segments_data_type).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  switch (x_data_type) {
    COMPUTE_CASE(DT_FLOAT16, Eigen::half, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
    COMPUTE_CASE(DT_FLOAT, float, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
    COMPUTE_CASE(DT_DOUBLE, double, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
    default:
      KERNEL_LOG_ERROR(
        "SparseSegmentSqrtNWithNumSegments kernel data type [%s] not "
        "support.",
        DTypeStr(x_data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
 }
 REGISTER_CPU_KERNEL(SparseSegmentSqrtNWithNumSegments, SparseSegmentSqrtNWithNumSegmentsCpuKernel);
 template <typename T1, typename T2, typename T3, typename T4>
 uint32_t SparseSegmentSqrtNWithNumSegmentsCpuKernel::Computekernel(CpuKernelContext &ctx) {
  int n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
  int m = ctx.Input(2)->GetTensorShape()->NumElements();
  auto x_ptr = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
  auto indices_ptr = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
  auto segment_ids_ptr = reinterpret_cast<T3 *>(ctx.Input(2)->GetData());
  auto num_segments_ptr = reinterpret_cast<T4 *>(ctx.Input(3)->GetData());
  auto y_ptr = reinterpret_cast<T1 *>(ctx.Output(0)->GetData());
  std::vector<int64_t> y_shape_values = ctx.Input(0)->GetTensorShape()->GetDimSizes();
  y_shape_values[0] = num_segments_ptr[0];
  ctx.Output(0)->GetTensorShape()->SetDimSizes(y_shape_values);
  for (int64_t i = 1; i < m; i++) {
    if (segment_ids_ptr[i] < segment_ids_ptr[i - 1]) {
      KERNEL_LOG_ERROR("segment_ids should be sorted.");
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  for (int64_t i = 0; i < m; i++) {
    if (indices_ptr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
      KERNEL_LOG_ERROR("indices out of range.");
      return KERNEL_STATUS_PARAM_INVALID;
    }
    if (segment_ids_ptr[i] >= num_segments_ptr[0]) {
      KERNEL_LOG_ERROR("segment_ids out of range.");
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  for (int64_t i = 0; i < ctx.Output(0)->GetTensorShape()->NumElements(); i++) {
    y_ptr[i] = (T1)0;
  }
  int oldindex = -1;
  int countnum = 0;
  for (int64_t i = 0; i < m; i++) {
    if (oldindex == segment_ids_ptr[i]) {
      countnum++;
    } else if (countnum != 0) {
      for (int64_t j = 0; j < n; j++) {
        y_ptr[j + oldindex * n] /= (static_cast<T1>(sqrt(countnum)));
      }
      countnum = 1;
      oldindex = segment_ids_ptr[i];
    } else {
      countnum = 1;
      oldindex = segment_ids_ptr[i];
    }
    for (int64_t j = 0; j < n; j++) {
      y_ptr[j + oldindex * n] += x_ptr[j + indices_ptr[i] * n];
    }
  }
  if (countnum != 0) {
    for (int64_t j = 0; j < n; j++) {
      y_ptr[j + oldindex * n] /= (static_cast<T1>(sqrt(countnum)));
    }
  }
  return KERNEL_STATUS_OK;
 }
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n_with_num_segments.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n_with_num_segments.h
@ -0,0 +1,38 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_WITH_NUM_SEGMENTS_H_
 #define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_WITH_NUM_SEGMENTS_H_
 #include "cpu_ops_kernel.h"
 #include "cpu_types.h"
 #include "utils/bcast.h"
 #include "utils/sparse_tensor.h"
 namespace aicpu {
 class SparseSegmentSqrtNWithNumSegmentsCpuKernel : public CpuKernel {
 public:
  SparseSegmentSqrtNWithNumSegmentsCpuKernel() = default;
  ~SparseSegmentSqrtNWithNumSegmentsCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T1, typename T2, typename T3, typename T4>
  uint32_t Computekernel(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sum_with_num_segments.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sum_with_num_segments.cc
@ -0,0 +1,152 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sparse_segment_sum_with_num_segments.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kInputNum = 4;
 const uint32_t kOutputNum = 1;
 const char *SparseSegmentSumWithNumSegments = "SparseSegmentSumWithNumSegments";
 #define COMPUTE_CASE(DTYPE, TYPE, ITYPE, CTX)   \
  case (DTYPE):                                 \
    if ((ITYPE) == DT_INT32) {                  \
      return ComputeKernel<TYPE, int32_t>(CTX); \
    } else {                                    \
      return ComputeKernel<TYPE, int64_t>(CTX); \
    }                                           \
    break;
 }  // namespace
 namespace aicpu {
 uint32_t SparseSegmentSumWithNumSegmentsCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSegmentSumWithNumSegments normalcheck failed.");
  Tensor *x = ctx.Input(0);
  Tensor *indices = ctx.Input(1);
  Tensor *segment_ids = ctx.Input(2);
  Tensor *num_segments = ctx.Input(3);
  if (x->GetDataSize() == 0 || indices->GetDataSize() == 0 || segment_ids->GetDataSize() == 0 ||
      num_segments->GetDataSize() == 0) {
    KERNEL_LOG_ERROR("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  auto x_shape = x->GetTensorShape();
  auto indices_shape = indices->GetTensorShape();
  auto segment_ids_shape = segment_ids->GetTensorShape();
  auto num_segments_shape = num_segments->GetTensorShape();
  if (x_shape->GetDims() < 1) {
    KERNEL_LOG_ERROR("[%s] Tensor x's rank less than 1.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (indices_shape->NumElements() != segment_ids_shape->NumElements()) {
    KERNEL_LOG_ERROR("[%s] Tensor indices&segment_ids's ranks mismatch.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  auto x_data_type = x->GetDataType();
  auto indices_data_type = indices->GetDataType();
  auto segment_ids_data_type = segment_ids->GetDataType();
  auto num_segments_data_type = num_segments->GetDataType();
  if (indices_data_type != DT_INT32 && indices_data_type != DT_INT64) {
    KERNEL_LOG_ERROR("SparseSegmentSumWithNumSegments kernel data type [%s] not support.",
                     DTypeStr(indices_data_type).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (segment_ids_data_type != indices_data_type || num_segments_data_type != indices_data_type) {
    KERNEL_LOG_ERROR("SparseSegmentSumWithNumSegments kernel data type mismatch.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  switch (x_data_type) {
    COMPUTE_CASE(DT_INT8, int8_t, indices_data_type, ctx)
    COMPUTE_CASE(DT_INT16, int16_t, indices_data_type, ctx)
    COMPUTE_CASE(DT_INT32, int32_t, indices_data_type, ctx)
    COMPUTE_CASE(DT_INT64, int64_t, indices_data_type, ctx)
    COMPUTE_CASE(DT_UINT8, uint8_t, indices_data_type, ctx)
    COMPUTE_CASE(DT_UINT16, uint16_t, indices_data_type, ctx)
    COMPUTE_CASE(DT_FLOAT16, Eigen::half, indices_data_type, ctx)
    COMPUTE_CASE(DT_FLOAT, float, indices_data_type, ctx)
    COMPUTE_CASE(DT_DOUBLE, double, indices_data_type, ctx)
    default:
      KERNEL_LOG_ERROR("SparseSegmentSumWithNumSegments kernel data type [%s] not support.",
                       DTypeStr(x_data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(SparseSegmentSumWithNumSegments, SparseSegmentSumWithNumSegmentsCpuKernel);
 template <typename dataT, typename indicesT>
 uint32_t SparseSegmentSumWithNumSegmentsCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
  size_t n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
  size_t m = ctx.Input(2)->GetTensorShape()->NumElements();
  size_t num_elements = ctx.Output(0)->GetTensorShape()->NumElements();
  auto x_ptr = reinterpret_cast<dataT *>(ctx.Input(0)->GetData());
  auto indices_ptr = reinterpret_cast<indicesT *>(ctx.Input(1)->GetData());
  auto segment_ids_ptr = reinterpret_cast<indicesT *>(ctx.Input(2)->GetData());
  auto num_segments_ptr = reinterpret_cast<indicesT *>(ctx.Input(3)->GetData());
  auto y_ptr = reinterpret_cast<dataT *>(ctx.Output(0)->GetData());
  std::vector<int64_t> y_shape_values = ctx.Input(0)->GetTensorShape()->GetDimSizes();
  y_shape_values[0] = num_segments_ptr[0];
  ctx.Output(0)->GetTensorShape()->SetDimSizes(y_shape_values);
  for (size_t i = 1; i < m; i++) {
    if (segment_ids_ptr[i] < segment_ids_ptr[i - 1]) {
      KERNEL_LOG_ERROR("segment_ids should be sorted.");
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  for (size_t i = 0; i < m; i++) {
    if (indices_ptr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
      KERNEL_LOG_ERROR("indices out of range.");
      return KERNEL_STATUS_PARAM_INVALID;
    }
    if (segment_ids_ptr[i] >= num_segments_ptr[0]) {
      KERNEL_LOG_ERROR("segment_ids out of range.");
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  for (size_t i = 0; i < num_elements; i++) {
    y_ptr[i] = (dataT)0;
  }
  int oldindex = -1;
  for (size_t i = 0; i < m; i++) {
    if (oldindex != segment_ids_ptr[i]) {
      oldindex = segment_ids_ptr[i];
      for (size_t j = 0; j < n; j++) {
        y_ptr[j + oldindex * n] = (dataT)0;
      }
    }
    for (size_t j = 0; j < n; j++) {
      y_ptr[j + oldindex * n] += x_ptr[j + indices_ptr[i] * n];
    }
  }
  return KERNEL_STATUS_OK;
 };
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sum_with_num_segments.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sum_with_num_segments.h
@ -0,0 +1,38 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SUM_H_
 #define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SUM_H_
 #include "cpu_ops_kernel.h"
 #include "cpu_types.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class SparseSegmentSumWithNumSegmentsCpuKernel : public CpuKernel {
 public:
  SparseSegmentSumWithNumSegmentsCpuKernel() = default;
  ~SparseSegmentSumWithNumSegmentsCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename dataT, typename indicesT>
  static uint32_t ComputeKernel(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_softmax_cross_entropy_with_logits.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_softmax_cross_entropy_with_logits.cc
@ -0,0 +1,253 @@
 /**
 Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sparse_softmax_cross_entropy_with_logits.h"
 #include <iostream>
 #include <unsupported/Eigen/CXX11/Tensor>
 #include "cpu_kernel_utils.h"
 #include "cpu_types.h"
 #include "kernel_log.h"
 #include "status.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const char *kSparseSoftmaxCrossEntropyWithLogits = "SparseSoftmaxCrossEntropyWithLogits";
 const uint32_t kOutputNum{2};
 const uint32_t kInputNum{2};
 const uint32_t kDimSizeTwo{2};
 const uint32_t kDimSizeOne{1};
 const uint32_t paralledDataNum{2048};
 }  // namespace
 namespace aicpu {
 template <typename data_type, typename label_type>
 void SparseSoftmaxCrossEntropyWithLogitsSingleOp(data_type *input_features, label_type *input_labels,
                                                 data_type *output_loss, data_type *output_backprop, int64_t batch_size,
                                                 int64_t classes_num, size_t features_total) {
  double_t *dims_exp_sum = static_cast<double_t *>(malloc(batch_size * sizeof(double_t)));
  data_type *dims_maximum = static_cast<data_type *>(malloc(batch_size * sizeof(data_type)));
  memset(dims_exp_sum, 0, batch_size * sizeof(double_t));
  Eigen::TensorMap<Eigen::Tensor<data_type, kDimSizeTwo>, Eigen::Aligned> logits(input_features, batch_size,
                                                                                 classes_num);
  Eigen::TensorMap<Eigen::Tensor<double_t, 1>, Eigen::Aligned> dims_sum(dims_exp_sum, batch_size);
  Eigen::TensorMap<Eigen::Tensor<data_type, 1>, Eigen::Aligned> dims_max(dims_maximum, batch_size);
  Eigen::array<int, 1> axes{{1}};
  // compute softmax
  dims_max = logits.maximum(axes);
  const data_type constant_one(1.0);
  for (size_t index = 0, batch_idx = 0; index < features_total; index++) {
    output_backprop[index] = Eigen::numext::exp(input_features[index] - dims_maximum[batch_idx]);
    dims_exp_sum[batch_idx] += static_cast<double_t>(output_backprop[index]);
    if ((index + 1) % classes_num == 0) {
      batch_idx++;
    }
  }
  dims_sum = dims_sum.inverse();
  for (size_t index = 0, batch_idx = 0; index < features_total; index++) {
    *(output_backprop + index) =
      static_cast<data_type>(static_cast<double_t>(*(output_backprop + index)) * dims_exp_sum[batch_idx]);
    if ((index + 1) % classes_num == 0) {
      batch_idx++;
    }
  }
  label_type offset = 0;
  for (int64_t index = 0, batch_base = 0; index < batch_size; ++index, batch_base += classes_num) {
    offset = input_labels[index];
    *(output_loss + index) = -Eigen::numext::log(*(output_backprop + batch_base + offset));
    *(output_backprop + batch_base + offset) = *(output_backprop + batch_base + offset) - constant_one;
  }
  free(dims_exp_sum);
  free(dims_maximum);
 }
 template <typename data_type, typename label_type>
 void SparseSoftmaxCrossEntropyWithLogitsMultiOp(data_type *input_features, label_type *input_labels,
                                                data_type *output_loss, data_type *output_backprop, size_t begin,
                                                size_t end, int64_t classes_num, size_t features_total) {
  for (size_t index = begin; index < end; index++) {
    size_t batch_begin = index * classes_num;
    size_t batch_end = batch_begin + classes_num;
    data_type max_value = input_features[batch_begin];
    double_t sum_value{0};
    data_type constant_one{1};
    for (size_t idx = batch_begin; idx < batch_end; idx++) {
      if (max_value < input_features[idx]) {
        max_value = input_features[idx];
      }
    }
    for (size_t idx = batch_begin; idx < batch_end; idx++) {
      output_backprop[idx] = Eigen::numext::exp(input_features[idx] - max_value);
      sum_value += static_cast<double_t>(output_backprop[idx]);
    }
    sum_value = double_t(1.0) / sum_value;
    for (size_t idx = batch_begin; idx < batch_end; idx++) {
      output_backprop[idx] = static_cast<data_type>(static_cast<double_t>(output_backprop[idx]) * sum_value);
      if (idx % classes_num == static_cast<size_t>(input_labels[index])) {
        output_loss[index] = -Eigen::numext::log(output_backprop[idx]);
        output_backprop[idx] = output_backprop[idx] - constant_one;
      }
    }
  }
 }
 std::uint32_t SparseSoftmaxCrossEntropyWithLogitsExtraCheck(CpuKernelContext &ctx) {
  Tensor *input_features = ctx.Input(0);
  Tensor *input_labels = ctx.Input(1);
  Tensor *output_loss = ctx.Output(0);
  Tensor *output_backprop = ctx.Output(1);
  std::vector<int64_t> features_dims = input_features->GetTensorShape()->GetDimSizes();
  std::vector<int64_t> labels_dims = input_labels->GetTensorShape()->GetDimSizes();
  std::vector<int64_t> loss_dims = output_loss->GetTensorShape()->GetDimSizes();
  std::vector<int64_t> backprop_dims = output_backprop->GetTensorShape()->GetDimSizes();
  if ((input_features->GetDataSize() == 0) || (input_labels->GetDataSize() == 0)) {
    KERNEL_LOG_INFO("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (input_features->GetDataType() != output_loss->GetDataType() ||
      input_features->GetDataType() != output_backprop->GetDataType()) {
    KERNEL_LOG_ERROR(
      "The data type of the input features [%s], output loss [%s], output "
      "backprop [%s] must be the same type.",
      DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str(),
      DTypeStr(ctx.Output(1)->GetDataType()).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (input_labels->GetDataType() != DT_INT32 && input_labels->GetDataType() != DT_INT64) {
    KERNEL_LOG_ERROR(
      "The data type of the input labels [%s], must be the int32 or int64 "
      "type.",
      DTypeStr(ctx.Input(1)->GetDataType()).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (features_dims.size() != kDimSizeTwo || labels_dims.size() != kDimSizeOne || loss_dims.size() != kDimSizeOne ||
      backprop_dims.size() != kDimSizeTwo) {
    KERNEL_LOG_ERROR(
      "The dims of the input features [%d], output backprop [%d] must be "
      "[batch_size x num_classes]. the dims of input labels [%d], output "
      "loss [%d] must be [batch_size].",
      features_dims.size(), backprop_dims.size(), labels_dims.size(), loss_dims.size());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  int64_t batch_size = features_dims[0];
  int64_t num_classes = features_dims[1];
  if (labels_dims[0] != batch_size) {
    KERNEL_LOG_ERROR("the size of label must be equal with batch_size[%d]", batch_size);
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (loss_dims[0] != batch_size) {
    KERNEL_LOG_ERROR("the size of loss must be equal with batch_size[%d]", batch_size);
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (backprop_dims[0] != batch_size || backprop_dims[1] != num_classes) {
    KERNEL_LOG_ERROR("the size of label must be equal with [%d x %d], but get [%d x %d]", batch_size, num_classes,
                     backprop_dims[0], backprop_dims[1]);
    return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename data_type, typename label_type>
 inline uint32_t SparseSoftmaxCrossEntropyWithLogitsCompute(const CpuKernelContext &ctx) {
  size_t features_total = static_cast<size_t>(ctx.Input(0)->NumElements());
  uint64_t total_size = ctx.Input(0)->GetDataSize();
  uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
  auto *input_features = static_cast<data_type *>(ctx.Input(0)->GetData());
  auto *input_labels = static_cast<label_type *>(ctx.Input(1)->GetData());
  auto *output_loss = static_cast<data_type *>(ctx.Output(0)->GetData());
  auto *output_backprop = static_cast<data_type *>(ctx.Output(1)->GetData());
  bool muilt_core_flag = false;
  if (total_size > paralledDataNum * sizeof(data_type)) {
    muilt_core_flag = true;
  }
  std::vector<std::int64_t> dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
  std::vector<std::int64_t> labels_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
  for (int64_t idx = 0; idx < labels_dims[0]; idx++) {
    if (input_labels[idx] >= dims[1]) {
      KERNEL_LOG_ERROR(
        "Received a label value of [%d] which is outside the valid range of "
        "[0, %d).",
        input_labels[idx], dims[1]);
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  // Determine whether to enable multi-core parallel computing
  size_t pivot, classes_num;
  int64_t batch_size{1};
  pivot = dims.size() - 1;
  classes_num = dims[pivot];
  for (size_t index = 0; index < dims.size(); index++) {
    if (index < pivot) {
      batch_size *= dims[index];
    }
  }
  // Eigen::Array
  if (muilt_core_flag) {
    std::int64_t per_unit_size{batch_size / std::min(std::max(1L, cores - 2L), batch_size)};
    auto shard = [&](size_t begin, size_t end) {
      SparseSoftmaxCrossEntropyWithLogitsMultiOp(input_features, input_labels, output_loss, output_backprop, begin, end,
                                                 classes_num, features_total);
    };
    CpuKernelUtils::ParallelFor(ctx, batch_size, per_unit_size, shard);
  } else if (cores != 0) {
    SparseSoftmaxCrossEntropyWithLogitsSingleOp<data_type, label_type>(
      input_features, input_labels, output_loss, output_backprop, batch_size, classes_num, features_total);
  } else {
    KERNEL_LOG_ERROR("SparseSoftmaxCrossEntropyWithLogits compute failed.");
    return KERNEL_STATUS_INNER_ERROR;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t SparseSoftmaxCrossEntropyWithLogitsCpuKernel::Compute(CpuKernelContext &ctx) {
  if (NormalCheck(ctx, kInputNum, kOutputNum) == KERNEL_STATUS_PARAM_INVALID) {
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (SparseSoftmaxCrossEntropyWithLogitsExtraCheck(ctx) == KERNEL_STATUS_PARAM_INVALID) {
    return KERNEL_STATUS_PARAM_INVALID;
  }
  // choose compute function depend on dataType
  auto data_type = static_cast<DataType>(ctx.Input(0)->GetDataType());
  auto labels_type = static_cast<DataType>(ctx.Input(1)->GetDataType());
  switch (data_type) {
    case DT_FLOAT16: {
      if (labels_type == DT_INT32) {
        return SparseSoftmaxCrossEntropyWithLogitsCompute<Eigen::half, std::int32_t>(ctx);
      } else if (labels_type == DT_INT64) {
        return SparseSoftmaxCrossEntropyWithLogitsCompute<Eigen::half, std::int64_t>(ctx);
      }
    }
    case DT_FLOAT: {
      if (labels_type == DT_INT32) {
        return SparseSoftmaxCrossEntropyWithLogitsCompute<std::float_t, std::int32_t>(ctx);
      } else if (labels_type == DT_INT64) {
        return SparseSoftmaxCrossEntropyWithLogitsCompute<std::float_t, std::int64_t>(ctx);
      }
    }
    case DT_DOUBLE: {
      if (labels_type == DT_INT32) {
        return SparseSoftmaxCrossEntropyWithLogitsCompute<std::double_t, std::int32_t>(ctx);
      } else if (labels_type == DT_INT64) {
        return SparseSoftmaxCrossEntropyWithLogitsCompute<std::double_t, std::int64_t>(ctx);
      }
    }
    default:
      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
                       DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
 }
 REGISTER_CPU_KERNEL(kSparseSoftmaxCrossEntropyWithLogits, SparseSoftmaxCrossEntropyWithLogitsCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_softmax_cross_entropy_with_logits.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_softmax_cross_entropy_with_logits.h
@ -0,0 +1,27 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SPARSESOFTMAXENTROPYWITHLOGITS_H_
 #define AICPU_KERNELS_NORMALIZED_SPARSESOFTMAXENTROPYWITHLOGITS_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class SparseSoftmaxCrossEntropyWithLogitsCpuKernel final : public CpuKernel {
  virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_sparse_maximum.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_sparse_maximum.cc
@ -0,0 +1,241 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sparse_sparse_maximum.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const char *kSparseSparseMaximum = "SparseSparseMaximum";
 const uint32_t kOutputNum = 2;
 const uint32_t kInputNum = 6;
 constexpr int64_t kIndex0 = 0;
 constexpr int64_t kIndex1 = 1;
 constexpr int64_t kIndex2 = 2;
 constexpr int64_t kIndex3 = 3;
 constexpr int64_t kIndex4 = 4;
 constexpr int64_t kIndex5 = 5;
 bool isMatrix(const std::shared_ptr<aicpu::TensorShape> shape) { return shape->GetDims() == 2; }
 bool isVector(const std::shared_ptr<aicpu::TensorShape> shape) { return shape->GetDims() == 1; }
 }  // namespace
 // 定义命名空间aicpu
 namespace aicpu {
 uint32_t SparseMaximumCpuKernel::NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &databank) {
  databank.a_indices_t = ctx.Input(kIndex0);
  databank.a_values_t = ctx.Input(kIndex1);
  databank.a_shape_t = ctx.Input(kIndex2);
  databank.b_indices_t = ctx.Input(kIndex3);
  databank.b_values_t = ctx.Input(kIndex4);
  databank.b_shape_t = ctx.Input(kIndex5);
  databank.output_indices_t = ctx.Output(kIndex0);
  databank.output_values_t = ctx.Output(kIndex1);
  KERNEL_CHECK_FALSE(
    isMatrix(databank.a_indices_t->GetTensorShape()) && isMatrix(databank.b_indices_t->GetTensorShape()),
    KERNEL_STATUS_PARAM_INVALID,
    "Inputs a_indices and b_indices should be "
    "matrices but received shapes: [%d], [%d]",
    databank.a_indices_t->GetTensorShape()->GetDims(), databank.b_indices_t->GetTensorShape()->GetDims());
  KERNEL_CHECK_FALSE(isVector(databank.a_values_t->GetTensorShape()) && isVector(databank.b_values_t->GetTensorShape()),
                     KERNEL_STATUS_PARAM_INVALID,
                     "Inputs a_values and b_values should be vectors "
                     "but received shapes: [%d] and [%d]",
                     databank.a_values_t->GetTensorShape()->GetDims(),
                     databank.b_values_t->GetTensorShape()->GetDims());
  KERNEL_CHECK_FALSE(isVector(databank.a_shape_t->GetTensorShape()) && isVector(databank.b_shape_t->GetTensorShape()),
                     KERNEL_STATUS_PARAM_INVALID, "Input shapes should be a vector but received shapes [%d] and [%d]",
                     databank.a_shape_t->GetTensorShape()->GetDims(), databank.b_shape_t->GetTensorShape()->GetDims());
  return KERNEL_STATUS_OK;
 }
 inline static int64_t cmp(const TTypes<int64_t>::Matrix &a_idx, const TTypes<int64_t>::Matrix &b_idx,
                          const int64_t a_row, const int64_t b_row, const int64_t dims) {
  for (int d = 0; d < dims; ++d) {
    const int64_t a = a_idx(a_row, d);
    const int64_t b = b_idx(b_row, d);
    if (a < b) {
      return -1;
    } else if (a > b) {
      return 1;
    }
  }
  return 0;
 }
 template <typename T>
 void SparseMaximumCpuKernel::UnionSparseIndicesAndValues(typename TTypes<int64_t>::Matrix a_indices_mat,
                                                         typename TTypes<T>::Flat a_values, int64_t a_nnz,
                                                         typename TTypes<int64_t>::Matrix b_indices_mat,
                                                         typename TTypes<T>::Flat b_values, int64_t b_nnz,
                                                         int64_t num_dims, std::vector<T> *a_augmented_values,
                                                         std::vector<T> *b_augmented_values,
                                                         std::vector<std::pair<bool, int64_t>> *entries_to_copy) {
  entries_to_copy->reserve(a_nnz + b_nnz);
  a_augmented_values->reserve(a_nnz);
  b_augmented_values->reserve(b_nnz);
  int64_t i = 0, j = 0;
  const T kZero = T(0);
  while (i < a_nnz && j < b_nnz) {
    switch (cmp(a_indices_mat, b_indices_mat, i, j, num_dims)) {
      case -1:
        entries_to_copy->emplace_back(true, i);
        a_augmented_values->push_back(a_values(i));
        b_augmented_values->push_back(kZero);
        ++i;
        break;
      case 0:
        entries_to_copy->emplace_back(true, i);
        a_augmented_values->push_back(a_values(i));
        b_augmented_values->push_back(b_values(j));
        ++i;
        ++j;
        break;
      case 1:
        entries_to_copy->emplace_back(false, j);
        a_augmented_values->push_back(kZero);
        b_augmented_values->push_back(b_values(j));
        ++j;
        break;
    }
  }
  // Handles leftovers; at most one loop runs.
  while (i < a_nnz) {
    entries_to_copy->emplace_back(true, i);
    a_augmented_values->push_back(a_values(i++));
    b_augmented_values->push_back(kZero);
  }
  while (j < b_nnz) {
    entries_to_copy->emplace_back(false, j);
    a_augmented_values->push_back(kZero);
    b_augmented_values->push_back(b_values(j++));
  }
 }
 template <typename T>
 uint32_t SparseMaximumCpuKernel::EigenedSparseMax(DataBank &databank) {
  const int64_t a_nnz = databank.a_indices_t->GetTensorShape()->GetDimSize(0);
  const int64_t b_nnz = databank.b_indices_t->GetTensorShape()->GetDimSize(0);
  EigenTensor a_values_t(databank.a_values_t, databank.a_values_t->GetData());
  const auto a_values = a_values_t.vec<T>();
  EigenTensor b_values_t(databank.b_values_t, databank.b_values_t->GetData());
  const auto b_values = b_values_t.vec<T>();
  EigenTensor a_indices_t(databank.a_indices_t, databank.a_indices_t->GetData());
  const auto a_indices_mat = a_indices_t.matrix<int64_t>();
  EigenTensor b_indices_t(databank.b_indices_t, databank.b_indices_t->GetData());
  const auto b_indices_mat = b_indices_t.matrix<int64_t>();
  const int64_t num_dims = databank.a_indices_t->GetTensorShape()->GetDimSize(1);
  EigenTensor a_shape_t(databank.a_shape_t, databank.a_shape_t->GetData());
  const auto a_shape = a_shape_t.flat<int64_t>();
  EigenTensor b_shape_t(databank.b_shape_t, databank.b_shape_t->GetData());
  const auto b_shape = b_shape_t.flat<int64_t>();
  KERNEL_CHECK_FALSE(a_values.size() == a_nnz && b_values.size() == b_nnz, KERNEL_STATUS_PARAM_INVALID,
                     "Expected [%d] and [%d] non-empty input values, got [%d] and [%d]", a_nnz, b_nnz, a_values.size(),
                     b_values.size());
  KERNEL_CHECK_FALSE(databank.a_shape_t->GetTensorShape()->NumElements() == num_dims, KERNEL_STATUS_PARAM_INVALID,
                     "Second dimension of a_indices and length of "
                     "a_shape must match, got [%d] and [%d]",
                     databank.a_shape_t->GetTensorShape()->NumElements(), num_dims);
  KERNEL_CHECK_FALSE(num_dims > 0, KERNEL_STATUS_PARAM_INVALID, "Tensors must not be empty");
  KERNEL_CHECK_FALSE(
    databank.a_shape_t->GetTensorShape()->NumElements() == databank.b_shape_t->GetTensorShape()->NumElements(),
    KERNEL_STATUS_PARAM_INVALID, "Operands do not have the same ranks; got shapes: [%d] and [%d]",
    databank.a_shape_t->GetTensorShape()->NumElements(), databank.b_shape_t->GetTensorShape()->NumElements());
  for (int i = 0; i < num_dims; ++i) {
    KERNEL_CHECK_FALSE(a_shape(i) == b_shape(i), KERNEL_STATUS_PARAM_INVALID,
                       "Operands' shapes do not match: got [%d] and [%d] for dimension [%d]", a_shape(i), b_shape(i), i)
  }
  std::vector<T> a_augmented_values, b_augmented_values;
  std::vector<std::pair<bool, int64_t>> entries_to_copy;  // from_a?, idx
  UnionSparseIndicesAndValues(a_indices_mat, a_values, a_nnz, b_indices_mat, b_values, b_nnz, num_dims,
                              &a_augmented_values, &b_augmented_values, &entries_to_copy);
  const int64_t sum_nnz = a_augmented_values.size();
  EigenTensor output_values_t(databank.output_values_t, databank.output_values_t->GetData());
  EigenTensor output_indices_t(databank.output_indices_t, databank.output_indices_t->GetData());
  auto output_indices_mat = output_indices_t.matrix<int64_t>();
  for (int64_t i = 0; i < sum_nnz; ++i) {
    const bool from_a = entries_to_copy[i].first;
    const int64_t idx = entries_to_copy[i].second;
    output_indices_mat.chip<0>(i) = from_a ? a_indices_mat.chip<0>(idx) : b_indices_mat.chip<0>(idx);
  }
  using UnalignedTensorMap = Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>, Eigen::Unaligned>;
  auto a_augmented_values_t = UnalignedTensorMap(a_augmented_values.data(), sum_nnz);
  auto b_augmented_values_t = UnalignedTensorMap(b_augmented_values.data(), sum_nnz);
  output_values_t.flat<T>() =
    a_augmented_values_t.binaryExpr(b_augmented_values_t, Eigen::internal::scalar_max_op<T, T>());
  databank.output_indices_t->GetTensorShape()->SetDimSizes({sum_nnz, num_dims});
  databank.output_values_t->GetTensorShape()->SetDimSizes({sum_nnz});
  return KERNEL_STATUS_OK;
 }
 uint32_t SparseMaximumCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
                      "SparseSparseMaximum check input and output number failed.");
  DataBank databank;
  KERNEL_HANDLE_ERROR(NullptrAndMatVecCheck(ctx, databank), "SparseSparseMaximum check params failed.");
  DataType dt = static_cast<DataType>(databank.output_values_t->GetDataType());
  uint32_t KERNEL_STATUS;
  switch (dt) {
    case DT_INT8:
      KERNEL_STATUS = EigenedSparseMax<int8_t>(databank);
      break;
    case DT_UINT8:
      KERNEL_STATUS = EigenedSparseMax<uint8_t>(databank);
      break;
    case DT_INT16:
      KERNEL_STATUS = EigenedSparseMax<int16_t>(databank);
      break;
    case DT_UINT16:
      KERNEL_STATUS = EigenedSparseMax<uint16_t>(databank);
      break;
    case DT_INT32:
      KERNEL_STATUS = EigenedSparseMax<int32_t>(databank);
      break;
    case DT_INT64:
      KERNEL_STATUS = EigenedSparseMax<int64_t>(databank);
      break;
    case DT_FLOAT16:
      KERNEL_STATUS = EigenedSparseMax<Eigen::half>(databank);
      break;
    case DT_FLOAT:
      KERNEL_STATUS = EigenedSparseMax<float>(databank);
      break;
    case DT_DOUBLE:
      KERNEL_STATUS = EigenedSparseMax<double>(databank);
      break;
    default:
      KERNEL_LOG_ERROR("SparseSparseMaximum can't support this data type [%d].", dt);
      return KERNEL_STATUS_PARAM_INVALID;
  }
  if (KERNEL_STATUS != KERNEL_STATUS_OK) {
    KERNEL_LOG_ERROR("SparseSparseMaximum failed.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 // 注册该算子实现
 REGISTER_CPU_KERNEL(kSparseSparseMaximum, SparseMaximumCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_sparse_maximum.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_sparse_maximum.h
@ -0,0 +1,59 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "cpu_ops_kernel.h"
 #include "utils/eigen_tensor.h"
 namespace aicpu {
 struct DataBank {
  DataBank()
      : a_indices_t(nullptr),
        a_values_t(nullptr),
        a_shape_t(nullptr),
        b_indices_t(nullptr),
        b_values_t(nullptr),
        b_shape_t(nullptr) {}
  Tensor *a_indices_t;
  Tensor *a_values_t;
  Tensor *a_shape_t;
  Tensor *b_indices_t;
  Tensor *b_values_t;
  Tensor *b_shape_t;
  Tensor *output_indices_t;
  Tensor *output_values_t;
 };
 class SparseMaximumCpuKernel : public CpuKernel {
 public:
  ~SparseMaximumCpuKernel() = default;
  SparseMaximumCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  static void UnionSparseIndicesAndValues(typename TTypes<int64_t>::Matrix a_indices_mat,
                                          typename TTypes<T>::Flat a_values, int64_t a_nnz,
                                          typename TTypes<int64_t>::Matrix b_indices_mat,
                                          typename TTypes<T>::Flat b_values, int64_t b_nnz, int64_t num_dims,
                                          std::vector<T> *a_augmented_values, std::vector<T> *b_augmented_values,
                                          std::vector<std::pair<bool, int64_t>> *entries_to_copy);
  template <typename T>
  uint32_t EigenedSparseMax(DataBank &databank);
  static uint32_t NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &calc_info);
 };
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_sparse_minimum.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_sparse_minimum.cc
@ -0,0 +1,207 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sparse_sparse_minimum.h"
 #include <algorithm>
 #include "cpu_kernel_utils.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kOutputNum = 2;
 const uint32_t kInputNum = 6;
 const char *kSparseSparseMinimum = "SparseSparseMinimum";
 #define SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX)          \
  case (DTYPE): {                                                     \
    uint32_t result = SparseSparseMinimumCompute<TYPE>(CTX);          \
    if (result != KERNEL_STATUS_OK) {                                 \
      KERNEL_LOG_ERROR("SparseSparseMinimum kernel compute failed."); \
      return result;                                                  \
    }                                                                 \
    break;                                                            \
  }
 }  // namespace
 namespace aicpu {
 uint32_t SparseSparseMinimumCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSparseMinimum normal check failed.");
  const Tensor *x1_indices = ctx.Input(0);
  const Tensor *x1_values_t = ctx.Input(1);
  const Tensor *x1_shape = ctx.Input(2);
  const Tensor *x2_indices = ctx.Input(3);
  const Tensor *x2_values_t = ctx.Input(4);
  const Tensor *x2_shape = ctx.Input(5);
  auto x1_indices_shape = x1_indices->GetTensorShape();
  auto x2_indices_shape = x2_indices->GetTensorShape();
  KERNEL_CHECK_FALSE(((x1_indices_shape->GetDims() == 2) && (x2_indices_shape->GetDims() == 2)),
                     KERNEL_STATUS_PARAM_INVALID, "Input indices should be matrices but received dims: %d and %d.",
                     x1_indices_shape->GetDims(), x2_indices_shape->GetDims())
  const int64_t x1_nnz = x1_indices_shape->GetDimSize(0);
  const int64_t x2_nnz = x2_indices_shape->GetDimSize(0);
  auto x1_values_shape = x1_values_t->GetTensorShape();
  auto x2_values_shape = x2_values_t->GetTensorShape();
  KERNEL_CHECK_FALSE(((x1_values_shape->GetDims() == 1) && (x2_values_shape->GetDims() == 1)),
                     KERNEL_STATUS_PARAM_INVALID, "Input values should be vectors but received dims: %d and %d.",
                     x1_values_shape->GetDims(), x2_values_shape->GetDims())
  KERNEL_CHECK_FALSE(((x1_values_t->NumElements() == x1_nnz) && (x2_values_t->NumElements() == x2_nnz)),
                     KERNEL_STATUS_PARAM_INVALID,
                     "Expected %d and %d non-empty input values, but received : %d and %d.", x1_nnz, x2_nnz,
                     x1_values_t->NumElements(), x2_values_t->NumElements())
  KERNEL_CHECK_FALSE((x1_values_t->GetDataType() == x2_values_t->GetDataType()), KERNEL_STATUS_PARAM_INVALID,
                     "Data types of the input values should be the same, but "
                     "received %d-th and %d-th data type in the DataType enum.",
                     x1_values_t->GetDataType(), x2_values_t->GetDataType())
  auto x1_shape_shape = x1_shape->GetTensorShape();
  auto x2_shape_shape = x2_shape->GetTensorShape();
  KERNEL_CHECK_FALSE(((x1_shape_shape->GetDims() == 1) && (x2_shape_shape->GetDims() == 1)),
                     KERNEL_STATUS_PARAM_INVALID, "Input shapes should be vectors but received dims: %d and %d.",
                     x1_shape_shape->GetDims(), x2_shape_shape->GetDims())
  KERNEL_CHECK_FALSE((x1_shape_shape->GetDimSize(0) == x2_shape_shape->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID,
                     "Operands' should have the same ranks but received: %d and %d.", x1_shape_shape->GetDimSize(0),
                     x2_shape_shape->GetDimSize(0))
  auto shape_x1 = reinterpret_cast<int64_t *>(x1_shape->GetData());
  auto shape_x2 = reinterpret_cast<int64_t *>(x2_shape->GetData());
  for (int i = 0; i < x1_shape->NumElements(); ++i) {
    KERNEL_CHECK_FALSE(shape_x1[i] == shape_x2[i], KERNEL_STATUS_PARAM_INVALID,
                       "Operands' shapes do not match: got %d and %d for dimension %d", shape_x1[i], shape_x2[i], i)
  }
  auto data_type = ctx.Input(1)->GetDataType();
  switch (data_type) {
    SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
    SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
    SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_INT8, int8_t, ctx)
    SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_INT16, int16_t, ctx)
    SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
    SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
    SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
    SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
    SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
    default:
      KERNEL_LOG_ERROR("SparseSparseMinimum kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 int SparseSparseMinimumCpuKernel::cmp(const TTypes<int64_t>::ConstMatrix &x_idx, const int64_t x_row, const int dims,
                                      const TTypes<int64_t>::ConstMatrix &y_idx, const int64_t y_row) {
  for (int d = 0; d < dims; ++d) {
    const int64_t x = x_idx(x_row, d);
    const int64_t y = y_idx(y_row, d);
    if (x < y) {
      return -1;
    } else if (x > y) {
      return 1;
    }
  }
  return 0;
 }
 template <typename T>
 uint32_t SparseSparseMinimumCpuKernel::SparseSparseMinimumCompute(CpuKernelContext &ctx) {
  const EigenTensor x1_indices_ET(ctx.Input(0), ctx.Input(0)->GetData());
  const EigenTensor x2_indices_ET(ctx.Input(3), ctx.Input(3)->GetData());
  auto x1_indices_mat = x1_indices_ET.matrix<int64_t>();
  auto x2_indices_mat = x2_indices_ET.matrix<int64_t>();
  const int64_t x1_nnz = x1_indices_mat.dimension(0);
  const int64_t x2_nnz = x2_indices_mat.dimension(0);
  std::vector<std::pair<bool, int64_t>> entries_to_copy;
  entries_to_copy.reserve(x1_nnz + x2_nnz);
  std::vector<T> out_values;
  const int num_dims = ctx.Input(2)->GetTensorShape()->GetDimSize(0);
  EigenTensor x1_values_ET(ctx.Input(1), ctx.Input(1)->GetData());
  EigenTensor x2_values_ET(ctx.Input(4), ctx.Input(4)->GetData());
  auto x1_values = x1_values_ET.vec<T>();
  auto x2_values = x2_values_ET.vec<T>();
  int64_t i = 0, j = 0;
  T s;
  while (i < x1_nnz && j < x2_nnz) {
    switch (cmp(x1_indices_mat, i, num_dims, x2_indices_mat, j)) {
      case -1:
        s = std::min(x1_values(i), T(0));
        entries_to_copy.emplace_back(true, i);
        out_values.push_back(s);
        ++i;
        break;
      case 0:
        s = std::min(x1_values(i), x2_values(j));
        entries_to_copy.emplace_back(true, i);
        out_values.push_back(s);
        ++i;
        ++j;
        break;
      case 1:
        s = std::min(T(0), x2_values(j));
        entries_to_copy.emplace_back(false, j);
        out_values.push_back(s);
        ++j;
        break;
      default:
        KERNEL_LOG_ERROR("Some inner error happens in the SparseSparseMinimum computation.");
        return KERNEL_STATUS_INNER_ERROR;
    }
  }
 #define HANDLE_LEFTOVERS(X1_OR_X2, IDX, IS_A)       \
  while ((IDX) < X1_OR_X2##_nnz) {                  \
    entries_to_copy.emplace_back(IS_A, IDX);        \
    s = std::min((X1_OR_X2##_values)((IDX)), T(0)); \
    out_values.push_back(s);                        \
    ++(IDX);                                        \
  }
  HANDLE_LEFTOVERS(x1, i, true);
  HANDLE_LEFTOVERS(x2, j, false);
 #undef HANDLE_LEFTOVERS
  const int64_t y_nnz = out_values.size();
  Tensor *out_indices_t = ctx.Output(0);
  EigenTensor out_indices_ET(out_indices_t, out_indices_t->GetData());
  auto out_indices_mat = out_indices_ET.matrix<int64_t>();
  for (int64_t i = 0; i < y_nnz; ++i) {
    const bool from_x1 = entries_to_copy[i].first;
    const int64_t idx = entries_to_copy[i].second;
    out_indices_mat.chip<0>(i) = from_x1 ? x1_indices_mat.chip<0>(idx) : x2_indices_mat.chip<0>(idx);
  }
  std::vector<int64_t> indices_dims = {y_nnz, num_dims};
  auto out_indices_shape = out_indices_t->GetTensorShape();
  out_indices_shape->SetDimSizes(indices_dims);
  out_indices_t->SetTensorShape(out_indices_shape.get());
  Tensor *out_values_t = ctx.Output(1);
  EigenTensor out_values_ET(out_values_t, out_values_t->GetData());
  auto out_values_flat = out_values_ET.vec<T>();
  if (y_nnz > 0) {
    std::copy_n(out_values.begin(), y_nnz, &out_values_flat(0));
  }
  std::vector<int64_t> values_dims = {y_nnz};
  auto out_values_shape = out_values_t->GetTensorShape();
  out_values_shape->SetDimSizes(values_dims);
  out_values_t->SetTensorShape(out_values_shape.get());
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kSparseSparseMinimum, SparseSparseMinimumCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_sparse_minimum.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_sparse_minimum.h
@ -0,0 +1,41 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SPARSE_MINIMUM_H_
 #define AICPU_KERNELS_NORMALIZED_SPARSE_SPARSE_MINIMUM_H_
 #include "cpu_ops_kernel.h"
 #include "eigen_tensor.h"
 namespace aicpu {
 class SparseSparseMinimumCpuKernel : public CpuKernel {
 public:
  SparseSparseMinimumCpuKernel() = default;
  ~SparseSparseMinimumCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  static uint32_t SparseSparseMinimumCompute(CpuKernelContext &ctx);
  static int cmp(const TTypes<int64_t>::ConstMatrix &x_idx, const int64_t x_row, const int dims,
                 const TTypes<int64_t>::ConstMatrix &y_idx, const int64_t y_row);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparseaddmm.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparseaddmm.cc
@ -0,0 +1,301 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sparseaddmm.h"
 #include <securec.h>
 #include "cpu_kernel_utils.h"
 #include "cpu_types.h"
 #include "kernel_log.h"
 #include "status.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 7;
 const char *kSparseAddmm = "SparseAddmm";
 constexpr int64_t kParallelDataNums = 16;
 #define SPARSEADDMM_COMPUTE_CASE(DTYPE, TYPE, CTX)              \
  case (DTYPE): {                                               \
    if (indices_type == DT_INT64) {                             \
      uint32_t result = SparseAddmmCompute<TYPE, int64_t>(CTX); \
      if (result != KERNEL_STATUS_OK) {                         \
        KERNEL_LOG_ERROR("SparseAddmm kernel compute failed."); \
        return result;                                          \
      }                                                         \
      break;                                                    \
    } else {                                                    \
      uint32_t result = SparseAddmmCompute<TYPE, int32_t>(CTX); \
      if (result != KERNEL_STATUS_OK) {                         \
        KERNEL_LOG_ERROR("SparseAddmm kernel compute failed."); \
        return result;                                          \
      }                                                         \
      break;                                                    \
    }                                                           \
  }
 }  // namespace
 namespace aicpu {
 uint32_t SparseAddmmCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kSparseAddmm);
  KERNEL_HANDLE_ERROR(SparseAddmmCheck(ctx), "[%s] check params failed.", kSparseAddmm);
  DataType data_type = ctx.Input(1)->GetDataType();
  DataType data_type1 = ctx.Input(3)->GetDataType();
  DataType indices_type = ctx.Input(0)->GetDataType();
  if (data_type != data_type1) {
    KERNEL_LOG_ERROR(
      "sparse data type is no equal dense data type, sparsetype [%d], "
      "densetype [%d].",
      data_type, data_type1);
    return KERNEL_STATUS_PARAM_INVALID;
  }
  switch (data_type) {
    SPARSEADDMM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
    SPARSEADDMM_COMPUTE_CASE(DT_FLOAT, float, ctx)
    SPARSEADDMM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
    SPARSEADDMM_COMPUTE_CASE(DT_INT8, int8_t, ctx)
    SPARSEADDMM_COMPUTE_CASE(DT_INT16, int16_t, ctx)
    SPARSEADDMM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
    SPARSEADDMM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
    SPARSEADDMM_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
    SPARSEADDMM_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
    SPARSEADDMM_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
    SPARSEADDMM_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
    SPARSEADDMM_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
    SPARSEADDMM_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
    default:
      KERNEL_LOG_ERROR("SparseAddmm kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t SparseAddmmCpuKernel::SparseAddmmCheck(CpuKernelContext &ctx) {
  Tensor *indices_tensor = ctx.Input(0);
  Tensor *values_tensor = ctx.Input(1);
  Tensor *shape_tensor = ctx.Input(2);
  Tensor *dense_tensor = ctx.Input(3);
  Tensor *alpha_tensor = ctx.Input(5);
  Tensor *beta_tensor = ctx.Input(6);
  if (alpha_tensor->GetTensorShape()->NumElements() != 1) {
    KERNEL_LOG_ERROR(
      "alpha_tensor should be a number,but got NumElements "
      "[%d].",
      alpha_tensor->GetTensorShape()->NumElements());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (beta_tensor->GetTensorShape()->NumElements() != 1) {
    KERNEL_LOG_ERROR(
      "beta_tensor should be a number,but got NumElements "
      "[%d].",
      beta_tensor->GetTensorShape()->NumElements());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  // valid shape nullptr
  auto sparse_shape = shape_tensor->GetTensorShape();
  auto values_shape = values_tensor->GetTensorShape();
  auto dense_tensor_shape = dense_tensor->GetTensorShape();
  auto indices_shape = indices_tensor->GetTensorShape();
  // sparse_indices
  if (indices_shape->GetDims() > 2) {
    KERNEL_LOG_ERROR(
      "Sparse_indices should be a scalar, vector, or matrix, got dim "
      "size [%d].",
      indices_shape->GetDims());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  const int64_t elems_num = indices_shape->GetDims() > 0 ? indices_shape->GetDimSize(0) : 1;
  const int64_t dims_num = indices_shape->GetDims() > 1 ? indices_shape->GetDimSize(1) : 1;
  // output_shape
  if (sparse_shape->GetDims() != 1) {
    KERNEL_LOG_ERROR("Sparse_shape should be a vector, got dim size [%d].", sparse_shape->GetDims());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (shape_tensor->NumElements() != dims_num) {
    KERNEL_LOG_ERROR("Sparse_shape has incorrect number of elements [%lld], should be [%lld]",
                     shape_tensor->NumElements(), dims_num);
    return KERNEL_STATUS_PARAM_INVALID;
  }
  // valid data type
  int32_t IndiceType = indices_tensor->GetDataType();
  int32_t ShapeType = shape_tensor->GetDataType();
  bool validIndiceType = ((IndiceType != DT_INT32) && (IndiceType != DT_INT64));
  bool validShapeType = ((ShapeType != DT_INT32) && (ShapeType != DT_INT64));
  if (validShapeType || validIndiceType) {
    KERNEL_LOG_ERROR(
      "Valid indice or Sparse shape data type failed, indiceType [%d], "
      "shapeType [%d].",
      IndiceType, ShapeType);
    return KERNEL_STATUS_PARAM_INVALID;
  }
  // sparse_values
  int32_t values_dims_size = values_shape->GetDims();
  if ((values_dims_size != 0) && (values_dims_size != 1)) {
    KERNEL_LOG_ERROR("Values_shape should be a scalar or a vector, got dim size [%d].", values_shape->GetDims());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if ((values_dims_size == 1) && (values_tensor->NumElements() != elems_num)) {
    KERNEL_LOG_ERROR("Values_shape has incorrect number of elements [%lld], should be [%lld]",
                     values_tensor->NumElements(), elems_num);
    return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T, typename T1>
 uint32_t SparseAddmmCpuKernel::SparseAddmmCompute(CpuKernelContext &ctx) {
  auto *indices_tensor = ctx.Input(0);
  auto *values_tensor = ctx.Input(1);
  auto *shape_tensor = ctx.Input(2);
  auto *dense_tensor = ctx.Input(3);
  auto *x3_dense_tensor = ctx.Input(4);
  auto *alpha_tensor = ctx.Input(5);
  auto *beta_tensor = ctx.Input(6);
  auto *output_tensor = ctx.Output(0);
  // auto indices = reinterpret_cast<int64_t *>(indices_tensor->GetData());
  auto values = reinterpret_cast<T *>(values_tensor->GetData());
  auto dense_data = reinterpret_cast<T *>(dense_tensor->GetData());
  auto x3_dense_data = reinterpret_cast<T *>(x3_dense_tensor->GetData());
  auto alpha = reinterpret_cast<T *>(alpha_tensor->GetData());
  auto beta = reinterpret_cast<T *>(beta_tensor->GetData());
  auto y = reinterpret_cast<T *>(output_tensor->GetData());
  std::vector<int64_t> temp_shape;
  for (int32_t index = 0; index < shape_tensor->GetTensorShape()->GetDimSize(0); ++index) {
    if (shape_tensor->GetDataType() == DT_INT32) {
      int32_t *temp_dim = reinterpret_cast<int32_t *>(shape_tensor->GetData());
      temp_shape.emplace_back(static_cast<int64_t>(temp_dim[index]));
    } else {
      int64_t *temp_dim = reinterpret_cast<int64_t *>(shape_tensor->GetData());
      temp_shape.emplace_back(temp_dim[index]);
    }
  }
  const int64_t row_x1 = temp_shape[0];
  const int64_t col_x1 = temp_shape[1];
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> sparse(row_x1, col_x1);
  sparse.setZero(row_x1, col_x1);
  std::vector<int64_t> temp_indices;
  auto indices_one = indices_tensor->GetTensorShape()->GetDimSize(0);
  auto indices_two = indices_tensor->GetTensorShape()->GetDimSize(1);
  for (int32_t index = 0; index < indices_one; ++index) {
    if (indices_tensor->GetDataType() == DT_INT32) {
      int32_t *temp_dim = reinterpret_cast<int32_t *>(indices_tensor->GetData());
      temp_indices.emplace_back(static_cast<int64_t>(temp_dim[index * indices_two + 0]));
      temp_indices.emplace_back(static_cast<int64_t>(temp_dim[index * indices_two + 1]));
    } else {
      int64_t *temp_dim = reinterpret_cast<int64_t *>(indices_tensor->GetData());
      temp_indices.emplace_back(temp_dim[index * indices_two + 0]);
      temp_indices.emplace_back(temp_dim[index * indices_two + 1]);
    }
  }
  if (indices_one <= kParallelDataNums) {
    for (int64_t i = 0; i < indices_one; i++) {
      int64_t row = temp_indices[i * indices_two + 0];
      int64_t col = temp_indices[i * indices_two + 1];
      sparse(row, col) = *(values + i);
    }
  } else {
    uint32_t minCoreNum = 1;
    int64_t maxCoreNum = std::max(minCoreNum, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    auto shardSparse = [&](size_t start, size_t end) {
      for (size_t i = start; i < end; i++) {
        int64_t row = temp_indices[i * indices_two + 0];
        int64_t col = temp_indices[i * indices_two + 1];
        sparse(row, col) = *(values + i);
      }
    };
    CpuKernelUtils::ParallelFor(ctx, indices_one, indices_one / maxCoreNum, shardSparse);
  }
  std::vector<int64_t> shape_x2 = dense_tensor->GetTensorShape()->GetDimSizes();
  const int64_t row_x2 = shape_x2[0];
  const int64_t col_x2 = shape_x2[1];
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> dense(row_x2, col_x2);
  std::vector<int64_t> shape_x3 = x3_dense_tensor->GetTensorShape()->GetDimSizes();
  const int64_t row_x3 = shape_x3[0];
  const int64_t col_x3 = shape_x3[1];
  if (row_x3 != row_x1) {
    KERNEL_LOG_ERROR("x1's row is no equal x3's row, cannot do add!");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (col_x3 != col_x2) {
    KERNEL_LOG_ERROR("x2's col is no equal x3's col, cannot do add!");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (row_x2 <= kParallelDataNums) {
    for (int64_t i = 0; i < row_x2; i++) {
      for (int64_t j = 0; j < col_x2; j++) {
        dense(i, j) = *(dense_data + i * col_x2 + j);
      }
    }
  } else {
    uint32_t minCoreNum = 1;
    int64_t maxCoreNum = std::max(minCoreNum, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    auto shardDense = [&](size_t start, size_t end) {
      for (size_t i = start; i < end; i++) {
        for (int64_t j = 0; j < col_x2; j++) {
          dense(i, j) = *(dense_data + i * col_x2 + j);
        }
      }
    };
    CpuKernelUtils::ParallelFor(ctx, row_x2, row_x2 / maxCoreNum, shardDense);
  }
  if (col_x1 != row_x2) {
    KERNEL_LOG_ERROR("x1's col is no equal x2's row, cannot do mat mul!");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> temp;
  temp = sparse * dense;
  if (row_x1 <= kParallelDataNums) {
    for (int64_t i = 0; i < row_x1; i++) {
      for (int64_t j = 0; j < col_x2; j++) {
        *(y + i * col_x2 + j) = *(alpha + 0) * temp(i, j);
        *(y + i * col_x2 + j) += *(beta + 0) * (*(x3_dense_data + i * col_x2 + j));
      }
    }
  } else {
    uint32_t minCoreNum = 1;
    int64_t maxCoreNum = std::max(minCoreNum, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    auto shardMatMul = [&](size_t start, size_t end) {
      for (size_t i = start; i < end; i++) {
        for (int64_t j = 0; j < col_x2; j++) {
          *(y + i * col_x2 + j) = *(alpha + 0) * temp(i, j);
          *(y + i * col_x2 + j) += *(beta + 0) * (*(x3_dense_data + i * col_x2 + j));
        }
      }
    };
    CpuKernelUtils::ParallelFor(ctx, row_x1, row_x1 / maxCoreNum, shardMatMul);
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kSparseAddmm, SparseAddmmCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparseaddmm.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparseaddmm.h
@ -0,0 +1,38 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SPARSEADDMM_H_
 #define AICPU_KERNELS_NORMALIZED_SPARSEADDMM_H_
 #include "cpu_ops_kernel.h"
 #include "utils/sparse_tensor.h"
 namespace aicpu {
 class SparseAddmmCpuKernel : public CpuKernel {
 public:
  ~SparseAddmmCpuKernel() = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx);
  uint32_t SparseAddmmCheck(CpuKernelContext &ctx);
  template <typename T, typename T1>
  uint32_t SparseAddmmCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparsefillemptyrowsgrad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparsefillemptyrowsgrad.cc
@ -0,0 +1,169 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sparsefillemptyrowsgrad.h"
 #include <algorithm>
 #include <atomic>
 #include <mutex>
 #include <numeric>
 #include <set>
 #include <string>
 #include <vector>
 #include "cpu_kernel_utils.h"
 #include "utils/allocator_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 #include "kernel_log.h"
 #include "status.h"
 namespace {
 const char *kSparseFillEmptyRowsGrad = "SparseFillEmptyRowsGrad";
 const uint32_t kOutputNum = 2;
 const uint32_t kInputNum = 2;
 const int64_t kParallelNum{16384};
 bool isVector(const std::shared_ptr<aicpu::TensorShape> shape) { return shape->GetDims() == 1; }
 }  // namespace
 namespace aicpu {
 template <typename T>
 uint32_t SparseFillEmptyRowsGradCpuKernel::ComputeSparseFillEmptyRowsGrad(CpuKernelContext &ctx, DataBank &databank) {
  EigenTensor reverse_index_map_e(databank.reverse_index_map, databank.reverse_index_map->GetData());
  EigenTensor grad_values_e(databank.grad_values, databank.grad_values->GetData());
  EigenTensor y_value_e(databank.y_value, databank.y_value->GetData());
  auto reverse_index_map = reverse_index_map_e.vec<int64_t>();
  auto grad_values = grad_values_e.vec<T>();
  auto y_value = y_value_e.vec<T>();
  const int64_t N = databank.reverse_index_map->GetTensorShape()->GetDimSize(0);
  const int64_t N_full = databank.grad_values->GetTensorShape()->GetDimSize(0);
  std::vector<bool> visited(N_full, false);
  T *y_default_value = reinterpret_cast<T *>(databank.y_default_value->GetData());
  *y_default_value = static_cast<T>(0);
  if (N <= kParallelNum) {
    for (int64_t i = 0; i < N; ++i) {
      int64_t reverse_index = reverse_index_map(i);
      KERNEL_CHECK_FALSE(0 <= reverse_index && reverse_index < N_full, KERNEL_STATUS_PARAM_INVALID,
                         "Elements in reverse index must be in [0, [%d]) but got [%d]", N_full, reverse_index)
      y_value(i) = grad_values(reverse_index);
      visited[reverse_index] = true;
    }
  } else {
    int64_t total = N;
    uint32_t cores = CpuKernelUtils::GetCPUNum(ctx);
    int64_t per_unit_size = (total / std::min(std::max(1L, cores - 2L), total));
    uint32_t ret = CpuKernelUtils::ParallelFor(ctx, total, per_unit_size, [&](int64_t begin, int64_t end) {
      for (int64_t i = begin; i < end; ++i) {
        int64_t reverse_index = reverse_index_map(i);
        KERNEL_CHECK_FALSE_VOID(0 <= reverse_index && reverse_index < N_full,
                                "Elements in reverse index must be in [0, [%d]) but got [%d]", N_full, reverse_index);
        y_value(i) = grad_values(reverse_index);
        visited[reverse_index] = true;
      }
    });
    KERNEL_CHECK_FALSE((ret == KERNEL_STATUS_OK), KERNEL_STATUS_INNER_ERROR, "SparseFillEmptyRowsGrad compute failed.");
  }
  for (int64_t j = 0; j < N_full; ++j) {
    if (!visited[j]) {
      (*y_default_value) += grad_values(j);
    }
  }
  databank.y_default_value->GetTensorShape()->SetDimSizes({});
  databank.y_value->GetTensorShape()->SetDimSizes({N});
  return KERNEL_STATUS_OK;
 }
 uint32_t SparseFillEmptyRowsGradCpuKernel::NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &databank) {
  databank.reverse_index_map = ctx.Input(0);
  databank.grad_values = ctx.Input(1);
  databank.y_value = ctx.Output(0);
  databank.y_default_value = ctx.Output(1);
  KERNEL_CHECK_FALSE(isVector(databank.reverse_index_map->GetTensorShape()), KERNEL_STATUS_PARAM_INVALID,
                     "Inputs reverse_index_map should be vectors")
  KERNEL_CHECK_FALSE(isVector(databank.grad_values->GetTensorShape()), KERNEL_STATUS_PARAM_INVALID,
                     "Inputs grad_values should be vectors")
  return KERNEL_STATUS_OK;
 }
 uint32_t SparseFillEmptyRowsGradCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
                      "SparseFillEmptyRowsGrad check input and output number failed.");
  DataBank databank;
  KERNEL_HANDLE_ERROR(NullptrAndMatVecCheck(ctx, databank), "SparseFillEmptyRowsGrad check params failed.");
  DataType dt = static_cast<DataType>(databank.y_value->GetDataType());
  uint32_t KERNEL_STATUS;
  switch (dt) {
    case DT_INT8:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<int8_t>(ctx, databank);
      break;
    case DT_UINT8:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<uint8_t>(ctx, databank);
      break;
    case DT_INT16:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<int16_t>(ctx, databank);
      break;
    case DT_UINT16:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<uint16_t>(ctx, databank);
      break;
    case DT_INT32:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<int32_t>(ctx, databank);
      break;
    case DT_UINT32:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<uint32_t>(ctx, databank);
      break;
    case DT_INT64:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<int64_t>(ctx, databank);
      break;
    case DT_UINT64:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<uint64_t>(ctx, databank);
      break;
    case DT_BOOL:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<bool>(ctx, databank);
      break;
    case DT_STRING:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<std::string>(ctx, databank);
      break;
    case DT_FLOAT16:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<Eigen::half>(ctx, databank);
      break;
    case DT_FLOAT:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<float>(ctx, databank);
      break;
    case DT_DOUBLE:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<double>(ctx, databank);
      break;
    case DT_COMPLEX64:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<std::complex<float>>(ctx, databank);
      break;
    case DT_COMPLEX128:
      KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<std::complex<double>>(ctx, databank);
      break;
    default:
      KERNEL_LOG_ERROR("SparseFillEmptyRowsGrad can't support this data type [%s].", DTypeStr(dt).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  if (KERNEL_STATUS != KERNEL_STATUS_OK) {
    KERNEL_LOG_ERROR("SparseFillEmptyRowsGrad failed.");
    return KERNEL_STATUS;
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kSparseFillEmptyRowsGrad, SparseFillEmptyRowsGradCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparsefillemptyrowsgrad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparsefillemptyrowsgrad.h
@ -0,0 +1,45 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <set>
 #include "cpu_ops_kernel.h"
 #include "utils/sparse_group.h"
 #include "utils/sparse_tensor.h"
 // 定义命名空间aicpu
 namespace aicpu {
 struct DataBank {
  DataBank() : reverse_index_map(nullptr), grad_values(nullptr), y_value(nullptr), y_default_value(nullptr) {}
  Tensor *reverse_index_map;
  Tensor *grad_values;
  Tensor *y_value;
  Tensor *y_default_value;
 };
 // 算子类继承CpuKernel基类
 class SparseFillEmptyRowsGradCpuKernel : public CpuKernel {
 public:
  ~SparseFillEmptyRowsGradCpuKernel() = default;
  SparseFillEmptyRowsGradCpuKernel() = default;
  // 声明函数Compute，且Compute函数需要重写
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  uint32_t NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &calc_info);
  template <typename T>
  uint32_t ComputeSparseFillEmptyRowsGrad(CpuKernelContext &ctx, DataBank &databank);
 };
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/split.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/split.cc
@ -0,0 +1,190 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "split.h"
 #include "utils/kernel_util.h"
 namespace {
 const char *kSplit = "Split";
 constexpr uint32_t kSplitInputNum = 2;
 std::vector<std::string> attr_names;
 }  // namespace
 namespace aicpu {
 uint32_t SplitCpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
  // check params
  AttrValue *num_split_ptr = ctx.GetAttr("num_split");
  num_split_ = num_split_ptr->GetInt();
  uint32_t kSplitOutputNum = num_split_ptr->GetInt();
  attr_names.emplace_back("num_split");
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kSplitInputNum, kSplitOutputNum, attr_names), "[%s] check params failed.",
                      kSplit);
  KERNEL_CHECK_FALSE((num_split_ >= 1), KERNEL_STATUS_PARAM_INVALID,
                     "Attr num_split must >= 1, but got attr num_split[%lld]", num_split_);
  Tensor *split_dim_ptr = ctx.Input(0);
  auto split_dim_shape_ptr = split_dim_ptr->GetTensorShape();
  KERNEL_CHECK_FALSE((split_dim_shape_ptr->GetDims() == 0), KERNEL_STATUS_PARAM_INVALID,
                     "Input split_dim should be a scalar integer, but got rank[%lld]", split_dim_shape_ptr->GetDims());
  KERNEL_CHECK_FALSE((split_dim_ptr->GetDataType() == DT_INT32), KERNEL_STATUS_PARAM_INVALID,
                     "Input split_dim data type must be DT_INT32, but got data type[%s]",
                     DTypeStr(split_dim_ptr->GetDataType()).c_str());
  auto split_dim_data_ptr = split_dim_ptr->GetData();
  KERNEL_CHECK_NULLPTR(split_dim_data_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input split_dim data failed.");
  split_dim_ = *(reinterpret_cast<int32_t *>(split_dim_data_ptr));
  Tensor *value_ptr = ctx.Input(1);
  value_data_ptr_ = value_ptr->GetData();
  auto value_shape_ptr = value_ptr->GetTensorShape();
  int64_t value_dim = value_shape_ptr->GetDims();
  if (split_dim_ < 0) {
    split_dim_ += value_dim;
  }
  KERNEL_CHECK_FALSE(value_dim > split_dim_, KERNEL_STATUS_PARAM_INVALID,
                     "Dim of Input value must greater than split_dim, value dim is [%d], split_dim is [%d].", value_dim,
                     num_split_);
  value_shape_vec_ = value_shape_ptr->GetDimSizes();
  data_type_ = value_ptr->GetDataType();
  value_num_ = value_ptr->NumElements();
  KERNEL_CHECK_FALSE((value_shape_ptr->GetDimSize(split_dim_) % num_split_ == 0), KERNEL_STATUS_PARAM_INVALID,
                     "Number of ways to split should evenly divide the split "
                     "dimension, but got split_dim [%d] (size = [%lld]) and num_split is [%lld]",
                     split_dim_, value_shape_ptr->GetDimSize(split_dim_), num_split_);
  output_ptr_vec_.resize(num_split_);
  for (int64_t i = 0; i < num_split_; i++) {
    Tensor *output_ptr = ctx.Output(i);
    auto output_data_ptr = output_ptr->GetData();
    output_ptr_vec_[i] = output_data_ptr;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t SplitCpuKernel::DoCompute(CpuKernelContext &ctx) {
  T *input_data_ptr = static_cast<T *>(value_data_ptr_);
  std::vector<T *> output_data_vec;
  output_data_vec.resize(num_split_);
  for (int64_t i = 0; i < num_split_; i++) {
    output_data_vec[i] = reinterpret_cast<T *>(output_ptr_vec_[i]);
  }
  if (num_split_ == 1) {
    KERNEL_CHECK_FALSE((SplitWithOneOutput<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
                       KERNEL_STATUS_PARAM_INVALID, "SplitWithOneOutput failed.");
    return KERNEL_STATUS_OK;
  }
  if (split_dim_ == 0) {
    KERNEL_CHECK_FALSE((SplitWithDimZero<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
                       KERNEL_STATUS_PARAM_INVALID, "SplitWithDimZero failed.");
    return KERNEL_STATUS_OK;
  }
  KERNEL_CHECK_FALSE((SplitCompute<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
                     KERNEL_STATUS_PARAM_INVALID, "Split Compute failed.");
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t SplitCpuKernel::SplitWithOneOutput(T *input_data_ptr, std::vector<T *> output_data_vec) {
  int64_t copy_size = value_num_ * sizeof(T);
  auto mem_ret = memcpy_s(output_data_vec[0], copy_size, input_data_ptr, copy_size);
  KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
                     "Memcpy size[%zu] from input value to output[0] failed.", copy_size);
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t SplitCpuKernel::SplitWithDimZero(T *input_data_ptr, std::vector<T *> output_data_vec) {
  int64_t copy_num = value_num_ / value_shape_vec_[0];
  T *input_copy_ptr = input_data_ptr;
  const int64_t split_dim_output_size = value_shape_vec_[0] / num_split_;
  for (int32_t i = 0; i < num_split_; i++) {
    int64_t copy_size_per = copy_num * split_dim_output_size;
    int64_t copy_size = copy_size_per * sizeof(T);
    auto mem_ret = memcpy_s(output_data_vec[i], copy_size, input_copy_ptr, copy_size);
    KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
                       "Memcpy size[%zu] from input value to output[%d] failed.", copy_size, i);
    input_copy_ptr += copy_size_per;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t SplitCpuKernel::SplitCompute(T *input_data_ptr, std::vector<T *> output_data_vec) {
  int64_t prefix = 1;
  for (int32_t i = 0; i < split_dim_; ++i) {
    prefix *= value_shape_vec_[i];
  }
  int64_t midfix = value_shape_vec_[split_dim_];
  int64_t subfix = 1;
  for (size_t i = split_dim_ + 1; i < value_shape_vec_.size(); i++) {
    subfix *= value_shape_vec_[i];
  }
  const int64_t split_dim_output_size = midfix / num_split_;
  int64_t offset = 0;
  for (int64_t i = 0; i < num_split_; ++i) {
    T *output_data_ptr = output_data_vec[i];
    T *input_copy_ptr = input_data_ptr + offset;
    int64_t copy_num = subfix * split_dim_output_size;
    int64_t copy_size = copy_num * sizeof(T);
    for (int64_t j = 0; j < prefix; j++) {
      auto mem_ret = memcpy_s(output_data_ptr, copy_size, input_copy_ptr, copy_size);
      KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
                         "Memcpy size[%zu] from input value to output[%d] failed.", copy_size, i);
      input_copy_ptr += (subfix * midfix);
      output_data_ptr += copy_num;
    }
    offset += copy_num;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t SplitCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_CHECK_FALSE((CheckAndInitParams(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
                     "CheckAndInitParams failed.");
  switch (data_type_) {
    case DT_FLOAT16:
      return DoCompute<Eigen::half>(ctx);
    case DT_FLOAT:
      return DoCompute<float>(ctx);
    case DT_DOUBLE:
      return DoCompute<double>(ctx);
    case DT_BOOL:
      return DoCompute<bool>(ctx);
    case DT_INT8:
      return DoCompute<int8_t>(ctx);
    case DT_INT16:
      return DoCompute<int16_t>(ctx);
    case DT_INT32:
      return DoCompute<int32_t>(ctx);
    case DT_INT64:
      return DoCompute<int64_t>(ctx);
    case DT_UINT8:
      return DoCompute<uint8_t>(ctx);
    case DT_UINT16:
      return DoCompute<uint16_t>(ctx);
    case DT_UINT32:
      return DoCompute<uint32_t>(ctx);
    case DT_UINT64:
      return DoCompute<uint64_t>(ctx);
    case DT_COMPLEX64:
      return DoCompute<std::complex<float>>(ctx);
    case DT_COMPLEX128:
      return DoCompute<std::complex<double>>(ctx);
    default:
      KERNEL_LOG_ERROR("Unsupported datatype[%s]", DTypeStr(data_type_).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
 }
 REGISTER_CPU_KERNEL(kSplit, SplitCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/split.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/split.h
@ -0,0 +1,84 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SPLIT_H_
 #define AICPU_KERNELS_NORMALIZED_SPLIT_H_
 #include <memory>
 #include <vector>
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "securec.h"
 #include "cpu_ops_kernel.h"
 #include "cpu_kernel_utils.h"
 #include "kernel_log.h"
 #include "status.h"
 namespace aicpu {
 class SplitCpuKernel : public CpuKernel {
 public:
  SplitCpuKernel() = default;
  ~SplitCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  /**
   * @brief Init params
   * @param ctx cpu kernel context
   * @return status if success
   */
  uint32_t CheckAndInitParams(CpuKernelContext &ctx);
  /**
   * @brief split data when split num is 1
   * @param input_data_ptr ptr which store input data
   * @param output_data_vec vector which store all output data ptr
   * @return status if success
   */
  template <typename T>
  uint32_t SplitWithOneOutput(T *input_data_ptr, std::vector<T *> output_data_vec);
  /**
   * @brief split data when split dim is 0
   * @param input_data_ptr ptr which store input data
   * @param output_data_vec vector which store all output data ptr
   * @return status if success
   */
  template <typename T>
  uint32_t SplitWithDimZero(T *input_data_ptr, std::vector<T *> output_data_vec);
  /**
   * @brief split data
   * @param input_data_ptr ptr which store input data
   * @param output_data_vec vector which store all output data ptr
   * @return status if success
   */
  template <typename T>
  uint32_t SplitCompute(T *input_data_ptr, std::vector<T *> output_data_vec);
  template <typename T>
  uint32_t DoCompute(CpuKernelContext &ctx);
 private:
  DataType data_type_;
  int32_t split_dim_;
  int64_t num_split_;
  int64_t value_num_;
  void *value_data_ptr_;
  std::vector<void *> output_ptr_vec_;
  std::vector<int64_t> value_shape_vec_;
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sqrt.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sqrt.cc
@ -0,0 +1,142 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sqrt.h"
 #include <complex>
 #include <unsupported/Eigen/CXX11/Tensor>
 #include "cpu_kernel_utils.h"
 #include "cpu_types.h"
 #include "kernel_log.h"
 #include "status.h"
 #include "utils/kernel_util.h"
 namespace {
 const std::uint32_t kSqrtInputNum{1};
 const std::uint32_t kSqrtOutputNum{1};
 const std::uint32_t Parallel4ThreadNum{4096};
 const std::uint32_t Parallel6ThreadNum{8192};
 const std::uint32_t ParallelNum{16384};
 const char *kSqrt{"Sqrt"};
 }  // namespace
 namespace aicpu {
 namespace detail {
 template <typename T>
 inline std::uint32_t ComputeSqrtKernel(const CpuKernelContext &ctx) {
  const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
  auto input = static_cast<T *>(ctx.Input(0)->GetData());
  auto output = static_cast<T *>(ctx.Output(0)->GetData());
  std::int64_t total = ctx.Input(0)->NumElements();
  std::uint64_t total_size = ctx.Input(0)->GetDataSize();
  uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
  bool parallel_flag = false;
  if (total_size > ParallelNum * sizeof(T)) {
    parallel_flag = true;
  } else if (total_size > Parallel6ThreadNum * sizeof(T)) {
    parallel_flag = true;
    cores = 8;
  } else if (total_size > Parallel4ThreadNum * sizeof(T)) {
    parallel_flag = true;
    cores = 6;
  }
  if (parallel_flag) {
    std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
    return ParallelFor(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
      std::transform(input + begin, input + end, output + begin, Eigen::numext::sqrt<T>);
    });
  } else if (cores != 0) {
    std::transform(input, input + total, output, Eigen::numext::sqrt<T>);
  } else {
    return KERNEL_STATUS_INNER_ERROR;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 inline std::uint32_t ComputeSqrt(const CpuKernelContext &ctx) {
  uint32_t result = ComputeSqrtKernel<T>(ctx);
  if (result != 0) {
    KERNEL_LOG_ERROR("Sqrt compute failed.");
  }
  return result;
 }
 inline std::uint32_t SqrtExtraCheck(const CpuKernelContext &ctx) {
  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
    KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
                     DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (ctx.Input(0)->GetData() == nullptr) {
    KERNEL_LOG_ERROR("Get input data failed.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (ctx.Output(0)->GetData() == nullptr) {
    KERNEL_LOG_ERROR("Get output data failed.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
  std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
  if (input_dims.size() != output_dims.size()) {
    KERNEL_LOG_ERROR(
      "The data dim of the input size [%llu] need be the same as the output "
      "size [%llu].",
      input_dims.size(), output_dims.size());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  for (size_t index = 0; index < input_dims.size(); index++) {
    if (input_dims[index] != output_dims[index]) {
      KERNEL_LOG_ERROR(
        "The data dim[%llu]=%lld of the input need be the same as the output "
        "dim[%llu]=%lld.",
        index, input_dims[index], index, output_dims[index]);
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  return KERNEL_STATUS_OK;
 }
 std::uint32_t SqrtCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
  return NormalCheck(ctx, kSqrtInputNum, kSqrtOutputNum) ? KERNEL_STATUS_PARAM_INVALID : SqrtExtraCheck(ctx);
 }
 std::uint32_t SqrtCompute(const CpuKernelContext &ctx) {
  DataType input_type{ctx.Input(0)->GetDataType()};
  switch (input_type) {
    case DT_FLOAT16:
      return ComputeSqrt<Eigen::half>(ctx);
    case DT_FLOAT:
      return ComputeSqrt<std::float_t>(ctx);
    case DT_DOUBLE:
      return ComputeSqrt<std::double_t>(ctx);
    case DT_COMPLEX64:
      return ComputeSqrt<std::complex<std::float_t> >(ctx);
    case DT_COMPLEX128:
      return ComputeSqrt<std::complex<std::double_t> >(ctx);
    default:
      KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
 }
 }  // namespace detail
 std::uint32_t SqrtCpuKernel::Compute(CpuKernelContext &ctx) {
  return detail::SqrtCheck(ctx, kSqrtInputNum, kSqrtOutputNum) ? KERNEL_STATUS_PARAM_INVALID : detail::SqrtCompute(ctx);
 }
 REGISTER_CPU_KERNEL(kSqrt, SqrtCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sqrt.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sqrt.h
@ -0,0 +1,25 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SQRT_H_
 #define AICPU_KERNELS_NORMALIZED_SQRT_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class SqrtCpuKernel final : public CpuKernel {
  virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sqrtgrad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sqrtgrad.cc
@ -0,0 +1,248 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "sqrtgrad.h"
 #include <complex>
 #include <cstdint>
 #include <typeinfo>
 #include "Eigen/Dense"
 #include <iostream>
 #include "cpu_kernel_utils.h"
 #include "cpu_types.h"
 #include "utils/kernel_util.h"
 #include "kernel_log.h"
 #include "securec.h"
 #include "status.h"
 namespace {
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 2;
 const char *kSqrtGrad = "SqrtGrad";
 const int64_t kParallelDataNum = 2 * 1024;
 const int64_t kParallelDataNumMid = 16 * 1024;
 const int64_t kParallelDataNumSameShape = 7 * 1024;
 const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
 #define SQRTGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
  case (DTYPE): {                                          \
    uint32_t result = SqrtGradCompute<TYPE>(CTX);          \
    if (result != KERNEL_STATUS_OK) {                      \
      KERNEL_LOG_ERROR("SqrtGrad kernel compute failed."); \
      return result;                                       \
    }                                                      \
    break;                                                 \
  }
 #define SQRTGRAD_COMPUTE_COMPLEX_CASE(DTYPE, TYPE, CTX)    \
  case (DTYPE): {                                          \
    uint32_t result = SqrtGradComputeComplex<TYPE>(CTX);   \
    if (result != KERNEL_STATUS_OK) {                      \
      KERNEL_LOG_ERROR("SqrtGrad kernel compute failed."); \
      return result;                                       \
    }                                                      \
    break;                                                 \
  }
 }  // namespace
 namespace aicpu {
 uint32_t SqrtGradCpuKernel::Compute(CpuKernelContext &ctx) {
  // check params
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kSqrtGrad);
  KERNEL_HANDLE_ERROR(SqrtGradParamCheck(ctx), "[%s] check params failed.", kSqrtGrad);
  auto data_type = ctx.Input(0)->GetDataType();
  switch (data_type) {
    SQRTGRAD_COMPUTE_COMPLEX_CASE(DT_COMPLEX64, std::complex<float>, ctx)
    SQRTGRAD_COMPUTE_COMPLEX_CASE(DT_COMPLEX128, std::complex<double>, ctx)
    SQRTGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
    SQRTGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
    SQRTGRAD_COMPUTE_CASE(DT_DOUBLE, double, ctx)
    default:
      KERNEL_LOG_ERROR("SqrtGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t SqrtGradCpuKernel::SqrtGradParamCheck(CpuKernelContext &ctx) {
  // the non null of input_0, input_1, output has been verified in NormalCheck
  Tensor *input_0 = ctx.Input(0);
  Tensor *input_1 = ctx.Input(1);
  Tensor *output = ctx.Output(0);
  DataType input0_type = input_0->GetDataType();
  DataType input1_type = input_1->GetDataType();
  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
                     "The data type of input0 [%s] need be same with "
                     "input1 [%s].",
                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
  KERNEL_LOG_DEBUG(
    "SqrtGradCpuKernel[%s], input0: size[%llu];"
    "input1: size[%llu], output: size[%llu].",
    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
  return KERNEL_STATUS_OK;
 }
 /**
 special compute is used in the following situations.
 1. the shapes of input1 and input2 are the same
 2. input1 is a 1D tensor with only one element or input1 is scalar
 3. input2 is a 1D tensor with only one element or input2 is scalar
 4. the shapes of input1 and input2 are different
 */
 template <typename T>
 void SqrtGradCpuKernel::SpecialCompute(int64_t start, int64_t end, T *input1, T *input2, T *output) {
  int flag = 0;
  for (int64_t i = start; i < end; ++i) {
    if (*(input2 + i) == static_cast<T>(0)) {
      flag = 1;
      break;
    }
  }
  for (int64_t i = start; i < end; ++i) {
    *(output + i) = *(input2 + i) * static_cast<T>(0.5) / *(input1 + i);
  }
  if (flag == 1) KERNEL_LOG_WARN("divide by zero encountered");
 }
 template <typename T>
 void SqrtGradCpuKernel::SpecialComputeComplex(int64_t start, int64_t end, T *input1, T *input2, T *output) {
  int flag = 0;
  for (int64_t i = start; i < end; ++i) {
    if (*(input2 + i) == static_cast<T>(0)) {
      flag = 1;
      break;
    }
  }
  for (int64_t i = start; i < end; ++i) {
    T in1 = *(input1 + i);
    T in1_conj = std::conj(in1);
    if (in1_conj == static_cast<T>(0)) {
      *(output + i) = INFINITY;
    } else {
      *(output + i) = *(input2 + i) * static_cast<T>(0.5) / in1_conj;
    }
  }
  if (flag == 1) KERNEL_LOG_WARN("divide by zero encountered");
 }
 template <typename T>
 uint32_t SqrtGradCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
  int64_t data_num = in0_elements_nums;
  if (data_num >= kParallelDataNumSameShape) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (data_num <= kParallelDataNumSameShapeMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    auto sharder_sqrtgrad = [&](size_t start, size_t end) { SpecialCompute<T>(0, data_num, in0, in1, out); };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_sqrtgrad),
                        "SqrtGrad Compute failed.");
  } else {
    SpecialCompute<T>(0, data_num, in0, in1, out);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t SqrtGradCpuKernel::NoBcastComputeComplex(CpuKernelContext &ctx) {
  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
  int64_t data_num = in0_elements_nums;
  if (data_num >= kParallelDataNumSameShape) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (data_num <= kParallelDataNumSameShapeMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    auto sharder_sqrtgrad = [&](size_t start, size_t end) { SpecialComputeComplex<T>(0, data_num, in0, in1, out); };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_sqrtgrad),
                        "SqrtGrad Compute failed.");
  } else {
    SpecialComputeComplex<T>(0, data_num, in0, in1, out);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t SqrtGradCpuKernel::SqrtGradCompute(CpuKernelContext &ctx) {
  Tensor *input0_tensor = ctx.Input(0);
  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
  int64_t input0_elements_nums = input0_tensor->NumElements();
  Tensor *input1_tensor = ctx.Input(1);
  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
  int64_t input1_elements_nums = input1_tensor->NumElements();
  if (input0_elements_nums != input1_elements_nums) {
    KERNEL_LOG_WARN("Invalid element numbers, got[%d] and [%d]", static_cast<int32_t>(input0_elements_nums),
                    static_cast<int32_t>(input1_elements_nums));
    return KERNEL_STATUS_PARAM_INVALID;
  } else {
    return NoBcastCompute<T>(ctx);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t SqrtGradCpuKernel::SqrtGradComputeComplex(CpuKernelContext &ctx) {
  Tensor *input0_tensor = ctx.Input(0);
  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
  int64_t input0_elements_nums = input0_tensor->NumElements();
  Tensor *input1_tensor = ctx.Input(1);
  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
  int64_t input1_elements_nums = input1_tensor->NumElements();
  if (input0_elements_nums != input1_elements_nums) {
    KERNEL_LOG_WARN("Invalid element numbers, got[%d] and [%d]", static_cast<int32_t>(input0_elements_nums),
                    static_cast<int32_t>(input1_elements_nums));
    return KERNEL_STATUS_PARAM_INVALID;
  } else {
    return NoBcastComputeComplex<T>(ctx);
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kSqrtGrad, SqrtGradCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sqrtgrad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sqrtgrad.h
@ -0,0 +1,50 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_SQRTGRAD_H_
 #define AICPU_KERNELS_NORMALIZED_SQRTGRAD_H_
 #include "cpu_ops_kernel.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class SqrtGradCpuKernel : public CpuKernel {
 public:
  SqrtGradCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  uint32_t SqrtGradParamCheck(CpuKernelContext &ctx);
  template <typename T>
  void SpecialCompute(int64_t start, int64_t end, T *input1, T *input2, T *output);
  template <typename T>
  void SpecialComputeComplex(int64_t start, int64_t end, T *input1, T *input2, T *output);
  template <typename T>
  uint32_t NoBcastCompute(CpuKernelContext &ctx);
  template <typename T>
  uint32_t NoBcastComputeComplex(CpuKernelContext &ctx);
  template <typename T>
  uint32_t SqrtGradCompute(CpuKernelContext &ctx);
  template <typename T>
  uint32_t SqrtGradComputeComplex(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tanh.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tanh.cc
@ -0,0 +1,85 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "tanh.h"
 #include "Eigen/Dense"
 #include "cpu_kernel_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 #include "cmath"
 #include <complex>
 namespace {
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 1;
 const char *kTanh = "Tanh";
 constexpr int64_t kParallelDataNums = 128 * 1024;
 #define Tanh_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
  case (DTYPE): {                                      \
    uint32_t result = TanhCompute<TYPE>(CTX);          \
    if (result != KERNEL_STATUS_OK) {                  \
      KERNEL_LOG_ERROR("Tanh kernel compute failed."); \
      return result;                                   \
    }                                                  \
    break;                                             \
  }
 }  // namespace
 namespace aicpu {
 uint32_t TanhCpuKernel::Compute(CpuKernelContext &ctx) {
  // check params
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kTanh);
  auto data_type = ctx.Input(0)->GetDataType();
  switch (data_type) {
    Tanh_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
      Tanh_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx) Tanh_COMPUTE_CASE(DT_FLOAT, float, ctx)
        Tanh_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx) Tanh_COMPUTE_CASE(DT_DOUBLE, double, ctx) default
        : KERNEL_LOG_ERROR("Tanh kernel data type [%s] not support.", DTypeStr(data_type).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t TanhCpuKernel::TanhCompute(CpuKernelContext &ctx) {
  Eigen::internal::scalar_tanh_op<T> tanh_op;
  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  size_t data_num = ctx.Input(0)->NumElements();
  int64_t data_size = data_num * sizeof(T);
  if (data_size <= kParallelDataNums) {
    for (size_t i = 0; i < data_num; i++) {
      auto x_idx = input_x + i;  // i-th value of input0
      *(output_y + i) = tanh_op((*x_idx));
    }
  } else {
    uint32_t min_core_num = 1;
    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
    auto shard_Tanh = [&](size_t start, size_t end) {
      for (size_t i = start; i < end; i++) {
        auto x_idx = input_x + i;  // i-th value of input0
        *(output_y + i) = tanh_op((*x_idx));
      }
    };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_Tanh),
                        "Tanh Compute failed.");
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kTanh, TanhCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tanh.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tanh.h
@ -0,0 +1,34 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_TANH_H_
 #define AICPU_KERNELS_NORMALIZED_TANH_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class TanhCpuKernel : public CpuKernel {
 public:
  TanhCpuKernel() = default;
  ~TanhCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  uint32_t TanhCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tile.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tile.cc
@ -0,0 +1,156 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <stdint.h>
 #include <algorithm>
 #include <tuple>
 #include <utility>
 #include "tile.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 #include "Eigen/Core"
 namespace {
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 2;
 const char *kTile = "Tile";
 #define TILE_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX)    \
  case (DTYPE): {                                      \
    uint32_t result = TileCompute<TYPE1, TYPE2>(CTX);  \
    if (result != KERNEL_STATUS_OK) {                  \
      KERNEL_LOG_ERROR("Tile kernel compute failed."); \
      return result;                                   \
    }                                                  \
    break;                                             \
  }
 #define TILE_COMPUTE_CASE_ALL(TYPE, CTX)                            \
  TILE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, TYPE, CTX)   \
  TILE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
  TILE_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)                   \
  TILE_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX)                     \
  TILE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX)             \
  TILE_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX)                     \
  TILE_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX)                   \
  TILE_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX)                   \
  TILE_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX)                   \
  TILE_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX)                   \
  TILE_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX)                 \
  TILE_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX)                 \
  TILE_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)
 }  // namespace
 namespace aicpu {
 uint32_t TileCpuKernel::Compute(CpuKernelContext &ctx) {
  // check params
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Tile check input and output number failed.");
  Tensor *input_x0 = ctx.Input(0);
  Tensor *input_x1 = ctx.Input(1);
  Tensor *output = ctx.Output(0);
  auto size_0 = ctx.Input(0)->GetTensorShape()->GetDims();
  auto size_1 = ctx.Input(1)->GetTensorShape()->GetDims();
  KERNEL_CHECK_FALSE((size_0 >= 1), KERNEL_STATUS_PARAM_INVALID, "Dimension of x must be 1 or higher, but got[%zu].",
                     size_0);
  KERNEL_CHECK_FALSE((size_1 == 1), KERNEL_STATUS_PARAM_INVALID, "Dimension of multiples must be 1, but got[%zu].",
                     size_1);
  KERNEL_CHECK_FALSE((size_0 == input_x1->NumElements()), KERNEL_STATUS_PARAM_INVALID,
                     "Multiples length must be the same as the number of dimensions in x.");
  KERNEL_LOG_DEBUG(
    "TileCpuKernel[%s], inputx0: size[%llu];"
    "inputx1: size[%llu], output: size[%llu].",
    ctx.GetOpType().c_str(), input_x0->GetDataSize(), input_x1->GetDataSize(), output->GetDataSize());
  DataType data_type = ctx.Input(0)->GetDataType();
  DataType multiples_type = ctx.Input(1)->GetDataType();
  switch (multiples_type) {
    case DT_INT32:
      switch (data_type) {
        TILE_COMPUTE_CASE_ALL(int32_t, ctx)
        default:
          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
          return KERNEL_STATUS_PARAM_INVALID;
      }
      break;
    case DT_INT64:
      switch (data_type) {
        TILE_COMPUTE_CASE_ALL(int64_t, ctx)
        default:
          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
          return KERNEL_STATUS_PARAM_INVALID;
      }
      break;
    default:
      KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(multiples_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T, typename M>
 void TileCpuKernel::CopyMultipleTimes(const T *in_data, int64_t in_size, M multiplier, T *out_data) {
  for (M i = 0; i < multiplier; ++i) {
    const T *in_end = in_data + in_size;
    T *new_out_data = std::copy(in_data, in_end, out_data);
    in_data = out_data;
    out_data = new_out_data;
  }
 }
 template <typename T, typename M>
 std::pair<int64_t, int64_t> TileCpuKernel::TileOneDimension(const std::vector<int64_t> &in_dimensions, const T *in_data,
                                                            const M *multipliers, T *out_data, int64_t dimension) {
  if (in_dimensions.size() == 0) {
    // If input tensor is a scalar, then just copy it to output (no need to
    // multiply).
    *out_data = *in_data;
    return std::make_pair(0, 0);
  }
  const int64_t dimension_size = in_dimensions[dimension];
  if (dimension == static_cast<int64_t>(in_dimensions.size() - 1)) {
    CopyMultipleTimes(in_data, dimension_size, multipliers[dimension], out_data);
    return std::make_pair(dimension_size, dimension_size * static_cast<int64_t>(multipliers[dimension]));
  }
  int64_t total_stride_size = 0, total_tiled_stride_size = 0;
  const T *copy_from_data = in_data;
  T *copy_to_data = out_data;
  for (int64_t i = 0; i < dimension_size; ++i) {
    int64_t stride_size = 0, tiled_stride_size = 0;
    std::tie(stride_size, tiled_stride_size) =
      TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
    copy_from_data += stride_size;
    copy_to_data += tiled_stride_size;
    total_stride_size += stride_size;
    total_tiled_stride_size += tiled_stride_size;
  }
  CopyMultipleTimes(out_data, total_tiled_stride_size, multipliers[dimension] - 1, out_data + total_tiled_stride_size);
  return std::make_pair(total_stride_size, static_cast<int64_t>(total_tiled_stride_size * multipliers[dimension]));
 }
 template <typename T, typename M>
 uint32_t TileCpuKernel::TileCompute(CpuKernelContext &ctx) {
  auto x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto multiples = reinterpret_cast<M *>(ctx.Input(1)->GetData());
  auto y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  std::vector<int64_t> in_dimensions = ctx.Input(0)->GetTensorShape()->GetDimSizes();
  TileOneDimension(in_dimensions, x, multiples, y, 0);
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kTile, TileCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tile.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tile.h
@ -0,0 +1,43 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_TILE_H_
 #define AICPU_KERNELS_NORMALIZED_TILE_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class TileCpuKernel : public CpuKernel {
 public:
  TileCpuKernel() = default;
  ~TileCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T, typename M>
  void CopyMultipleTimes(const T *in_data, int64_t in_size, M multiplier, T *out_data);
  template <typename T, typename M>
  std::pair<int64_t, int64_t> TileOneDimension(const std::vector<int64_t> &in_dimensions, const T *in_data,
                                               const M *multipliers, T *out_data, int64_t dimension);
  template <typename T, typename M>
  uint32_t TileCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/transpose.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/transpose.cc
@ -0,0 +1,220 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "transpose.h"
 #include "cpu_kernel_utils.h"
 #include "securec.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 2;
 const char *kTranspose = "Transpose";
 #define TRANSPOSE_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
  case (DTYPE): {                                           \
    uint32_t result = TransposeCompute<TYPE>(CTX);          \
    if (result != KERNEL_STATUS_OK) {                       \
      KERNEL_LOG_ERROR("Transpose kernel compute failed."); \
      return result;                                        \
    }                                                       \
    break;                                                  \
  }
 }  // namespace
 namespace aicpu {
 uint32_t TransposeCpuKernel::GetTransposeValue(Tensor *tensor, std::vector<int64_t> &value) {
  auto type = tensor->GetDataType();
  if (type == DT_INT32) {
    auto data = reinterpret_cast<int32_t *>(tensor->GetData());
    for (unsigned int i = 0; i < tensor->NumElements(); i++) {
      value.push_back(static_cast<int64_t>(*(data + i)));
    }
  } else if (type == DT_INT64) {
    auto data = reinterpret_cast<int64_t *>(tensor->GetData());
    for (unsigned int i = 0; i < tensor->NumElements(); i++) {
      value.push_back(*(data + i));
    }
  } else {
    return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t TransposeCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kTranspose);
  KERNEL_HANDLE_ERROR(TransposeParamCheck(ctx), "[%s] check params failed.", kTranspose);
  auto x_type = ctx.Input(0)->GetDataType();
  switch (x_type) {
    TRANSPOSE_COMPUTE_CASE(DT_BOOL, bool, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_INT8, int8_t, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_INT16, int16_t, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_INT32, int32_t, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_INT64, int64_t, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_FLOAT, float, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
    TRANSPOSE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
    default:
      KERNEL_LOG_ERROR("Transpose kernel data type [%s] not support.", DTypeStr(x_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t TransposeCpuKernel::TransposeParamCheck(CpuKernelContext &ctx) {
  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
  std::vector<int64_t> shape_perm = ctx.Input(1)->GetTensorShape()->GetDimSizes();
  auto perm_tensor = ctx.Input(1);
  auto y_tensor = ctx.Output(0);
  KERNEL_CHECK_FALSE((shape_perm.size() == 1), KERNEL_STATUS_PARAM_INVALID,
                     "Expected perm to be 1-D tensors , but got [%zu]-D tensors.", shape_x.size())
  KERNEL_CHECK_FALSE((perm_tensor->NumElements() == (unsigned int)shape_x.size()), KERNEL_STATUS_PARAM_INVALID,
                     "Expected the size of perm to be [%zu], but got [%zu].", shape_x.size(),
                     perm_tensor->NumElements())
  KERNEL_CHECK_FALSE((GetTransposeValue(perm_tensor, perm) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
                     "perm must be either int32 or int64, but got [%s].", DTypeStr(perm_tensor->GetDataType()).c_str())
  KERNEL_CHECK_FALSE((shape_x.size() > 1), KERNEL_STATUS_PARAM_INVALID,
                     "Expected the dimension of x to be greater than 1-D, but got [%zu].", shape_x.size())
  std::vector<int64_t> shape_y;
  for (size_t i = 0; i < shape_x.size(); ++i) {
    int64_t perm_value = perm.at(i);
    if (shape_x.at(i) == 0) {
      KERNEL_CHECK_FALSE((perm_value == 0), KERNEL_STATUS_PARAM_INVALID,
                         "Expected perm[%zu] == 0 (got %zu), when x shape[%zu] == 0.", i, perm_value, i)
    } else {
      KERNEL_CHECK_FALSE((0 <= perm_value && perm_value <= (unsigned int)shape_x.size() - 1),
                         KERNEL_STATUS_PARAM_INVALID, "Expected perm[%zu] in [0, %zu], but got %zu.", i, shape_x.size(),
                         perm_value)
    }
    int64_t temp_value = 0;
    for (size_t j = 0; j < shape_x.size(); ++j) {
      if ((unsigned int)perm.at(j) == i) {
        break;
      } else {
        temp_value = j + 1;
        KERNEL_CHECK_FALSE((temp_value < (unsigned int)shape_x.size()), KERNEL_STATUS_PARAM_INVALID,
                           "Expected perm value is unique.")
      }
    }
    shape_y.push_back(shape_x.at(perm_value));
  }
  y_tensor->GetTensorShape()->SetDimSizes(shape_y);
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t TransposeCpuKernel::TransposeCompute(CpuKernelContext &ctx) {
  auto x_data = ctx.Input(0)->GetData();
  auto y_data = ctx.Output(0)->GetData();
  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
  std::vector<int64_t> shape_y = ctx.Output(0)->GetTensorShape()->GetDimSizes();
  auto input_data = reinterpret_cast<T *>(x_data);
  auto output_data = reinterpret_cast<T *>(y_data);
  int64_t input_dims = shape_x.size();
  switch (input_dims) {
    case 2: {
      typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_2D;
      Eigen_Tensor_2D input_2D(input_data, shape_x.at(0), shape_x.at(1));
      Eigen_Tensor_2D output_2D(output_data, shape_y.at(0), shape_y.at(1));
      Eigen::array<Eigen::DenseIndex, 2> perm_2D;
      for (size_t i = 0; i < 2; ++i) {
        perm_2D[i] = perm.at(i);
      }
      output_2D = input_2D.shuffle(perm_2D);
      break;
    }
    case 3: {
      typedef Eigen::TensorMap<Eigen::Tensor<T, 3, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_3D;
      Eigen_Tensor_3D input_3D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2));
      Eigen_Tensor_3D output_3D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2));
      Eigen::array<Eigen::DenseIndex, 3> perm_3D;
      for (size_t i = 0; i < 3; ++i) {
        perm_3D[i] = perm.at(i);
      }
      output_3D = input_3D.shuffle(perm_3D);
      break;
    }
    case 4: {
      typedef Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_4D;
      Eigen_Tensor_4D input_4D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2), shape_x.at(3));
      Eigen_Tensor_4D output_4D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2), shape_y.at(3));
      Eigen::array<Eigen::DenseIndex, 4> perm_4D;
      for (size_t i = 0; i < 4; ++i) {
        perm_4D[i] = perm.at(i);
      }
      output_4D = input_4D.shuffle(perm_4D);
      break;
    }
    case 5: {
      typedef Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_5D;
      Eigen_Tensor_5D input_5D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2), shape_x.at(3), shape_x.at(4));
      Eigen_Tensor_5D output_5D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2), shape_y.at(3), shape_y.at(4));
      Eigen::array<Eigen::DenseIndex, 5> perm_5D;
      for (size_t i = 0; i < 5; ++i) {
        perm_5D[i] = perm.at(i);
      }
      output_5D = input_5D.shuffle(perm_5D);
      break;
    }
    case 6: {
      typedef Eigen::TensorMap<Eigen::Tensor<T, 6, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_6D;
      Eigen_Tensor_6D input_6D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2), shape_x.at(3), shape_x.at(4),
                               shape_x.at(5));
      Eigen_Tensor_6D output_6D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2), shape_y.at(3), shape_y.at(4),
                                shape_y.at(5));
      Eigen::array<Eigen::DenseIndex, 6> perm_6D;
      for (size_t i = 0; i < 6; ++i) {
        perm_6D[i] = perm.at(i);
      }
      output_6D = input_6D.shuffle(perm_6D);
      break;
    }
    case 7: {
      typedef Eigen::TensorMap<Eigen::Tensor<T, 7, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_7D;
      Eigen_Tensor_7D input_7D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2), shape_x.at(3), shape_x.at(4),
                               shape_x.at(5), shape_x.at(6));
      Eigen_Tensor_7D output_7D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2), shape_y.at(3), shape_y.at(4),
                                shape_y.at(5), shape_y.at(6));
      Eigen::array<Eigen::DenseIndex, 7> perm_7D;
      for (size_t i = 0; i < 7; ++i) {
        perm_7D[i] = perm.at(i);
      }
      output_7D = input_7D.shuffle(perm_7D);
      break;
    }
    default:
      KERNEL_LOG_ERROR("[%s] : Unhandled input dimensions [%zu].", kTranspose, input_dims);
      return KERNEL_STATUS_INNER_ERROR;
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kTranspose, TransposeCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/transpose.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/transpose.h
@ -0,0 +1,39 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_TRANSPOSE_H_
 #define AICPU_KERNELS_NORMALIZED_TRANSPOSE_H_
 #include <vector>
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class TransposeCpuKernel : public CpuKernel {
 public:
  ~TransposeCpuKernel() = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  std::vector<int64_t> perm;
  uint32_t TransposeParamCheck(CpuKernelContext &ctx);
  uint32_t GetTransposeValue(Tensor *tensor, std::vector<int64_t> &value);
  template <typename T>
  uint32_t TransposeCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif  //  AICPU_TRANSPOSE_H
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tridiagonal_matmul.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tridiagonal_matmul.cc
@ -0,0 +1,127 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "tridiagonal_matmul.h"
 #include <complex>
 #include "Eigen/Core"
 #include "Eigen/Dense"
 #include "Eigen/LU"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "cpu_kernel_utils.h"
 #include "kernel_log.h"
 #include "status.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 constexpr uint32_t kInputNum = 4;
 constexpr uint32_t kOutputNum = 1;
 const char *kTridiagonalMatMul = "TridiagonalMatMul";
 }  // namespace
 namespace aicpu {
 uint32_t TridiagonalMatMulCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "TridiagonalMatMul check input and output num failed.");
  KERNEL_HANDLE_ERROR(TridiagonalMatMulDataAndTypeCheck(ctx),
                      "TridiagonalMatMul check input and output params failed.");
  auto data_type = ctx.Input(0)->GetDataType();
  switch (data_type) {
    case DT_FLOAT16:
      return TridiagonalMatMulCompute<Eigen::half>(ctx);
    case DT_FLOAT:
      return TridiagonalMatMulCompute<float>(ctx);
    case DT_DOUBLE:
      return TridiagonalMatMulCompute<double>(ctx);
    case DT_COMPLEX64:
      return TridiagonalMatMulCompute<std::complex<float>>(ctx);
    case DT_COMPLEX128:
      return TridiagonalMatMulCompute<std::complex<double>>(ctx);
    default:
      KERNEL_LOG_ERROR("Unsupported input data type[%s]", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t TridiagonalMatMulCpuKernel::TridiagonalMatMulDataAndTypeCheck(CpuKernelContext &ctx) {
  DataType superdiag_type = ctx.Input(0)->GetDataType();
  DataType maindiag_type = ctx.Input(1)->GetDataType();
  DataType subdiag_type = ctx.Input(2)->GetDataType();
  DataType rhs_type = ctx.Input(3)->GetDataType();
  KERNEL_CHECK_FALSE((superdiag_type == maindiag_type && maindiag_type == subdiag_type && subdiag_type == rhs_type),
                     KERNEL_STATUS_PARAM_INVALID,
                     "The data type of input0 [%s], input1 [%s],input2 [%s] and input3 [%s] "
                     "need be same.",
                     DTypeStr(superdiag_type).c_str(), DTypeStr(maindiag_type).c_str(), DTypeStr(subdiag_type).c_str(),
                     DTypeStr(rhs_type).c_str())
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t TridiagonalMatMulCpuKernel::TridiagonalMatMulCompute(CpuKernelContext &ctx) {
  auto superdiag_tensor = ctx.Input(0);
  auto superdiag_tensor_shape = superdiag_tensor->GetTensorShape();
  KERNEL_CHECK_FALSE((IsVector(superdiag_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
                     "invalid Input[superdiag]")
  auto maindiag_tensor = ctx.Input(1);
  auto maindiag_tensor_shape = maindiag_tensor->GetTensorShape();
  KERNEL_CHECK_FALSE((IsVector(maindiag_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
                     "invalid Input[maindiag]")
  auto subdiag_tensor = ctx.Input(2);
  auto subdiag_tensor_shape = subdiag_tensor->GetTensorShape();
  KERNEL_CHECK_FALSE((IsVector(subdiag_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
                     "invalid Input[subdiag]")
  auto rhs_tensor = ctx.Input(3);
  auto rhs_tensor_shape = rhs_tensor->GetTensorShape();
  KERNEL_CHECK_FALSE((IsMatrix(rhs_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID, "invalid Input[rhs]")
  auto superdiag_shape = superdiag_tensor_shape->GetDimSizes();
  auto maindiag_shape = maindiag_tensor_shape->GetDimSizes();
  auto subdiag_shape = subdiag_tensor_shape->GetDimSizes();
  auto rhs_shape = rhs_tensor_shape->GetDimSizes();
  int32_t superdiag_dims = superdiag_tensor_shape->GetDims();
  int32_t maindiag_dims = maindiag_tensor_shape->GetDims();
  int32_t subdiag_dims = subdiag_tensor_shape->GetDims();
  int32_t rhs_dims = rhs_tensor_shape->GetDims();
  int64_t length = rhs_shape[rhs_dims - 2];
  KERNEL_CHECK_FALSE((superdiag_shape[superdiag_dims - 1] == length), KERNEL_STATUS_PARAM_INVALID,
                     "invalid Input superdiag length")
  KERNEL_CHECK_FALSE((maindiag_shape[maindiag_dims - 1] == length), KERNEL_STATUS_PARAM_INVALID,
                     "invalid Input maindiag length")
  KERNEL_CHECK_FALSE((subdiag_shape[subdiag_dims - 1] == length), KERNEL_STATUS_PARAM_INVALID,
                     "invalid Input subdiag length")
  using VectorMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>;
  using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
  VectorMap superdiag(reinterpret_cast<T *>(superdiag_tensor->GetData()), superdiag_shape[superdiag_dims - 1], 1);
  VectorMap maindiag(reinterpret_cast<T *>(maindiag_tensor->GetData()), maindiag_shape[maindiag_dims - 1], 1);
  VectorMap subdiag(reinterpret_cast<T *>(subdiag_tensor->GetData()), subdiag_shape[subdiag_dims - 1], 1);
  MatrixMap rhs(reinterpret_cast<T *>(rhs_tensor->GetData()), rhs_shape[rhs_dims - 2], rhs_shape[rhs_dims - 1]);
  auto y_tensor = ctx.Output(0);
  auto y_shape = y_tensor->GetTensorShape()->GetDimSizes();
  int32_t y_dims = y_tensor->GetTensorShape()->GetDims();
  MatrixMap y(reinterpret_cast<T *>(y_tensor->GetData()), y_shape[y_dims - 2], y_shape[y_dims - 1]);
  y.array() = rhs.array().colwise() * maindiag.array();
  for (int64_t i = 0; i < length - 1; i++) {
    y.array().row(i) += rhs.array().row(i + 1) * superdiag(i);
    y.array().row(i + 1) += rhs.array().row(i) * subdiag(i + 1);
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kTridiagonalMatMul, TridiagonalMatMulCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tridiagonal_matmul.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tridiagonal_matmul.h
@ -0,0 +1,37 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_TRIDIAGONALMATMUL_H_
 #define AICPU_KERNELS_NORMALIZED_TRIDIAGONALMATMUL_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class TridiagonalMatMulCpuKernel : public CpuKernel {
 public:
  TridiagonalMatMulCpuKernel() = default;
  ~TridiagonalMatMulCpuKernel() = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  uint32_t TridiagonalMatMulCompute(CpuKernelContext &ctx);
  uint32_t TridiagonalMatMulDataAndTypeCheck(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif  // AICPU_KERNELS_NORMALIZED_TRIDIAGONALMATMUL_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tril_indices.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tril_indices.cc
@ -0,0 +1,93 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "tril_indices.h"
 #include <Eigen/Dense>
 #include <algorithm>
 #include <iostream>
 #include <map>
 #include "cpu_kernel_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const char *kTrilIndices = "TrilIndices";
 #define TRIL_INDICES_COMPUTE_CASE(DTYPE, TYPE, CTX)           \
  case (DTYPE): {                                             \
    uint32_t result = DoCompute<TYPE>(CTX);                   \
    if (result != KERNEL_STATUS_OK) {                         \
      KERNEL_LOG_ERROR("TrilIndices kernel compute failed."); \
      return result;                                          \
    }                                                         \
    break;                                                    \
  }
 }  // namespace
 namespace aicpu {
 uint32_t TrilIndicesCpuKernel::Compute(CpuKernelContext &ctx) {
  Tensor *output = ctx.Output(0);
  KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "Get output failed.")
  auto data_type = ctx.Output(0)->GetDataType();
  switch (data_type) {
    TRIL_INDICES_COMPUTE_CASE(DT_INT32, int32_t, ctx)
    TRIL_INDICES_COMPUTE_CASE(DT_INT64, int64_t, ctx)
    default:
      KERNEL_LOG_ERROR("TrilIndices kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t TrilIndicesCpuKernel::DoCompute(CpuKernelContext &ctx) {
  AttrValue *row_ptr = ctx.GetAttr("row");
  AttrValue *col_ptr = ctx.GetAttr("col");
  AttrValue *offset_ptr = ctx.GetAttr("offset");
  int64_t row = row_ptr->GetInt();
  int64_t col = col_ptr->GetInt();
  int64_t offset = (offset_ptr == nullptr) ? 0 : (offset_ptr->GetInt());
  auto m_first_row = offset > 0 ? std::min<int64_t>(col, 1 + offset) : row + offset > 0;
  auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset));
  auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(row, row + offset));
  auto n_row_trapezoid = (m_last_row - m_first_row + 1);
  auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1;
  auto diff_row = n_row_all - n_row_trapezoid;
  if (diff_row > 0) {
    tril_size += diff_row * col;
  }
  T *output{static_cast<T *>(ctx.Output(0)->GetData())};
  int64_t i = 0;
  int64_t r = std::max<int64_t>(0, -offset), c = 0;
  while (i < tril_size) {
    output[i] = r;
    output[tril_size + i++] = c;
    c += 1;
    if (c > r + offset || c >= col) {
      r += 1;
      c = 0;
    }
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kTrilIndices, TrilIndicesCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tril_indices.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tril_indices.h
@ -0,0 +1,40 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_TRIL_INDICES_H_
 #define AICPU_KERNELS_NORMALIZED_TRIL_INDICES_H_
 #include "cpu_ops_kernel.h"
 #include "cpu_types.h"
 #include "utils/bcast.h"
 #include "utils/sparse_tensor.h"
 namespace aicpu {
 class TrilIndicesCpuKernel : public CpuKernel {
 public:
  TrilIndicesCpuKernel() = default;
  ~TrilIndicesCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  uint32_t DoCompute(CpuKernelContext &ctx);
  int32_t offset = 0;
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc
@ -0,0 +1,874 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "triplet_margin_loss.h"
 #include <Eigen/Dense>
 #include <algorithm>
 #include <cmath>
 #include <iostream>
 #include <map>
 #include "cpu_kernel_utils.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "utils/broadcast_iterator.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kInputNum = 4;
 const uint32_t kOutputNum = 1;
 const int64_t kNoBroadcastValue = 1;
 const char *kTripletMarginLoss = "TripletMarginLoss";
 // when input data size is more than kParallelDataNum, use Parallel func
 const int64_t kParallelDataNum = 28 * 1024;
 const int64_t kParallelDataNumMid = 56 * 1024;
 }  // namespace
 namespace aicpu {
 uint32_t TripletMarginLossCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
                      " TripletMarginLoss check input and output number failed.");
  auto data_type_x = static_cast<DataType>(ctx.Input(0)->GetDataType());
  auto data_type_positive = static_cast<DataType>(ctx.Input(1)->GetDataType());
  auto data_type_negative = static_cast<DataType>(ctx.Input(2)->GetDataType());
  if (data_type_x != data_type_negative || data_type_positive != data_type_negative ||
      data_type_x != data_type_positive) {
    KERNEL_LOG_ERROR(
      "[%s] Data type of inputs requires to be the same, but got data type "
      "[%s] and "
      "[%s], type[%s].",
      ctx.GetOpType().c_str(), DTypeStr(data_type_x).c_str(), DTypeStr(data_type_positive).c_str(),
      DTypeStr(data_type_negative).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  AttrValue *Attr_p = ctx.GetAttr("p");
  int p_value = (Attr_p == nullptr) ? 2 : Attr_p->GetInt();
  float margin_value = *(reinterpret_cast<float *>(ctx.Input(3)->GetData()));
  AttrValue *Attr_eps = ctx.GetAttr("eps");
  float eps_value = (Attr_eps == nullptr) ? 1e-6 : Attr_eps->GetFloat();
  AttrValue *Attr_swap = ctx.GetAttr("swap");
  bool swap_value = (Attr_swap == nullptr) ? false : Attr_swap->GetBool();
  AttrValue *Attr_red = ctx.GetAttr("reduction");
  std::string reduction_value = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
  Tensor *input_x = (ctx.Input(0));
  Tensor *input_positive = (ctx.Input(1));
  Tensor *input_negative = (ctx.Input(2));
  const std::vector<int64_t> &shape_x = input_x->GetTensorShape()->GetDimSizes();
  const std::vector<int64_t> &shape_positive = input_positive->GetTensorShape()->GetDimSizes();
  const std::vector<int64_t> &shape_negative = input_negative->GetTensorShape()->GetDimSizes();
  std::vector<int64_t> broadcast_shape;
  std::vector<int64_t> broadcast_shape_x_and_positive;
  (void)GetBroadcastShape(shape_x, shape_positive, broadcast_shape_x_and_positive);
  (void)GetBroadcastShape(broadcast_shape_x_and_positive, shape_negative, broadcast_shape);
  int64_t num_elements = 1;
  for (size_t i = 0; i < broadcast_shape.size(); i++) {
    num_elements *= broadcast_shape[i];
  }
  int64_t data_num_output_reduction_none = (num_elements) / (broadcast_shape[1]);
  int64_t data_num_each_batch_input = (num_elements) / (broadcast_shape[0]);
  int64_t data_num_each_batch_output_reduction_none = data_num_output_reduction_none / (broadcast_shape[0]);
  int64_t batch_size = broadcast_shape[0];
  int64_t once_compute_size = broadcast_shape[1];
  bool broadcast = false;
  std::vector<int64_t> x_reshape_vector = shape_x;
  std::vector<int64_t> positive_reshape_vector = shape_positive;
  std::vector<int64_t> negative_reshape_vector = shape_negative;
  if (shape_x != shape_positive || shape_x != shape_negative || shape_positive != shape_negative) {
    broadcast = true;
    std::reverse(x_reshape_vector.begin(), x_reshape_vector.end());
    std::reverse(positive_reshape_vector.begin(), positive_reshape_vector.end());
    std::reverse(negative_reshape_vector.begin(), negative_reshape_vector.end());
    int64_t dim_num_x = input_x->GetTensorShape()->GetDims();
    int64_t dim_num_positive = input_positive->GetTensorShape()->GetDims();
    int64_t dim_num_negative = input_negative->GetTensorShape()->GetDims();
    auto dims = std::max(dim_num_x, std::max(dim_num_positive, dim_num_negative));
    if (dim_num_x < dims) x_reshape_vector.resize(dims, kNoBroadcastValue);
    if (dim_num_positive < dims) positive_reshape_vector.resize(dims, kNoBroadcastValue);
    if (dim_num_negative < dims) negative_reshape_vector.resize(dims, kNoBroadcastValue);
    std::reverse(x_reshape_vector.begin(), x_reshape_vector.end());
    std::reverse(positive_reshape_vector.begin(), positive_reshape_vector.end());
    std::reverse(negative_reshape_vector.begin(), negative_reshape_vector.end());
  }
  switch (data_type_x) {
    case DT_FLOAT16:
      return TripletMarginLossComputeRealTypeFloat16<Eigen::half>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    case DT_FLOAT:
      return TripletMarginLossComputeRealType<float>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    case DT_DOUBLE:
      return TripletMarginLossComputeRealType<double>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    case DT_INT8:
      return TripletMarginLossComputeRealType<int8_t>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    case DT_INT16:
      return TripletMarginLossComputeRealType<int16_t>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    case DT_INT32:
      return TripletMarginLossComputeRealType<int32_t>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    case DT_INT64:
      return TripletMarginLossComputeRealType<int64_t>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    case DT_UINT8:
      return TripletMarginLossComputeRealType<uint8_t>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    case DT_UINT16:
      return TripletMarginLossComputeRealType<uint16_t>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    case DT_UINT32:
      return TripletMarginLossComputeRealType<uint32_t>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    case DT_UINT64:
      return TripletMarginLossComputeRealType<uint64_t>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    case DT_COMPLEX128:
      return TripletMarginLossComputeComplexType<std::complex<double>>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    case DT_COMPLEX64:
      return TripletMarginLossComputeComplexType<std::complex<float>>(
        ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
        data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
        batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
    default:
      KERNEL_LOG_ERROR("[%s] Data type of input is not supported, input data type is [%s].", ctx.GetOpType().c_str(),
                       DTypeStr(data_type_x).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t TripletMarginLossCpuKernel::TripletMarginLossComputeRealType(
  CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value, std::string reduction_value,
  int64_t num_elements, int64_t data_num_output_reduction_none, int64_t data_num_each_batch_input,
  int64_t data_num_each_batch_output_reduction_none, int64_t batch_size, int64_t once_compute_size, bool broadcast,
  std::vector<int64_t> x_reshape_vector, std::vector<int64_t> positive_reshape_vector,
  std::vector<int64_t> negative_reshape_vector) {
  constexpr int ADULT_AGE = 4;
  Tensor *input_x = (ctx.Input(0));
  Tensor *input_positive = (ctx.Input(1));
  Tensor *input_negative = (ctx.Input(2));
  Tensor *output = (ctx.Output(0));
  const std::vector<int64_t> &shape_x = input_x->GetTensorShape()->GetDimSizes();
  const std::vector<int64_t> &shape_positive = input_positive->GetTensorShape()->GetDimSizes();
  const std::vector<int64_t> &shape_negative = input_negative->GetTensorShape()->GetDimSizes();
  T *x_data = reinterpret_cast<T *>(input_x->GetData());
  T *positive_data = reinterpret_cast<T *>(input_positive->GetData());
  T *negative_data = reinterpret_cast<T *>(input_negative->GetData());
  std::vector<int64_t> broadcast_shape;
  std::vector<int64_t> broadcast_shape_x_and_positive;
  (void)GetBroadcastShape(shape_x, shape_positive, broadcast_shape_x_and_positive);
  (void)GetBroadcastShape(broadcast_shape_x_and_positive, shape_negative, broadcast_shape);
  std::vector<T> x_broadcast_tensor;
  std::vector<T> positive_broadcast_tensor;
  std::vector<T> negative_broadcast_tensor;
  if (broadcast == true) {
    auto shape_x1 = shape_x;
    auto shape_x2 = shape_x;
    auto shape_positive1 = shape_positive;
    auto shape_negative1 = shape_negative;
    auto broadcast_shape1 = broadcast_shape;
    auto broadcast_shape2 = broadcast_shape;
    BroadcastIterator iter1(shape_x1, shape_positive1, broadcast_shape1);
    BroadcastIterator iter2(shape_x2, shape_negative1, broadcast_shape2);
    iter1.SetPos(0);
    iter2.SetPos(0);
    for (int64_t i = 0; i < num_elements; i++) {
      x_broadcast_tensor.push_back(x_data[iter1.GetInputPosA()]);
      positive_broadcast_tensor.push_back(positive_data[iter1.GetInputPosB()]);
      negative_broadcast_tensor.push_back(negative_data[iter2.GetInputPosB()]);
      iter1.GenNextPos();
      iter2.GenNextPos();
    }
    x_data = x_broadcast_tensor.data();
    positive_data = positive_broadcast_tensor.data();
    negative_data = negative_broadcast_tensor.data();
  }
  auto output_data = reinterpret_cast<float *>(output->GetData());
  Eigen::Array<float, Eigen::Dynamic, 1> output_reduction_none(data_num_output_reduction_none, 1);
  float *output_reduction_none_data = reinterpret_cast<float *>(output_reduction_none.data());
  auto shard_triplet_margin_loss = [&](int64_t start, int64_t end) {
    Eigen::Array<float, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
    Eigen::Array<float, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
    Eigen::Array<float, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
    float *calculate_positive_distance_data = reinterpret_cast<float *>(calculate_positive_distance.data());
    float *calculate_negative_distance_data = reinterpret_cast<float *>(calculate_negative_distance.data());
    float *calculate_swap_distance_data = reinterpret_cast<float *>(calculate_swap_distance.data());
    int64_t once_compute_thread_size = (end - start);
    float positive_distance;
    float negative_distance;
    float swap_distance;
    float temp1;
    float temp2;
    float temp3;
    if (data_num_each_batch_input == 0) {
      KERNEL_LOG_ERROR("data_num_each_batch_input could not be 0.");
    }
    for (int64_t n = 0; n < once_compute_thread_size / data_num_each_batch_input; n++) {
      int64_t i = start / data_num_each_batch_input;
      for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
        for (int64_t k = 0; k < once_compute_size; k++) {
          *(calculate_positive_distance_data + k) =
            eps_value +
            static_cast<float>(
              *(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
            static_cast<float>(
              *(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
          *(calculate_negative_distance_data + k) =
            eps_value +
            static_cast<float>(
              *(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
            static_cast<float>(
              *(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
          if (swap_value == true) {
            *(calculate_swap_distance_data + k) =
              eps_value +
              static_cast<float>(
                *(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
              static_cast<float>(
                *(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
          }
        }
        calculate_positive_distance = (calculate_positive_distance).abs();
        calculate_negative_distance = (calculate_negative_distance).abs();
        for (int64_t n = 0; n < once_compute_size; n++) {
          temp1 = *(calculate_positive_distance_data + n);
          temp2 = *(calculate_negative_distance_data + n);
          for (int64_t l = 1; l < p_value; l++) {
            *(calculate_positive_distance_data + n) = *(calculate_positive_distance_data + n) * temp1;
            *(calculate_negative_distance_data + n) = *(calculate_negative_distance_data + n) * temp2;
          }
        }
        positive_distance =
          std::pow(static_cast<double>(calculate_positive_distance.sum()), (1 / static_cast<float>(p_value)));
        negative_distance =
          std::pow(static_cast<double>(calculate_negative_distance.sum()), (1 / static_cast<float>(p_value)));
        if (broadcast == true) {
          if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
            positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
          }
          if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
            negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
          }
        }
        if (swap_value == true) {
          calculate_swap_distance = ((calculate_swap_distance)).abs();
          for (int64_t n = 0; n < once_compute_size; n++) {
            temp3 = *(calculate_swap_distance_data + n);
            for (int64_t l = 1; l < p_value; l++) {
              *(calculate_swap_distance_data + n) = *(calculate_swap_distance_data + n) * temp3;
            }
          }
          swap_distance =
            std::pow(static_cast<double>(calculate_swap_distance.sum()), (1 / static_cast<float>(p_value)));
          if (broadcast == true) {
            if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
              swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
            }
          }
          negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
        }
        *(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
          (positive_distance + margin_value - negative_distance > 0)
            ? (positive_distance + margin_value - negative_distance)
            : 0;
      }
      start += data_num_each_batch_input;
    }
  };
  if (num_elements * sizeof(T) > kParallelDataNum) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (num_elements * sizeof(T) <= kParallelDataNumMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("max_core_num could not be 0.");
    }
    CpuKernelUtils::ParallelFor(ctx, num_elements,
                                data_num_each_batch_input * ADULT_AGE * (batch_size / max_core_num + 1),
                                shard_triplet_margin_loss);
  } else {
    Eigen::Array<float, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
    Eigen::Array<float, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
    Eigen::Array<float, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
    float *calculate_positive_distance_data = reinterpret_cast<float *>(calculate_positive_distance.data());
    float *calculate_negative_distance_data = reinterpret_cast<float *>(calculate_negative_distance.data());
    float *calculate_swap_distance_data = reinterpret_cast<float *>(calculate_swap_distance.data());
    float positive_distance;
    float negative_distance;
    float swap_distance;
    float temp1;
    float temp2;
    float temp3;
    for (int64_t i = 0; i < batch_size; i++) {
      for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
        for (int64_t k = 0; k < once_compute_size; k++) {
          *(calculate_positive_distance_data + k) =
            eps_value +
            static_cast<float>(
              *(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
            static_cast<float>(
              *(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
          *(calculate_negative_distance_data + k) =
            eps_value +
            static_cast<float>(
              *(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
            static_cast<float>(
              *(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
          if (swap_value == true) {
            *(calculate_swap_distance_data + k) =
              eps_value +
              static_cast<float>(
                *(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
              static_cast<float>(
                *(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
          }
        }
        calculate_positive_distance = (calculate_positive_distance).abs();
        calculate_negative_distance = (calculate_negative_distance).abs();
        for (int64_t n = 0; n < once_compute_size; n++) {
          temp1 = *(calculate_positive_distance_data + n);
          temp2 = *(calculate_negative_distance_data + n);
          for (int64_t l = 1; l < p_value; l++) {
            *(calculate_positive_distance_data + n) = *(calculate_positive_distance_data + n) * temp1;
            *(calculate_negative_distance_data + n) = *(calculate_negative_distance_data + n) * temp2;
          }
        }
        positive_distance =
          std::pow(static_cast<double>(calculate_positive_distance.sum()), (1 / static_cast<float>(p_value)));
        negative_distance =
          std::pow(static_cast<double>(calculate_negative_distance.sum()), (1 / static_cast<float>(p_value)));
        if (broadcast == true) {
          if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
            positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
          }
          if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
            negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
          }
        }
        if (swap_value == true) {
          calculate_swap_distance = ((calculate_swap_distance)).abs();
          for (int64_t n = 0; n < once_compute_size; n++) {
            temp3 = *(calculate_swap_distance_data + n);
            for (int64_t l = 1; l < p_value; l++) {
              *(calculate_swap_distance_data + n) = *(calculate_swap_distance_data + n) * temp3;
            }
          }
          swap_distance =
            std::pow(static_cast<double>(calculate_swap_distance.sum()), (1 / static_cast<float>(p_value)));
          if (broadcast == true) {
            if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
              swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
            }
          }
          negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
        }
        *(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
          (positive_distance + margin_value - negative_distance > 0)
            ? (positive_distance + margin_value - negative_distance)
            : 0;
      }
    }
  }
  if (reduction_value == "none") {
    for (int64_t i = 0; i < data_num_output_reduction_none; i++) {
      *(output_data + i) = *(output_reduction_none_data + i);
    }
  }
  if (reduction_value == "mean") {
    *(output_data) = (output_reduction_none.mean());
  }
  if (reduction_value == "sum") {
    *(output_data) = (output_reduction_none.sum());
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t TripletMarginLossCpuKernel::TripletMarginLossComputeComplexType(
  CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value, std::string reduction_value,
  int64_t num_elements, int64_t data_num_output_reduction_none, int64_t data_num_each_batch_input,
  int64_t data_num_each_batch_output_reduction_none, int64_t batch_size, int64_t once_compute_size, bool broadcast,
  std::vector<int64_t> x_reshape_vector, std::vector<int64_t> positive_reshape_vector,
  std::vector<int64_t> negative_reshape_vector) {
  constexpr int ADULT_AGE = 4;
  Tensor *input_x = (ctx.Input(0));
  Tensor *input_positive = (ctx.Input(1));
  Tensor *input_negative = (ctx.Input(2));
  Tensor *output = (ctx.Output(0));
  const std::vector<int64_t> &shape_x = input_x->GetTensorShape()->GetDimSizes();
  const std::vector<int64_t> &shape_positive = input_positive->GetTensorShape()->GetDimSizes();
  const std::vector<int64_t> &shape_negative = input_negative->GetTensorShape()->GetDimSizes();
  T *x_data = reinterpret_cast<T *>(input_x->GetData());
  T *positive_data = reinterpret_cast<T *>(input_positive->GetData());
  T *negative_data = reinterpret_cast<T *>(input_negative->GetData());
  std::vector<int64_t> broadcast_shape;
  std::vector<int64_t> broadcast_shape_x_and_positive;
  (void)GetBroadcastShape(shape_x, shape_positive, broadcast_shape_x_and_positive);
  (void)GetBroadcastShape(broadcast_shape_x_and_positive, shape_negative, broadcast_shape);
  std::vector<T> x_broadcast_tensor;
  std::vector<T> positive_broadcast_tensor;
  std::vector<T> negative_broadcast_tensor;
  if (broadcast == true) {
    auto shape_x1 = shape_x;
    auto shape_x2 = shape_x;
    auto shape_positive1 = shape_positive;
    auto shape_negative1 = shape_negative;
    auto broadcast_shape1 = broadcast_shape;
    auto broadcast_shape2 = broadcast_shape;
    BroadcastIterator iter1(shape_x1, shape_positive1, broadcast_shape1);
    BroadcastIterator iter2(shape_x2, shape_negative1, broadcast_shape2);
    iter1.SetPos(0);
    iter2.SetPos(0);
    for (int64_t i = 0; i < num_elements; i++) {
      x_broadcast_tensor.push_back(x_data[iter1.GetInputPosA()]);
      positive_broadcast_tensor.push_back(positive_data[iter1.GetInputPosB()]);
      negative_broadcast_tensor.push_back(negative_data[iter2.GetInputPosB()]);
      iter1.GenNextPos();
      iter2.GenNextPos();
    }
    x_data = x_broadcast_tensor.data();
    positive_data = positive_broadcast_tensor.data();
    negative_data = negative_broadcast_tensor.data();
  }
  auto output_data = reinterpret_cast<float *>(output->GetData());
  Eigen::Array<float, Eigen::Dynamic, 1> output_reduction_none(data_num_output_reduction_none, 1);
  float *output_reduction_none_data = reinterpret_cast<float *>(output_reduction_none.data());
  auto shard_triplet_margin_loss = [&](int64_t start, int64_t end) {
    Eigen::Array<T, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
    Eigen::Array<T, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
    Eigen::Array<T, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
    T *calculate_positive_distance_data = reinterpret_cast<T *>(calculate_positive_distance.data());
    T *calculate_negative_distance_data = reinterpret_cast<T *>(calculate_negative_distance.data());
    T *calculate_swap_distance_data = reinterpret_cast<T *>(calculate_swap_distance.data());
    int64_t once_compute_thread_size = end - start;
    float positive_distance;
    float negative_distance;
    float swap_distance;
    if (data_num_each_batch_input == 0) {
      KERNEL_LOG_ERROR("data_num_each_batch_input could not be 0.");
    }
    for (int64_t n = 0; n < (once_compute_thread_size) / data_num_each_batch_input; n++) {
      int64_t i = start / data_num_each_batch_input;
      for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
        for (int64_t k = 0; k < once_compute_size; k++) {
          *(calculate_positive_distance_data + k) =
            static_cast<T>(eps_value) +
            (*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
            (*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
          *(calculate_negative_distance_data + k) =
            static_cast<T>(eps_value) +
            (*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
            (*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
          if (swap_value == true) {
            *(calculate_swap_distance_data + k) =
              static_cast<T>(eps_value) +
              (*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
              (*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
          }
        }
        auto calculate_positive_distance_float =
          (calculate_positive_distance * (calculate_positive_distance.matrix().conjugate().array())).real().sqrt();
        auto calculate_negative_distance_float =
          (calculate_negative_distance * (calculate_negative_distance.matrix().conjugate().array())).real().sqrt();
        positive_distance =
          std::pow(calculate_positive_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
        negative_distance =
          std::pow(calculate_negative_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
        if (broadcast == true) {
          if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
            positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
          }
          if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
            negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
          }
        }
        if (swap_value == true) {
          auto calculate_swap_distance_float =
            (calculate_swap_distance * (calculate_swap_distance.matrix().conjugate().array())).real().sqrt();
          swap_distance = std::pow(calculate_swap_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
          if (broadcast == true) {
            if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
              swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
            }
          }
          negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
        }
        *(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
          (positive_distance + margin_value - negative_distance > 0)
            ? (positive_distance + margin_value - negative_distance)
            : 0;
      }
      start += data_num_each_batch_input;
    }
  };
  if (num_elements * sizeof(T) > kParallelDataNum) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (num_elements * sizeof(T) <= kParallelDataNumMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("max_core_num could not be 0.");
    }
    CpuKernelUtils::ParallelFor(ctx, num_elements,
                                data_num_each_batch_input * ADULT_AGE * (batch_size / max_core_num + 1),
                                shard_triplet_margin_loss);
  } else {
    Eigen::Array<T, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
    Eigen::Array<T, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
    Eigen::Array<T, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
    T *calculate_positive_distance_data = reinterpret_cast<T *>(calculate_positive_distance.data());
    T *calculate_negative_distance_data = reinterpret_cast<T *>(calculate_negative_distance.data());
    T *calculate_swap_distance_data = reinterpret_cast<T *>(calculate_swap_distance.data());
    for (int64_t i = 0; i < batch_size; i++) {
      for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
        for (int64_t k = 0; k < once_compute_size; k++) {
          *(calculate_positive_distance_data + k) =
            static_cast<T>(eps_value) +
            (*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
            (*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
          *(calculate_negative_distance_data + k) =
            static_cast<T>(eps_value) +
            (*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
            (*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
          if (swap_value == true) {
            *(calculate_swap_distance_data + k) =
              static_cast<T>(eps_value) +
              (*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
              (*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
          }
        }
        float positive_distance;
        float negative_distance;
        float swap_distance;
        auto calculate_positive_distance_float =
          (calculate_positive_distance * (calculate_positive_distance.matrix().conjugate().array())).real().sqrt();
        auto calculate_negative_distance_float =
          (calculate_negative_distance * (calculate_negative_distance.matrix().conjugate().array())).real().sqrt();
        positive_distance =
          std::pow(calculate_positive_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
        negative_distance =
          std::pow(calculate_negative_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
        if (broadcast == true) {
          if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
            positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
          }
          if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
            negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
          }
        }
        if (swap_value == true) {
          auto calculate_swap_distance_float =
            (calculate_swap_distance * (calculate_swap_distance.matrix().conjugate().array())).real().sqrt();
          swap_distance = std::pow(calculate_swap_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
          if (broadcast == true) {
            if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
              swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
            }
          }
          negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
        }
        *(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
          (positive_distance + margin_value - negative_distance > 0)
            ? positive_distance + margin_value - negative_distance
            : 0;
      }
    }
  }
  if (reduction_value == "none") {
    for (int64_t i = 0; i < data_num_output_reduction_none; i++) {
      *(output_data + i) = *(output_reduction_none_data + i);
    }
  }
  if (reduction_value == "mean") {
    *(output_data) = (output_reduction_none.mean());
  }
  if (reduction_value == "sum") {
    *(output_data) = (output_reduction_none.sum());
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t TripletMarginLossCpuKernel::TripletMarginLossComputeRealTypeFloat16(
  CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value, std::string reduction_value,
  int64_t num_elements, int64_t data_num_output_reduction_none, int64_t data_num_each_batch_input,
  int64_t data_num_each_batch_output_reduction_none, int64_t batch_size, int64_t once_compute_size, bool broadcast,
  std::vector<int64_t> x_reshape_vector, std::vector<int64_t> positive_reshape_vector,
  std::vector<int64_t> negative_reshape_vector) {
  constexpr int ADULT_AGE = 4;
  Tensor *input_x = (ctx.Input(0));
  Tensor *input_positive = (ctx.Input(1));
  Tensor *input_negative = (ctx.Input(2));
  Tensor *output = (ctx.Output(0));
  const std::vector<int64_t> &shape_x = input_x->GetTensorShape()->GetDimSizes();
  const std::vector<int64_t> &shape_positive = input_positive->GetTensorShape()->GetDimSizes();
  const std::vector<int64_t> &shape_negative = input_negative->GetTensorShape()->GetDimSizes();
  T *x_data = reinterpret_cast<T *>(input_x->GetData());
  T *positive_data = reinterpret_cast<T *>(input_positive->GetData());
  T *negative_data = reinterpret_cast<T *>(input_negative->GetData());
  std::vector<int64_t> broadcast_shape;
  std::vector<int64_t> broadcast_shape_x_and_positive;
  (void)GetBroadcastShape(shape_x, shape_positive, broadcast_shape_x_and_positive);
  (void)GetBroadcastShape(broadcast_shape_x_and_positive, shape_negative, broadcast_shape);
  std::vector<T> x_broadcast_tensor;
  std::vector<T> positive_broadcast_tensor;
  std::vector<T> negative_broadcast_tensor;
  if (broadcast == true) {
    auto shape_x1 = shape_x;
    auto shape_x2 = shape_x;
    auto shape_positive1 = shape_positive;
    auto shape_negative1 = shape_negative;
    auto broadcast_shape1 = broadcast_shape;
    auto broadcast_shape2 = broadcast_shape;
    BroadcastIterator iter1(shape_x1, shape_positive1, broadcast_shape1);
    BroadcastIterator iter2(shape_x2, shape_negative1, broadcast_shape2);
    iter1.SetPos(0);
    iter2.SetPos(0);
    for (int64_t i = 0; i < num_elements; i++) {
      x_broadcast_tensor.push_back(x_data[iter1.GetInputPosA()]);
      positive_broadcast_tensor.push_back(positive_data[iter1.GetInputPosB()]);
      negative_broadcast_tensor.push_back(negative_data[iter2.GetInputPosB()]);
      iter1.GenNextPos();
      iter2.GenNextPos();
    }
    x_data = x_broadcast_tensor.data();
    positive_data = positive_broadcast_tensor.data();
    negative_data = negative_broadcast_tensor.data();
  }
  auto output_data = reinterpret_cast<T *>(output->GetData());
  Eigen::Array<float, Eigen::Dynamic, 1> output_reduction_none(data_num_output_reduction_none, 1);
  float *output_reduction_none_data = reinterpret_cast<float *>(output_reduction_none.data());
  auto shard_triplet_margin_loss = [&](int64_t start, int64_t end) {
    Eigen::Array<float, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
    Eigen::Array<float, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
    Eigen::Array<float, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
    float *calculate_positive_distance_data = reinterpret_cast<float *>(calculate_positive_distance.data());
    float *calculate_negative_distance_data = reinterpret_cast<float *>(calculate_negative_distance.data());
    float *calculate_swap_distance_data = reinterpret_cast<float *>(calculate_swap_distance.data());
    int64_t once_compute_thread_size = end - start;
    float positive_distance;
    float negative_distance;
    float swap_distance;
    float temp1;
    float temp2;
    float temp3;
    if (data_num_each_batch_input == 0) {
      KERNEL_LOG_ERROR("data_num_each_batch_input could not be 0.");
    }
    for (int64_t n = 0; n < (once_compute_thread_size) / data_num_each_batch_input; n++) {
      int64_t i = start / data_num_each_batch_input;
      for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
        for (int64_t k = 0; k < once_compute_size; k++) {
          *(calculate_positive_distance_data + k) =
            eps_value + (static_cast<float>(*(x_data + i * data_num_each_batch_input + j +
                                              k * data_num_each_batch_output_reduction_none)) -
                         static_cast<float>(*(positive_data + i * data_num_each_batch_input + j +
                                              k * data_num_each_batch_output_reduction_none)));
          *(calculate_negative_distance_data + k) =
            eps_value + (static_cast<float>(*(x_data + i * data_num_each_batch_input + j +
                                              k * data_num_each_batch_output_reduction_none)) -
                         static_cast<float>(*(negative_data + i * data_num_each_batch_input + j +
                                              k * data_num_each_batch_output_reduction_none)));
          if (swap_value == true) {
            *(calculate_swap_distance_data + k) =
              eps_value + (static_cast<float>(*(positive_data + i * data_num_each_batch_input + j +
                                                k * data_num_each_batch_output_reduction_none)) -
                           static_cast<float>(*(negative_data + i * data_num_each_batch_input + j +
                                                k * data_num_each_batch_output_reduction_none)));
          }
        }
        calculate_positive_distance = (calculate_positive_distance).abs();
        calculate_negative_distance = (calculate_negative_distance).abs();
        for (int64_t n = 0; n < once_compute_size; n++) {
          temp1 = *(calculate_positive_distance_data + n);
          temp2 = *(calculate_negative_distance_data + n);
          for (int64_t l = 1; l < p_value; l++) {
            *(calculate_positive_distance_data + n) = *(calculate_positive_distance_data + n) * temp1;
            *(calculate_negative_distance_data + n) = *(calculate_negative_distance_data + n) * temp2;
          }
        }
        positive_distance = static_cast<float>(
          std::pow(static_cast<double>(calculate_positive_distance.sum()), (1 / static_cast<float>(p_value))));
        negative_distance = static_cast<float>(
          std::pow(static_cast<double>(calculate_negative_distance.sum()), (1 / static_cast<float>(p_value))));
        if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
          positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
        }
        if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
          negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
        }
        if (swap_value == true) {
          calculate_swap_distance = ((calculate_swap_distance)).abs();
          for (int64_t n = 0; n < once_compute_size; n++) {
            temp3 = *(calculate_swap_distance_data + n);
            for (int64_t l = 1; l < p_value; l++) {
              *(calculate_swap_distance_data + n) = *(calculate_swap_distance_data + n) * temp3;
            }
          }
          swap_distance = static_cast<float>(
            std::pow(static_cast<double>(calculate_swap_distance.sum()), (1 / static_cast<float>(p_value))));
          if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
            swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
          }
          negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
        }
        *(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
          (positive_distance + margin_value - negative_distance > static_cast<float>(0))
            ? ((positive_distance + margin_value - negative_distance))
            : static_cast<float>(0);
      }
      start += data_num_each_batch_input;
    }
  };
  if (num_elements * sizeof(T) > kParallelDataNum) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (num_elements * sizeof(T) <= kParallelDataNumMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("max_core_num could not be 0.");
    }
    CpuKernelUtils::ParallelFor(ctx, num_elements,
                                data_num_each_batch_input * ADULT_AGE * (batch_size / max_core_num + 1),
                                shard_triplet_margin_loss);
  } else {
    Eigen::Array<float, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
    Eigen::Array<float, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
    Eigen::Array<float, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
    float *calculate_positive_distance_data = reinterpret_cast<float *>(calculate_positive_distance.data());
    float *calculate_negative_distance_data = reinterpret_cast<float *>(calculate_negative_distance.data());
    float *calculate_swap_distance_data = reinterpret_cast<float *>(calculate_swap_distance.data());
    for (int64_t i = 0; i < batch_size; i++) {
      for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
        float positive_distance;
        float negative_distance;
        float swap_distance;
        for (int64_t k = 0; k < once_compute_size; k++) {
          *(calculate_positive_distance_data + k) =
            eps_value + (static_cast<float>(*(x_data + i * data_num_each_batch_input + j +
                                              k * data_num_each_batch_output_reduction_none)) -
                         static_cast<float>(*(positive_data + i * data_num_each_batch_input + j +
                                              k * data_num_each_batch_output_reduction_none)));
          *(calculate_negative_distance_data + k) =
            eps_value + (static_cast<float>(*(x_data + i * data_num_each_batch_input + j +
                                              k * data_num_each_batch_output_reduction_none)) -
                         static_cast<float>(*(negative_data + i * data_num_each_batch_input + j +
                                              k * data_num_each_batch_output_reduction_none)));
          if (swap_value == true) {
            *(calculate_swap_distance_data + k) =
              eps_value + (static_cast<float>(*(positive_data + i * data_num_each_batch_input + j +
                                                k * data_num_each_batch_output_reduction_none)) -
                           static_cast<float>(*(negative_data + i * data_num_each_batch_input + j +
                                                k * data_num_each_batch_output_reduction_none)));
          }
        }
        calculate_positive_distance = (calculate_positive_distance).abs();
        calculate_negative_distance = (calculate_negative_distance).abs();
        float temp1;
        float temp2;
        float temp3;
        for (int64_t n = 0; n < once_compute_size; n++) {
          temp1 = *(calculate_positive_distance_data + n);
          temp2 = *(calculate_negative_distance_data + n);
          for (int64_t l = 1; l < p_value; l++) {
            *(calculate_positive_distance_data + n) = *(calculate_positive_distance_data + n) * temp1;
            *(calculate_negative_distance_data + n) = *(calculate_negative_distance_data + n) * temp2;
          }
        }
        positive_distance = static_cast<float>(
          std::pow(static_cast<double>(calculate_positive_distance.sum()), (1 / static_cast<float>(p_value))));
        negative_distance = static_cast<float>(
          std::pow(static_cast<double>(calculate_negative_distance.sum()), (1 / static_cast<float>(p_value))));
        if (broadcast == true) {
          if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
            positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
          }
          if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
            negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
          }
        }
        if (swap_value == true) {
          calculate_swap_distance = ((calculate_swap_distance)).abs();
          for (int64_t n = 0; n < once_compute_size; n++) {
            temp3 = *(calculate_swap_distance_data + n);
            for (int64_t l = 1; l < p_value; l++) {
              *(calculate_swap_distance_data + n) = *(calculate_swap_distance_data + n) * temp3;
            }
          }
          swap_distance = static_cast<float>(
            std::pow(static_cast<double>(calculate_swap_distance.sum()), (1 / static_cast<float>(p_value))));
          if (broadcast == true) {
            if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
              swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
            }
          }
          negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
        }
        *(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
          (positive_distance + margin_value - negative_distance > static_cast<float>(0))
            ? ((positive_distance + margin_value - negative_distance))
            : static_cast<float>(0);
      }
    }
  }
  if (reduction_value == "none") {
    for (int64_t i = 0; i < data_num_output_reduction_none; i++) {
      *(output_data + i) = static_cast<T>(*(output_reduction_none_data + i));
    }
  }
  if (reduction_value == "mean") {
    *(output_data) = static_cast<T>(output_reduction_none.mean());
  }
  if (reduction_value == "sum") {
    *(output_data) = static_cast<T>(output_reduction_none.sum());
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kTripletMarginLoss, TripletMarginLossCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.h
@ -0,0 +1,57 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_TRIPLET_MARGIN_LOSS_H_
 #define AICPU_KERNELS_NORMALIZED_TRIPLET_MARGIN_LOSS_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class TripletMarginLossCpuKernel : public CpuKernel {
 public:
  TripletMarginLossCpuKernel() = default;
  ~TripletMarginLossCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  static uint32_t TripletMarginLossComputeRealType(
    CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value,
    std::string reduction_value, int64_t num_elements, int64_t data_num_output_reduction_none,
    int64_t data_num_each_batch_input, int64_t data_num_each_batch_output_reduction_none, int64_t batch_size,
    int64_t once_compute_size, bool broadcast, std::vector<int64_t> x_reshape_vector,
    std::vector<int64_t> positive_reshape_vector, std::vector<int64_t> negative_reshape_vector);
  template <typename T>
  static uint32_t TripletMarginLossComputeRealTypeFloat16(
    CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value,
    std::string reduction_value, int64_t num_elements, int64_t data_num_output_reduction_none,
    int64_t data_num_each_batch_input, int64_t data_num_each_batch_output_reduction_none, int64_t batch_size,
    int64_t once_compute_size, bool broadcast, std::vector<int64_t> x_reshape_vector,
    std::vector<int64_t> positive_reshape_vector, std::vector<int64_t> negative_reshape_vector);
  template <typename T>
  static uint32_t TripletMarginLossComputeComplexType(
    CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value,
    std::string reduction_value, int64_t num_elements, int64_t data_num_output_reduction_none,
    int64_t data_num_each_batch_input, int64_t data_num_each_batch_output_reduction_none, int64_t batch_size,
    int64_t once_compute_size, bool broadcast, std::vector<int64_t> x_reshape_vector,
    std::vector<int64_t> positive_reshape_vector, std::vector<int64_t> negative_reshape_vector);
 };
 }  // namespace aicpu
 #endif  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triu_indices.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triu_indices.cc
@ -0,0 +1,95 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "triu_indices.h"
 #include <Eigen/Dense>
 #include <algorithm>
 #include <iostream>
 #include <map>
 #include "cpu_kernel_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const char *kTriuIndices = "TriuIndices";
 #define TRIU_INDICES_COMPUTE_CASE(DTYPE, TYPE, CTX)           \
  case (DTYPE): {                                             \
    uint32_t result = DoCompute<TYPE>(CTX);                   \
    if (result != KERNEL_STATUS_OK) {                         \
      KERNEL_LOG_ERROR("TriuIndices kernel compute failed."); \
      return result;                                          \
    }                                                         \
    break;                                                    \
  }
 }  // namespace
 namespace aicpu {
 uint32_t TriuIndicesCpuKernel::Compute(CpuKernelContext &ctx) {
  Tensor *output = ctx.Output(0);
  KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "Get output failed.")
  auto data_type = ctx.Output(0)->GetDataType();
  switch (data_type) {
    TRIU_INDICES_COMPUTE_CASE(DT_INT32, int32_t, ctx)
    TRIU_INDICES_COMPUTE_CASE(DT_INT64, int64_t, ctx)
    default:
      KERNEL_LOG_ERROR("TriuIndices kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t TriuIndicesCpuKernel::DoCompute(CpuKernelContext &ctx) {
  AttrValue *row_ptr = ctx.GetAttr("row");
  AttrValue *col_ptr = ctx.GetAttr("col");
  AttrValue *offset_ptr = ctx.GetAttr("offset");
  int64_t row = row_ptr->GetInt();
  int64_t col = col_ptr->GetInt();
  int64_t offset = (offset_ptr == nullptr) ? 0 : (offset_ptr->GetInt());
  auto offset1 = offset - 1;
  auto m_first_row = offset1 > 0 ? std::min<int64_t>(col, 1 + offset1) : row + offset1 > 0;
  auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset1));
  auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(row, row + offset1));
  auto n_row_trapezoid = (m_last_row - m_first_row + 1);
  auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1;
  auto diff_row = n_row_all - n_row_trapezoid;
  if (diff_row > 0) {
    tril_size += diff_row * col;
  }
  auto triu_size = row * col - tril_size;
  T *output{static_cast<T *>(ctx.Output(0)->GetData())};
  int64_t i = 0;
  int64_t c = std::max<int64_t>(0, offset), r = 0;
  while (i < triu_size) {
    output[i] = r;
    output[triu_size + i++] = c;
    c += 1;
    if (c >= col) {
      r += 1;
      c = std::max<int64_t>(0, r + offset);
    }
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kTriuIndices, TriuIndicesCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triu_indices.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triu_indices.h
@ -0,0 +1,41 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_TRIU_INDICES_H_
 #define AICPU_KERNELS_NORMALIZED_TRIU_INDICES_H_
 #include "cpu_ops_kernel.h"
 #include "cpu_types.h"
 #include "utils/bcast.h"
 #include "utils/sparse_tensor.h"
 namespace aicpu {
 class TriuIndicesCpuKernel : public CpuKernel {
 public:
  TriuIndicesCpuKernel() = default;
  ~TriuIndicesCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  uint32_t DoCompute(CpuKernelContext &ctx);
  int32_t offset = 0;
  int32_t offset1 = 0;
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unpack.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unpack.cc
@ -0,0 +1,209 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "unpack.h"
 #include "utils/kernel_util.h"
 namespace {
 const char *kUnpack = "Unpack";
 }
 namespace aicpu {
 uint32_t UnpackCpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
  Tensor *value_ptr = ctx.Input(0);
  KERNEL_CHECK_NULLPTR(value_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input value failed.");
  value_data_ptr = value_ptr->GetData();
  KERNEL_CHECK_NULLPTR(value_data_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input value data failed.");
  auto value_shape_ptr = value_ptr->GetTensorShape();
  KERNEL_CHECK_NULLPTR(value_shape_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input value shape failed.");
  int64_t value_dim = value_shape_ptr->GetDims();
  AttrValue *unpack_axis_ptr = ctx.GetAttr("axis");
  int64_t real_unpack_axis = 0;
  KERNEL_CHECK_FALSE(unpack_axis_ptr, KERNEL_STATUS_PARAM_INVALID, "get axis failed!");
  unpack_axis = unpack_axis_ptr->GetInt();
  real_unpack_axis = unpack_axis >= 0 ? unpack_axis : unpack_axis + value_dim;
  KERNEL_CHECK_FALSE(value_dim > real_unpack_axis, KERNEL_STATUS_PARAM_INVALID,
                     "The axis value range should be [-value_dim, value_dim), "
                     "value dim is [%d], axis is [%d].",
                     value_dim, unpack_axis);
  unpack_axis = real_unpack_axis;
  AttrValue *unpack_num_ptr = ctx.GetAttr("num");
  KERNEL_CHECK_FALSE(unpack_num_ptr, KERNEL_STATUS_PARAM_INVALID, "get num failed!");
  int64_t axis_size = value_shape_ptr->GetDimSize(unpack_axis);
  unpack_num = unpack_num_ptr->GetInt();
  KERNEL_CHECK_FALSE(unpack_num == axis_size, KERNEL_STATUS_PARAM_INVALID,
                     "The num you want to unpack to should be equal to the "
                     "size of the specified dimension. "
                     "The num you want to unpack to is [%d], while the [%d] "
                     "dim's size is [%d].",
                     unpack_num, unpack_axis, axis_size);
  value_shape_vec = value_shape_ptr->GetDimSizes();
  data_type = value_ptr->GetDataType();
  value_num = value_ptr->NumElements();
  output_ptr_vec.resize(unpack_num);
  for (int64_t i = 0; i < unpack_num; i++) {
    Tensor *output_ptr = ctx.Output(i);
    KERNEL_CHECK_NULLPTR(output_ptr, KERNEL_STATUS_PARAM_INVALID, "Get output [%d] failed.", i);
    auto output_data_ptr = output_ptr->GetData();
    KERNEL_CHECK_NULLPTR(output_data_ptr, KERNEL_STATUS_PARAM_INVALID, "Get output data [%d] failed.", i);
    output_ptr_vec[i] = output_data_ptr;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t UnpackCpuKernel::UnpackWithOneOutput(T *input_data_ptr, std::vector<T *> output_data_vec) {
  int64_t copy_size = value_num * sizeof(T);
  auto mem_ret = memcpy_s(output_data_vec[0], copy_size, input_data_ptr, copy_size);
  KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
                     "Memcpy size[%zu] from input value to output[0] failed.", copy_size);
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t UnpackCpuKernel::UnpackWithDimZero(T *input_data_ptr, std::vector<T *> output_data_vec) {
  if (value_shape_vec[0] == 0) {
    KERNEL_CHECK_FALSE(value_shape_vec[0] > 0, KERNEL_STATUS_PARAM_INVALID, "The shape of input tensor is invalid.");
  }
  int64_t copy_num = value_num / value_shape_vec[0];
  T *input_copy_ptr = input_data_ptr;
  for (int64_t i = 0; i < unpack_num; i++) {
    int64_t copy_size_per = copy_num;
    int64_t copy_size = copy_size_per * sizeof(T);
    auto mem_ret = memcpy_s(output_data_vec[i], copy_size, input_copy_ptr, copy_size);
    KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
                       "Memcpy size[%zu] from input value to output[%d] failed.", copy_size, i);
    input_copy_ptr += copy_size_per;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t UnpackCpuKernel::UnpackCompute(T *input_data_ptr, std::vector<T *> output_data_vec, CpuKernelContext &ctx) {
  int64_t prefix = 1;
  for (uint64_t i = 0; i < unpack_axis; i++) {
    if (value_shape_vec[i] == 0) {
      KERNEL_CHECK_FALSE(value_shape_vec[i] > 0, KERNEL_STATUS_PARAM_INVALID, "The shape of input tensor is invalid.");
    }
    prefix *= value_shape_vec[i];
  }
  if (unpack_axis >= value_shape_vec.size()) {
    KERNEL_CHECK_FALSE(unpack_axis < value_shape_vec.size(), KERNEL_STATUS_PARAM_INVALID,
                       "input attr axis is invalid.");
  }
  int64_t midfix = value_shape_vec[unpack_axis];
  int64_t subfix = 1;
  for (size_t i = unpack_axis + 1; i < value_shape_vec.size(); i++) {
    if (value_shape_vec[i] == 0) {
      KERNEL_CHECK_FALSE(value_shape_vec[i] > 0, KERNEL_STATUS_PARAM_INVALID, "The shape of input tensor is invalid.");
    }
    subfix *= value_shape_vec[i];
  }
  uint32_t min_core_num = 1;
  int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
  if (max_core_num > unpack_num) {
    max_core_num = unpack_num;
  }
  auto shard_unpack = [&](size_t start, size_t end) {
    int64_t offset = 0;
    for (uint64_t i = start; i < end; i++) {
      offset = i * subfix;
      T *output_data_ptr = output_data_vec[i];
      T *input_copy_ptr = input_data_ptr + offset;
      int64_t copy_size = subfix * sizeof(T);
      for (int64_t j = 0; j < prefix; j++) {
        auto mem_ret = memcpy_s(output_data_ptr, copy_size, input_copy_ptr, copy_size);
        KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
                           "Memcpy size[%zu] from input value to output[%d] failed.", copy_size, i);
        input_copy_ptr += (subfix * midfix);
        output_data_ptr += subfix;
      }
    }
    return KERNEL_STATUS_OK;
  };
  KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, unpack_num, unpack_num / max_core_num, shard_unpack),
                      "Unpack Compute failed.");
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t UnpackCpuKernel::DoCompute(CpuKernelContext &ctx) {
  T *input_data_ptr = reinterpret_cast<T *>(value_data_ptr);
  std::vector<T *> output_data_vec;
  output_data_vec.resize(unpack_num);
  for (int64_t i = 0; i < unpack_num; i++) {
    output_data_vec[i] = reinterpret_cast<T *>(output_ptr_vec[i]);
  }
  if (unpack_num == 1) {
    KERNEL_CHECK_FALSE((UnpackWithOneOutput<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
                       KERNEL_STATUS_PARAM_INVALID, "UnpackWithOneOutput failed.");
    return KERNEL_STATUS_OK;
  }
  if (unpack_axis == 0) {
    KERNEL_CHECK_FALSE((UnpackWithDimZero<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
                       KERNEL_STATUS_PARAM_INVALID, "UnpackWithDimZero failed.");
    return KERNEL_STATUS_OK;
  }
  KERNEL_CHECK_FALSE((UnpackCompute<T>(input_data_ptr, output_data_vec, ctx) == KERNEL_STATUS_OK),
                     KERNEL_STATUS_PARAM_INVALID, "Unpack Compute failed.");
  return KERNEL_STATUS_OK;
 }
 uint32_t UnpackCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_CHECK_FALSE((CheckAndInitParams(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
                     "CheckAndInitParams failed.");
  switch (data_type) {
    case DT_FLOAT16:
      return DoCompute<Eigen::half>(ctx);
    case DT_FLOAT:
      return DoCompute<float>(ctx);
    case DT_DOUBLE:
      return DoCompute<double>(ctx);
    case DT_BOOL:
      return DoCompute<bool>(ctx);
    case DT_INT8:
      return DoCompute<int8_t>(ctx);
    case DT_INT16:
      return DoCompute<int16_t>(ctx);
    case DT_INT32:
      return DoCompute<int32_t>(ctx);
    case DT_INT64:
      return DoCompute<int64_t>(ctx);
    case DT_UINT8:
      return DoCompute<uint8_t>(ctx);
    case DT_UINT16:
      return DoCompute<uint16_t>(ctx);
    case DT_UINT32:
      return DoCompute<uint32_t>(ctx);
    case DT_UINT64:
      return DoCompute<uint64_t>(ctx);
    case DT_COMPLEX64:
      return DoCompute<std::complex<float>>(ctx);
    case DT_COMPLEX128:
      return DoCompute<std::complex<double>>(ctx);
    default:
      KERNEL_LOG_ERROR("Unsupported data type [%s]", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
 }
 REGISTER_CPU_KERNEL(kUnpack, UnpackCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unpack.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unpack.h
@ -0,0 +1,65 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_UNPACK_H_
 #define AICPU_KERNELS_NORMALIZED_UNPACK_H_
 #include <memory>
 #include <vector>
 #include "cpu_types.h"
 #include "utils/bcast.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "securec.h"
 #include "cpu_ops_kernel.h"
 #include "cpu_kernel_utils.h"
 #include "kernel_log.h"
 #include "status.h"
 namespace aicpu {
 class UnpackCpuKernel : public CpuKernel {
 public:
  UnpackCpuKernel() : data_type(DT_DOUBLE), unpack_axis(0), unpack_num(0), value_num(0) {
    output_ptr_vec.clear();
    value_shape_vec.clear();
  }
  ~UnpackCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  uint32_t CheckAndInitParams(CpuKernelContext &ctx);
  template <typename T>
  uint32_t UnpackWithOneOutput(T *input_data_ptr, std::vector<T *> output_data_vec);
  template <typename T>
  uint32_t UnpackWithDimZero(T *input_data_ptr, std::vector<T *> output_data_vec);
  template <typename T>
  uint32_t UnpackCompute(T *input_data_ptr, std::vector<T *> output_data_vec, CpuKernelContext &ctx);
  template <typename T>
  uint32_t DoCompute(CpuKernelContext &ctx);
 private:
  DataType data_type;
  uint64_t unpack_axis;
  int64_t unpack_num;
  int64_t value_num;
  void *value_data_ptr;
  std::vector<void *> output_ptr_vec;
  std::vector<int64_t> value_shape_vec;
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unravel_index.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unravel_index.cc
@ -0,0 +1,120 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "unravel_index.h"
 #include "cpu_kernel_utils.h"
 #include "utils/kernel_util.h"
 namespace {
 const char *KUnravelIndex = "UnravelIndex";
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 2;
 const int64_t kParallelDataNumSameShape = 1000;
 }  // namespace
 namespace aicpu {
 uint32_t UnravelIndexCpuKernel::Compute(CpuKernelContext &ctx) {
  auto data_type = ctx.Input(0)->GetDataType();
  switch (data_type) {
    case DT_INT32: {
      KERNEL_HANDLE_ERROR(DataAndTypeCheck<int32_t>(ctx), " data or type check failed.");
      UnravelCompute<int32_t>(ctx);
      break;
    }
    case DT_INT64: {
      KERNEL_HANDLE_ERROR(DataAndTypeCheck<int64_t>(ctx), " data or type check failed.");
      UnravelCompute<int64_t>(ctx);
      break;
    }
    default: {
      KERNEL_LOG_ERROR("UnravelIndex kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
    }
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t UnravelIndexCpuKernel::DataAndTypeCheck(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Unravel_Index check input and output number failed.");
  Tensor *indices = ctx.Input(0);
  Tensor *dims = ctx.Input(1);
  auto dims_number = ctx.Input(1)->NumElements();
  auto indices_number = ctx.Input(0)->NumElements();
  auto dims_data = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto indices_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto indices_type = indices->GetDataType();
  auto dims_type = dims->GetDataType();
  T dims_multi = 1;
  KERNEL_CHECK_FALSE((indices_type == dims_type), KERNEL_STATUS_PARAM_INVALID,
                     "The data type of input0 [%s] need be same with "
                     "input1 [%s].",
                     DTypeStr(indices_type).c_str(), DTypeStr(dims_type).c_str())
  for (auto i = 0; i < dims_number; i++) {
    KERNEL_CHECK_FALSE((*(dims_data + i) > 0), KERNEL_STATUS_PARAM_INVALID, "Dimension number must be more than 0.")
    dims_multi = dims_multi * (*(dims_data + i));
  }
  for (auto i = 0; i < indices_number; i++) {
    KERNEL_CHECK_FALSE((*(indices_data + i) >= 0), KERNEL_STATUS_PARAM_INVALID, "Indice number must be more than 0.")
    KERNEL_CHECK_FALSE((*(indices_data + i) <= dims_multi), KERNEL_STATUS_PARAM_INVALID,
                       "Index is out of bound as with dims");
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t UnravelIndexCpuKernel ::UnravelCompute(CpuKernelContext &ctx) {
  auto indices_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
  auto dims_data = reinterpret_cast<T *>(ctx.Input(1)->GetData());
  auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
  auto dims_number = ctx.Input(1)->NumElements();
  auto indices_number = ctx.Input(0)->NumElements();
  auto data_num = indices_number;
  if (data_num >= kParallelDataNumSameShape) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
    if (max_core_num > data_num) {
      max_core_num = data_num;
    }
    auto sharder_unravel_index = [&](size_t start, size_t end) {
      for (auto j = start; j < end; j++) {
        T Quotient = *(indices_data + j);
        for (auto i = (dims_number - 1); i >= 0; i--) {
          *(output_data + i + j * dims_number) = Quotient % *(dims_data + i);
          Quotient = Quotient / *(dims_data + i);
        }
      }
    };
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_unravel_index),
                        "Unravel Index Compute failed.");
  } else {
    for (auto j = 0; j < indices_number; j++) {
      T Quotient = *(indices_data + j);
      for (auto i = (dims_number - 1); i >= 0; i--) {
        *(output_data + i + j * dims_number) = Quotient % *(dims_data + i);
        Quotient = Quotient / *(dims_data + i);
      }
    }
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(KUnravelIndex, UnravelIndexCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unravel_index.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unravel_index.h
@ -0,0 +1,37 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_UNRAVEL_INDEX_
 #define AICPU_KERNELS_NORMALIZED_UNRAVEL_INDEX_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class UnravelIndexCpuKernel : public CpuKernel {
 public:
  ~UnravelIndexCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T>
  uint32_t DataAndTypeCheck(CpuKernelContext &ctx);
  template <typename T>
  uint32_t UnravelCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_sum.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_sum.cc
@ -0,0 +1,167 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "unsorted_segment_sum.h"
 #include <string>
 #include "cpu_kernel_utils.h"
 #include "cpu_types.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const char *kUnsortedSegmentSum = "UnsortedSegmentSum";
 const uint32_t input_num = 3;
 const uint32_t output_num = 1;
 constexpr int64_t kParallelDataNums = 64 * 1024;
 }  // namespace
 namespace aicpu {
 template <typename input_t, typename segment_ids_t, typename num_segments_t>
 uint32_t UnsortedSegmentSumCpuKernel::UnsortedSegmentSumComputeTemplate(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, input_num, output_num), " node input size should be [%llu],  get [%llu]",
                      input_num, ctx.GetInputsSize(), " node output size should be [%llu],  get [%llu]", output_num,
                      ctx.GetOutputsSize());
  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
    KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s]",
                     DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
    KERNEL_LOG_ERROR(
      "The data size of the input [%llu] need be the same as the output "
      "[%llu]",
      ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  int64_t data_size = ctx.Input(0)->NumElements();
  int64_t id_size = ctx.Input(1)->NumElements();
  auto input_x = reinterpret_cast<input_t *>(ctx.Input(0)->GetData());
  KERNEL_CHECK_NULLPTR(input_x, KERNEL_STATUS_PARAM_INVALID, "Get input data failed")
  auto output_y = reinterpret_cast<input_t *>(ctx.Output(0)->GetData());
  KERNEL_CHECK_NULLPTR(output_y, KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
  auto segmentids = reinterpret_cast<segment_ids_t *>(ctx.Input(1)->GetData());
  KERNEL_CHECK_NULLPTR(segmentids, KERNEL_STATUS_PARAM_INVALID, "Get segment_ids failed")
  auto numsegments = reinterpret_cast<num_segments_t *>(ctx.Input(2)->GetData());
  KERNEL_CHECK_NULLPTR(numsegments, KERNEL_STATUS_PARAM_INVALID, "Get num_segments failed")
  if (id_size <= 0) {
    KERNEL_LOG_ERROR("segment_ids num elements should great than 0");
    return KERNEL_STATUS_PARAM_INVALID;
  }
  int64_t reshapesize = data_size / id_size;
  // Initialized to 0
  memset(output_y, 0, ctx.Output(0)->GetDataSize());
  if (data_size <= kParallelDataNums) {
    // calculation process
    for (int64_t i = 0; i < id_size; i++) {
      if (*(segmentids + i) < *numsegments) {
        for (int64_t j = 0; j < reshapesize; j++) {
          *(output_y + *(segmentids + i) * reshapesize + j) += *(input_x + i * reshapesize + j);
        }
      }
    }
  } else {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
    if (max_core_num > reshapesize) {
      max_core_num = reshapesize;
    }
    // calculation process
    auto shard_unsorted_segment_sum = [&](int64_t start, int64_t end) {
      for (int64_t i = 0; i < id_size; i++) {
        if (*(segmentids + i) < *numsegments) {
          for (int64_t j = start; j < end; j++) {
            *(output_y + *(segmentids + i) * reshapesize + j) += *(input_x + i * reshapesize + j);
          }
        }
      }
    };
    KERNEL_HANDLE_ERROR(
      CpuKernelUtils::ParallelFor(ctx, reshapesize, reshapesize / max_core_num, shard_unsorted_segment_sum),
      "CpuKernelUtils::ParallelFor failed.");
  }
  return KERNEL_STATUS_OK;
 }
 template <typename input_t, typename segment_ids_t>
 uint32_t UnsortedSegmentSumCpuKernel::DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type) {
  switch (num_segments_type) {
    case DT_INT32:
      return UnsortedSegmentSumComputeTemplate<input_t, segment_ids_t, int32_t>(ctx);
    case DT_INT64:
      return UnsortedSegmentSumComputeTemplate<input_t, segment_ids_t, int64_t>(ctx);
    default:
      KERNEL_LOG_ERROR("UnsortedSegmentSum invalid num_segments_type type [%s]", DTypeStr(num_segments_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
 }
 template <typename input_t>
 uint32_t UnsortedSegmentSumCpuKernel::DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type) {
  auto num_segments_type = ctx.Input(2)->GetDataType();
  switch (segment_ids_type) {
    case DT_INT32:
      return DoComputeWithNumSegmentsType<input_t, int32_t>(ctx, num_segments_type);
    case DT_INT64:
      return DoComputeWithNumSegmentsType<input_t, int64_t>(ctx, num_segments_type);
    default:
      KERNEL_LOG_ERROR("UnsortedSegmentSum invalid segment_ids_type type [%s]", DTypeStr(segment_ids_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
 }
 uint32_t UnsortedSegmentSumCpuKernel::Compute(CpuKernelContext &ctx) {
  auto input_type = ctx.Input(0)->GetDataType();
  auto segment_ids_type = ctx.Input(1)->GetDataType();
  switch (input_type) {
    case DT_INT32:
      return DoComputeWithSegmentIdsType<int32_t>(ctx, segment_ids_type);
    case DT_INT16:
      return DoComputeWithSegmentIdsType<int16_t>(ctx, segment_ids_type);
    case DT_FLOAT:
      return DoComputeWithSegmentIdsType<float>(ctx, segment_ids_type);
    case DT_DOUBLE:
      return DoComputeWithSegmentIdsType<double>(ctx, segment_ids_type);
    case DT_FLOAT16:
      return DoComputeWithSegmentIdsType<Eigen::half>(ctx, segment_ids_type);
    case DT_INT8:
      return DoComputeWithSegmentIdsType<int8_t>(ctx, segment_ids_type);
    case DT_INT64:
      return DoComputeWithSegmentIdsType<int64_t>(ctx, segment_ids_type);
    case DT_UINT8:
      return DoComputeWithSegmentIdsType<uint8_t>(ctx, segment_ids_type);
    case DT_UINT16:
      return DoComputeWithSegmentIdsType<uint16_t>(ctx, segment_ids_type);
    case DT_UINT32:
      return DoComputeWithSegmentIdsType<uint32_t>(ctx, segment_ids_type);
    case DT_UINT64:
      return DoComputeWithSegmentIdsType<uint64_t>(ctx, segment_ids_type);
    case DT_COMPLEX64:
      return DoComputeWithSegmentIdsType<std::complex<float>>(ctx, segment_ids_type);
    case DT_COMPLEX128:
      return DoComputeWithSegmentIdsType<std::complex<double>>(ctx, segment_ids_type);
    default:
      KERNEL_LOG_ERROR("UnsortedSegmentSum invalid input type [%s]", DTypeStr(input_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kUnsortedSegmentSum, UnsortedSegmentSumCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_sum.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_sum.h
@ -0,0 +1,38 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_SUM_H
 #define AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_SUM_H
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class UnsortedSegmentSumCpuKernel : public CpuKernel {
 public:
  ~UnsortedSegmentSumCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename input_t, typename segment_ids_t, typename num_segments_t>
  uint32_t UnsortedSegmentSumComputeTemplate(CpuKernelContext &ctx);
  template <typename input_t, typename segment_ids_t>
  uint32_t DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type);
  template <typename input_t>
  uint32_t DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/upper_bound.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/upper_bound.cc
@ -0,0 +1,153 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "upper_bound.h"
 #include "cpu_kernel_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kInputNum = 2;
 const uint32_t kOutputNum = 1;
 const char *kUpperBound = "UpperBound";
 #define UPPERBOUND_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX)    \
  case (DTYPE): {                                            \
    uint32_t result = UpperBoundCompute<TYPE1, TYPE2>(CTX);  \
    if (result != KERNEL_STATUS_OK) {                        \
      KERNEL_LOG_ERROR("UpperBound kernel compute failed."); \
      return result;                                         \
    }                                                        \
    break;                                                   \
  }
 #define UPPERBOUND_COMPUTE_CASE_ALL(TYPE, CTX)                \
  UPPERBOUND_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX)         \
  UPPERBOUND_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX)       \
  UPPERBOUND_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX)       \
  UPPERBOUND_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX)       \
  UPPERBOUND_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX)       \
  UPPERBOUND_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX)     \
  UPPERBOUND_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
  UPPERBOUND_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX)         \
  UPPERBOUND_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
 }  // namespace
 namespace aicpu {
 uint32_t UpperBoundCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "UpperBound check input and output number failed.");
  Tensor *sorted_x_data = ctx.Input(0);
  Tensor *values_data = ctx.Input(1);
  Tensor *output_data = ctx.Output(0);
  auto output_type = output_data->GetDataType();
  auto sorted_x_type = sorted_x_data->GetDataType();
  auto values_type = values_data->GetDataType();
  if (sorted_x_type != values_type) {
    KERNEL_LOG_ERROR("Input[0] data type[%s] must be same with Input[1] data type[%s]", DTypeStr(sorted_x_type).c_str(),
                     DTypeStr(values_type).c_str());
    return KERNEL_STATUS_PARAM_INVALID;
  }
  switch (output_type) {
    case DT_INT32:
      switch (sorted_x_type) {
        UPPERBOUND_COMPUTE_CASE_ALL(int32_t, ctx)
        default:
          KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
          return KERNEL_STATUS_PARAM_INVALID;
      }
      break;
    case DT_INT64:
      switch (sorted_x_type) {
        UPPERBOUND_COMPUTE_CASE_ALL(int64_t, ctx)
        default:
          KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
          return KERNEL_STATUS_PARAM_INVALID;
      }
      break;
    default:
      KERNEL_LOG_ERROR("Output data type[%s] not supported.", DTypeStr(output_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T1, typename T2>
 uint32_t UpperBoundCpuKernel::UpperBoundCompute(CpuKernelContext &ctx) {
  Tensor *sorted_x_data = ctx.Input(0);
  auto sorted_x_data_addr = reinterpret_cast<T1 *>(sorted_x_data->GetData());
  auto sorted_x_data_shape = sorted_x_data->GetTensorShape();
  std::vector<int64_t> sorted_x_data_shape_dims = sorted_x_data_shape->GetDimSizes();
  Tensor *values_data = ctx.Input(1);
  auto values_data_addr = reinterpret_cast<T1 *>(values_data->GetData());
  auto values_data_shape = values_data->GetTensorShape();
  int64_t values_data_num = values_data_shape->NumElements();
  std::vector<int64_t> values_data_shape_dims = values_data_shape->GetDimSizes();
  Tensor *output_data = ctx.Output(0);
  auto output_data_addr = reinterpret_cast<T2 *>(output_data->GetData());
  if (sorted_x_data_shape_dims[0] != values_data_shape_dims[0]) {
    KERNEL_LOG_ERROR("The number of rows of Input[0]:([%d]) should be consistent with that of Input[1]:([%d]).",
                     sorted_x_data_shape_dims[0], values_data_shape_dims[0]);
    return KERNEL_STATUS_PARAM_INVALID;
  }
  int64_t sorted_x_data_column = sorted_x_data_shape_dims[1];
  int64_t values_data_column = values_data_shape_dims[1];
  if (values_data_num < 1024) {
    for (int64_t i = 0; i < values_data_num; i++) {
      int64_t seq_row = i / values_data_column;
      int64_t low = seq_row * sorted_x_data_column;
      int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
      int64_t mid;
      while (low <= up) {
        mid = (low + up) / 2;
        if (values_data_addr[i] < sorted_x_data_addr[mid]) {
          up = mid - 1;
        } else {
          low = mid + 1;
        }
      }
      output_data_addr[i] = low - seq_row * sorted_x_data_column;
    }
  } else {
    uint32_t min_core_num = 1;
    int64_t sum_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
    if (sum_core_num > values_data_num) {
      sum_core_num = values_data_num;
    }
    auto shard_compute = [&](size_t start, size_t end) {
      for (size_t i = start; i < end; i++) {
        int64_t seq_row = i / values_data_column;
        int64_t low = seq_row * sorted_x_data_column;
        int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
        int64_t mid;
        while (low <= up) {
          mid = (low + up) / 2;
          if (values_data_addr[i] < sorted_x_data_addr[mid]) {
            up = mid - 1;
          } else {
            low = mid + 1;
          }
        }
        output_data_addr[i] = low - seq_row * sorted_x_data_column;
      }
    };
    KERNEL_HANDLE_ERROR(
      CpuKernelUtils::ParallelFor(ctx, values_data_num, values_data_num / sum_core_num, shard_compute),
      "UpperBound Compute failed.");
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kUpperBound, UpperBoundCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/upper_bound.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/upper_bound.h
@ -0,0 +1,35 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_UPPERBOUND_H_
 #define AICPU_KERNELS_NORMALIZED_UPPERBOUND_H_
 #include "cpu_ops_kernel.h"
 namespace aicpu {
 class UpperBoundCpuKernel : public CpuKernel {
 public:
  UpperBoundCpuKernel() = default;
  ~UpperBoundCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  template <typename T1, typename T2>
  static uint32_t UpperBoundCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/xdivy.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/xdivy.cc
@ -0,0 +1,192 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "xdivy.h"
 #include <complex>
 #include "cmath"
 #include "cpu_kernel_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 2;
 const char *kXdivy = "Xdivy";
 const int64_t kParallelDataNum = 2 * 1024;
 const int64_t kParallelDataNumMid = 16 * 1024;
 const int64_t kParallelDataNumSameShape = 7 * 1024;
 const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
 constexpr double EPSLON = 1e-15;
 #define XDIVY_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
  case (DTYPE): {                                       \
    uint32_t result = XdivyCompute<TYPE>(CTX);          \
    if (result != KERNEL_STATUS_OK) {                   \
      KERNEL_LOG_ERROR("Xdivy kernel compute failed."); \
      return result;                                    \
    }                                                   \
    break;                                              \
  }
 }  // namespace
 namespace aicpu {
 uint32_t XdivyCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kXdivy);
  BCalcInfo calc_info;
  KERNEL_HANDLE_ERROR(XdivyParamCheck(ctx), "Xdivy check params failed.");
  auto data_type = ctx.Input(0)->GetDataType();
  switch (data_type) {
    XDIVY_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
    XDIVY_COMPUTE_CASE(DT_FLOAT, float, ctx)
    XDIVY_COMPUTE_CASE(DT_DOUBLE, double, ctx)
    XDIVY_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
    XDIVY_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
    default:
      KERNEL_LOG_ERROR("Xdivy kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t XdivyCpuKernel::XdivyParamCheck(CpuKernelContext &ctx) {
  // the non null of input_0, input_1, output has been verified in NormalCheck
  Tensor *input_0 = ctx.Input(0);
  Tensor *input_1 = ctx.Input(1);
  Tensor *output = ctx.Output(0);
  DataType input0_type = input_0->GetDataType();
  DataType input1_type = input_1->GetDataType();
  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
                     "The data type of input0 [%s] need be same with "
                     "input1 [%s].",
                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
  KERNEL_LOG_DEBUG(
    "XdivyCpuKernel[%s], input0: size[%llu];"
    "input1: size[%llu], output: size[%llu].",
    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t XdivyCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) {
  auto input1 = static_cast<T *>(ctx.Input(0)->GetData());
  auto input2 = static_cast<T *>(ctx.Input(1)->GetData());
  auto output = static_cast<T *>(ctx.Output(0)->GetData());
  switch (type) {
    case BcastShapeType::SAME_SHAPE:
      for (int64_t i = start; i < end; ++i) {
        *(output + i) = *(input1 + i) / *(input2 + i) + static_cast<T>(EPSLON);
      }
      break;
    case BcastShapeType::X_ONE_ELEMENT:
      for (int64_t i = start; i < end; ++i) {
        *(output + i) = (*input1) / *(input2 + i) + static_cast<T>(EPSLON);
      }
      break;
    case BcastShapeType::Y_ONE_ELEMENT:
      for (int64_t i = start; i < end; ++i) {
        *(output + i) = *(input1 + i) / (*input2) + static_cast<T>(EPSLON);
      }
      break;
    default:
      KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
      break;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t XdivyCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
  int64_t data_num = ctx.Output(0)->NumElements();
  BcastShapeType type = in0_elements_nums == in1_elements_nums
                          ? BcastShapeType::SAME_SHAPE
                          : (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
  if (data_num >= kParallelDataNumSameShape) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
    if (data_num <= kParallelDataNumSameShapeMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    auto sharder_div = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, ctx); };
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("max_core_num could not be 0.");
    }
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_div),
                        "Xdivy Compute failed.");
  } else {
    SpecialCompute<T>(type, 0, data_num, ctx);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t XdivyCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
  auto in0 = static_cast<T *>(ctx.Input(0)->GetData());
  auto in1 = static_cast<T *>(ctx.Input(1)->GetData());
  auto out = static_cast<T *>(ctx.Output(0)->GetData());
  int64_t data_num = ctx.Output(0)->NumElements();
  if (data_num >= kParallelDataNum) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
    if (data_num <= kParallelDataNumMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    auto sharder_div = [&](int64_t start, int64_t end) {
      for (int64_t i = start; i < end; ++i) {
        *(out + i) =
          *(in0 + bcast.GetBroadcastXIndex(i)) / *(in1 + bcast.GetBroadcastYIndex(i)) + static_cast<T>(EPSLON);
      }
    };
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("max_core_num could not be 0.");
    }
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_div),
                        "Xdivy Compute failed.");
  } else {
    for (int64_t i = 0; i < data_num; ++i) {
      *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) / *(in1 + bcast.GetBroadcastYIndex(i)) + static_cast<T>(EPSLON);
    }
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t XdivyCpuKernel::XdivyCompute(CpuKernelContext &ctx) {
  Tensor *input0_tensor = ctx.Input(0);
  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
  int64_t input0_elements_nums = input0_tensor->NumElements();
  Tensor *input1_tensor = ctx.Input(1);
  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
  int64_t input1_elements_nums = input1_tensor->NumElements();
  bool noNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
  if (noNeedBcast) {
    return NoBcastCompute<T>(ctx);
  } else {
    Bcast bcast(input0_shape, input1_shape);
    if (!bcast.IsValid()) {
      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
      return KERNEL_STATUS_PARAM_INVALID;
    }
    return BcastCompute<T>(ctx, bcast);
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kXdivy, XdivyCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/xdivy.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/xdivy.h
@ -0,0 +1,52 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_XDIVY_H_
 #define AICPU_KERNELS_NORMALIZED_XDIVY_H_
 #define EIGEN_USE_THREADS
 #define EIGEN_USE_SIMPLE_THREAD_POOL
 #include "cpu_ops_kernel.h"
 #include "cpu_types.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class XdivyCpuKernel : public CpuKernel {
 public:
  XdivyCpuKernel() = default;
  ~XdivyCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  uint32_t XdivyParamCheck(CpuKernelContext &ctx);
 private:
  template <typename T>
  uint32_t SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx);
  template <typename T>
  uint32_t NoBcastCompute(CpuKernelContext &ctx);
  template <typename T>
  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
  template <typename T>
  uint32_t XdivyCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/xlogy.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/xlogy.cc
@ -0,0 +1,216 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "xlogy.h"
 #include "cmath"
 #include "cpu_kernel_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"
 namespace {
 const uint32_t kOutputNum = 1;
 const uint32_t kInputNum = 2;
 const char *kXlogy = "Xlogy";
 const int64_t kParallelDataNum = 8 * 1024;
 const int64_t kParallelDataNumMid = 16 * 1024;
 const int64_t kParallelDataNumSameShape = 7 * 1024;
 const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
 #define XLOGY_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
  case (DTYPE): {                                       \
    uint32_t result = XlogyCompute<TYPE>(CTX);          \
    if (result != KERNEL_STATUS_OK) {                   \
      KERNEL_LOG_ERROR("Xlogy kernel compute failed."); \
      return result;                                    \
    }                                                   \
    break;                                              \
  }
 }  // namespace
 namespace aicpu {
 uint32_t XlogyCpuKernel::Compute(CpuKernelContext &ctx) {
  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kXlogy);
  BCalcInfo calc_info;
  KERNEL_HANDLE_ERROR(XlogyParamCheck(ctx), "Xlogy check params failed.");
  auto data_type = ctx.Input(0)->GetDataType();
  switch (data_type) {
    XLOGY_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
    XLOGY_COMPUTE_CASE(DT_FLOAT, float, ctx)
    XLOGY_COMPUTE_CASE(DT_DOUBLE, double, ctx)
    default:
      KERNEL_LOG_ERROR("Xlogy kernel data type [%s] not support.", DTypeStr(data_type).c_str());
      return KERNEL_STATUS_PARAM_INVALID;
  }
  return KERNEL_STATUS_OK;
 }
 uint32_t XlogyCpuKernel::XlogyParamCheck(CpuKernelContext &ctx) {
  // the non null of input_0, input_1, output has been verified in NormalCheck
  Tensor *input_0 = ctx.Input(0);
  Tensor *input_1 = ctx.Input(1);
  Tensor *output = ctx.Output(0);
  DataType input0_type = input_0->GetDataType();
  DataType input1_type = input_1->GetDataType();
  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
                     "The data type of input0 [%s] need be same with "
                     "input1 [%s].",
                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
  KERNEL_LOG_DEBUG(
    "XlogyCpuKernel[%s], input0: size[%llu];"
    "input1: size[%llu], output: size[%llu].",
    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t XlogyCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
                                        const T *input2, T *output) {
  auto zero = T(0);
  switch (type) {
    case BcastShapeType::SAME_SHAPE:
      for (int64_t i = start; i < end; ++i) {
        if (*(input1 + i) == zero) {
          *(output + i) = zero;
          continue;
        }
        if (*(input2 + i) < zero) {
          *(output + i) = std::numeric_limits<T>::quiet_NaN();
          continue;
        }
        *(output + i) = *(input1 + i) * log(*(input2 + i));
      }
      break;
    case BcastShapeType::X_ONE_ELEMENT:
      for (int64_t i = start; i < end; ++i) {
        if (*(input1) == zero) {
          *(output + i) = zero;
          continue;
        }
        if (*(input2 + i) < zero) {
          *(output + i) = std::numeric_limits<T>::quiet_NaN();
          continue;
        }
        *(output + i) = (*input1) * log(*(input2 + i));
      }
      break;
    case BcastShapeType::Y_ONE_ELEMENT:
      for (int64_t i = start; i < end; ++i) {
        if (*(input1 + i) == zero) {
          *(output + i) = zero;
          continue;
        }
        if (*(input2) < zero) {
          *(output + i) = std::numeric_limits<T>::quiet_NaN();
          continue;
        }
        *(output + i) = *(input1 + i) * log(*(input2));
      }
      break;
    default:
      KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
      break;
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t XlogyCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
  auto in0 = static_cast<T *>(ctx.Input(0)->GetData());
  auto in1 = static_cast<T *>(ctx.Input(1)->GetData());
  auto out = static_cast<T *>(ctx.Output(0)->GetData());
  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
  int64_t data_num = ctx.Output(0)->NumElements();
  BcastShapeType type = in0_elements_nums == in1_elements_nums
                          ? BcastShapeType::SAME_SHAPE
                          : (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
  if (data_num >= kParallelDataNumSameShape) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
    if (data_num <= kParallelDataNumSameShapeMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    auto sharder_div = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, in0, in1, out); };
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("max_core_num could not be 0.");
    }
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_div),
                        "Xlogy Compute failed.");
  } else {
    SpecialCompute<T>(type, 0, data_num, in0, in1, out);
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t XlogyCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
  auto in0 = static_cast<T *>(ctx.Input(0)->GetData());
  auto in1 = static_cast<T *>(ctx.Input(1)->GetData());
  auto out = static_cast<T *>(ctx.Output(0)->GetData());
  int64_t data_num = ctx.Output(0)->NumElements();
  auto zero = T(0);
  if (data_num >= kParallelDataNum) {
    uint32_t min_core_num = 1;
    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
    if (data_num <= kParallelDataNumMid) {
      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
    }
    auto sharder_div = [&](int64_t start, int64_t end) {
      for (int64_t i = start; i < end; ++i) {
        *(out + i) = *(in1 + i) >= zero
                       ? *(in0 + bcast.GetBroadcastXIndex(i)) * log(*(in1 + bcast.GetBroadcastYIndex(i)))
                       : std::numeric_limits<T>::quiet_NaN();
      }
    };
    if (max_core_num == 0) {
      KERNEL_LOG_ERROR("max_core_num could not be 0.");
    }
    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_div),
                        "Xlogy Compute failed.");
  } else {
    for (int64_t i = 0; i < data_num; ++i) {
      *(out + i) = *(in1 + i) >= zero ? *(in0 + bcast.GetBroadcastXIndex(i)) * log(*(in1 + bcast.GetBroadcastYIndex(i)))
                                      : std::numeric_limits<T>::quiet_NaN();
    }
  }
  return KERNEL_STATUS_OK;
 }
 template <typename T>
 uint32_t XlogyCpuKernel::XlogyCompute(CpuKernelContext &ctx) {
  Tensor *input0_tensor = ctx.Input(0);
  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
  int64_t input0_elements_nums = input0_tensor->NumElements();
  Tensor *input1_tensor = ctx.Input(1);
  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
  int64_t input1_elements_nums = input1_tensor->NumElements();
  bool noNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
  if (noNeedBcast) {
    return NoBcastCompute<T>(ctx);
  } else {
    Bcast bcast(input0_shape, input1_shape);
    if (!bcast.IsValid()) {
      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
      return KERNEL_STATUS_PARAM_INVALID;
    }
    return BcastCompute<T>(ctx, bcast);
  }
  return KERNEL_STATUS_OK;
 }
 REGISTER_CPU_KERNEL(kXlogy, XlogyCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/xlogy.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/xlogy.h
@ -0,0 +1,52 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_KERNELS_NORMALIZED_XLOGY_H_
 #define AICPU_KERNELS_NORMALIZED_XLOGY_H_
 #define EIGEN_USE_THREADS
 #define EIGEN_USE_SIMPLE_THREAD_POOL
 #include "cpu_ops_kernel.h"
 #include "cpu_types.h"
 #include "utils/bcast.h"
 namespace aicpu {
 class XlogyCpuKernel : public CpuKernel {
 public:
  XlogyCpuKernel() = default;
  ~XlogyCpuKernel() override = default;
 protected:
  uint32_t Compute(CpuKernelContext &ctx) override;
 private:
  uint32_t XlogyParamCheck(CpuKernelContext &ctx);
 private:
  template <typename T>
  uint32_t SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, T *output);
  template <typename T>
  uint32_t NoBcastCompute(CpuKernelContext &ctx);
  template <typename T>
  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
  template <typename T>
  uint32_t XlogyCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_dense_cwise_utils.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_dense_cwise_utils.h
@ -0,0 +1,61 @@
 #ifndef AICPU_UtILS_SPARSE_DENSE_CWISE_UTILS_H_
 #define AICPU_UtILS_SPARSE_DENSE_CWISE_UTILS_H_
 #include "cpu_ops_kernel.h"
 #include "utils/bcast.h"
 #include "utils/eigen_tensor.h"
 namespace aicpu {
 struct AddOp {
  static std::string Name() { return "Add"; }
 };
 struct DivOp {
  static std::string Name() { return "Div"; }
 };
 struct MulOp {
  static std::string Name() { return "Mul"; }
 };
 template <typename Op>
 class SparseDenseCwiseOpKernel : public CpuKernel {
 public:
  SparseDenseCwiseOpKernel() = default;
  ~SparseDenseCwiseOpKernel() override = default;
 protected:
  virtual uint32_t Compute(CpuKernelContext &ctx) override = 0;
  static uint32_t CheckParams(CpuKernelContext &ctx);
  template <typename T>
  uint32_t SparseDenseCwiseOpSpecialCompute(BcastShapeType type, CpuKernelContext &ctx);
  template <typename T>
  uint32_t SparseDenseCwiseOpSpecialComputeComplex(BcastShapeType type, CpuKernelContext &ctx);
  template <typename T>
  uint32_t ComputeOp(CpuKernelContext &ctx);
  template <typename T>
  uint32_t ComputeOpComplex(CpuKernelContext &ctx);
  template <typename T>
  uint32_t SparseDenseCwiseOpNoBcastCompute(CpuKernelContext &ctx);
  template <typename T>
  uint32_t SparseDenseCwiseOpNoBcastComputeComplex(CpuKernelContext &ctx);
  template <typename T>
  uint32_t SparseDenseCwiseOpBcastCompute(CpuKernelContext &ctx);
  template <typename T>
  uint32_t SparseDenseCwiseOpBcastComputeComplex(CpuKernelContext &ctx);
  template <typename T>
  uint32_t SparseDenseCwiseOpCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_tensor.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sparse_tensor.h
@ -67,6 +67,14 @@ class SparseTensor {
   * sparse eigen tensor indices valid
   * @return uint32_t: 0->success other->failed
   */
  int dims() const { return dims_; }
  std::shared_ptr<EigenTensor> indices() const { return ix_; }
  std::shared_ptr<EigenTensor> values() const { return vals_; }
  std::vector<int64_t> shape() const { return shape_; }
  template <typename T>
  uint32_t EigenTensorIndicesValidCheck(int64_t dims_size) const {
    const auto ix_t = ix_->matrix<T>();
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
@ -78,9 +78,75 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
                                                               mindspore::kMaskedSelectOpName,
                                                               mindspore::kMaskedSelectGradOpName,
                                                               mindspore::kMedianOpName,
                                                               mindspore::kACosGradOpName,
                                                               mindspore::kAcoshGradOpName,
                                                               mindspore::kAdaptiveAvgPool3DOpName,
                                                               mindspore::kAdaptiveAvgPool3DGradOpName,
                                                               mindspore::kAdaptiveMaxPool2DGradOpName,
                                                               mindspore::kAdaptiveMaxPool3DOpName,
                                                               mindspore::kAdaptiveMaxPool3DGradOpName,
                                                               mindspore::kAddNOpName,
                                                               mindspore::kAddV2OpName,
                                                               mindspore::kAdjustContrastv2OpName,
                                                               mindspore::kAdjustHueOpName,
                                                               mindspore::kAdjustSaturationOpName,
                                                               mindspore::kAffineGridGradOpName,
                                                               mindspore::kAngleOpName,
                                                               mindspore::kArgmaxOpName,
                                                               mindspore::kArgMaxWithValueOpName,
                                                               mindspore::kArgMinOpName,
                                                               mindspore::kArgMinWithValueOpName,
                                                               mindspore::KAsinGradOpName,
                                                               mindspore::KAsinhGradOpName,
                                                               mindspore::kAvgPoolOpName,
                                                               mindspore::kAvgPoolGradOpName,
                                                               mindspore::kBartlettWindowOpName,
                                                               mindspore::kBatchNormGradGradOpName,
                                                               mindspore::kBiasAddOpName,
                                                               mindspore::kBiasAddGradOpName,
                                                               mindspore::kBincountOpName,
                                                               mindspore::kBlackmanWindowOpName,
                                                               mindspore::kBroadcastOpName,
                                                               mindspore::kMedianGradOpName,
                                                               mindspore::kNMSWithMaskOpName,
                                                               mindspore::kReduceSumOpName,
                                                               mindspore::kSpaceToDepthOpName,
                                                               mindspore::kSparseAddmmOpName,
                                                               mindspore::kSparseApplyAdagradDAOpName,
                                                               mindspore::kSparseApplyCenteredRMSPropOpName,
                                                               mindspore::kSparseApplyMomentumOpName,
                                                               mindspore::kSparseApplyProximalGradientDescentOpName,
                                                               mindspore::kSparseConcatOpName,
                                                               mindspore::kSparseDenseCwiseAddOpName,
                                                               mindspore::kSparseDenseCwiseDivOpName,
                                                               mindspore::kSparseDenseCwiseMulOpName,
                                                               mindspore::kSparseMatrixMatMulOpName,
                                                               mindspore::kSparseMatrixNNZOpName,
                                                               mindspore::kSparseMatrixTransposeOpName,
                                                               mindspore::kSparseFillEmptyRowsGradOpName,
                                                               mindspore::kSparseReshapeOpName,
                                                               mindspore::kSparseSegmentSqrtNGradOpName,
                                                               mindspore::kSparseSegmentSqrtNWithNumSegmentsOpName,
                                                               mindspore::kSparseSoftmaxCrossEntropyWithLogitsOpName,
                                                               mindspore::kSparseSparseMaximumOpName,
                                                               mindspore::kSparseSparseMinimumOpName,
                                                               mindspore::kSparseSegmentSumWithNumSegmentsOpName,
                                                               mindspore::kSplitOpName,
                                                               mindspore::kSqrtOpName,
                                                               mindspore::kSqrtGradOpName,
                                                               mindspore::kTanhOpName,
                                                               mindspore::kTileOpName,
                                                               mindspore::kTridiagonalMatMulOpName,
                                                               mindspore::kTripletMarginLossOpName,
                                                               mindspore::kTransposeOpName,
                                                               mindspore::kTriuIndicesOpName,
                                                               mindspore::kTrilIndicesOpName,
                                                               mindspore::kUnpackOpName,
                                                               mindspore::kUnravelIndexOpName,
                                                               mindspore::kUnsortedSegmentSumOpName,
                                                               mindspore::kUpperBoundOpName,
                                                               mindspore::kXlogyOpName,
                                                               mindspore::kXdivyOpName,
                                                               mindspore::kFFTWithSizeOpName,
                                                               mindspore::kHistogramDOpName,
                                                               mindspore::kIm2colOpName,
--- a/mindspore/python/mindspore/common/api.py
+++ b/mindspore/python/mindspore/common/api.py
@ -410,7 +410,7 @@ class _MindsporeFunctionExecutor:
        # Case: If the shape of input args is dynamic, get dynamic shape tensor from context and use it to compile.
        compile_args = args_list
        # Case: The `set_inputs()` of Cell object has been set, using these dynamic shape args as compile args.
-        if isinstance(self.obj, ms.nn.Cell) and self.obj.get_inputs():
+        if self.fn.__name__ == 'construct' and isinstance(self.obj, ms.nn.Cell) and self.obj.get_inputs():
            compile_args = self.obj.get_inputs()
            for args in compile_args:
                Validator.check_isinstance("args set in `set_inputs()` of Cell", args, PythonTensor)
--- a/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
+++ b/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
@ -13,9 +13,48 @@
 # limitations under the License.
 """aicpu ops"""
 from .adaptive_max_pool_3d_grad import _adaptive_max_pool_3d_grad_aicpu
 from .adaptive_max_pool_3d import _adaptive_max_pool_3d_aicpu
 from .adaptive_max_pool_2d_grad import _adaptive_max_pool_2d_grad_aicpu
 from .adaptive_avg_pool_3d_grad import _adaptiveavgpool3d_grad_aicpu
 from .adaptive_avg_pool_3d import _adaptiveavgpool3d_aicpu
 from .tile import _tile_aicpu
 from .tanh import _tanh_aicpu
 from .space_to_depth import _space_to_depth_aicpu
 from .sparse_matrix_transpose import _sparse_matrix_transpose_aicpu
 from .sparse_matrix_nnz import _sparse_matrix_nnz_aicpu
 from .sparse_matrix_mat_mul import _sparse_matrix_mat_mul_aicpu
 from .sparse_dense_cwise_mul import _sparse_dense_cwise_mul_aicpu
 from .sparse_dense_cwise_div import _sparse_dense_cwise_div_aicpu
 from .sparse_dense_cwise_add import _sparse_dense_cwise_add_aicpu
 from .sparse_concat import _sparse_concat_aicpu
 from .sparse_apply_proximal_gradient_descent import _sparse_apply_proximal_gradient_descent_aicpu
 from .sparse_apply_momentum import _sparse_apply_momentum_aicpu
 from .sparse_apply_centered_rms_prop import _sparse_apply_centered_rms_prop_aicpu
 from .sparse_apply_adagrad_da import _sparse_apply_adagrad_da_aicpu
 from .sparseaddmm import _sparse_addmm_aicpu
 from .broadcast_to import _broadcast_to_aicpu
 from .blackman_window import _blackman_window_aicpu
 from .bincount import _bincount_aicpu
 from .asinh_grad import _asinh_grad_aicpu
 from .unique import _unique_aicpu
 from .add_n import _add_n_aicpu
 from .add_v2 import _add_v2_aicpu
 from .adjust_contrastv2 import _adjust_contrastv2_aicpu
 from .adjust_hue import _adjust_hue_aicpu
 from .adjust_saturation import _adjust_saturation_aicpu
 from .affine_grid_grad import _affine_grid_grad_aicpu
 from .angle import _angle_aicpu
 from .arg_max import _arg_max_aicpu
 from .argmax_with_value import _argmax_with_value_aicpu
 from .arg_min import _arg_min_aicpu
 from .argmin_with_value import _argmin_with_value_aicpu
 from .avgpool_v1 import _avgpool_v1_aicpu
 from .avgpool_grad_v1 import _avgpool_grad_v1_aicpu
 from .matrix_solve import _matrix_solve_aicpu
 from .betainc import _betainc_aicpu
 from .bartlett_window import _bartlett_window_aicpu
 from .batch_norm_grad_grad import _batch_norm_grad_grad_aicpu
 from .no_repeat_ngram import _no_repeat_ngram_aicpu
 from .init_data_set_queue import _init_data_set_queue_aicpu
 from .embedding_lookup import _embedding_lookup_aicpu
@ -43,6 +82,7 @@ from .topk import _top_k_aicpu
 from .tensor_scatter_update import _tensor_scatter_update_aicpu
 from .log1p import _log1p_aicpu
 from .asin import _asin_aicpu
 from .asin_grad import _asin_grad_aicpu
 from .is_finite import _is_finite_aicpu
 from .is_inf import _is_inf_aicpu
 from .is_nan import _is_nan_aicpu
@ -52,14 +92,18 @@ from .cosh import _cosh_aicpu
 from .sign import _sign_aicpu
 from .squeeze import _squeeze_aicpu
 from .acos import _acos_aicpu
 from .acos_grad import _acos_grad_aicpu
 from .expand import _expand_aicpu
 from .expand_dims import _expand_dims_aicpu
 from .randperm import _randperm_aicpu
 from .random_choice_with_mask import _random_choice_with_mask_aicpu
 from .rsqrt import _rsqrt_aicpu
 from .sqrt import _sqrt_aicpu
 from .sqrt_grad import _sqrt_grad_aicpu
 from .search_sorted import _search_sorted_aicpu
 from .stack import _stack_aicpu
 from .unstack import _unstack_aicpu
 from .unsorted_segment_sum import _unsorted_segment_sum_aicpu
 from .addcmul import _addcmul_aicpu
 from .uniform_candidate_sampler import _uniform_candidate_sampler_aicpu
 from .log_uniform_candidate_sampler import _log_uniform_candidate_sampler_aicpu
@ -69,6 +113,7 @@ from .reverse_sequence import _reverse_sequence_aicpu
 from .log_matrix_determinant import _log_matrix_determinant_aicpu
 from .crop_and_resize import _crop_and_resize_aicpu
 from .acosh import _acosh_aicpu
 from .acosh_grad import _acosh_grad_aicpu
 from .rnnt_loss import _rnnt_loss_aicpu
 from .random_categorical import _random_categorical_aicpu
 from .tanh_grad import _tanh_grad_aicpu
@ -86,6 +131,7 @@ from .sub import _sub_aicpu
 from .not_equal import _not_equal_aicpu
 from .poisson import _poisson_aicpu
 from .update_cache import _update_cache_aicpu
 from .upper_bound import _upper_bound_aicpu
 from .cache_swap_table import _cache_swap_table_aicpu
 from .uniform_int import _uniform_int_aicpu
 from .uniform_real import _uniform_real_aicpu
@ -97,6 +143,23 @@ from .end_of_sequence import _end_of_sequence_aicpu
 from .fused_sparse_adam import _fused_sparse_adam_aicpu
 from .fused_sparse_lazy_adam import _fused_sparse_lazy_adam_aicpu
 from .fused_sparse_ftrl import _fused_sparse_ftrl_aicpu
 from .sparse_fill_empty_rows_grad import _sparse_fill_empty_rows_grad_aicpu
 from .sparse_reshape import _sparse_reshape_aicpu
 from .sparse_segment_sqrt_n_grad import _sparse_segment_sqrt_n_grad_aicpu
 from .sparse_segment_mean_with_num_segments import _sparse_segment_mean_with_num_segments_aicpu
 from .sparse_segment_sum_with_num_segments import _sparse_segment_sum_with_num_segments_aicpu
 from .sparse_softmax_cross_entropy_with_logits_v2 import _sparse_softmax_cross_entropy_with_logits_v2_aicpu
 from .sparsesparsemaximum import _sparsesparsemaximum_aicpu
 from .sparse_sparse_minimum import _sparse_sparse_minimum_aicpu
 from .split import _split_aicpu
 from .transpose import _transpose_aicpu
 from .tridiagonal_matmul import _tridiagonal_matmul_aicpu
 from .tril_indices import _tril_indices_aicpu
 from .triu_indices import _triu_indices_aicpu
 from .triplet_margin_loss import _triplet_margin_loss_aicpu
 from .unravel_index import _unravel_index_aicpu
 from .xlogy import _xlogy_aicpu
 from .xdivy import _xdivy_aicpu
 from .fused_sparse_proximal_adagrad import _fused_sparse_proximal_adagrad_aicpu
 from .meshgrid import _meshgrid_aicpu
 from .div import _div_aicpu