0103 aicpu migration first half

2022-12-30 17:20:58 +08:00 · 2022-12-30 17:20:58 +08:00 · 540665dbbc
parent a023825aae
commit 540665dbbc
40 changed files with 4020 additions and 133 deletions
--- a/.jenkins/check/config/filter_cppcheck.txt
+++ b/.jenkins/check/config/filter_cppcheck.txt
@ -100,7 +100,8 @@
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "zerodivcond"
-"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantInitialization"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "noConstructor"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "noExplicitConstructor"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "identicalConditionAfterEarlyExit"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitMemberVar"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantInitialization"
--- a/.jenkins/check/config/whitelizard.txt
+++ b/.jenkins/check/config/whitelizard.txt
@ -292,30 +292,15 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd_update.cc:aicpu::ScatterNdUpdateCpuKernel::Compute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_sparse.cc:aicpu::RaggedTensorToSparseCpuKernel::Compute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc:aicpu::MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute_Complex
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_tensor.cc:aicpu::RaggedTensorToTensorCpuKernel::Compute
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute_Complex
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::GenerateRandomCrop
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::SDBBExt2Compute
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc:aicpu::MaxUnpool3DCpuKernel::MaxUnpool3DCompute
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute_Complex
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc:aicpu::SpatialMaxPoolWithArgMaxHelper
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute_Complex
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.cc:aicpu::ScatterNdCpuKernel::Compute
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc:aicpu::Generate
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossCompute
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossComputeFP
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc:aicpu::MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool.cc:aicpu::SpacialMaxPool
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute_Complex
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute_Complex
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::SDBBExt2Compute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc:aicpu::MaxUnpool3DCpuKernel::MaxUnpool3DCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/slice.cc:aicpu::SliceCpuKernel::SliceCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute
@ -323,11 +308,26 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc:aicpu::SpatialMaxPoolWithArgMaxHelper
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute_Complex
-mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradC
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc:aicpu::Generate
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossCompute
-mindspore/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc:mindspore::opt::AICpuLibSelectPass::Process
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradComputeFP16
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossComputeFP16
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_band_part.cc:aicpu::MatrixBandPartCpuKernel::BandCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::SpecialComputeSameShape
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::SpecialComputeXOneElement
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::SpecialComputeYOneElement
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::BcastComputeMultiKernel
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::BcastComputeOneKernel
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::SpecialComputeSameShape
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::SpecialComputeXOneElement
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::SpecialComputeYOneElement
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::BcastComputeMultiKernel
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::BcastComputeOneKernel
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.cc:aicpu::LuUnpackCpuKernel::Compute
+mindspore/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc:mindspore::opt::AICpuLibSelectPass::Process
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool_grad.cc:aicpu::FractionalMaxPoolGradCpuKernel::DoCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool_grad.cc:aicpu::FractionalAvgPoolGradCpuKernel::DoCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool.cc:aicpu::FractionalMaxPoolCpuKernel::DoCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool.cc:aicpu::FractionalAvgPoolCpuKernel::DoCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/densetosparsesetoperation.cc:aicpu::DenseToSparseSetOperationCpuKernel::ComputeDenseToSparse
--- a/mindspore/ccsrc/include/common/utils/utils.h
+++ b/mindspore/ccsrc/include/common/utils/utils.h
@ -226,6 +226,7 @@ constexpr auto kCumulativeLogsumexpOpName = "CumulativeLogsumexp";
 constexpr auto kCumulativeLogsumexpDOpName = "CumulativeLogsumexpD";
 constexpr auto kDataFormatVecPermuteOpName = "DataFormatVecPermute";
 constexpr auto kDeadNodeName = "DeadNode";
+constexpr auto kDenseToCSRSparseMatrixOpName = "DenseToCSRSparseMatrix";
 constexpr auto kDenseToDenseSetOperation = "DenseToDenseSetOperation";
 constexpr auto kDenseToSparseSetOperation = "DenseToSparseSetOperation";
 constexpr auto kDepthwiseConv2dNativeBackpropFilterOpName = "DepthwiseConv2dNativeBackpropFilter";
@ -242,7 +243,9 @@ constexpr auto kDiagPartOpName = "DiagPart";
 constexpr auto kDiagPartDOpName = "DiagPartD";
 constexpr auto kDiagOpName = "Diag";
 constexpr auto kDiagDOpName = "DiagD";
+constexpr auto kDiagonalOpName = "Diagonal";
 constexpr auto kDivOpName = "Div";
+constexpr auto kDivNoNanOpName = "DivNoNan";
 constexpr auto kDropoutDoMaskOpName = "DropoutDoMask";
 constexpr auto kDropOutDoMaskOpName = "DropOutDoMask";
 constexpr auto kDropoutDoMaskV3OpName = "DropoutDoMaskV3";
@ -257,6 +260,7 @@ constexpr auto kDynamicAtomicAddrCleanOpName = "DynamicAtomicAddrClean";
 constexpr auto kDynamicGRUV2OpName = "DynamicGRUV2";
 constexpr auto kDynamicRNNOpName = "DynamicRNN";
 constexpr auto kDynamicStitchOpName = "DynamicStitch";
+constexpr auto kEigOpName = "Eig";
 constexpr auto kEmbeddingLookupCommGradOpName = "EmbeddingLookupCommGrad";
 constexpr auto kEmbeddingLookupOpName = "EmbeddingLookup";
 constexpr auto kEmbeddingLookupProxyOpName = "EmbeddingLookupProxy";
@ -293,7 +297,12 @@ constexpr auto kFive2FourOpName = "Five2Four";
 constexpr auto kFlattenGradOpName = "FlattenGrad";
 constexpr auto kFloorDivOpName = "FloorDiv";
 constexpr auto kFour2FiveOpName = "Four2Five";
+constexpr auto kFractionalAvgPoolOpName = "FractionalAvgPool";
 constexpr auto kFractionalAvgPoolGradOpName = "FractionalAvgPoolGrad";
+constexpr auto kFractionalMaxPoolOpName = "FractionalMaxPool";
+constexpr auto kFractionalMaxPoolGradOpName = "FractionalMaxPoolGrad";
+constexpr auto kFractionalMaxPoolGradWithFixedKsizeOpName = "FractionalMaxPoolGradWithFixedKsize";
+constexpr auto kFractionalMaxPoolWithFixedKsizeOpName = "FractionalMaxPoolWithFixedKsize";
 constexpr auto kFusedAdaFactorName = "FusedAdaFactor";
 constexpr auto kFusedAdaFactorWithGlobalNormName = "FusedAdaFactorWithGlobalNorm";
 constexpr auto kFusedAdamName = "FusedAdam";
@ -327,10 +336,12 @@ constexpr auto kGatherOpName = "Gather";
 constexpr auto kGatherNdOpName = "GatherNd";
 constexpr auto kGatherV2OpName = "GatherV2";
 constexpr auto kGatherV2DOpName = "GatherV2D";
+constexpr auto kGcdOpName = "Gcd";
 constexpr auto kGeLUOpName = "GeLU";
 constexpr auto kGeluOpName = "Gelu";
 constexpr auto kGeLUGradOpName = "GeLUGrad";
 constexpr auto kGeluGradOpName = "GeluGrad";
+constexpr auto kGeqrfOpName = "Geqrf";
 constexpr auto kGetNextOpName = "GetNext";
 constexpr auto kGreaterEqualOpName = "GreaterEqual";
 constexpr auto kGreaterOpName = "Greater";
@ -346,13 +357,21 @@ constexpr auto kHSigmoidOpName = "HSigmoid";
 constexpr auto kHardSigmoidOpName = "HardSigmoid";
 constexpr auto kHSigmoidGradOpName = "HSigmoidGrad";
 constexpr auto kHardSigmoidGradOpName = "HardSigmoidGrad";
+constexpr auto kHSVToRGBOpName = "HSVToRGB";
 constexpr auto kHSwishOpName = "HSwish";
 constexpr auto kHardSwishOpName = "HardSwish";
 constexpr auto kHistogramDOpName = "HistogramD";
 constexpr auto kHSwishGradOpName = "HSwishGrad";
 constexpr auto kHardSwishGradOpName = "HardSwishGrad";
+constexpr auto kHeavisideOpName = "Heaviside";
 constexpr auto kHostAllGatherOpName = "HostAllGather";
 constexpr auto kHostReduceScatterOpName = "HostReduceScatter";
+constexpr auto kHypotOpName = "Hypot";
+constexpr auto kIdentityNOpName = "IdentityN";
+constexpr auto kIgammaOpName = "Igamma";
+constexpr auto kIgammacOpName = "Igammac";
+constexpr auto kIgammaGradAOpName = "IgammaGradA";
+constexpr auto kIndexFillOpName = "IndexFill";
 constexpr auto kInitDatasetQueueOpName = "InitDataSetQueue";
 constexpr auto kIOUOpName = "IOU";
 constexpr auto kIouOpName = "Iou";
@ -369,7 +388,6 @@ constexpr auto kInstanceNormV2OpName = "InstanceNormV2";
 constexpr auto kInstanceNormV2GradOpName = "InstanceNormV2Grad";
 constexpr auto kInTopKOpName = "InTopK";
 constexpr auto kInTopKDOpName = "InTopKD";
-constexpr auto kIsInfOpName = "IsInf";
 constexpr auto kIsNanOpName = "IsNan";
 constexpr auto kKLDivLossOpName = "KLDivLoss";
 constexpr auto kKLDivOpName = "KLDiv";
@ -395,6 +413,7 @@ constexpr auto kLayerNormBetaGammaBackpropV2OpName = "LayerNormBetaGammaBackprop
 constexpr auto kLayerNormGradOpName = "LayerNormGrad";
 constexpr auto kLayerNormXBackpropOpName = "LayerNormXBackprop";
 constexpr auto kLayerNormXBackpropV2OpName = "LayerNormXBackpropV2";
+constexpr auto kLcmOpName = "Lcm";
 constexpr auto kLessEqualOpName = "LessEqual";
 constexpr auto kLessOpName = "Less";
 constexpr auto kLinSpaceOpName = "LinSpace";
@ -403,14 +422,21 @@ constexpr auto kListDiffOpName = "ListDiff";
 constexpr auto kLogMatrixDeterminantOpName = "LogMatrixDeterminant";
 constexpr auto kLogOpName = "Log";
 constexpr auto kLog1pOpName = "Log1p";
+constexpr auto kLogicalXorOpName = "LogicalXor";
+constexpr auto kLogitOpName = "Logit";
+constexpr auto kLogitGradOpName = "LogitGrad";
+constexpr auto kLogNormalReverseOpName = "LogNormalReverse";
 constexpr auto kLogSoftmaxOpName = "LogSoftmax";
 constexpr auto kLogSoftmaxV2OpName = "LogSoftmaxV2";
 constexpr auto kLogSoftmaxGradOpName = "LogSoftmaxGrad";
+constexpr auto kLowerBoundOpName = "LowerBound";
 constexpr auto kLpNormOpName = "LpNorm";
 constexpr auto kLSTMGradOpName = "LSTMGrad";
 constexpr auto kLSTMInputGradOpName = "LSTMInputGrad";
 constexpr auto kLSTMOpName = "LSTM";
+constexpr auto kLstsqOpName = "Lstsq";
 constexpr auto kLuUnpackOpName = "LuUnpack";
+constexpr auto kLuUnpackGradOpName = "LuUnpackGrad";
 constexpr auto kMaskedFillOpName = "MaskedFill";
 constexpr auto kMaskedSelectOpName = "MaskedSelect";
 constexpr auto kMaskedSelectGradOpName = "MaskedSelectGrad";
@ -423,6 +449,7 @@ constexpr auto kMatrixDiagDOpName = "MatrixDiagD";
 constexpr auto kMatrixDiagPartOpName = "MatrixDiagPart";
 constexpr auto kMatrixDiagPartDOpName = "MatrixDiagPartD";
 constexpr auto kMatrixDiagPartV3OpName = "MatrixDiagPartV3";
+constexpr auto kMatrixExpOpName = "MatrixExp";
 constexpr auto kMatrixLogarithmOpName = "MatrixLogarithm";
 constexpr auto kMatrixSetDiagOpName = "MatrixSetDiag";
 constexpr auto kMatrixSetDiagDOpName = "MatrixSetDiagD";
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/dense_to_csr_sparse_matrix.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/dense_to_csr_sparse_matrix.cc
@ -0,0 +1,150 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dense_to_csr_sparse_matrix.h"
+#include <complex>
+#include <numeric>
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 2;
+const uint32_t kOutputNum = 5;
+const char *DenseToCSRSparseMatrix = "DenseToCSRSparseMatrix";
+}  // namespace
+
+namespace aicpu {
+uint32_t DenseToCSRSparseMatrixCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "DenseToCSRSparseMatrix normal check failed.");
+  DataType value_type = ctx.Input(0)->GetDataType();
+  DataType indice_type = ctx.Input(1)->GetDataType();
+  uint32_t status;
+  switch (indice_type) {
+    case DT_INT32:
+      switch (value_type) {
+        case DT_FLOAT:
+          status = ComputeKernel<int32_t, float>(ctx);
+          break;
+        case DT_DOUBLE:
+          status = ComputeKernel<int32_t, double>(ctx);
+          break;
+        case DT_COMPLEX64:
+          status = ComputeKernel<int32_t, std::complex<float>>(ctx);
+          break;
+        case DT_COMPLEX128:
+          status = ComputeKernel<int32_t, std::complex<double>>(ctx);
+          break;
+        default:
+          KERNEL_LOG_ERROR("DenseToCSRSparseMatrix value type [%s] not support.", DTypeStr(value_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    case DT_INT64:
+      switch (value_type) {
+        case DT_FLOAT:
+          status = ComputeKernel<int64_t, float>(ctx);
+          break;
+        case DT_DOUBLE:
+          status = ComputeKernel<int64_t, double>(ctx);
+          break;
+        case DT_COMPLEX64:
+          status = ComputeKernel<int64_t, std::complex<float>>(ctx);
+          break;
+        case DT_COMPLEX128:
+          status = ComputeKernel<int64_t, std::complex<double>>(ctx);
+          break;
+        default:
+          KERNEL_LOG_ERROR("DenseToCSRSparseMatrix value type [%s] not support.", DTypeStr(value_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    default:
+      KERNEL_LOG_ERROR("DenseToCSRSparseMatrix indices type [%s] not support.", DTypeStr(indice_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  KERNEL_HANDLE_ERROR(status, "DenseToCSRSparseMatrix kernel compute failed.");
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(DenseToCSRSparseMatrix, DenseToCSRSparseMatrixCpuKernel);
+
+template <typename indiceT, typename valueT>
+uint32_t DenseToCSRSparseMatrixCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
+  auto dense_input_ptr = reinterpret_cast<valueT *>(ctx.Input(0)->GetData());
+  auto indices_ptr = reinterpret_cast<indiceT *>(ctx.Input(1)->GetData());
+  auto y_dense_shape_ptr = reinterpret_cast<indiceT *>(ctx.Output(0)->GetData());
+  auto y_batch_pointers_ptr = reinterpret_cast<indiceT *>(ctx.Output(1)->GetData());
+  auto y_row_pointers_ptr = reinterpret_cast<indiceT *>(ctx.Output(2)->GetData());
+  auto y_col_indices_ptr = reinterpret_cast<indiceT *>(ctx.Output(3)->GetData());
+  auto y_values_ptr = reinterpret_cast<valueT *>(ctx.Output(4)->GetData());
+  // Copy the CSRSparseMatrix's dense_shape and values from the Dense.
+  const int64_t rank = ctx.Input(1)->GetTensorShape()->GetDimSize(1);
+  const int64_t total_nnz = ctx.Input(1)->GetTensorShape()->GetDimSize(0);
+  const int64_t batch_size = (rank == 2) ? 1 : ctx.Input(0)->GetTensorShape()->GetDimSize(0);
+  const int64_t num_rows = ctx.Input(0)->GetTensorShape()->GetDimSize((rank == 2) ? 0 : 1);
+  const int64_t num_cols = ctx.Input(0)->GetTensorShape()->GetDimSize((rank == 2) ? 1 : 2);
+  for (int64_t i = 0; i < rank; i++) {
+    y_dense_shape_ptr[i] = ctx.Input(0)->GetTensorShape()->GetDimSize(i);
+  }
+  for (int64_t i = 0; i < total_nnz; i++) {
+    if (rank == 2) {
+      int64_t cur_idx = indices_ptr[i * rank] * num_cols + indices_ptr[i * rank + 1];
+      y_values_ptr[i] = dense_input_ptr[cur_idx];
+    } else {
+      int64_t cur_idx = indices_ptr[i * rank] * num_rows * num_cols;
+      cur_idx = cur_idx + indices_ptr[i * rank + 1] * num_cols + indices_ptr[i * rank + 2];
+      y_values_ptr[i] = dense_input_ptr[cur_idx];
+    }
+  }
+  for (int64_t i = 0; i < batch_size * (num_rows + 1); i++) {
+    y_row_pointers_ptr[i] = 0;
+  }
+  int prev_batch = -1;
+  if (rank == 2) {
+    // For a single batch, the batch_ptrs are {0, total_nnz}.
+    y_batch_pointers_ptr[0] = 0;
+    ++prev_batch;
+    for (int64_t i = 0; i < total_nnz; ++i) {
+      // For now, the rows pointers store the corresponding row counts.
+      y_row_pointers_ptr[indices_ptr[i * rank] + 1] += 1;
+      y_col_indices_ptr[i] = indices_ptr[i * rank + 1];
+    }
+  } else {  // rank == 3
+    for (int64_t i = 0; i < total_nnz; ++i) {
+      const int cur_batch = indices_ptr[i * rank];
+      // For now, the rows pointers store the corresponding row counts.
+      y_row_pointers_ptr[cur_batch * (num_rows + 1) + indices_ptr[i * rank + 1] + 1] += 1;
+      y_col_indices_ptr[i] = indices_ptr[i * rank + 2];
+      // We're at a new batch and might have skipped over empty batches.
+      while (prev_batch < cur_batch) {
+        // The previous batch ends at position i.
+        y_batch_pointers_ptr[prev_batch + 1] = i;
+        ++prev_batch;
+      }
+    }
+  }
+  // Set the last element of batch_ptr and account for trailing empty batches.
+  while (prev_batch < batch_size) {
+    y_batch_pointers_ptr[prev_batch + 1] = total_nnz;
+    ++prev_batch;
+  }
+  // Compute the cumulative row counts for each batch.
+  for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
+    auto *row_ptr_batch = y_row_pointers_ptr + batch_idx * (num_rows + 1);
+    std::partial_sum(row_ptr_batch, row_ptr_batch + num_rows + 1, row_ptr_batch);
+  }
+  return KERNEL_STATUS_OK;
+}
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/dense_to_csr_sparse_matrix.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/dense_to_csr_sparse_matrix.h
@ -0,0 +1,35 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_DENSE_TO_CSR_SPARSE_MATRIX_H_
+#define AICPU_KERNELS_NORMALIZED_DENSE_TO_CSR_SPARSE_MATRIX_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+
+class DenseToCSRSparseMatrixCpuKernel : public CpuKernel {
+ public:
+  ~DenseToCSRSparseMatrixCpuKernel() = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename indiceT, typename valueT>
+  uint32_t ComputeKernel(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/densetosparsesetoperation.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/densetosparsesetoperation.cc
@ -0,0 +1,429 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "densetosparsesetoperation.h"
+#include <algorithm>
+#include <atomic>
+#include <mutex>
+#include <numeric>
+#include <set>
+#include <string>
+#include <vector>
+#include "cpu_kernel_utils.h"
+#include "utils/allocator_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include "kernel_log.h"
+#include "status.h"
+
+namespace {
+const char *kDenseToSparseSetOperation = "DenseToSparseSetOperation";
+const uint32_t kOutputNum = 3;
+const uint32_t kInputNum = 4;
+constexpr int64_t kIndex0 = 0;
+constexpr int64_t kIndex1 = 1;
+constexpr int64_t kIndex2 = 2;
+constexpr int64_t kIndex3 = 3;
+const int64_t kParallelNum{64};
+}  // namespace
+// 定义命名空间aicpu
+namespace aicpu {
+const std::vector<int64_t> Strides(const std::vector<int64_t> &shape) {
+  std::vector<int64_t> result(shape.size());
+  int64_t product = 1;
+  for (int64_t i = static_cast<int64_t>(shape.size()) - 1; i >= 0; --i) {
+    result[i] = product;
+    product *= shape[i];
+  }
+  return result;
+}
+
+uint32_t GroupShape(const std::vector<int64_t> input_shape, std::vector<int64_t> &grouped_shape) {
+  if (input_shape.size() < 2) {
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  // grouped_shape is input_shape[:-1]
+  grouped_shape.assign(input_shape.begin(), input_shape.end() - 1);
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t CheckShapesMatch(const std::vector<int64_t> &shape1, const std::vector<int64_t> &shape2) {
+  if (shape1.size() != shape2.size()) {
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  for (size_t i = 0; i < shape1.size(); i++) {
+    if (shape1[i] != shape2[i]) return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t GroupShapeFromInputs(const std::vector<int64_t> &shape1, const std::vector<int64_t> &shape2,
+                              std::vector<int64_t> &group_shape) {
+  std::vector<int64_t> group_shape_1;
+  KERNEL_HANDLE_ERROR(GroupShape(shape1, group_shape_1), "X1_Shape rank is less than 2.");
+  std::vector<int64_t> group_shape_2;
+  KERNEL_HANDLE_ERROR(GroupShape(shape2, group_shape_2), "X2_Shape rank is less than 2.");
+  KERNEL_HANDLE_ERROR(CheckShapesMatch(group_shape_1, group_shape_2), "Two shapes mismatch with each other.");
+  group_shape.assign(group_shape_1.begin(), group_shape_1.end());
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t GetNumElements(const std::vector<int64_t> input_shape, int64_t &res) {
+  int64_t result = 1;
+  for (uint32_t i = 0; i < input_shape.size(); i++) {
+    KERNEL_CHECK_FALSE(MulWithoutOverflow(input_shape[i], result, result), KERNEL_STATUS_PARAM_INVALID,
+                       "Overflow when calculate shape size.");
+  }
+  res = result;
+  return KERNEL_STATUS_OK;
+}
+
+void DenseToSparseSetOperationCpuKernel::PopulateGroupIndices(const int64_t flat_group_index,
+                                                              const std::vector<int64_t> &group_shape,
+                                                              std::vector<int64_t> &group_indices) {
+  group_indices.clear();
+  int64_t running_flat_group_index = flat_group_index;
+  for (int64_t group_dim_index = static_cast<int64_t>(group_shape.size()) - 1; group_dim_index >= 0;
+       --group_dim_index) {
+    const auto group_dim = group_shape[group_dim_index];
+    group_indices.insert(group_indices.begin(), running_flat_group_index % group_dim);
+    running_flat_group_index /= group_dim;
+  }
+}
+
+template <typename T>
+uint32_t DenseToSparseSetOperationCpuKernel::PopulateFromDenseGroup(Tensor *input_tensor,
+                                                                    const std::vector<int64_t> &input_strides,
+                                                                    const std::vector<int64_t> &group_indices,
+                                                                    std::set<T> &result) {
+  result.clear();
+  EigenTensor input_tensor_eigen(input_tensor, input_tensor->GetData());
+  auto input_flat = input_tensor_eigen.flat<T>();
+  const auto start = std::inner_product(group_indices.begin(), group_indices.end(), input_strides.begin(), 0LL);
+  auto input_shape = input_tensor->GetTensorShape();
+  const auto end = start + input_shape->GetDimSize(input_shape->GetDims() - 1);
+  for (int64_t i = start; i < end; ++i) {
+    result.insert(input_flat(i));
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t DenseToSparseSetOperationCpuKernel::PopulateFromSparseGroup(const Group &group,
+                                                                     const std::vector<int64_t> &sparse_tensor_shape,
+                                                                     std::set<T> &result) {
+  KERNEL_HANDLE_ERROR(CheckGroup<T>(group, sparse_tensor_shape), "PopulateFromSparseGroup check error.");
+  result.clear();
+  const auto &group_values = group.values<T>();
+  for (int64_t i = 0; i < group_values.size(); ++i) {
+    result.insert(group_values(i));
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t DenseToSparseSetOperationCpuKernel::CheckGroup(const Group &group,
+                                                        const std::vector<int64_t> &sparse_tensor_shape) {
+  const auto &indices = group.indices();
+  const auto &values = group.values<T>();
+  const auto num_values = values.dimension(0);
+
+  // Sanity check: valid indices.
+  const uint32_t expected_rank = sparse_tensor_shape.size();
+  for (uint32_t j = 0; j < expected_rank; ++j) {
+    const auto dim_size = sparse_tensor_shape[j];
+    KERNEL_CHECK_FALSE(dim_size > 0, KERNEL_STATUS_PARAM_INVALID, "Invalid dim_size [%d] for index [%d]", dim_size, j);
+    for (int64_t i = 0; i < num_values; ++i) {
+      const auto index = indices(i, j);
+      KERNEL_CHECK_FALSE(dim_size > index, KERNEL_STATUS_PARAM_INVALID,
+                         "indices index ([%d],[%d]) expected < [%d], got [%d].", i, j, dim_size, index);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+void DenseToSparseSetOperationCpuKernel::ApplySetOperation(const std::set<T> &set1, const std::set<T> &set2,
+                                                           std::set<T> &result, SetOperation set_operation_) {
+  switch (set_operation_) {
+    case A_MINUS_B:
+      std::set_difference(set1.begin(), set1.end(), set2.begin(), set2.end(), std::inserter(result, result.begin()));
+      break;
+    case B_MINUS_A:
+      std::set_difference(set2.begin(), set2.end(), set1.begin(), set1.end(), std::inserter(result, result.begin()));
+      break;
+    case INTERSECTION:
+      std::set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(), std::inserter(result, result.begin()));
+      break;
+    case UNION:
+      std::set_union(set1.begin(), set1.end(), set2.begin(), set2.end(), std::inserter(result, result.begin()));
+      break;
+  }
+}
+
+template <typename T>
+uint32_t DenseToSparseSetOperationCpuKernel::OutputSparseTensor(
+  DataBank &databank, const std::vector<int64_t> &output_shape, const int64_t num_values,
+  const std::map<std::vector<int64_t>, std::set<T>> &sets) {
+  Tensor *out_indices, *out_values, *out_shape;
+  out_indices = databank.result_indices;
+  out_values = databank.result_values;
+  out_shape = databank.result_shape;
+
+  EigenTensor out_indices_t(out_indices, out_indices->GetData());
+  auto out_indices_mat = out_indices_t.matrix<int64_t>();
+  EigenTensor out_values_t(out_values, out_values->GetData());
+  auto out_values_flat = out_values_t.vec<T>();
+  EigenTensor out_shape_t(out_shape, out_shape->GetData());
+  auto out_shape_flat = out_shape_t.vec<int64_t>();
+
+  int64_t value_index = 0;
+  for (auto it = sets.begin(); it != sets.end(); ++it) {
+    const auto &group_indices = it->first;
+    KERNEL_CHECK_FALSE(group_indices.size() == output_shape.size() - 1, KERNEL_STATUS_PARAM_INVALID,
+                       "Invalid number of indices [%d] expected [%].", group_indices.size(), output_shape.size() - 1)
+    const auto &set = it->second;
+
+    // For each set item, write its indices and value to output tensors.
+    int64_t group_value_index = 0;
+    for (auto value = set.begin(); value != set.end(); ++value, ++value_index, ++group_value_index) {
+      // First n-1 dimensions are the group, last dimension is the position in
+      // the set.
+      for (uint32_t i = 0; i < group_indices.size(); ++i) {
+        out_indices_mat(value_index, i) = group_indices[i];
+      }
+      out_indices_mat(value_index, group_indices.size()) = group_value_index;
+
+      out_values_flat(value_index) = *value;
+    }
+  }
+
+  for (uint32_t i = 0; i < output_shape.size(); ++i) {
+    out_shape_flat(i) = output_shape[i];
+  }
+
+  out_indices->GetTensorShape()->SetDimSizes({num_values, static_cast<int64_t>(output_shape.size())});
+  out_values->GetTensorShape()->SetDimSizes({num_values});
+  out_shape->GetTensorShape()->SetDimSizes({static_cast<int64_t>(output_shape.size())});
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t DenseToSparseSetOperationCpuKernel::NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &databank) {
+  databank.set1 = ctx.Input(kIndex0);
+  databank.set2_indices = ctx.Input(kIndex1);
+  databank.set2_values = ctx.Input(kIndex2);
+  databank.set2_shape = ctx.Input(kIndex3);
+  databank.result_indices = ctx.Output(kIndex0);
+  databank.result_values = ctx.Output(kIndex1);
+  databank.result_shape = ctx.Output(kIndex2);
+  databank.ctx = &ctx;
+  AttrValue *validate_indices = ctx.GetAttr("validate_indices");
+  if (validate_indices == nullptr) {
+    databank.validate_indices_ = true;
+  } else {
+    databank.validate_indices_ = validate_indices->GetBool();
+  }
+  AttrValue *set_operation = ctx.GetAttr("set_operation");
+  KERNEL_CHECK_NULLPTR(set_operation, KERNEL_STATUS_PARAM_INVALID, "Missing set_operation.")
+  std::string set_operation_str = set_operation->GetString();
+  std::transform(set_operation_str.begin(), set_operation_str.end(), set_operation_str.begin(), ::tolower);
+  if ("a-b" == set_operation_str) {
+    databank.set_operation_ = A_MINUS_B;
+  } else if ("b-a" == set_operation_str) {
+    databank.set_operation_ = B_MINUS_A;
+  } else if ("intersection" == set_operation_str) {
+    databank.set_operation_ = INTERSECTION;
+  } else if ("union" == set_operation_str) {
+    databank.set_operation_ = UNION;
+  } else {
+    KERNEL_LOG_ERROR("Invalid set_operation.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t DenseToSparseSetOperationCpuKernel::ComputeDenseToSparse(DataBank &databank) {
+  EigenTensor set2_shape_e(databank.set2_shape, databank.set2_shape->GetData());
+  auto set2_shape = set2_shape_e.vec<int64_t>();
+  std::vector<int64_t> shape2(set2_shape.size());
+  for (int64_t i = 0; i < set2_shape.size(); ++i) {
+    shape2[i] = set2_shape(i);
+  }
+  const auto rank = shape2.size();
+  std::vector<int64_t> order(rank);
+  std::iota(order.begin(), order.end(), 0);
+  SparseTensor set2;
+
+  Tensor *set1_t = databank.set1;
+  SparseTensor *set2_st = &set2;
+  KERNEL_HANDLE_ERROR(set2_st->CreateSparseTensor(databank.set2_indices, databank.set2_values, shape2, order),
+                      "create sparse tenser fail.");
+  if (databank.validate_indices_) {
+    KERNEL_HANDLE_ERROR(set2_st->IndicesValid(*databank.ctx), "IndicesValid fail!!");
+  }
+  std::vector<int64_t> group_shape;
+  const auto shape1 = set1_t->GetTensorShape()->GetDimSizes();
+
+  KERNEL_HANDLE_ERROR(GroupShapeFromInputs(shape1, shape2, group_shape), "GroupShapeFromInputs error.");
+  const std::vector<int64_t> set1_strides = Strides(shape1);
+  std::map<std::vector<int64_t>, std::set<T>> group_sets;
+  int64_t num_result_values = 0;
+  int64_t max_set_size = 0;
+  int64_t num_elements;
+  KERNEL_HANDLE_ERROR(GetNumElements(group_shape, num_elements), "NumElements error.");
+  if (num_elements <= kParallelNum) {
+    std::set<T> set1_group_set;
+    std::set<T> set2_group_set;
+    const std::vector<int64_t> subspan(order.begin(), order.end() - 1);
+    auto set2_grouper = set2_st->group(subspan);
+    auto set2_group_it = set2_grouper.begin();
+    std::vector<int64_t> group_indices;
+    for (int64_t flat_group_index = 0; flat_group_index < num_elements; ++flat_group_index) {
+      PopulateGroupIndices(flat_group_index, group_shape, group_indices);
+
+      // Get values from set1.
+      PopulateFromDenseGroup<T>(set1_t, set1_strides, group_indices, set1_group_set);
+      // Get values from set2, if applicable.
+      set2_group_set.clear();
+      if (set2_group_it != set2_grouper.end()) {
+        const auto &group = *set2_group_it;
+        const auto set2_group_indices = group.group();
+        bool group_match = true;
+        for (uint32_t i = 0; group_match && (i < set2_group_indices.size()); ++i) {
+          if (set2_group_indices[i] != group_indices[i]) {
+            group_match = false;
+          }
+        }
+        if (group_match) {
+          KERNEL_HANDLE_ERROR(PopulateFromSparseGroup<T>(group, shape2, set2_group_set),
+                              "PopulateFromSparseGroup error.");
+          ++set2_group_it;
+        }
+      }
+
+      std::set<T> group_set;
+      ApplySetOperation(set1_group_set, set2_group_set, group_set, databank.set_operation_);
+      if (!group_set.empty()) {
+        group_sets[group_indices] = group_set;
+        int64_t set_size = group_set.size();
+        if (set_size > max_set_size) {
+          max_set_size = set_size;
+        }
+        num_result_values += set_size;
+      }
+    }
+  } else {
+    std::mutex mt;
+    int64_t total = num_elements;
+    uint32_t cores = CpuKernelUtils::GetCPUNum(*databank.ctx);
+    int64_t per_unit_size = (total / std::min(std::max(1L, cores - 2L), total));
+    uint32_t ret =
+      CpuKernelUtils::ParallelFor(*databank.ctx, total, per_unit_size, [&](int64_t begin, int64_t end) -> uint32_t {
+        std::set<T> set1_group_set;
+        std::set<T> set2_group_set;
+        const std::vector<int64_t> subspan(order.begin(), order.end() - 1);
+        auto set2_grouper = set2_st->group(subspan);
+        auto set2_group_it = set2_grouper.begin();
+        std::vector<int64_t> group_indices;
+        for (int64_t flat_group_index = begin; flat_group_index < end; ++flat_group_index) {
+          PopulateGroupIndices(flat_group_index, group_shape, group_indices);
+
+          // Get values from set1.
+          PopulateFromDenseGroup<T>(set1_t, set1_strides, group_indices, set1_group_set);
+          // Get values from set2, if applicable.
+          set2_group_set.clear();
+          if (set2_group_it != set2_grouper.end()) {
+            const auto &group = *set2_group_it;
+            const auto set2_group_indices = group.group();
+            bool group_match = true;
+            for (uint32_t i = 0; group_match && (i < set2_group_indices.size()); ++i) {
+              if (set2_group_indices[i] != group_indices[i]) {
+                group_match = false;
+              }
+            }
+            if (group_match) {
+              KERNEL_HANDLE_ERROR(PopulateFromSparseGroup<T>(group, shape2, set2_group_set),
+                                  "PopulateFromSparseGroup error.");
+              ++set2_group_it;
+            }
+          }
+
+          std::set<T> group_set;
+          ApplySetOperation(set1_group_set, set2_group_set, group_set, databank.set_operation_);
+          if (!group_set.empty()) {
+            std::lock_guard<std::mutex> lck(mt);
+            group_sets[group_indices] = group_set;
+            int64_t set_size = group_set.size();
+            if (set_size > max_set_size) {
+              max_set_size = set_size;
+            }
+            num_result_values += set_size;
+          }
+        }
+        return KERNEL_STATUS_OK;
+      });
+    KERNEL_CHECK_FALSE((ret == KERNEL_STATUS_OK), KERNEL_STATUS_INNER_ERROR,
+                       "DenseToSparseSetOperation compute failed.");
+  }
+
+  group_shape.push_back(max_set_size);
+  return OutputSparseTensor<T>(databank, group_shape, num_result_values, group_sets);
+}
+
+uint32_t DenseToSparseSetOperationCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
+                      "DenseToSparseSetOperation check input and output number failed.");
+  DataBank databank;
+  KERNEL_HANDLE_ERROR(NullptrAndMatVecCheck(ctx, databank), "DenseToSparseSetOperation check params failed.");
+  DataType dt = reinterpret_cast<DataType>(databank.set2_values->GetDataType());
+
+  uint32_t KERNEL_STATUS;
+  switch (dt) {
+    case DT_INT8:
+      KERNEL_STATUS = ComputeDenseToSparse<int8_t>(databank);
+      break;
+    case DT_UINT8:
+      KERNEL_STATUS = ComputeDenseToSparse<uint8_t>(databank);
+      break;
+    case DT_INT16:
+      KERNEL_STATUS = ComputeDenseToSparse<int16_t>(databank);
+      break;
+    case DT_UINT16:
+      KERNEL_STATUS = ComputeDenseToSparse<uint16_t>(databank);
+      break;
+    case DT_INT32:
+      KERNEL_STATUS = ComputeDenseToSparse<int32_t>(databank);
+      break;
+    case DT_INT64:
+      KERNEL_STATUS = ComputeDenseToSparse<int64_t>(databank);
+      break;
+    case DT_STRING:
+      KERNEL_STATUS = ComputeDenseToSparse<std::string>(databank);
+      break;
+    default:
+      KERNEL_LOG_ERROR("DenseToSparseSetOperation can't support this data type [%s].", DTypeStr(dt).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (KERNEL_STATUS != KERNEL_STATUS_OK) {
+    KERNEL_LOG_ERROR("DenseToSparseSetOperation failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kDenseToSparseSetOperation, DenseToSparseSetOperationCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/densetosparsesetoperation.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/densetosparsesetoperation.h
@ -0,0 +1,80 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <set>
+#include "cpu_ops_kernel.h"
+#include "utils/sparse_group.h"
+#include "utils/sparse_tensor.h"
+// 定义命名空间aicpu
+
+namespace aicpu {
+enum SetOperation { A_MINUS_B = 0, B_MINUS_A = 1, INTERSECTION = 2, UNION = 3 };
+struct DataBank {
+  DataBank()
+      : set1(nullptr),
+        set2_indices(nullptr),
+        set2_values(nullptr),
+        set2_shape(nullptr),
+        result_indices(nullptr),
+        result_values(nullptr),
+        result_shape(nullptr) {}
+  Tensor *set1;
+  Tensor *set2_indices;
+  Tensor *set2_values;
+  Tensor *set2_shape;
+  Tensor *result_indices;
+  Tensor *result_values;
+  Tensor *result_shape;
+  SetOperation set_operation_;
+  bool validate_indices_;
+  CpuKernelContext *ctx;
+};
+
+// 算子类继承CpuKernel基类
+class DenseToSparseSetOperationCpuKernel : public CpuKernel {
+ public:
+  ~DenseToSparseSetOperationCpuKernel() = default;
+  DenseToSparseSetOperationCpuKernel() = default;
+  // 声明函数Compute，且Compute函数需要重写
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &calc_info);
+
+  template <typename T>
+  uint32_t ComputeDenseToSparse(DataBank &databank);
+
+  template <typename T>
+  uint32_t CheckGroup(const Group &group, const std::vector<int64_t> &sparse_tensor_shape);
+
+  template <typename T>
+  uint32_t PopulateFromSparseGroup(const Group &group, const std::vector<int64_t> &sparse_tensor_shape,
+                                   std::set<T> &result);
+  template <typename T>
+  uint32_t PopulateFromDenseGroup(Tensor *input_tensor, const std::vector<int64_t> &input_strides,
+                                  const std::vector<int64_t> &group_indices, std::set<T> &result);
+
+  void PopulateGroupIndices(const int64_t flat_group_index, const std::vector<int64_t> &group_shape,
+                            std::vector<int64_t> &group_indices);
+
+  template <typename T>
+  void ApplySetOperation(const std::set<T> &set1, const std::set<T> &set2, std::set<T> &result,
+                         SetOperation set_operation_);
+
+  template <typename T>
+  uint32_t OutputSparseTensor(DataBank &databank, const std::vector<int64_t> &output_shape, const int64_t num_values,
+                              const std::map<std::vector<int64_t>, std::set<T>> &sets);
+};
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/diag.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/diag.cc
@ -0,0 +1,121 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "diag.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 1;
+const char *kDiag = "Diag";
+constexpr int64_t kParallelDataNums = 80 * 32;
+constexpr int64_t kParallelDataNumsMid = 8 * 1024;
+
+#define DIAG_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                      \
+    uint32_t result = DiagCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                  \
+      KERNEL_LOG_ERROR("Diag kernel compute failed."); \
+      return result;                                   \
+    }                                                  \
+    break;                                             \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t DiagCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kDiag);
+  KERNEL_HANDLE_ERROR(DiagCheck(ctx), "[%s] check params failed.", kDiag);
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    DIAG_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    DIAG_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    DIAG_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    DIAG_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    DIAG_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    DIAG_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
+    DIAG_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("Diag kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t DiagCpuKernel::DiagCheck(CpuKernelContext &ctx) {
+  KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.")
+  KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get output tensor shape failed.")
+
+  std::vector<int64_t> shape_input = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> shape_output = ctx.Output(0)->GetTensorShape()->GetDimSizes();
+  KERNEL_CHECK_FALSE((shape_input.size() != 0), KERNEL_STATUS_PARAM_INVALID,
+                     "Input must be at least rank 1, got [%zu].", shape_input.size())
+  KERNEL_CHECK_FALSE((shape_input.size() != shape_output.size() * 2), KERNEL_STATUS_PARAM_INVALID,
+                     "The output shape size should be twice the output shape size, "
+                     "but the input shape size is [%zu] and the output shape size is [%zu].",
+                     shape_input.size(), shape_output.size())
+  for (size_t i = 0; i < shape_output.size(); ++i) {
+    KERNEL_CHECK_FALSE((shape_input[i % shape_input.size()] == shape_output[i]), KERNEL_STATUS_PARAM_INVALID,
+                       "Invalid shape: the input dimension [%zu] size [%zu] does not match "
+                       "the output dimension [%zu] size [%zu].",
+                       i % shape_input.size(), shape_input[i % shape_input.size()], i, shape_output[i])
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t DiagCpuKernel::DiagCompute(CpuKernelContext &ctx) {
+  auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+
+  int64_t size = ctx.Input(0)->NumElements();
+  int64_t data_size = size * sizeof(T);
+
+  if (data_size <= kParallelDataNums) {
+    std::fill(output, output + size * size, T());
+    for (int64_t index = 0; index < size; index++) {
+      *(output + (1 + size) * index) = *(input + index);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (data_size <= kParallelDataNumsMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+    if (max_core_num > size) {
+      max_core_num = size;
+    }
+    auto shard_diag = [&](int64_t start, int64_t end) {
+      std::fill(output + size * start, output + size * end, T());
+      for (int64_t index = start; index < end; index++) {
+        *(output + (1 + size) * index) = *(input + index);
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, size, size / max_core_num, shard_diag),
+                        "Diag Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kDiag, DiagCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/is_inf.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/is_inf.h
@ -1,5 +1,5 @@
 /**
- * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -13,23 +13,25 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef AICPU_KERNELS_NORMALIZED_IS_INF_H_
-#define AICPU_KERNELS_NORMALIZED_IS_INF_H_
+#ifndef AICPU_KERNELS_NORMALIZED_DIAG_H_
+#define AICPU_KERNELS_NORMALIZED_DIAG_H_

 #include "cpu_ops_kernel.h"

 namespace aicpu {
-class IsInfCpuKernel : public CpuKernel {
+class DiagCpuKernel : public CpuKernel {
 public:
-  IsInfCpuKernel() = default;
-  ~IsInfCpuKernel() override = default;
+  DiagCpuKernel() = default;
+  ~DiagCpuKernel() override = default;
+
+ protected:
  uint32_t Compute(CpuKernelContext &ctx) override;

 private:
-  uint32_t IsInfCheck(const CpuKernelContext &ctx) const;
+  uint32_t DiagCheck(CpuKernelContext &ctx);

  template <typename T>
-  uint32_t IsInfCompute(const CpuKernelContext &ctx);
+  uint32_t DiagCompute(CpuKernelContext &ctx);
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/diag_part.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/diag_part.cc
@ -0,0 +1,87 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "diag_part.h"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 1;
+const char *kDiagPart = "DiagPart";
+
+#define DIAGPART_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                          \
+    uint32_t result = DiagPartCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                      \
+      KERNEL_LOG_ERROR("DiagPart kernel compute failed."); \
+      return result;                                       \
+    }                                                      \
+    break;                                                 \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t DiagPartCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kDiagPart);
+  KERNEL_HANDLE_ERROR(DiagPartCheck(ctx), "[%s] check params failed.", kDiagPart);
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    DIAGPART_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    DIAGPART_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    DIAGPART_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    DIAGPART_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    DIAGPART_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    DIAGPART_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
+    DIAGPART_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("DiagPart kernel data type [%s] not supports.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t DiagPartCpuKernel::DiagPartCheck(CpuKernelContext &ctx) {
+  std::vector<int64_t> shape_input = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> shape_output = ctx.Output(0)->GetTensorShape()->GetDimSizes();
+  KERNEL_CHECK_FALSE((shape_input.size() % 2 == 0), KERNEL_STATUS_PARAM_INVALID,
+                     "The rank of the tensor should be even and positive.");
+  for (size_t i = 0; i < shape_output.size(); i++) {
+    KERNEL_CHECK_FALSE((shape_input[i] == shape_input[i + shape_output.size()]), KERNEL_STATUS_PARAM_INVALID,
+                       "Invalid shape: the input dimension [%zu] size [%zu] does not match "
+                       "the input dimension [%zu] size [%zu].",
+                       i, shape_input[i], i + shape_output.size(), shape_input[i + shape_output.size()]);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t DiagPartCpuKernel::DiagPartCompute(CpuKernelContext &ctx) {
+  auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+
+  uint64_t size = ctx.Output(0)->NumElements();
+  for (size_t index = 0; index < size; index++) {
+    *(output + index) = *(input + (1 + size) * index);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kDiagPart, DiagPartCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/diag_part.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/diag_part.h
@ -0,0 +1,37 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_DIAG_PART_H_
+#define AICPU_KERNELS_NORMALIZED_DIAG_PART_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class DiagPartCpuKernel : public CpuKernel {
+ public:
+  DiagPartCpuKernel() = default;
+  ~DiagPartCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t DiagPartCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t DiagPartCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/diagonal.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/diagonal.cc
@ -0,0 +1,227 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "diagonal.h"
+
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#define N2 2
+#define N3 3
+#define N4 4
+
+using namespace std;
+
+namespace {
+const size_t kOutputNum = 1;
+const size_t kInputNum = 1;
+const char *kDiagonal = "Diagonal";
+// when input data size is more than kParallelDataNum, use Parallel func
+const int64_t kParallelDataNum = 400;
+const int64_t kParallelDataNumMid = 2 * 1024;
+const uint32_t min_core_num = 1;
+
+template <typename T>
+T mul_sum(std::vector<T> v1, std::vector<T> v2) {
+  T output = 0;
+  if (v1.size() != v2.size()) {
+    return false;
+  } else {
+    for (unsigned int i = 0; i < v1.size(); i++) {
+      output += v1[i] * v2[i];
+    }
+    return output;
+  }
+}
+
+template <typename T>
+std::vector<T> construct_stride(std::vector<T> t_shape) {
+  std::vector<T> t_stride(t_shape.size(), 1);
+  int initial = 1;
+  for (unsigned int i = t_shape.size(); i > 0; i--) {
+    t_stride[i - 1] = initial;
+    initial = initial * t_shape[i - 1];
+  }
+  return t_stride;
+}
+
+int64_t diag_size(const int64_t &offset, const int64_t &dim1, const int64_t &dim2, std::vector<int64_t> x_shape) {
+  int64_t dsize = 0;
+  if (offset >= 0) {
+    dsize = std::max<int64_t>(std::min(x_shape.at(dim1), x_shape.at(dim2) - offset), 0);
+  } else {
+    dsize = std::max<int64_t>(std::min(x_shape.at(dim1) + offset, x_shape.at(dim2)), 0);
+  }
+  return dsize;
+}
+
+int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr) {
+  if (dim < 0) {
+    dim += dim_post_expr;
+  }
+  return dim;
+}
+
+template <typename T>
+T get_data(int64_t basepos, int64_t offset, int64_t *ar, T *dptr) {
+  if (offset >= 0) {
+    return dptr[basepos + offset * ar[1]];
+  } else {
+    return dptr[basepos - offset * ar[0]];
+  }
+}
+
+template <typename T>
+std::vector<T> construct_index(int num, std::vector<T> &stride) {
+  std::vector<T> idx;
+  int tmp_num = num;
+  for (uint32_t i = 0; i < stride.size(); i++) {
+    idx.push_back(tmp_num / stride[i]);
+    tmp_num = tmp_num % stride[i];
+  }
+  return idx;
+}
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+void DiagonalCpuKernel::set_output(int64_t *ar, T *dptr, T *y_dptr) {
+  for (int i = 0; i < dsize; i++) {
+    y_dptr[ar[N3] + i] = get_data(ar[N2] + i * (ar[0] + ar[1]), offset_, ar, dptr);
+  }
+}
+
+template <typename T>
+uint32_t DiagonalCpuKernel::DoComputeType(CpuKernelContext &ctx) {
+  // Get the inuput and output
+  Tensor *input_x = ctx.Input(0);
+  // Get some information of input
+  int32_t x_NumE = input_x->NumElements();
+  auto x_shape = input_x->GetTensorShape();
+  std::vector<int64_t> x_shape_ = x_shape->GetDimSizes();
+  const int64_t x_dim = x_shape->GetDims();
+  auto dataptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto y_dataptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  // Compute
+  dsize = diag_size(offset_, dim1_, dim2_, x_shape_);
+  std::vector<int64_t> x_stride = construct_stride<int64_t>(x_shape_);
+  if (x_dim != N2 && x_NumE > 0) {
+    // set the vx_shape and vx_stride, which is x_shape_ and x_stride of
+    // position dim1_ and dim2_ removed.
+    std::vector<int64_t> vx_shape, vx_stride;
+    for (unsigned int tmp_dim = 0; tmp_dim < x_shape_.size(); tmp_dim++) {
+      if (tmp_dim != dim1_ && tmp_dim != dim2_) {
+        vx_shape.push_back(x_shape_[tmp_dim]);
+        vx_stride.push_back(x_stride[tmp_dim]);
+      }
+    }
+    // set the y_shape (the output shape), y_stride(the output stride),
+    // vy_stride(the y_stride without the last dim)
+    std::vector<int64_t> y_shape = vx_shape;
+    y_shape.push_back(dsize);
+    std::vector<int64_t> y_stride = construct_stride<int64_t>(y_shape);
+    std::vector<int64_t> vy_stride = y_stride;
+    vy_stride.pop_back();
+    // diagonal
+    int32_t task_num = x_NumE / x_shape_[dim1_] / x_shape_[dim2_];
+    if (task_num >= kParallelDataNum) {
+      int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+      if (task_num <= kParallelDataNumMid) {
+        max_core_num = std::min(max_core_num, static_cast<int64_t>(N4));
+      }
+      max_core_num = max_core_num > task_num ? task_num : max_core_num;
+      auto sharder_diagonal = [&](int64_t start, int64_t end) {
+        for (int64_t j = start; j < end; j++) {
+          std::vector<int64_t> v_s_stride = construct_stride<int64_t>(vx_shape);
+          auto p = construct_index<int64_t>(j, v_s_stride);
+          int64_t arr[N4] = {x_stride[dim1_], x_stride[dim2_], mul_sum<int64_t>(p, vx_stride),
+                             mul_sum<int64_t>(p, vy_stride)};
+          set_output(arr, dataptr, y_dataptr);
+        }
+      };
+      if (max_core_num != 0) {
+        int64_t per_unit = task_num / max_core_num;
+        KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, task_num, per_unit, sharder_diagonal), "Diagonal failed.");
+      }
+    } else {
+      for (int64_t j = 0; j < task_num; j++) {
+        std::vector<int64_t> v_s_stride = construct_stride<int64_t>(vx_shape);
+        auto p = construct_index<int64_t>(j, v_s_stride);
+        int64_t arr[N4] = {x_stride[dim1_], x_stride[dim2_], mul_sum<int64_t>(p, vx_stride),
+                           mul_sum<int64_t>(p, vy_stride)};
+        set_output(arr, dataptr, y_dataptr);
+      }
+    }
+  } else if (x_dim == N2) {
+    int64_t arr[N4] = {x_stride[dim1_], x_stride[dim2_], 0, 0};
+    set_output(arr, dataptr, y_dataptr);
+  } else {
+    y_dataptr = dataptr;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t DiagonalCpuKernel::Compute(CpuKernelContext &ctx) {
+  // Check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Diagonal check input and output number failed.");
+  // Get the inuput
+  Tensor *input_x = ctx.Input(0);
+  auto input_size = input_x->GetTensorShape()->GetDims();
+  // Check the input dims
+  if (input_size < N2) {
+    KERNEL_LOG_ERROR("[Diagonal]: the input tensor must is at least 2-dimensional.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  // Get the attr
+  AttrValue *offset = ctx.GetAttr("offset");
+  offset_ = (offset == nullptr) ? 0 : (offset->GetInt());
+  AttrValue *dim1 = ctx.GetAttr("dim1");
+  dim1_ = (dim1 == nullptr) ? 0 : (dim1->GetInt());
+  AttrValue *dim2 = ctx.GetAttr("dim2");
+  dim2_ = (dim2 == nullptr) ? 1 : (dim2->GetInt());
+  int64_t min_d = -input_size;
+  int64_t max_d = input_size - 1;
+  // Check the attr
+  if (dim1_ < min_d || dim1_ > max_d || dim2_ < min_d || dim2_ > max_d) {
+    KERNEL_LOG_ERROR(
+      "[Diagonal]: Dimension out of range (expected to be in range of [%d, "
+      "%d]).",
+      min_d, max_d);
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  // Represent the dim in uniform standard form and Check the dim
+  dim1_ = maybe_wrap_dim(dim1_, input_size);
+  dim2_ = maybe_wrap_dim(dim2_, input_size);
+  if (dim1_ == dim2_) {
+    KERNEL_LOG_ERROR("[Diagonal]:Diagonal dimensions cannot be identical.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  auto data_type = input_x->GetDataType();
+  switch (data_type) {
+    case DT_FLOAT:
+      return DoComputeType<float>(ctx);
+    case DT_DOUBLE:
+      return DoComputeType<double>(ctx);
+    default:
+      KERNEL_LOG_ERROR("[Diagonal]: Diagonal kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+REGISTER_CPU_KERNEL(kDiagonal, DiagonalCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/diagonal.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/diagonal.h
@ -0,0 +1,43 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <Eigen/Dense>
+#include <array>
+#include <iostream>
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class DiagonalCpuKernel final : public CpuKernel {
+ public:
+  DiagonalCpuKernel() = default;
+  ~DiagonalCpuKernel() override = default;
+
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+  template <typename T>
+  uint32_t DoComputeType(CpuKernelContext &ctx);
+
+  template <typename T>
+  void set_output(int64_t *ar, T *dptr, T *y_dptr);
+
+ private:
+  int64_t offset_ = 0;
+  int64_t dim1_ = 0;
+  int64_t dim2_ = 1;
+  int64_t dsize = 0;
+};
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/eig.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/eig.cc
@ -0,0 +1,95 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "eig.h"
+#include <Eigen/Dense>
+#include <Eigen/Eigenvalues>
+#include <algorithm>
+#include <complex>
+#include <iostream>
+#include <map>
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 1;
+const uint32_t kOutputNum = 2;
+const char *Eig = "Eig";
+}  // namespace
+
+namespace aicpu {
+uint32_t EigCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Eig check input and output failed.");
+  Tensor *input = ctx.Input(0);
+  auto input_dtype = static_cast<DataType>(input->GetDataType());
+  switch (input_dtype) {
+    case DT_FLOAT:
+      return ComputeKernel<float, std::complex<float>>(ctx);
+    case DT_DOUBLE:
+      return ComputeKernel<double, std::complex<double>>(ctx);
+    case DT_COMPLEX64:
+      return ComputeKernel<std::complex<float>, std::complex<float>>(ctx);
+    case DT_COMPLEX128:
+      return ComputeKernel<std::complex<double>, std::complex<double>>(ctx);
+    default:
+      KERNEL_LOG_ERROR("Eig kernel data type [%s] not support.", DTypeStr(input_dtype).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+REGISTER_CPU_KERNEL(Eig, EigCpuKernel);
+
+template <typename T, typename C>
+uint32_t EigCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
+  auto xptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto valptr = reinterpret_cast<C *>(ctx.Output(0)->GetData());
+  auto vecptr = reinterpret_cast<C *>(ctx.Output(1)->GetData());
+  std::vector<int64_t> dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  int64_t rank = ctx.Input(0)->GetTensorShape()->GetDims();
+  int64_t x_dim = ctx.Input(0)->GetTensorShape()->GetDimSize(rank - 1);
+  int64_t batch_size = 1;
+  if (rank > 2) {
+    for (int64_t i = 0; i < rank - 2; i++) {
+      batch_size *= dims[i];
+    }
+  }
+  AttrValue *compute_v = ctx.GetAttr("compute_v");
+  bool compute_v_ = (compute_v == nullptr) ? false : compute_v->GetBool();
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> A(x_dim, x_dim);
+  for (int64_t k = 0; k < batch_size; k++) {
+    for (int64_t i = 0; i < x_dim * x_dim; i++) {
+      A.data()[i] = xptr[k * x_dim * x_dim + i];
+    }
+    if (!compute_v_) {
+      Eigen::ComplexEigenSolver<Eigen::Matrix<C, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> es(A, false);
+      Eigen::Matrix<C, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> D = es.eigenvalues();
+      for (int64_t i = 0; i < x_dim; i++) {
+        valptr[k * x_dim + i] = D.data()[i];
+      }
+    } else {
+      Eigen::ComplexEigenSolver<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> es(A);
+      Eigen::Matrix<C, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> D = es.eigenvalues();
+      Eigen::Matrix<C, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> V = es.eigenvectors();
+      for (int64_t i = 0; i < x_dim; i++) {
+        valptr[k * x_dim + i] = D.data()[i];
+      }
+      for (int64_t i = 0; i < x_dim * x_dim; i++) {
+        vecptr[k * x_dim * x_dim + i] = V.data()[i];
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/eig.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/eig.h
@ -0,0 +1,38 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_EIG_H_
+#define AICPU_KERNELS_NORMALIZED_EIG_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class EigCpuKernel : public CpuKernel {
+ public:
+  EigCpuKernel() = default;
+  ~EigCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T, typename C>
+  uint32_t ComputeKernel(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/eye.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/eye.cc
@ -0,0 +1,114 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "eye.h"
+
+#include <string.h>
+#include "Eigen/Dense"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kEye = "Eye";
+
+#define EYE_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                     \
+    uint32_t result = EyePartCompute<TYPE>(CTX);      \
+    if (result != KERNEL_STATUS_OK) {                 \
+      KERNEL_LOG_ERROR("Eye kernel compute failed."); \
+      return result;                                  \
+    }                                                 \
+    break;                                            \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t EyeCpuKernel::Compute(CpuKernelContext &ctx) {
+  Tensor *output = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "Get output failed")
+  auto data_type = ctx.Output(0)->GetDataType();
+  switch (data_type) {
+    EYE_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    EYE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    EYE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    EYE_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    EYE_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    EYE_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    EYE_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    EYE_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    EYE_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
+    EYE_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
+    EYE_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
+    EYE_COMPUTE_CASE(DT_COMPLEX64, std::complex<std::float_t>, ctx)
+    EYE_COMPUTE_CASE(DT_COMPLEX128, std::complex<std::double_t>, ctx)
+    default:
+      KERNEL_LOG_ERROR("Eye kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t EyeCpuKernel::EyePartCompute(CpuKernelContext &ctx) {
+  int64_t num_rows_value1 = 0;
+  int64_t num_columns_value = -1;
+  int64_t dim_value = 1;
+  int32_t out_size_size = 0;
+  AttrValue *num_rows = ctx.GetAttr("num_rows");
+  KERNEL_CHECK_NULLPTR(num_rows, KERNEL_STATUS_PARAM_INVALID, "get num_rows failed.");
+  num_rows_value1 = num_rows->GetInt();
+  int64_t min_value = num_rows_value1;
+  int64_t max_value = -1;
+  int64_t num_col = num_rows_value1;
+  AttrValue *num_columns = ctx.GetAttr("num_columns");
+  if (num_columns) {
+    num_columns_value = num_columns->GetInt();
+    min_value = num_columns_value < num_rows_value1 ? num_columns_value : num_rows_value1;
+    max_value = num_columns_value > num_rows_value1 ? num_columns_value : num_rows_value1;
+    num_col = num_columns_value;
+  }
+  if (max_value == -1) {
+    max_value = num_rows_value1;
+  }
+  AttrValue *batch_shape = ctx.GetAttr("batch_shape");
+  if (batch_shape) {
+    std::vector<int64_t> output_size = ctx.GetAttr("batch_shape")->GetListInt();
+    out_size_size = output_size.size();
+    int64_t batch_shape_value = 1;
+    for (int32_t t = 0; t < out_size_size; t++) {
+      batch_shape_value = output_size[t];
+      dim_value = dim_value * batch_shape_value;
+    }
+  }
+  KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed.")
+  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Output(0)->NumElements();
+  int64_t data_size = data_num * sizeof(T);
+  Tensor *y = ctx.Output(0);
+  auto y_addr = y->GetData();
+  memset(y_addr, 0.0, data_size);
+  T num = static_cast<T>(1);
+  int32_t block_size = min_value * max_value;
+  for (int32_t dim = 0; dim < dim_value; dim++) {
+    for (int32_t i = 0; i < min_value; i++) {
+      *(output_y + (dim * block_size) + (num_col + 1) * i) = num;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kEye, EyeCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/eye.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/eye.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_ZEROSLIKE_H_
+#define AICPU_KERNELS_NORMALIZED_ZEROSLIKE_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class EyeCpuKernel : public CpuKernel {
+ public:
+  EyeCpuKernel() = default;
+  ~EyeCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t EyePartCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool.cc
@ -0,0 +1,294 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fractional_avg_pool.h"
+
+#include "Eigen/Dense"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kFractionalAvgPool = "FractionalAvgPool";
+const uint32_t k_InputNum = 1;
+const uint32_t k_OutputNum = 3;
+const int64_t kParallelDataNum = 1024 * 1024;
+constexpr uint32_t tensor_in_and_out_dims = 4;
+}  // namespace
+
+namespace aicpu {
+uint32_t FractionalAvgPoolCpuKernel::FractionalAvgPoolParamCheck(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, k_InputNum, k_OutputNum),
+                      "FractionalAvgPool Check input and output number failed.");
+  Tensor *input = ctx.Input(0);
+  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
+    KERNEL_LOG_ERROR("The data type of the output [%s] need be the same as the input [%s]",
+                     DTypeStr(ctx.Output(0)->GetDataType()).c_str(), DTypeStr(ctx.Input(0)->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  auto input_shape = input->GetTensorShape();
+  int32_t input_dims = input_shape->GetDims();
+  for (int32_t i = 0; i < input_dims; i++) {
+    KERNEL_CHECK_FALSE((input_shape->GetDimSize(i) > 0), KERNEL_STATUS_PARAM_INVALID,
+                       "FractionalAvgPool: expected input to have non-empty spatial "
+                       "dimensions, "
+                       "but input has sizes [%d] with dimension [%d] being empty.",
+                       input_dims, i);
+  }
+  KERNEL_CHECK_FALSE((input_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
+                     "tensor_in must be 4-dimensional.");
+  AttrValue *pooling_ratio = ctx.GetAttr("pooling_ratio");
+  KERNEL_CHECK_NULLPTR(pooling_ratio, KERNEL_STATUS_PARAM_INVALID, "[%s] get attr:pooling_ratio failed.",
+                       kFractionalAvgPool);
+  int32_t pooling_ratio_size = pooling_ratio->ListFloatSize();
+  KERNEL_CHECK_FALSE((pooling_ratio_size == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
+                     "pooling_ratio field must specify 4 dimensions.");
+  std::vector<float> pooling_ratio_data = ctx.GetAttr("pooling_ratio")->GetListFloat();
+  KERNEL_CHECK_FALSE((pooling_ratio_data[0] == 1.0 && pooling_ratio_data[3] == 1.0), KERNEL_STATUS_PARAM_INVALID,
+                     "FractionalAvgPool is not yet supported on the batch nor channel "
+                     "dimension.The first and last elements of pooling ratio must be 1.0.");
+  return KERNEL_STATUS_OK;
+}
+
+static std::vector<int64_t> GeneratePoolingSequencePseudoRandom(int input_length, int output_length, int seed) {
+  // generate a random number u which is in (0,1)
+  std::vector<int64_t> cum_seq(output_length + 1, 0);
+  std::vector<int64_t> diff(output_length, 0);
+  double alpha = static_cast<double>(input_length) / output_length;
+  int k = input_length / output_length;
+  double u_max1 = (k + 2) / alpha - 1;
+  double u_max2 = (input_length + 1 - k) / alpha - (output_length - 1);
+  double max_u = std::min(u_max1, u_max2);
+  std::default_random_engine random(seed);
+  std::uniform_real_distribution<double> dis2(0.0, 1.0);
+  const double u = dis2(random) * max_u;
+  cum_seq[0] = 1;
+  cum_seq[output_length] = input_length + 1;
+  for (int i = 1; i < output_length; ++i) {
+    cum_seq[i] = static_cast<int>(ceil(alpha * (i + u)));
+  }
+  for (int i = 0; i < output_length; ++i) {
+    diff[i] = cum_seq[i + 1] - cum_seq[i];
+  }
+  return diff;
+}
+
+static std::vector<int64_t> GeneratePoolingSequenceRandom(int input_length, int output_length, int seed) {
+  int k = input_length / output_length;
+  int num_random_spot = input_length % output_length;
+  std::vector<int64_t> diff(output_length, k);
+  for (int i = 0; i < num_random_spot; ++i) {
+    diff[i] += 1;
+  }
+  std::srand(seed);
+  random_shuffle(diff.begin(), diff.end());
+  return diff;
+}
+
+std::vector<int64_t> GeneratePoolingSequence(int input_length, int output_length, bool pseudo_random, int seed) {
+  std::vector<int64_t> diff;
+  if (input_length % output_length == 0) {
+    diff = std::vector<int64_t>(output_length, input_length / output_length);
+  }
+  if (pseudo_random) {
+    diff = GeneratePoolingSequencePseudoRandom(input_length, output_length, seed);
+  } else {
+    diff = GeneratePoolingSequenceRandom(input_length, output_length, seed);
+  }
+  int k = input_length / output_length;
+  for (int i = 0; i < output_length; i++) {
+    if (diff[i] < k || diff[i] > k + 1) {
+      KERNEL_LOG_ERROR("FractionalAvgPool kernel GeneratePoolingSequence diff[%d] is error");
+    }
+  }
+  std::vector<int64_t> cum_seq(output_length + 1, 0);
+  for (size_t i = 1; i < cum_seq.size(); ++i) {
+    cum_seq[i] = cum_seq[i - 1] + diff[i - 1];
+  }
+  return cum_seq;
+}
+
+template <typename T>
+uint32_t FractionalAvgPoolCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  Tensor *input = ctx.Input(0);
+  Tensor *output = ctx.Output(0);
+  Tensor *row_pooling_sequence = ctx.Output(1);
+  Tensor *col_pooling_sequence = ctx.Output(2);
+  std::vector<float> pooling_ratio = ctx.GetAttr("pooling_ratio")->GetListFloat();
+  AttrValue *pseudo_random_ = ctx.GetAttr("pseudo_random");
+  bool pseudo_random = (pseudo_random_ == nullptr) ? false : (pseudo_random_->GetBool());
+  AttrValue *overlapping_ = ctx.GetAttr("overlapping");
+  bool overlapping = (overlapping_ == nullptr) ? false : (overlapping_->GetBool());
+  AttrValue *deterministic_ = ctx.GetAttr("deterministic");
+  bool deterministic = (deterministic_ == nullptr) ? false : (deterministic_->GetBool());
+  AttrValue *seed_ = ctx.GetAttr("seed");
+  int seed = (seed_ == nullptr) ? 0 : (seed_->GetInt());
+  AttrValue *seed2_ = ctx.GetAttr("seed2");
+  int seed2 = (seed2_ == nullptr) ? 0 : (seed2_->GetInt());
+  auto input_shape = input->GetTensorShape();
+  std::vector<int> input_size(tensor_in_and_out_dims);
+  std::vector<int> output_size(tensor_in_and_out_dims);
+  for (uint32_t i = 0; i < tensor_in_and_out_dims; ++i) {
+    input_size[i] = input_shape->GetDimSize(i);
+  }
+  for (uint32_t i = 0; i < tensor_in_and_out_dims; ++i) {
+    output_size[i] = static_cast<int>(std::floor(input_size[i] / pooling_ratio[i]));
+    KERNEL_CHECK_FALSE((output_size[i] > 0), KERNEL_STATUS_PARAM_INVALID,
+                       "FractionalAvgPool kernel outputsize[%d] cannot be 0");
+  }
+  auto input_data = static_cast<T *>(ctx.Input(0)->GetData());
+  auto output_data = static_cast<T *>(output->GetData());
+  auto output_height_seq_tensor = static_cast<int64_t *>(row_pooling_sequence->GetData());
+  auto output_width_seq_tensor = static_cast<int64_t *>(col_pooling_sequence->GetData());
+  std::random_device rd;
+  std::mt19937 generator(rd());
+  if (deterministic) {
+    // If both seeds are not set when deterministic is true, force set seeds.
+    if ((seed == 0) && (seed2 == 0)) {
+      seed = generator();
+      seed2 = generator();
+    }
+  } else {
+    KERNEL_CHECK_FALSE(((seed == 0) && (seed2 == 0)), KERNEL_STATUS_PARAM_INVALID,
+                       "Both seed and seed2 should be 0 if deterministic is false.");
+  }
+  if (seed == 0 && seed2 != 0) {
+    seed = seed2;
+  }
+  // Generate pooling sequence.
+  std::vector<int64_t> height_cum_seq;
+  std::vector<int64_t> width_cum_seq;
+  height_cum_seq = GeneratePoolingSequence(input_size[1], output_size[1], pseudo_random, seed);
+  width_cum_seq = GeneratePoolingSequence(input_size[2], output_size[2], pseudo_random, seed);
+  for (uint32_t i = 0; i < height_cum_seq.size(); ++i) {
+    *(output_height_seq_tensor + i) = height_cum_seq[i];
+  }
+  for (uint32_t i = 0; i < width_cum_seq.size(); ++i) {
+    *(output_width_seq_tensor + i) = width_cum_seq[i];
+  }
+  const int64_t height_max = input_size[1] - 1;
+  const int64_t width_max = input_size[2] - 1;
+  const int64_t depth_max = input_size[3] - 1;
+  uint64_t data_num = input->NumElements();
+  /**
+   * For both input and output,
+   * 0: batch
+   * 1: height / row
+   * 2: width / col
+   * 3: depth / channel
+   */
+  if (data_num < kParallelDataNum) {
+    for (int64_t b = 0; b < input_size[0]; ++b) {
+      // height sequence.
+      for (uint32_t hs = 0; hs < height_cum_seq.size() - 1; ++hs) {
+        // height start and end.
+        const int64_t height_start = height_cum_seq[hs];
+        int64_t height_end = overlapping ? height_cum_seq[hs + 1] : height_cum_seq[hs + 1] - 1;
+        height_end = std::min(height_end, height_max);
+        // width sequence.
+        for (uint32_t ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
+          for (int64_t c = 0; c <= depth_max; ++c) {
+            const int64_t out_offset = ((b * output_size[1] + hs) * output_size[2] + ws) * output_size[3] + c;
+            // Initializes the output tensor with 0.
+            T sum = static_cast<T>(0);
+            T avg = static_cast<T>(0);
+            int count = 0;
+            // width start and end.
+            const int64_t width_start = width_cum_seq[ws];
+            int64_t width_end = overlapping ? width_cum_seq[ws + 1] : width_cum_seq[ws + 1] - 1;
+            width_end = std::min(width_end, width_max);
+            for (int64_t h = height_start; h <= height_end; ++h) {
+              for (int64_t w = width_start; w <= width_end; ++w) {
+                const int64_t in_offset = ((b * input_size[1] + h) * input_size[2] + w) * output_size[3] + c;
+                sum += input_data[in_offset];
+                count++;
+              }
+            }
+            avg = sum / static_cast<T>(count);
+            *(output_data + out_offset) = avg;
+          }
+        }
+      }
+    }
+  } else {
+    uint64_t height_cum_len = height_cum_seq.size() - 1;
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num > height_cum_len) {
+      max_core_num = height_cum_len;
+    }
+    for (int64_t b = 0; b < input_size[0]; ++b) {
+      // height sequence.
+      auto sharder_fractionalavgpool_index = [&](size_t start, size_t end) {
+        for (uint32_t hs = start; hs < end; ++hs) {
+          // height start and end.
+          const int64_t height_start = height_cum_seq[hs];
+          int64_t height_end = overlapping ? height_cum_seq[hs + 1] : height_cum_seq[hs + 1] - 1;
+          height_end = std::min(height_end, height_max);
+          // width sequence.
+          for (uint32_t ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
+            for (int64_t c = 0; c <= depth_max; ++c) {
+              const int64_t out_offset = ((b * output_size[1] + hs) * output_size[2] + ws) * output_size[3] + c;
+              // Initializes the output tensor with 0.
+              T sum = static_cast<T>(0);
+              T avg = static_cast<T>(0);
+              int count = 0;
+              // width start and end.
+              const int64_t width_start = width_cum_seq[ws];
+              int64_t width_end = overlapping ? width_cum_seq[ws + 1] : width_cum_seq[ws + 1] - 1;
+              width_end = std::min(width_end, width_max);
+              for (int64_t h = height_start; h <= height_end; ++h) {
+                for (int64_t w = width_start; w <= width_end; ++w) {
+                  const int64_t in_offset = ((b * input_size[1] + h) * input_size[2] + w) * output_size[3] + c;
+                  sum += input_data[in_offset];
+                  count++;
+                }
+              }
+              avg = sum / static_cast<T>(count);
+              *(output_data + out_offset) = avg;
+            }
+          }
+        }
+      };
+      KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, height_cum_len, height_cum_len / max_core_num,
+                                                      sharder_fractionalavgpool_index),
+                          "FractionalAvgPool Index Compute failed");
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t FractionalAvgPoolCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(FractionalAvgPoolParamCheck(ctx), "Check FractionalAvgPool params failed.");
+  Tensor *input = ctx.Input(0);
+  auto data_type = input->GetDataType();
+  switch (data_type) {
+    case DT_FLOAT:
+      return DoCompute<float>(ctx);
+    case DT_DOUBLE:
+      return DoCompute<double>(ctx);
+    case DT_INT32:
+      return DoCompute<int32_t>(ctx);
+    case DT_INT64:
+      return DoCompute<int64_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("FractionalAvgPool kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kFractionalAvgPool, FractionalAvgPoolCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool.h
@ -0,0 +1,20 @@
+#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_POOL_H_
+#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_POOL_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+
+namespace aicpu {
+class FractionalAvgPoolCpuKernel : public CpuKernel {
+ public:
+  FractionalAvgPoolCpuKernel() = default;
+  ~FractionalAvgPoolCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t DoCompute(CpuKernelContext &ctx);
+  uint32_t FractionalAvgPoolParamCheck(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_POOL_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool_grad.cc
@ -0,0 +1,208 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fractional_avg_pool_grad.h"
+
+#include <iostream>
+#include "Eigen/Dense"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kFractionalAvgPoolGrad = "FractionalAvgPoolGrad";
+const uint32_t k_InputNum = 4;
+const uint32_t k_OutputNum = 1;
+const int64_t kParallelDataNum = 32 * 1024;
+}  // namespace
+
+namespace aicpu {
+uint32_t FractionalAvgPoolGradCpuKernel::FractionalAvgPoolGradParamCheck(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, k_InputNum, k_OutputNum),
+                      "FractionalAvgPoolGrad check input and output number failed.");
+  Tensor *orig_input_tensor_shape = ctx.Input(0);
+  Tensor *out_backprop = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  auto orig_input_shape = orig_input_tensor_shape->GetTensorShape();
+  int32_t orig_input_dims = orig_input_shape->GetDims();
+  int32_t orig_input_shape_nums = orig_input_tensor_shape->NumElements();
+  if (out_backprop->GetDataType() != output->GetDataType()) {
+    KERNEL_LOG_ERROR(
+      "The data type of the output [%s] need be the same as the out_backprop "
+      "[%s]",
+      DTypeStr(output->GetDataType()).c_str(), DTypeStr(out_backprop->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  KERNEL_CHECK_FALSE((orig_input_dims == 1 && orig_input_shape_nums == 4), KERNEL_STATUS_PARAM_INVALID,
+                     "original input tensor shape must be 1-dimensional and 4 elements.");
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t FractionalAvgPoolGradCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenMatrixMap;
+  typedef Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>> EigenDoubleMatrixMap;
+  const Tensor *orig_input_tensor_shape = ctx.Input(0);
+  const Tensor *out_backprop = ctx.Input(1);
+  const Tensor *row_pooling_sequence = ctx.Input(2);
+  const Tensor *col_pooling_sequence = ctx.Input(3);
+  Tensor *output = ctx.Output(0);
+  auto output_data = static_cast<T *>(output->GetData());
+  AttrValue *overlapping_ = ctx.GetAttr("overlapping");
+  bool overlapping = (overlapping_ == nullptr) ? false : (overlapping_->GetBool());
+  int32_t row_seq_nums = row_pooling_sequence->NumElements();
+  int32_t col_seq_nums = col_pooling_sequence->NumElements();
+  auto out_backprop_shape = out_backprop->GetTensorShape();
+  const int64_t out_batch = out_backprop_shape->GetDimSize(0);
+  const int64_t out_rows = out_backprop_shape->GetDimSize(1);
+  const int64_t out_cols = out_backprop_shape->GetDimSize(2);
+  const int64_t out_depth = out_backprop_shape->GetDimSize(3);
+  KERNEL_CHECK_FALSE((row_seq_nums > out_rows), KERNEL_STATUS_PARAM_INVALID,
+                     "Given out_backprop shape [%ld,%ld,%ld,%ld], row_seq_tensor must"
+                     " have at least [%ld] elements, but got[%ld].",
+                     out_batch, out_rows, out_cols, out_depth, out_rows + 1, row_seq_nums);
+  KERNEL_CHECK_FALSE((col_seq_nums > out_cols), KERNEL_STATUS_PARAM_INVALID,
+                     "Given out_backprop shape [%ld,%ld,%ld,%ld], col_seq_tensor must"
+                     " have at least [%ld] elements, but got[%ld].",
+                     out_batch, out_rows, out_cols, out_depth, out_cols + 1, col_seq_nums);
+  auto row_seq_data = static_cast<int64_t *>(row_pooling_sequence->GetData());
+  auto col_seq_data = static_cast<int64_t *>(col_pooling_sequence->GetData());
+  auto orig_input_tensor_shape_data = static_cast<int64_t *>(orig_input_tensor_shape->GetData());
+  const int64_t in_batch = *(orig_input_tensor_shape_data);
+  const int64_t in_rows = *(orig_input_tensor_shape_data + 1);
+  const int64_t in_cols = *(orig_input_tensor_shape_data + 2);
+  const int64_t in_depth = *(orig_input_tensor_shape_data + 3);
+  int32_t input_nums = orig_input_tensor_shape->NumElements();
+  std::vector<int64_t> out_put_dims;
+  for (int i = 0; i < input_nums; i++) {
+    KERNEL_CHECK_FALSE((*(orig_input_tensor_shape_data + i) > 0), KERNEL_STATUS_PARAM_INVALID,
+                       "Each dimension of input must be > 0.");
+    out_put_dims.push_back(orig_input_tensor_shape_data[i]);
+  }
+  int64_t output_nums = in_batch * in_rows * in_cols * in_depth;
+  // Create intermediate in_backprop.
+  std::vector<double> in_backprop_tensor_temp(output_nums);
+  for (int i = 0; i < output_nums; i++) {
+    in_backprop_tensor_temp[i] = 0;
+    *(output_data + i) = 0;
+  }
+  EigenDoubleMatrixMap in_backprop_tensor_temp_mat(in_backprop_tensor_temp.data(), in_depth,
+                                                   in_cols * in_rows * in_batch);
+  ConstEigenMatrixMap out_backprop_mat(reinterpret_cast<T *>(out_backprop->GetData()), out_depth,
+                                       out_cols * out_rows * out_batch);
+  // Loop through each element of out_backprop and evenly distribute the
+  // element to the corresponding pooling cell.
+  const int64_t in_max_row_index = in_rows - 1;
+  const int64_t in_max_col_index = in_cols - 1;
+  if (output_nums < kParallelDataNum) {
+    for (int64_t b = 0; b < out_batch; ++b) {
+      for (int64_t r = 0; r < out_rows; ++r) {
+        const int64_t in_row_start = *(row_seq_data + r);
+        int64_t in_row_end = overlapping ? *(row_seq_data + r + 1) : *(row_seq_data + r + 1) - 1;
+        in_row_end = std::min(in_row_end, in_max_row_index);
+        for (int64_t c = 0; c < out_cols; ++c) {
+          const int64_t in_col_start = *(col_seq_data + c);
+          int64_t in_col_end = overlapping ? *(col_seq_data + c + 1) : *(col_seq_data + c + 1) - 1;
+          in_col_end = std::min(in_col_end, in_max_col_index);
+          const int64_t num_elements_in_pooling_cell =
+            (in_row_end - in_row_start + 1) * (in_col_end - in_col_start + 1);
+          const int64_t out_index = (b * out_rows + r) * out_cols + c;
+          // Now we can evenly distribute out_backprop(b, h, w, *) to
+          // in_backprop(b, hs:he, ws:we, *).
+          for (int64_t in_r = in_row_start; in_r <= in_row_end; ++in_r) {
+            for (int64_t in_c = in_col_start; in_c <= in_col_end; ++in_c) {
+              const int64_t in_index = (b * in_rows + in_r) * in_cols + in_c;
+              // Walk through each channel (depth).
+              for (int64_t d = 0; d < out_depth; ++d) {
+                const double out_backprop_element = static_cast<double>(out_backprop_mat.coeffRef(d, out_index));
+                double &in_backprop_ref = in_backprop_tensor_temp_mat.coeffRef(d, in_index);
+                in_backprop_ref += out_backprop_element / num_elements_in_pooling_cell;
+              }
+            }
+          }
+        }
+      }
+    }
+  } else {
+    uint64_t row_len = out_rows;
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num > row_len) {
+      max_core_num = row_len;
+    }
+    for (int64_t b = 0; b < out_batch; ++b) {
+      auto sharder_fractionalavgpoolgrad_index = [&](size_t start, size_t end) {
+        for (size_t r = start; r < end; ++r) {
+          const int64_t in_row_start = *(row_seq_data + r);
+          int64_t in_row_end = overlapping ? *(row_seq_data + r + 1) : *(row_seq_data + r + 1) - 1;
+          in_row_end = std::min(in_row_end, in_max_row_index);
+          for (int64_t c = 0; c < out_cols; ++c) {
+            const int64_t in_col_start = *(col_seq_data + c);
+            int64_t in_col_end = overlapping ? *(col_seq_data + c + 1) : *(col_seq_data + c + 1) - 1;
+            in_col_end = std::min(in_col_end, in_max_col_index);
+            const int64_t num_elements_in_pooling_cell =
+              (in_row_end - in_row_start + 1) * (in_col_end - in_col_start + 1);
+            const int64_t out_index = (b * out_rows + r) * out_cols + c;
+            // Now we can evenly distribute out_backprop(b, h, w, *) to
+            // in_backprop(b, hs:he, ws:we, *).
+            for (int64_t in_r = in_row_start; in_r <= in_row_end; ++in_r) {
+              for (int64_t in_c = in_col_start; in_c <= in_col_end; ++in_c) {
+                const int64_t in_index = (b * in_rows + in_r) * in_cols + in_c;
+                // Walk through each channel (depth).
+                for (int64_t d = 0; d < out_depth; ++d) {
+                  const double out_backprop_element = static_cast<double>(out_backprop_mat.coeffRef(d, out_index));
+                  double &in_backprop_ref = in_backprop_tensor_temp_mat.coeffRef(d, in_index);
+                  in_backprop_ref += out_backprop_element / num_elements_in_pooling_cell;
+                }
+              }
+            }
+          }
+        }
+      };
+      KERNEL_HANDLE_ERROR(
+        CpuKernelUtils::ParallelFor(ctx, row_len, row_len / max_core_num, sharder_fractionalavgpoolgrad_index),
+        "FractionalAvgPoolGrad Index Compute failed.");
+    }
+  }
+  // Depending on the type, cast double to type T.
+  for (int64_t i = 0; i < output_nums; ++i) {
+    *(output_data + i) = static_cast<T>(in_backprop_tensor_temp[i]);
+  }
+  output->GetTensorShape()->SetDimSizes(out_put_dims);
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t FractionalAvgPoolGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(FractionalAvgPoolGradParamCheck(ctx), "Check FractionalAvgPoolGrad params failed.");
+  Tensor *out_backprop = ctx.Input(1);
+  auto data_type = out_backprop->GetDataType();
+  switch (data_type) {
+    case DT_FLOAT:
+      return DoCompute<float>(ctx);
+    case DT_DOUBLE:
+      return DoCompute<double>(ctx);
+    case DT_INT32:
+      return DoCompute<int32_t>(ctx);
+    case DT_INT64:
+      return DoCompute<int64_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("FractionalAvgPoolGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kFractionalAvgPoolGrad, FractionalAvgPoolGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool_grad.h
@ -0,0 +1,20 @@
+#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_GRAD_POOL_H_
+#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_GRAD_POOL_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+
+namespace aicpu {
+class FractionalAvgPoolGradCpuKernel : public CpuKernel {
+ public:
+  FractionalAvgPoolGradCpuKernel() = default;
+  ~FractionalAvgPoolGradCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t DoCompute(CpuKernelContext &ctx);
+  uint32_t FractionalAvgPoolGradParamCheck(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_GRAD_POOL_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool.cc
@ -0,0 +1,285 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fractional_max_pool.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kFractionalMaxPool = "FractionalMaxPool";
+const uint32_t k_InputNum = 1;
+const uint32_t k_OutputNum = 3;
+const int64_t kParallelDataNum = 1024 * 1024;
+const uint32_t tensor_in_and_out_dims = 4;
+}  // namespace
+
+namespace aicpu {
+uint32_t FractionalMaxPoolCpuKernel::FractionalMaxPoolParamCheck(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, k_InputNum, k_OutputNum),
+                      "FractionalMaxPool Check input and output number failed.");
+  Tensor *input = ctx.Input(0);
+  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
+    KERNEL_LOG_ERROR("The data type of the output [%s] need be the same as the input [%s]",
+                     DTypeStr(ctx.Output(0)->GetDataType()).c_str(), DTypeStr(ctx.Input(0)->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  auto input_shape = input->GetTensorShape();
+  int32_t input_dims = input_shape->GetDims();
+  for (int32_t i = 0; i < input_dims; i++) {
+    KERNEL_CHECK_FALSE((input_shape->GetDimSize(i) > 0), KERNEL_STATUS_PARAM_INVALID,
+                       "FractionalMaxPool: expected input to have non-empty spatial "
+                       "dimensions, "
+                       "but input has sizes [%d] with dimension [%d] being empty.",
+                       input_dims, i);
+  }
+  KERNEL_CHECK_FALSE((input_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
+                     "tensor_in must be 4-dimensional.");
+  AttrValue *pooling_ratio = ctx.GetAttr("pooling_ratio");
+  KERNEL_CHECK_NULLPTR(pooling_ratio, KERNEL_STATUS_PARAM_INVALID, "[%s] get attr:pooling_ratio failed.",
+                       kFractionalMaxPool);
+  int32_t pooling_ratio_size = pooling_ratio->ListFloatSize();
+  KERNEL_CHECK_FALSE((pooling_ratio_size == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
+                     "The size of pooling_ratio must be 4, but got [%d].", pooling_ratio_size);
+  std::vector<float> pooling_ratio_data = ctx.GetAttr("pooling_ratio")->GetListFloat();
+  KERNEL_CHECK_FALSE((pooling_ratio_data[0] == 1.0 && pooling_ratio_data[3] == 1.0), KERNEL_STATUS_PARAM_INVALID,
+                     "FractionalMaxPool is not yet supported on the batch nor channel "
+                     "dimension.The first and last elements of pooling ratio must be 1.0.");
+  return KERNEL_STATUS_OK;
+}
+
+static std::vector<int64_t> GeneratePoolingSequencePseudoRandom(int input_length, int output_length, int64_t seed) {
+  // generate a random number which is in (0,1)
+  std::vector<int64_t> cum_seq(output_length + 1, 0);
+  std::vector<int64_t> diff(output_length, 0);
+  double alpha = static_cast<double>(input_length) / output_length;
+  int k = input_length / output_length;
+  double u_max1 = (k + 2) / alpha - 1;
+  double u_max2 = (input_length + 1 - k) / alpha - (output_length - 1);
+  double max_u = std::min(u_max1, u_max2);
+  std::default_random_engine random(seed);
+  std::uniform_real_distribution<double> dis2(0.0, 1.0);
+  const double u = dis2(random) * max_u;
+  cum_seq[0] = 1;
+  cum_seq[output_length] = input_length + 1;
+  for (int i = 1; i < output_length; ++i) {
+    cum_seq[i] = static_cast<int>(ceil(alpha * (i + u)));
+  }
+  for (int i = 0; i < output_length; ++i) {
+    diff[i] = cum_seq[i + 1] - cum_seq[i];
+  }
+  return diff;
+}
+
+static std::vector<int64_t> GeneratePoolingSequenceRandom(int input_length, int output_length, int64_t seed) {
+  int k = input_length / output_length;
+  int num_random_spot = input_length % output_length;
+  std::vector<int64_t> diff(output_length, k);
+  for (int i = 0; i < num_random_spot; ++i) {
+    diff[i] += 1;
+  }
+  std::srand(seed);
+  random_shuffle(diff.begin(), diff.end());
+  return diff;
+}
+
+std::vector<int64_t> GeneratePoolingSequence(int input_length, int output_length, bool pseudo_random, int64_t seed) {
+  std::vector<int64_t> diff;
+  if (input_length % output_length == 0) {
+    diff = std::vector<int64_t>(output_length, input_length / output_length);
+  }
+  if (pseudo_random) {
+    diff = GeneratePoolingSequencePseudoRandom(input_length, output_length, seed);
+  } else {
+    diff = GeneratePoolingSequenceRandom(input_length, output_length, seed);
+  }
+  int k = input_length / output_length;
+  for (int i = 0; i < output_length; i++) {
+    if (diff[i] < k || diff[i] > k + 1) {
+      KERNEL_LOG_ERROR("FractionalMaxPool kernel GeneratePoolingSequence diff[%d] is error");
+    }
+  }
+  std::vector<int64_t> cum_seq(output_length + 1, 0);
+  for (size_t i = 1; i < cum_seq.size(); ++i) {
+    cum_seq[i] = cum_seq[i - 1] + diff[i - 1];
+  }
+  return cum_seq;
+}
+
+template <typename T>
+uint32_t FractionalMaxPoolCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  Tensor *input = ctx.Input(0);
+  Tensor *output = ctx.Output(0);
+  Tensor *row_pooling_sequence = ctx.Output(1);
+  Tensor *col_pooling_sequence = ctx.Output(2);
+  std::vector<float> pooling_ratio = ctx.GetAttr("pooling_ratio")->GetListFloat();
+  AttrValue *pseudo_random_ = ctx.GetAttr("pseudo_random");
+  bool pseudo_random = (pseudo_random_ == nullptr) ? false : (pseudo_random_->GetBool());
+  AttrValue *overlapping_ = ctx.GetAttr("overlapping");
+  bool overlapping = (overlapping_ == nullptr) ? false : (overlapping_->GetBool());
+  AttrValue *deterministic_ = ctx.GetAttr("deterministic");
+  bool deterministic = (deterministic_ == nullptr) ? false : (deterministic_->GetBool());
+  AttrValue *seed_ = ctx.GetAttr("seed");
+  int seed = (seed_ == nullptr) ? 0 : (seed_->GetInt());
+  AttrValue *seed2_ = ctx.GetAttr("seed2");
+  int seed2 = (seed2_ == nullptr) ? 0 : (seed2_->GetInt());
+  auto input_shape = input->GetTensorShape();
+  std::vector<int> input_size(tensor_in_and_out_dims);
+  std::vector<int> output_size(tensor_in_and_out_dims);
+  for (size_t i = 0; i < tensor_in_and_out_dims; ++i) {
+    input_size[i] = input_shape->GetDimSize(i);
+  }
+  for (size_t i = 0; i < tensor_in_and_out_dims; ++i) {
+    output_size[i] = static_cast<int>(std::floor(input_size[i] / pooling_ratio[i]));
+    KERNEL_CHECK_FALSE((output_size[i] > 0), KERNEL_STATUS_PARAM_INVALID,
+                       "FractionalMaxPool kernel output size[%d] cannot be 0.");
+  }
+  auto input_data = static_cast<T *>(ctx.Input(0)->GetData());
+  auto output_data = static_cast<T *>(output->GetData());
+  auto output_height_seq_tensor = static_cast<int64_t *>(row_pooling_sequence->GetData());
+  auto output_width_seq_tensor = static_cast<int64_t *>(col_pooling_sequence->GetData());
+  std::random_device rd;
+  std::mt19937 generator(rd());
+  if (deterministic) {
+    // If both seeds are not set when deterministic is true, force set seeds.
+    if ((seed == 0) && (seed2 == 0)) {
+      seed = generator();
+      seed2 = generator();
+    }
+  } else {
+    KERNEL_CHECK_FALSE(((seed == 0) && (seed2 == 0)), KERNEL_STATUS_PARAM_INVALID,
+                       "Both seed and seed2 should be 0 if deterministic is false.");
+  }
+  if (seed == 0 && seed2 != 0) {
+    seed = seed2;
+  }
+  // Generate pooling sequence.
+  std::vector<int64_t> height_cum_seq;
+  std::vector<int64_t> width_cum_seq;
+  height_cum_seq = GeneratePoolingSequence(input_size[1], output_size[1], pseudo_random, seed);
+  width_cum_seq = GeneratePoolingSequence(input_size[2], output_size[2], pseudo_random, seed);
+  for (size_t i = 0; i < height_cum_seq.size(); ++i) {
+    *(output_height_seq_tensor + i) = height_cum_seq[i];
+  }
+  for (size_t i = 0; i < width_cum_seq.size(); ++i) {
+    *(output_width_seq_tensor + i) = width_cum_seq[i];
+  }
+  const int64_t height_max = input_size[1] - 1;
+  const int64_t width_max = input_size[2] - 1;
+  const int64_t depth_max = input_size[3] - 1;
+  uint64_t data_num = input->NumElements();
+  /**
+   * For both input and output,
+   * 0: batch
+   * 1: height / row
+   * 2: width / col
+   * 3: depth / channel
+   */
+  if (data_num < kParallelDataNum) {
+    for (int64_t b = 0; b < input_size[0]; ++b) {
+      // height sequence.
+      for (size_t hs = 0; hs < height_cum_seq.size() - 1; ++hs) {
+        // height start and end.
+        const int64_t height_start = height_cum_seq[hs];
+        int64_t height_end = overlapping ? height_cum_seq[hs + 1] : height_cum_seq[hs + 1] - 1;
+        height_end = std::min(height_end, height_max);
+        // width sequence.
+        for (size_t ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
+          for (int64_t c = 0; c <= depth_max; ++c) {
+            const int64_t out_offset = ((b * output_size[1] + hs) * output_size[2] + ws) * output_size[3] + c;
+            // Initializes the output tensor with MIN<T>.
+            T max = std::numeric_limits<T>::lowest();
+            // width start and end.
+            const int64_t width_start = width_cum_seq[ws];
+            int64_t width_end = overlapping ? width_cum_seq[ws + 1] : width_cum_seq[ws + 1] - 1;
+            width_end = std::min(width_end, width_max);
+            for (int64_t h = height_start; h <= height_end; ++h) {
+              for (int64_t w = width_start; w <= width_end; ++w) {
+                const int64_t in_offset = ((b * input_size[1] + h) * input_size[2] + w) * output_size[3] + c;
+                max = max > input_data[in_offset] ? max : input_data[in_offset];
+              }
+            }
+            *(output_data + out_offset) = max;
+          }
+        }
+      }
+    }
+  } else {
+    uint64_t height_cum_len = height_cum_seq.size() - 1;
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num > height_cum_len) {
+      max_core_num = height_cum_len;
+    }
+    for (int64_t b = 0; b < input_size[0]; ++b) {
+      // height sequence.
+      auto sharder_fractionalmaxpool_index = [&](size_t start, size_t end) {
+        for (size_t hs = start; hs < end; ++hs) {
+          // height start and end.
+          const int64_t height_start = height_cum_seq[hs];
+          int64_t height_end = overlapping ? height_cum_seq[hs + 1] : height_cum_seq[hs + 1] - 1;
+          height_end = std::min(height_end, height_max);
+          // width sequence.
+          for (size_t ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
+            for (int64_t c = 0; c <= depth_max; ++c) {
+              const int64_t out_offset = ((b * output_size[1] + hs) * output_size[2] + ws) * output_size[3] + c;
+              // Initializes the output tensor with MIN<T>.
+              T max = std::numeric_limits<T>::lowest();
+              // width start and end.
+              const int64_t width_start = width_cum_seq[ws];
+              int64_t width_end = overlapping ? width_cum_seq[ws + 1] : width_cum_seq[ws + 1] - 1;
+              width_end = std::min(width_end, width_max);
+              for (int64_t h = height_start; h <= height_end; ++h) {
+                for (int64_t w = width_start; w <= width_end; ++w) {
+                  const int64_t in_offset = ((b * input_size[1] + h) * input_size[2] + w) * output_size[3] + c;
+                  max = max > input_data[in_offset] ? max : input_data[in_offset];
+                }
+              }
+              *(output_data + out_offset) = max;
+            }
+          }
+        }
+      };
+      KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, height_cum_len, height_cum_len / max_core_num,
+                                                      sharder_fractionalmaxpool_index),
+                          "FractionalMaxPool Index Compute failed");
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t FractionalMaxPoolCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(FractionalMaxPoolParamCheck(ctx), "FractionalMaxPool check params failed.");
+  Tensor *input = ctx.Input(0);
+  auto data_type = input->GetDataType();
+  switch (data_type) {
+    case DT_FLOAT:
+      return DoCompute<float>(ctx);
+    case DT_DOUBLE:
+      return DoCompute<double>(ctx);
+    case DT_INT32:
+      return DoCompute<int32_t>(ctx);
+    case DT_INT64:
+      return DoCompute<int64_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("FractionalMaxPool kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kFractionalMaxPool, FractionalMaxPoolCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool.h
@ -0,0 +1,20 @@
+#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_H_
+#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+
+namespace aicpu {
+class FractionalMaxPoolCpuKernel : public CpuKernel {
+ public:
+  FractionalMaxPoolCpuKernel() = default;
+  ~FractionalMaxPoolCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t DoCompute(CpuKernelContext &ctx);
+  uint32_t FractionalMaxPoolParamCheck(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool_grad.cc
@ -0,0 +1,242 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "fractional_max_pool_grad.h"
+
+#include "Eigen/Dense"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kFractionalMaxPoolGrad = "FractionalMaxPoolGrad";
+const uint32_t k_InputNum = 5;
+const uint32_t k_OutputNum = 1;
+static const int kInvalidMaxPoolingIndex = -1;
+const int64_t kParallelDataNum = 32 * 1024;
+const uint32_t tensor_in_and_out_dims = 4;
+}  // namespace
+
+namespace aicpu {
+uint32_t FractionalMaxPoolGradCpuKernel::FractionalMaxPoolGradParamCheck(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, k_InputNum, k_OutputNum),
+                      "FractionalMaxPoolGrad check input and output number failed.");
+  Tensor *orig_input = ctx.Input(0);
+  Tensor *orig_output = ctx.Input(1);
+  Tensor *out_backprop = ctx.Input(2);
+  auto orig_input_shape = orig_input->GetTensorShape();
+  int32_t orig_input_dims = orig_input_shape->GetDims();
+  auto orig_output_shape = orig_output->GetTensorShape();
+  int32_t orig_output_dims = orig_output_shape->GetDims();
+  auto out_backprop_shape = out_backprop->GetTensorShape();
+  int32_t out_backprop_dims = out_backprop_shape->GetDims();
+  if (orig_input->GetDataType() != orig_output->GetDataType()) {
+    KERNEL_LOG_ERROR(
+      "The data type of the orig_output [%s] need be the same as the "
+      "orig_input [%s].",
+      DTypeStr(orig_output->GetDataType()).c_str(), DTypeStr(orig_input->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (orig_input->GetDataType() != out_backprop->GetDataType()) {
+    KERNEL_LOG_ERROR(
+      "The data type of the out_backprop [%s] need be the same as the "
+      "orig_input [%s].",
+      DTypeStr(out_backprop->GetDataType()).c_str(), DTypeStr(orig_input->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  KERNEL_CHECK_FALSE((orig_input_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
+                     "orig_input should be a tensor of rank 4.");
+  KERNEL_CHECK_FALSE((orig_output_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
+                     "orig_output should be a tensor of rank 4.");
+  KERNEL_CHECK_FALSE((out_backprop_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
+                     "out_backprop should be a tensor of rank 4.");
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t FractionalMaxPoolGradCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenMatrixMap;
+  typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> EigenMatrixMap;
+  typedef Eigen::Map<Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>> EigenIndexMatrixMap;
+  const Tensor *tensor_in = ctx.Input(0);
+  const Tensor *tensor_out = ctx.Input(1);
+  const Tensor *out_backprop = ctx.Input(2);
+  const Tensor *height_seq_tensor = ctx.Input(3);
+  const Tensor *width_seq_tensor = ctx.Input(4);
+  Tensor *output = ctx.Output(0);
+  auto output_data = static_cast<T *>(output->GetData());
+  AttrValue *overlapping_ = ctx.GetAttr("overlapping");
+  bool overlapping = (overlapping_ == nullptr) ? false : (overlapping_->GetBool());
+  auto tensor_in_shape = tensor_in->GetTensorShape();
+  auto tensor_out_shape = tensor_out->GetTensorShape();
+  std::vector<int64_t> input_size(tensor_in_and_out_dims);
+  std::vector<int64_t> output_size(tensor_in_and_out_dims);
+  for (uint32_t i = 0; i < tensor_in_and_out_dims; ++i) {
+    input_size[i] = tensor_in_shape->GetDimSize(i);
+  }
+  for (uint32_t i = 0; i < tensor_in_and_out_dims; ++i) {
+    output_size[i] = tensor_out_shape->GetDimSize(i);
+  }
+  int64_t tensor_in_num = tensor_in->NumElements();
+  int64_t tensor_out_num = tensor_out->NumElements();
+  std::vector<T> tensor_out_dup(tensor_out_num);
+  std::vector<int64_t> tensor_out_arg_max(tensor_out_num);
+  for (int i = 0; i < tensor_out_num; i++) {
+    tensor_out_dup[i] = std::numeric_limits<T>::lowest();
+    tensor_out_arg_max[i] = -1;
+  }
+  // Find arg_max for each tensor_out
+  ConstEigenMatrixMap tensor_in_mat(reinterpret_cast<T *>(tensor_in->GetData()), input_size[3],
+                                    input_size[2] * input_size[1] * input_size[0]);
+  EigenMatrixMap tensor_out_dup_mat(tensor_out_dup.data(), output_size[3],
+                                    output_size[2] * output_size[1] * output_size[0]);
+  EigenIndexMatrixMap tensor_out_arg_max_mat(tensor_out_arg_max.data(), output_size[3],
+                                             output_size[2] * output_size[1] * output_size[0]);
+  auto height_seq_tensor_shape = height_seq_tensor->GetTensorShape();
+  auto width_seq_tensor_shape = width_seq_tensor->GetTensorShape();
+  auto height_seq_tensor_data = static_cast<int64_t *>(height_seq_tensor->GetData());
+  auto width_seq_tensor_data = static_cast<int64_t *>(width_seq_tensor->GetData());
+  /**
+   * Now walk through the process of fractional max pooling again.
+   * For both input and output,
+   * 0: batch
+   * 1: height / row
+   * 2: width / col
+   * 3: depth / channel
+   */
+  if (tensor_in_num < kParallelDataNum) {
+    const int64_t height_max = input_size[1] - 1;
+    const int64_t width_max = input_size[2] - 1;
+    for (int64_t b = 0; b < input_size[0]; ++b) {
+      // height sequence.
+      for (int64_t hs = 0; hs < height_seq_tensor_shape->GetDimSize(0) - 1; ++hs) {
+        // height start and end.
+        const int64_t height_start = *(height_seq_tensor_data + hs);
+        int64_t height_end = overlapping ? *(height_seq_tensor_data + hs + 1) : *(height_seq_tensor_data + hs + 1) - 1;
+        height_end = std::min(height_end, height_max);
+        // width sequence.
+        for (int64_t ws = 0; ws < width_seq_tensor_shape->GetDimSize(0) - 1; ++ws) {
+          const int64_t out_index = (b * output_size[1] + hs) * output_size[2] + ws;
+          // width start and end.
+          const int64_t width_start = *(width_seq_tensor_data + ws);
+          int64_t width_end = overlapping ? *(width_seq_tensor_data + ws + 1) : *(width_seq_tensor_data + ws + 1) - 1;
+          width_end = std::min(width_end, width_max);
+          for (int64_t h = height_start; h <= height_end; ++h) {
+            for (int64_t w = width_start; w <= width_end; ++w) {
+              const int64_t in_index = (b * input_size[1] + h) * input_size[2] + w;
+              // Walk through each channel (depth).
+              for (int64_t d = 0; d < input_size[3]; ++d) {
+                const T &input_ref = tensor_in_mat.coeffRef(d, in_index);
+                T &output_ref = tensor_out_dup_mat.coeffRef(d, out_index);
+                int64_t &out_arg_max_ref = tensor_out_arg_max_mat.coeffRef(d, out_index);
+                if (output_ref < input_ref || out_arg_max_ref == kInvalidMaxPoolingIndex) {
+                  output_ref = input_ref;
+                  int input_offset = in_index * input_size[3] + d;
+                  out_arg_max_ref = input_offset;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } else {
+    uint64_t height_seq_len = height_seq_tensor_shape->GetDimSize(0) - 1;
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num > height_seq_len) {
+      max_core_num = height_seq_len;
+    }
+    const int64_t height_max = input_size[1] - 1;
+    const int64_t width_max = input_size[2] - 1;
+    for (int64_t b = 0; b < input_size[0]; ++b) {
+      // height sequence.
+      auto sharder_fractionalmaxpoolgrad_index = [&](size_t start, size_t end) {
+        for (size_t hs = start; hs < end; ++hs) {
+          // height start and end.
+          const int64_t height_start = *(height_seq_tensor_data + hs);
+          int64_t height_end =
+            overlapping ? *(height_seq_tensor_data + hs + 1) : *(height_seq_tensor_data + hs + 1) - 1;
+          height_end = std::min(height_end, height_max);
+          // width sequence.
+          for (int64_t ws = 0; ws < width_seq_tensor_shape->GetDimSize(0) - 1; ++ws) {
+            const int64_t out_index = (b * output_size[1] + hs) * output_size[2] + ws;
+            // width start and end.
+            const int64_t width_start = *(width_seq_tensor_data + ws);
+            int64_t width_end = overlapping ? *(width_seq_tensor_data + ws + 1) : *(width_seq_tensor_data + ws + 1) - 1;
+            width_end = std::min(width_end, width_max);
+            for (int64_t h = height_start; h <= height_end; ++h) {
+              for (int64_t w = width_start; w <= width_end; ++w) {
+                const int64_t in_index = (b * input_size[1] + h) * input_size[2] + w;
+                // Walk through each channel (depth).
+                for (int64_t d = 0; d < input_size[3]; ++d) {
+                  const T &input_ref = tensor_in_mat.coeffRef(d, in_index);
+                  T &output_ref = tensor_out_dup_mat.coeffRef(d, out_index);
+                  int64_t &out_arg_max_ref = tensor_out_arg_max_mat.coeffRef(d, out_index);
+                  if (output_ref < input_ref || out_arg_max_ref == kInvalidMaxPoolingIndex) {
+                    output_ref = input_ref;
+                    int input_offset = in_index * input_size[3] + d;
+                    out_arg_max_ref = input_offset;
+                  }
+                }
+              }
+            }
+          }
+        }
+      };
+      KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, height_seq_len, height_seq_len / max_core_num,
+                                                      sharder_fractionalmaxpoolgrad_index),
+                          "FractionalMaxPoolGrad Index Compute failed.");
+    }
+  }
+  for (int i = 0; i < tensor_in_num; i++) {
+    *(output_data + i) = 0;
+  }
+  auto out_backprop_data = static_cast<T *>(out_backprop->GetData());
+  int num_total_outputs = out_backprop->NumElements();
+  int num_total_inputs = output->NumElements();
+  for (int index = 0; index < num_total_outputs; ++index) {
+    int input_backprop_index = tensor_out_arg_max[index];
+    KERNEL_CHECK_FALSE((input_backprop_index >= 0 && input_backprop_index < num_total_inputs),
+                       KERNEL_STATUS_PARAM_INVALID,
+                       "Invalid input backprop index:[%d], The maximum number of output is: "
+                       "[%d].",
+                       input_backprop_index, num_total_inputs);
+    *(output_data + input_backprop_index) += *(out_backprop_data + index);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t FractionalMaxPoolGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(FractionalMaxPoolGradParamCheck(ctx), "Check FractionalMaxPoolGrad params failed.");
+  Tensor *input = ctx.Input(0);
+  auto data_type = input->GetDataType();
+  switch (data_type) {
+    case DT_FLOAT:
+      return DoCompute<float>(ctx);
+    case DT_DOUBLE:
+      return DoCompute<double>(ctx);
+    case DT_INT32:
+      return DoCompute<int32_t>(ctx);
+    case DT_INT64:
+      return DoCompute<int64_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("FractionalMaxPoolGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kFractionalMaxPoolGrad, FractionalMaxPoolGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool_grad.h
@ -0,0 +1,20 @@
+#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_H_
+#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+
+namespace aicpu {
+class FractionalMaxPoolGradCpuKernel : public CpuKernel {
+ public:
+  FractionalMaxPoolGradCpuKernel() = default;
+  ~FractionalMaxPoolGradCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t DoCompute(CpuKernelContext &ctx);
+  uint32_t FractionalMaxPoolGradParamCheck(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool_grad_with_fixed_ksize.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool_grad_with_fixed_ksize.cc
@ -0,0 +1,198 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fractional_max_pool_grad_with_fixed_ksize.h"
+
+#include <cmath>
+#include <limits>
+#include <vector>
+#include "Eigen/Dense"
+
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 3;
+const uint32_t kOutputNum = 1;
+const char *kFractionalMaxPoolGradWithFixedKsize = "FractionalMaxPoolGradWithFixedKsize";
+constexpr int64_t kParallelDataNums = 128 * 1024;
+
+#define FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DTYPE, TYPE, OUT_BACKPROP, ARGMAX, DATA_NUMS, N_SIZE, C_SIZE, \
+                                                         INPUT_H, INPUT_W, OUTPUT_H, OUTPUT_W, CTX)                    \
+  case (DTYPE): {                                                                                                      \
+    uint32_t result = FractionalMaxPoolGradWithFixedKsizeCompute<TYPE>(                                                \
+      OUT_BACKPROP, ARGMAX, DATA_NUMS, N_SIZE, C_SIZE, INPUT_H, INPUT_W, OUTPUT_H, OUTPUT_W, CTX);                     \
+    if (result != KERNEL_STATUS_OK) {                                                                                  \
+      KERNEL_LOG_ERROR("FractionalMaxPoolGradWithFixedKsize kernel compute failed.");                                  \
+      return result;                                                                                                   \
+    }                                                                                                                  \
+    break;                                                                                                             \
+  }
+
+}  // namespace
+
+namespace aicpu {
+uint32_t FractionalMaxPoolGradWithFixedKsize::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
+                      "FractionalMaxPoolGradWithFixedKsize check input and "
+                      "output number failed.");
+
+  Tensor *origin_input = ctx.Input(0);
+  int64_t data_nums = origin_input->NumElements();
+  auto origin_input_shape = origin_input->GetTensorShape();
+  int32_t origin_input_dim = origin_input_shape->GetDims();
+  KERNEL_CHECK_FALSE(origin_input_dim == 4, KERNEL_STATUS_PARAM_INVALID,
+                     "The dim of input[origin_input] must be 4, but got [%d].", origin_input_dim);
+
+  Tensor *out_backprop = ctx.Input(1);
+  auto out_backprop_shape = out_backprop->GetTensorShape();
+  int32_t out_backprop_dim = out_backprop_shape->GetDims();
+  KERNEL_CHECK_FALSE(out_backprop_dim == 4, KERNEL_STATUS_PARAM_INVALID,
+                     "The dim of input[out_backprop] must be 4, but got [%d].", out_backprop_dim);
+  Tensor *argmax = ctx.Input(2);
+  auto argmax_shape = argmax->GetTensorShape();
+  int32_t argmax_dim = argmax_shape->GetDims();
+  KERNEL_CHECK_FALSE(argmax_dim == 4, KERNEL_STATUS_PARAM_INVALID, "The dim of input[argmax] must be 4, but got [%d].",
+                     argmax_dim);
+  std::vector<int64_t> out_backprop_dim_sizes = out_backprop_shape->GetDimSizes();
+  std::vector<int64_t> argmax_dim_sizes = argmax_shape->GetDimSizes();
+  KERNEL_CHECK_FALSE(out_backprop_dim_sizes == argmax_dim_sizes, KERNEL_STATUS_PARAM_INVALID,
+                     "The shape of input[out_backprop] and input[argmax] must be equal.");
+  int64_t n_size = out_backprop_dim_sizes[0];
+  int64_t c_size = out_backprop_dim_sizes[1];
+  int64_t input_h = out_backprop_dim_sizes[2];
+  int64_t input_w = out_backprop_dim_sizes[3];
+
+  std::vector<int64_t> origin_input_dim_sizes = origin_input_shape->GetDimSizes();
+  KERNEL_CHECK_FALSE(origin_input_dim_sizes[0] == n_size, KERNEL_STATUS_PARAM_INVALID,
+                     "The first dim of input[origin_input] and "
+                     "input[out_backprop] must be equal,"
+                     "but got origin_input=[%d] and out_backprop=[%d].",
+                     origin_input_dim_sizes[0], n_size);
+  KERNEL_CHECK_FALSE(origin_input_dim_sizes[1] == c_size, KERNEL_STATUS_PARAM_INVALID,
+                     "The second dim of input[origin_input] and "
+                     "input[out_backprop] must be equal,"
+                     "but got origin_input=[%d] and out_backprop=[%d].",
+                     origin_input_dim_sizes[1], c_size);
+  int64_t output_h = origin_input_dim_sizes[2];
+  int64_t output_w = origin_input_dim_sizes[3];
+
+  auto data_type = out_backprop->GetDataType();
+  switch (data_type) {
+    FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, out_backprop, argmax, data_nums, n_size,
+                                                     c_size, input_h, input_w, output_h, output_w, ctx)
+    FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_FLOAT, float, out_backprop, argmax, data_nums, n_size, c_size,
+                                                     input_h, input_w, output_h, output_w, ctx)
+    FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_DOUBLE, double, out_backprop, argmax, data_nums, n_size, c_size,
+                                                     input_h, input_w, output_h, output_w, ctx)
+    FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_INT32, int32_t, out_backprop, argmax, data_nums, n_size, c_size,
+                                                     input_h, input_w, output_h, output_w, ctx)
+    FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_INT64, int64_t, out_backprop, argmax, data_nums, n_size, c_size,
+                                                     input_h, input_w, output_h, output_w, ctx)
+    default:
+      KERNEL_LOG_ERROR(
+        "FractionalMaxPoolGradWithFixedKsize kernel input[out_backprop] type "
+        "[%s] not support.",
+        DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t FractionalMaxPoolGradWithFixedKsize::FractionalMaxPoolGradWithFixedKsizeCompute(
+  Tensor *out_backprop, Tensor *argmax, const int64_t data_nums, const int n_size, const int c_size, const int input_h,
+  const int input_w, const int output_h, const int output_w, CpuKernelContext &ctx) {
+  T *out_backprop_addr = reinterpret_cast<T *>(out_backprop->GetData());
+  int64_t *argmax_addr = reinterpret_cast<int64_t *>(argmax->GetData());
+
+  Tensor *y = ctx.Output(0);
+  T *y_addr = reinterpret_cast<T *>(y->GetData());
+
+  if (data_nums < kParallelDataNums || n_size == 1) {
+    for (int n = 0; n < n_size; n++) {
+      T *out_backprop_single_batch_addr = out_backprop_addr + n * c_size * input_h * input_w;
+      int64_t *argmax_single_batch_addr = argmax_addr + n * c_size * input_h * input_w;
+      T *y_single_batch_addr = y_addr + n * c_size * output_h * output_w;
+
+      ComputeSingleBatch<T>(out_backprop_single_batch_addr, argmax_single_batch_addr, y_single_batch_addr, c_size,
+                            input_h, input_w, output_h, output_w);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num > (uint32_t)n_size) {
+      max_core_num = n_size;
+    }
+    auto shared_computeN = [&](size_t start, size_t end) {
+      for (size_t n = start; n < end; n++) {
+        T *out_backprop_single_batch_addr = out_backprop_addr + n * c_size * input_h * input_w;
+        int64_t *argmax_single_batch_addr = argmax_addr + n * c_size * input_h * input_w;
+        T *y_single_batch_addr = y_addr + n * c_size * output_h * output_w;
+
+        ComputeSingleBatch<T>(out_backprop_single_batch_addr, argmax_single_batch_addr, y_single_batch_addr, c_size,
+                              input_h, input_w, output_h, output_w);
+      }
+    };
+    uint32_t ret = CpuKernelUtils::ParallelFor(ctx, n_size, n_size / max_core_num, shared_computeN);
+    if (ret != KERNEL_STATUS_OK) {
+      KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor shared_computeN failed.");
+      return KERNEL_STATUS_INNER_ERROR;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t FractionalMaxPoolGradWithFixedKsize::ComputeSingleBatch(T *out_backprop_single_batch_addr,
+                                                                 int64_t *argmax_single_batch_addr,
+                                                                 T *y_single_batch_addr, const int c_size,
+                                                                 const int input_h, const int input_w,
+                                                                 const int output_h, const int output_w) {
+  for (int plane = 0; plane < c_size; plane++) {
+    T *out_backprop_plane_addr = out_backprop_single_batch_addr + plane * input_h * input_w;
+    int64_t *argmax_plane_addr = argmax_single_batch_addr + plane * input_h * input_w;
+    T *y_plane_addr = y_single_batch_addr + plane * output_h * output_w;
+
+    for (int i = 0; i < output_h; i++) {
+      for (int j = 0; j < output_w; j++) {
+        y_plane_addr[i * output_w + j] = static_cast<T>(0);
+      }
+    }
+
+    for (int h = 0; h < input_h; h++) {
+      for (int w = 0; w < input_w; w++) {
+        int input_index = h * input_w + w;
+        KERNEL_CHECK_FALSE((input_index >= 0 && input_index < input_h * input_w), KERNEL_STATUS_PARAM_INVALID,
+                           "The input_index[%d] out of the length of argmax.", input_index);
+        int output_index = argmax_plane_addr[input_index];
+        KERNEL_CHECK_FALSE((output_index >= 0 && output_index < output_h * output_w), KERNEL_STATUS_PARAM_INVALID,
+                           "The output_index[%d] out of the length of y.", output_index);
+
+        y_plane_addr[output_index] += out_backprop_plane_addr[input_index];
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kFractionalMaxPoolGradWithFixedKsize, FractionalMaxPoolGradWithFixedKsize);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool_grad_with_fixed_ksize.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool_grad_with_fixed_ksize.h
@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_WITH_FIXED_KSIZE_H_
+#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_WITH_FIXED_KSIZE_H_
+
+#include <vector>
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+
+namespace aicpu {
+class FractionalMaxPoolGradWithFixedKsize : public CpuKernel {
+ public:
+  FractionalMaxPoolGradWithFixedKsize() = default;
+  ~FractionalMaxPoolGradWithFixedKsize() override = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t FractionalMaxPoolGradWithFixedKsizeCompute(Tensor *out_backprop, Tensor *argmax, const int64_t data_nums,
+                                                      const int n_size, const int c_size, const int input_h,
+                                                      const int input_w, const int output_h, const int output_w,
+                                                      CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t ComputeSingleBatch(T *out_backprop_single_batch_addr, int64_t *argmax_single_batch_addr,
+                              T *y_single_batch_addr, const int c_size, const int input_h, const int input_w,
+                              const int output_h, const int output_w);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/gcd.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/gcd.cc
@ -0,0 +1,160 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gcd.h"
+
+#include <set>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kGcdOutputNum = 1;
+const uint32_t kGcdInputNum = 2;
+const char *kGcd = "Gcd";
+// when input data size is more than kParallelDataNum, use Parallel func
+const int64_t kParallelDataNum = 2 * 1024;
+const int64_t kParallelDataNumMid = 16 * 1024;
+const int32_t kInput_32_32 = 3;
+const int32_t kInput_32_64 = 2;
+const int32_t kInput_64_32 = 1;
+const int32_t kInput_64_64 = 0;
+}  // namespace
+
+namespace aicpu {
+// Simple recursive Gcd.
+template <class T>
+T elewise_gcd(T a, T b) {
+  if (b == 0) {
+    return a;
+  }
+  return elewise_gcd(b, a % b);
+}
+
+uint32_t GcdIOTypeCheck(CpuKernelContext &ctx, int32_t &dual_types) {
+  Tensor *x1 = ctx.Input(kFirstInputIndex);
+  Tensor *x2 = ctx.Input(kSecondInputIndex);
+  Tensor *y = ctx.Output(kFirstOutputIndex);
+  const std::set<DataType> supported_types{DT_INT32, DT_INT64};
+  auto x1_type = x1->GetDataType();
+  auto x2_type = x2->GetDataType();
+  auto y_type = y->GetDataType();
+  KERNEL_CHECK_FALSE(supported_types.count(x1_type) != 0, KERNEL_STATUS_PARAM_INVALID,
+                     "[Gcd] input x1 data type [%s] is not supported.", DTypeStr(x1_type).c_str());
+  KERNEL_CHECK_FALSE(supported_types.count(x2_type) != 0, KERNEL_STATUS_PARAM_INVALID,
+                     "[Gcd] input x2 data type [%s] is not supported.", DTypeStr(x2_type).c_str());
+  int32_t x1_is_i32 = static_cast<int32_t>(x1_type == DT_INT32) << 1;
+  int32_t x2_is_i32 = static_cast<int32_t>(x2_type == DT_INT32);
+  int32_t _dual_types = x1_is_i32 | x2_is_i32;
+  switch (_dual_types) {
+    case kInput_64_64:
+    case kInput_64_32:
+    case kInput_32_64:
+      KERNEL_CHECK_FALSE(y_type == DT_INT64, KERNEL_STATUS_PARAM_INVALID,
+                         "[Gcd] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
+      dual_types = _dual_types;
+      break;
+    case kInput_32_32:
+      KERNEL_CHECK_FALSE(y_type == DT_INT32, KERNEL_STATUS_PARAM_INVALID,
+                         "[Gcd] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
+      dual_types = _dual_types;
+      break;
+    default:
+      KERNEL_LOG_ERROR("[Gcd] input data type tuple is not supported.");
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+template <class T1, class T2, class T3>
+uint32_t GcdElewiseCompute(CpuKernelContext &ctx, const T1 *x1_ptr, const T2 *x2_ptr, T3 *y_ptr, Bcast &bcast) {
+  int64_t data_num = ctx.Output(kFirstOutputIndex)->NumElements();
+  auto gcd_shard = [&](int64_t start, int64_t end) {
+    for (int64_t i = start; i < end; ++i) {
+      T3 x1_ele_abs = std::abs(static_cast<T3>(x1_ptr[bcast.GetBroadcastXIndex(i)]));
+      T3 x2_ele_abs = std::abs(static_cast<T3>(x2_ptr[bcast.GetBroadcastYIndex(i)]));
+      y_ptr[i] = elewise_gcd(x1_ele_abs, x2_ele_abs);
+    }
+  };
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("[Gcd] max_core_num is 0, please check the cpu num.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, gcd_shard);
+    if (ret != KERNEL_STATUS_OK) {
+      KERNEL_LOG_ERROR("[Gcd] Gcd Compute failed.");
+      return ret;
+    }
+  } else {
+    gcd_shard(0, data_num);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+template <class T1, class T2, class T3>
+uint32_t GcdCompute(CpuKernelContext &ctx) {
+  Tensor *x1 = ctx.Input(kFirstInputIndex);
+  Tensor *x2 = ctx.Input(kSecondInputIndex);
+  Tensor *y = ctx.Output(kFirstOutputIndex);
+  const T1 *x1_ptr = reinterpret_cast<const T1 *>(x1->GetData());
+  const T2 *x2_ptr = reinterpret_cast<const T2 *>(x2->GetData());
+  T3 *y_ptr = reinterpret_cast<T3 *>(y->GetData());
+  auto x1_shape = x1->GetTensorShape()->GetDimSizes();
+  auto x2_shape = x2->GetTensorShape()->GetDimSizes();
+  Bcast bcast(x1_shape, x2_shape);
+  if (bcast.IsValid()) {
+    return GcdElewiseCompute<T1, T2, T3>(ctx, x1_ptr, x2_ptr, y_ptr, bcast);
+  } else {
+    KERNEL_LOG_ERROR("[Gcd] broadcast failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+uint32_t GcdCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kGcdInputNum, kGcdOutputNum), "[Gcd] check input and output number failed.");
+  int32_t dual_types = static_cast<int32_t>(-1);
+  KERNEL_HANDLE_ERROR(GcdIOTypeCheck(ctx, dual_types), "[Gcd] check data type failed.");
+  switch (dual_types) {
+    case kInput_64_64:
+      return GcdCompute<int64_t, int64_t, int64_t>(ctx);
+      break;
+    case kInput_64_32:
+      return GcdCompute<int64_t, int32_t, int64_t>(ctx);
+      break;
+    case kInput_32_64:
+      return GcdCompute<int32_t, int64_t, int64_t>(ctx);
+      break;
+    case kInput_32_32:
+      return GcdCompute<int32_t, int32_t, int32_t>(ctx);
+      break;
+    default:
+      KERNEL_LOG_ERROR("[Gcd] input data type tuple is not supported.");
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kGcd, GcdCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/gcd.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/gcd.h
@ -0,0 +1,32 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_GCD_H_
+#define AICPU_KERNELS_NORMALIZED_GCD_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class GcdCpuKernel : public CpuKernel {
+ public:
+  ~GcdCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/geqrf.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/geqrf.cc
@ -0,0 +1,268 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "geqrf.h"
+#include <cmath>
+#include <complex>
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+using namespace std;
+
+namespace {
+const char *kGeqrf = "Geqrf";
+const uint32_t kInputNum = 1;
+const uint32_t kOutputNum = 2;
+}  // namespace
+
+namespace aicpu {
+uint32_t GeqrfCpuKernel::Compute(CpuKernelContext &ctx) {
+  if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  DataType input0_data_type = ctx.Input(0)->GetDataType();
+  bool ret = KERNEL_STATUS_PARAM_INVALID;
+  switch (input0_data_type) {
+    case DT_FLOAT16:
+      ret = DoCompute<Eigen::half>(ctx);
+      break;
+    case DT_FLOAT:
+      ret = DoCompute<float>(ctx);
+      break;
+    case DT_DOUBLE:
+      ret = DoCompute<double>(ctx);
+      break;
+    case DT_COMPLEX64:
+      ret = DoComputeC<float>(ctx);
+      break;
+    case DT_COMPLEX128:
+      ret = DoComputeC<double>(ctx);
+      break;
+    default:
+      KERNEL_LOG_ERROR("Unsupported input data type[%s]", DTypeStr(input0_data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return ret;
+}
+
+template <typename T>
+void GeqrfCpuKernel::Larfg(int n, int vm, int vn, T **A, T *tau) {
+  T zero = static_cast<T>(0);
+  if (n <= 1) {
+    *tau = zero;
+    return;
+  }
+  T xnorm = zero;
+  for (int i = vm + 1; i < vm + n; i++) {
+    xnorm = xnorm + A[i][vn] * A[i][vn];
+  }
+  xnorm = sqrt(xnorm);
+  if (xnorm == zero) {
+    *tau = zero;
+    return;
+  } else {
+    T beta = sqrt(A[vm][vn] * A[vm][vn] + xnorm * xnorm);
+    if (A[vm][vn] > zero) {
+      beta = -beta;
+    }
+    *tau = (beta - (A[vm][vn])) / beta;
+    auto scal = (A[vm][vn]) - beta;
+    for (int i = vm + 1; i < vm + n; i++) {
+      A[i][vn] /= scal;
+    }
+    A[vm][vn] = beta;
+  }
+}
+
+template <typename T>
+void GeqrfCpuKernel::Larf(int m, int n, T **A, T *tau, int cm, int cn) {
+  if (m <= 0 || n <= 0) {
+    return;
+  }
+  T *work = new T[n]();
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      work[j] += A[cm + i][cn - 1] * A[cm + i][cn + j];
+    }
+  }
+
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      A[i + cm][j + cn] -= (*tau) * A[cm + i][cn - 1] * work[j];
+    }
+  }
+  delete[] work;
+}
+
+template <typename T>
+void GeqrfCpuKernel::Geqrf(int m, int n, T **A, T *tau) {
+  if (m < 0 || n < 0) {
+    return;
+  }
+  int k = std::min(m, n);
+  T one = static_cast<T>(1);
+  for (int i = 0; i < k; i++) {
+    Larfg<T>(m - i, i, i, A, tau + i);
+    T aii = A[i][i];
+    A[i][i] = one;
+    Larf<T>(m - i, n - i - 1, A, tau + i, i, i + 1);
+    A[i][i] = aii;
+  }
+}
+
+template <typename T>
+void GeqrfCpuKernel::CLarfg(int n, int vm, int vn, complex<T> **A, complex<T> *tau) {
+  complex<T> one = complex<T>(1, 0);
+  complex<T> zero = complex<T>(0, 0);
+  if (n <= 0) {
+    *tau = zero;
+    return;
+  }
+  T xnorm = 0;
+  for (int i = vm + 1; i < vm + n; i++) {
+    xnorm = xnorm + norm(A[i][vn]);
+  }
+  xnorm = sqrt(xnorm);
+  T alphr = A[vm][vn].real();
+  T alphi = A[vm][vn].imag();
+  if (xnorm == 0 && alphi == 0) {
+    *tau = zero;
+  } else {
+    T beta;
+    beta = sqrt(alphr * alphr + alphi * alphi + xnorm * xnorm);
+    if (A[vm][vn].real() > 0) {
+      beta = -beta;
+    }
+    *tau = complex<T>((beta - alphr) / beta, -alphi / beta);
+    A[vm][vn] = one / (A[vm][vn] - beta);
+    for (int i = vm + 1; i < vm + n; i++) {
+      A[i][vn] *= A[vm][vn];
+    }
+    A[vm][vn] = beta;
+  }
+}
+
+template <typename T>
+void GeqrfCpuKernel::CLarf(int m, int n, complex<T> **A, complex<T> *tau, int cm, int cn) {
+  if (m <= 0 || n <= 0) {
+    return;
+  }
+  complex<T> zero = complex<T>(0, 0);
+  complex<T> *work = new complex<T>[n];
+  complex<T> temp = zero;
+  for (int j = 0; j < n; j++) {
+    for (int i = 0; i < m; i++) {
+      temp = temp + conj(A[i + cm][j + cn]) * A[cm + i][cn - 1];
+    }
+    work[j] = temp;
+    temp = zero;
+  }
+  for (int j = 0; j < n; j++) {
+    for (int i = 0; i < m; i++) {
+      A[i + cm][j + cn] = A[i + cm][j + cn] - conj(*tau) * A[cm + i][cn - 1] * conj(work[j]);
+    }
+  }
+  delete[] work;
+}
+
+template <typename T>
+void GeqrfCpuKernel::CGeqrf(int m, int n, complex<T> **A, complex<T> *tau) {
+  if (m < 0 || n < 0) {
+    return;
+  }
+  int k = std::min(m, n);
+  complex<T> one = complex<T>(1, 0);
+  complex<T> aii;
+  for (int i = 0; i < k; i++) {
+    CLarfg<T>(m - i, i, i, A, (tau + i));
+    aii = A[i][i];
+    A[i][i] = one;
+    CLarf<T>(m - i, n - i - 1, A, tau + i, i, i + 1);
+    A[i][i] = aii;
+  }
+}
+
+template <typename T>
+uint32_t GeqrfCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  auto input0_tensor = ctx.Input(0);
+  auto input0_tensor_shape = input0_tensor->GetTensorShape();
+  int32_t dim = input0_tensor_shape->GetDims();
+  if (dim != kOutputNum) {
+    KERNEL_LOG_ERROR("The input matrix must have dimension = 2");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  std::vector<int64_t> input0_dims = input0_tensor_shape->GetDimSizes();
+  const int32_t m = input0_dims[0];
+  const int32_t n = input0_dims[1];
+  auto input_m = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_r = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto output_tau = reinterpret_cast<T *>(ctx.Output(1)->GetData());
+
+  T **A = new T *[m];
+  for (int i = 0; i < m; i++) {
+    A[i] = new T[n];
+  }
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      A[i][j] = *(input_m + i * n + j);
+    }
+  }
+  Geqrf<T>(m, n, A, output_tau);
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      *(output_r + i * n + j) = A[i][j];
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t GeqrfCpuKernel::DoComputeC(CpuKernelContext &ctx) {
+  auto input0_tensor = ctx.Input(0);
+  auto input0_tensor_shape = input0_tensor->GetTensorShape();
+  int32_t dim = input0_tensor_shape->GetDims();
+  if (dim != kOutputNum) {
+    KERNEL_LOG_ERROR("The input matrix must have dimension = 2");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  std::vector<int64_t> input0_dims = input0_tensor_shape->GetDimSizes();
+  const int32_t m = input0_dims[0];
+  const int32_t n = input0_dims[1];
+  auto input_m = reinterpret_cast<complex<T> *>(ctx.Input(0)->GetData());
+  auto output_r = reinterpret_cast<complex<T> *>(ctx.Output(0)->GetData());
+  auto output_tau = reinterpret_cast<complex<T> *>(ctx.Output(1)->GetData());
+
+  complex<T> **A = new complex<T> *[m];
+  for (int i = 0; i < m; i++) {
+    A[i] = new complex<T>[n];
+  }
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      A[i][j] = *(input_m + i * n + j);
+    }
+  }
+  CGeqrf<T>(m, n, A, output_tau);
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      *(output_r + i * n + j) = A[i][j];
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kGeqrf, GeqrfCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/geqrf.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/geqrf.h
@ -0,0 +1,55 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_GEQRF_H_
+#define AICPU_KERNELS_NORMALIZED_GEQRF_H_
+
+#include <complex>
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class GeqrfCpuKernel : public CpuKernel {
+ public:
+  GeqrfCpuKernel() = default;
+  ~GeqrfCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  void Larfg(int n, int vm, int vn, T **A, T *tau);
+
+  template <typename T>
+  void Larf(int m, int n, T **A, T *tau, int cm, int cn);
+
+  template <typename T>
+  void Geqrf(int m, int n, T **A, T *tau);
+
+  template <typename T>
+  void CLarfg(int n, int vm, int vn, std::complex<T> **A, std::complex<T> *tau);
+
+  template <typename T>
+  void CLarf(int m, int n, std::complex<T> **A, std::complex<T> *tau, int cm, int cn);
+
+  template <typename T>
+  void CGeqrf(int m, int n, std::complex<T> **A, std::complex<T> *tau);
+
+  template <typename T>
+  uint32_t DoCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t DoComputeC(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_GEQRF_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hard_sigmoid.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hard_sigmoid.cc
@ -0,0 +1,89 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "hard_sigmoid.h"
+
+#include <algorithm>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 1;
+const char *const kHardSigmoid = "HardSigmoid";
+const int64_t kParallelDataNums = 16 * 1024;
+const float alpha = 0.16666666;
+const float beta = 0.5;
+
+#define HARD_SIGMOID_COMPUTE_CASE(DTYPE, TYPE, CTX)           \
+  case (DTYPE): {                                             \
+    uint32_t result = HardSigmoidCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                         \
+      KERNEL_LOG_ERROR("HardSigmoid kernel compute failed."); \
+      return result;                                          \
+    }                                                         \
+    break;                                                    \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t HardSigmoidCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kHardSigmoid);
+  DataType data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    HARD_SIGMOID_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    HARD_SIGMOID_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    HARD_SIGMOID_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("HardSigmoid kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t HardSigmoidCpuKernel::HardSigmoidCompute(const CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Input(0)->NumElements();
+  int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
+  const T zero = static_cast<T>(0);
+  const T three = static_cast<T>(3);
+  const T six = static_cast<T>(6);
+  if (data_size <= kParallelDataNums) {
+    for (int64_t i = 0; i < data_num; i++) {
+      *(output_y + i) = std::min(std::max(*(input_x + i) + three, zero), six) / six;
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    int64_t perUnitSize = max_core_num > 0 ? data_num / max_core_num : data_num;
+    auto shard_hard_sigmoid = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        *(output_y + i) = std::min(std::max(*(input_x + i) + three, zero), six) / six;
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, perUnitSize, shard_hard_sigmoid),
+                        "HardSigmoid Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kHardSigmoid, HardSigmoidCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hard_sigmoid.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hard_sigmoid.h
@ -0,0 +1,35 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_HARD_SIGMOID_H
+#define AICPU_KERNELS_NORMALIZED_HARD_SIGMOID_H
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class HardSigmoidCpuKernel : public CpuKernel {
+ public:
+  HardSigmoidCpuKernel() = default;
+  ~HardSigmoidCpuKernel() override = default;
+
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t HardSigmoidCompute(const CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hard_sigmoid_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hard_sigmoid_grad.cc
@ -0,0 +1,95 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "hard_sigmoid_grad.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *const kHardSigmoidGrad = "HardSigmoidGrad";
+const int64_t kParallelDataNums = 16 * 1024;
+
+#define HARD_SIGMOID_GRAD_COMPUTE_CASE(DTYPE1, TYPE1, TYPE2, CTX) \
+  case (DTYPE1): {                                                \
+    uint32_t result = HardSigmoidGradCompute<TYPE1, TYPE2>(CTX);  \
+    if (result != KERNEL_STATUS_OK) {                             \
+      KERNEL_LOG_ERROR("HardSigmoidGrad kernel compute failed."); \
+      return result;                                              \
+    }                                                             \
+    break;                                                        \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t HardSigmoidGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kHardSigmoidGrad);
+  DataType grads_type = ctx.Input(0)->GetDataType();
+  DataType x_type = ctx.Input(1)->GetDataType();
+  if (grads_type != x_type) {
+    KERNEL_LOG_ERROR("HardSigmoidGrad kernel input[0] data type [%s] must be the same as input[1] data type [%s].",
+                     DTypeStr(grads_type).c_str(), DTypeStr(x_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  switch (grads_type) {
+    HARD_SIGMOID_GRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, Eigen::half, ctx)
+    HARD_SIGMOID_GRAD_COMPUTE_CASE(DT_FLOAT, float, float, ctx)
+    HARD_SIGMOID_GRAD_COMPUTE_CASE(DT_DOUBLE, double, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("HardSigmoidGrad kernel inputs data type [%s] not support.", DTypeStr(grads_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T1, typename T2>
+uint32_t HardSigmoidGradCpuKernel::HardSigmoidGradCompute(const CpuKernelContext &ctx) {
+  auto grads = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
+  auto input_x = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
+  auto y = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Input(1)->NumElements();
+  int64_t data_size = data_num * static_cast<int64_t>(sizeof(T2));
+  const T2 zero = static_cast<T2>(0);
+  const T2 three = static_cast<T2>(3);
+  const T2 neg_three = static_cast<T2>(-3);
+  const T2 one_sixth = static_cast<T2>(1.0f / 6.0f);
+  if (data_size <= kParallelDataNums) {
+    for (int64_t i = 0; i < data_num; i++) {
+      *(y + i) =
+        (*(input_x + i) > neg_three && *(input_x + i) < three) ? static_cast<T2>(*(grads + i)) * one_sixth : zero;
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    int64_t perUnitSize = max_core_num > 0 ? data_num / max_core_num : data_num;
+    auto shard_hard_sigmoid_grad = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        *(y + i) =
+          (*(input_x + i) > neg_three && *(input_x + i) < three) ? static_cast<T2>(*(grads + i)) * one_sixth : zero;
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, perUnitSize, shard_hard_sigmoid_grad),
+                        "HardSigmoidGrad Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kHardSigmoidGrad, HardSigmoidGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hard_sigmoid_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hard_sigmoid_grad.h
@ -0,0 +1,35 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_HARD_SIGMOID_GRAD_H
+#define AICPU_KERNELS_NORMALIZED_HARD_SIGMOID_GRAD_H
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class HardSigmoidGradCpuKernel : public CpuKernel {
+ public:
+  HardSigmoidGradCpuKernel() = default;
+  ~HardSigmoidGradCpuKernel() override = default;
+
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T1, typename T2>
+  uint32_t HardSigmoidGradCompute(const CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/heaviside.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/heaviside.cc
@ -0,0 +1,237 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022.All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "heaviside.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *kHeaviside = "Heaviside";
+const int64_t kParallelDataNum = 2 * 1024;
+const int64_t kParallelDataNumMid = 16 * 1024;
+const int64_t kParallelDataNumSameShape = 7 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
+
+#define HEAVISIDE_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                           \
+    uint32_t result = HeavisideCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                       \
+      KERNEL_LOG_ERROR("Heaviside kernel compute failed."); \
+      return result;                                        \
+    }                                                       \
+    break;                                                  \
+  }
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+T heaviside(T a, T b) {
+  return a == static_cast<T>(0) ? b : static_cast<T>(a > static_cast<T>(0));
+}
+
+uint32_t HeavisideCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Heaviside check input and output number failed.");
+  KERNEL_HANDLE_ERROR(HeavisideParamCheck(ctx), "Heaviside check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    HEAVISIDE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    HEAVISIDE_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    HEAVISIDE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    HEAVISIDE_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    HEAVISIDE_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    HEAVISIDE_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    HEAVISIDE_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    HEAVISIDE_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
+    HEAVISIDE_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
+    HEAVISIDE_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
+    HEAVISIDE_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    default:
+      KERNEL_LOG_ERROR("Heaviside kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t HeavisideCpuKernel::HeavisideParamCheck(CpuKernelContext &ctx) {
+  Tensor *input_0 = ctx.Input(0);
+  Tensor *input_1 = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  DataType input0_type = input_0->GetDataType();
+  DataType input1_type = input_1->GetDataType();
+  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with "
+                     "input1 [%s].",
+                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
+  KERNEL_LOG_DEBUG(
+    "HeavisideCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t HeavisideCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
+  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
+  int64_t data_num = ctx.Output(0)->NumElements();
+
+  BcastShapeType type;
+  if (in0_elements_nums == in1_elements_nums) {
+    type = BcastShapeType::SAME_SHAPE;
+  } else {
+    type = (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
+  }
+
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumSameShapeMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_heaviside = [&](int64_t start, int64_t end) {
+      switch (type) {
+        case BcastShapeType::SAME_SHAPE:
+          for (int64_t i = start; i < end; ++i) {
+            *(out + i) = heaviside<T>(*(in0 + i), *(in1 + i));
+          }
+          break;
+        case BcastShapeType::X_ONE_ELEMENT:
+          for (int64_t i = start; i < end; ++i) {
+            *(out + i) = heaviside<T>(*in0, *(in1 + i));
+          }
+          break;
+        case BcastShapeType::Y_ONE_ELEMENT:
+          for (int64_t i = start; i < end; ++i) {
+            *(out + i) = heaviside<T>(*(in0 + i), *in1);
+          }
+          break;
+        default:
+          KERNEL_LOG_ERROR("Invalid type [%d]", static_cast<int32_t>(type));
+          break;
+      }
+    };
+
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0");
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_heaviside),
+                        "Heaviside Compute failed.");
+  } else {
+    switch (type) {
+      case BcastShapeType::SAME_SHAPE:
+        for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
+          *(out + i) = heaviside<T>(*(in0 + i), *(in1 + i));
+        }
+        break;
+      case BcastShapeType::X_ONE_ELEMENT:
+        for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
+          *(out + i) = heaviside<T>(*in0, *(in1 + i));
+        }
+        break;
+      case BcastShapeType::Y_ONE_ELEMENT:
+        for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
+          *(out + i) = heaviside<T>(*(in0 + i), *in1);
+        }
+        break;
+      default:
+        KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
+        break;
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t HeavisideCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
+  T *in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  T *in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  T *out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Output(0)->NumElements();
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_heaviside = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; ++i) {
+        *(out + i) = heaviside<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
+      }
+    };
+
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0");
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_heaviside),
+                        "Heaviside Compute failed.");
+  } else {
+    for (int64_t i = 0; i < data_num; ++i) {
+      *(out + i) = heaviside<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t HeavisideCpuKernel::HeavisideCompute(CpuKernelContext &ctx) {
+  Tensor *input0_tensor = ctx.Input(0);
+  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input0_elements_nums = input0_tensor->NumElements();
+
+  Tensor *input1_tensor = ctx.Input(1);
+  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input1_elements_nums = input1_tensor->NumElements();
+
+  bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
+  if (isNeedBcast) {
+    return NoBcastCompute<T>(ctx);
+  } else {
+    Bcast bcast(input0_shape, input1_shape);
+    if (!bcast.IsValid()) {
+      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+
+    return BcastCompute<T>(ctx, bcast);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kHeaviside, HeavisideCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/heaviside.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/heaviside.h
@ -0,0 +1,43 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022.All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_HEAVISIDE_H_
+#define AICPU_KERNELS_NORMALIZED_HEAVISIDE_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+namespace aicpu {
+class HeavisideCpuKernel : public CpuKernel {
+ public:
+  HeavisideCpuKernel() = default;
+  ~HeavisideCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t HeavisideParamCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t NoBcastCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
+
+  template <typename T>
+  uint32_t HeavisideCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/is_inf.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/is_inf.cc
@ -1,103 +0,0 @@
-/**
- * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "is_inf.h"
-
-#include "Eigen/Dense"
-#include "unsupported/Eigen/CXX11/Tensor"
-#include "cpu_kernel_utils.h"
-#include "utils/kernel_util.h"
-#include "cpu_types.h"
-#include "kernel_log.h"
-#include "status.h"
-
-namespace {
-const char *const kIsInf = "IsInf";
-const uint32_t kOutputNum = 1;
-const uint32_t kInputNum = 1;
-constexpr int64_t kParallelDataNumsFloat16 = 128 * 1024;
-constexpr int64_t kParallelDataNumsFloat = 128 * 1024;
-constexpr int64_t kParallelDataNumsDouble = 300 * 1024;
-
-#define ISINF_COMPUTE_CASE(DTYPE, TYPE, CTX)                 \
-  case (DTYPE): {                                            \
-    uint32_t result = IsInfCompute<TYPE>(CTX);               \
-    if (result != static_cast<uint32_t>(KERNEL_STATUS_OK)) { \
-      KERNEL_LOG_ERROR("IsInf kernel compute failed.");      \
-      return result;                                         \
-    }                                                        \
-    break;                                                   \
-  }
-}  // namespace
-
-namespace aicpu {
-uint32_t IsInfCpuKernel::Compute(CpuKernelContext &ctx) {
-  // check params
-  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kIsInf);
-  KERNEL_HANDLE_ERROR(IsInfCheck(ctx), "[%s] check params failed.", kIsInf);
-  auto data_type = ctx.Input(0)->GetDataType();
-  switch (data_type) {
-    ISINF_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
-    ISINF_COMPUTE_CASE(DT_FLOAT, float, ctx)
-    ISINF_COMPUTE_CASE(DT_DOUBLE, double, ctx)
-    default:
-      KERNEL_LOG_ERROR("IsInf kernel data type [%s] not supports.", DTypeStr(data_type).c_str());
-      return KERNEL_STATUS_PARAM_INVALID;
-  }
-  return static_cast<uint32_t>(KERNEL_STATUS_OK);
-}
-
-uint32_t IsInfCpuKernel::IsInfCheck(const CpuKernelContext &ctx) const {
-  KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
-  KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed.")
-  return KERNEL_STATUS_OK;
-}
-
-template <typename T>
-uint32_t IsInfCpuKernel::IsInfCompute(const CpuKernelContext &ctx) {
-  auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
-  auto output = reinterpret_cast<bool *>(ctx.Output(0)->GetData());
-
-  auto data_type = ctx.Input(0)->GetDataType();
-  int64_t data_num = ctx.Output(0)->NumElements();
-  int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
-
-  if ((data_type == DT_FLOAT16 && data_size <= kParallelDataNumsFloat16) ||
-      (data_type == DT_FLOAT && data_size <= kParallelDataNumsFloat) ||
-      (data_type == DT_DOUBLE && data_size <= kParallelDataNumsDouble)) {
-    for (int64_t index = 0; index < data_num; index++) {
-      *(output + index) = Eigen::numext::isinf(*(input + index));
-    }
-  } else {
-    uint32_t min_core_num = 1;
-    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
-    if (max_core_num > data_num) {
-      max_core_num = data_num;
-    }
-
-    auto shard_isinf = [&](size_t start, size_t end) {
-      for (size_t index = start; index < end; index++) {
-        *(output + index) = Eigen::numext::isinf(*(input + index));
-      }
-    };
-    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_isinf),
-                        "IsInf Compute failed.");
-  }
-
-  return static_cast<uint32_t>(KERNEL_STATUS_OK);
-}
-
-REGISTER_CPU_KERNEL(kIsInf, IsInfCpuKernel);
-}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
@ -88,7 +88,6 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
                                                               mindspore::kScatterNdOpName,
                                                               mindspore::kScatterNdUpdateOpName,
                                                               mindspore::kTensorScatterUpdateOpName,
-                                                               mindspore::kIsInfOpName,
                                                               mindspore::kIsNanOpName,
                                                               mindspore::kMatrixDeterminantOpName,
                                                               mindspore::kMatrixLogarithmOpName,
@ -145,7 +144,44 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
                                                               mindspore::kMulOpName,
                                                               mindspore::kConjOpName,
                                                               mindspore::kZerosLikeOpName,
-                                                               mindspore::kMatrixBandPartOpName};
+                                                               mindspore::kMatrixBandPartOpName,
+                                                               mindspore::kDenseToCSRSparseMatrixOpName,
+                                                               mindspore::kDenseToSparseSetOperation,
+                                                               mindspore::kDiagOpName,
+                                                               mindspore::kDiagonalOpName,
+                                                               mindspore::kDiagPartOpName,
+                                                               mindspore::kEigOpName,
+                                                               mindspore::kEyeOpName,
+                                                               mindspore::kMaximumOpName,
+                                                               mindspore::kMinimumOpName,
+                                                               mindspore::kFractionalAvgPoolOpName,
+                                                               mindspore::kFractionalAvgPoolGradOpName,
+                                                               mindspore::kFractionalMaxPoolOpName,
+                                                               mindspore::kFractionalMaxPoolGradOpName,
+                                                               mindspore::kFractionalMaxPoolGradWithFixedKsizeOpName,
+                                                               mindspore::kGatherNdOpName,
+                                                               mindspore::kGcdOpName,
+                                                               mindspore::kGeqrfOpName,
+                                                               mindspore::kHardSigmoidOpName,
+                                                               mindspore::kHardSigmoidGradOpName,
+                                                               mindspore::kHeavisideOpName,
+                                                               mindspore::kHypotOpName,
+                                                               mindspore::kIdentityNOpName,
+                                                               mindspore::kIndexFillOpName,
+                                                               mindspore::kKLDivOpName,
+                                                               mindspore::kKlDivLossGradOpName,
+                                                               mindspore::kLcmOpName,
+                                                               mindspore::kLessEqualOpName,
+                                                               mindspore::kLogicalXorOpName,
+                                                               mindspore::kLogitOpName,
+                                                               mindspore::kLogitGradOpName,
+                                                               mindspore::kLogNormalReverseOpName,
+                                                               mindspore::kLowerBoundOpName,
+                                                               mindspore::kLstsqOpName,
+                                                               mindspore::kLuUnpackOpName,
+                                                               mindspore::kLuUnpackGradOpName,
+                                                               mindspore::kMatMulOpName,
+                                                               mindspore::kMatrixExpOpName};

  static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
  static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";
--- a/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
+++ b/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
@ -238,7 +238,42 @@ from .smooth_l1_loss import _smooth_l1_loss_aicpu
 from .cumulative_logsumexp import _cumulative_logsumexp_aicpu
 from .nuclear_norm import _nuclear_norm_aicpu
 from .sparse_segment_sqrt_n import _sparse_segment_sqrt_n_aicpu
-from .unsorted_segment_prod import _unsorted_segment_prod_aicpu
 from .scale_and_translate import _scale_and_translate_aicpu
 from .quant_dtype_cast import _quant_dtype_cast_aicpu
 from .fse_decode import _fse_decode_aicpu
+from .unsorted_segment_prod import _unsorted_segment_prod_aicpu
+from .dense_to_csr_sparse_matrix import _dense_to_csr_sparse_matrix_aicpu
+from .dense_to_sparse_set_operation import _dense_to_sparse_set_operation_aicpu
+from .diag import _diag_aicpu
+from .diagonal import _diagonal_aicpu
+from .diag_part import _diag_part_aicpu
+from .eig import _eig_aicpu
+from .eye import _eye_aicpu
+from .fmax import _fmax_aicpu
+from .fmin import _fmin_aicpu
+from .fractional_avg_pool import _fractional_avg_pool_aicpu
+from .fractional_avg_pool_grad import _fractional_avg_pool_grad_aicpu
+from .fractional_max_pool import _fractional_max_pool_aicpu
+from .fractional_max_pool_grad import _fractional_max_pool_grad_aicpu
+from .fractional_max_pool_grad_with_fixed_ksize import _fractional_max_pool_grad_with_fixed_ksize_aicpu
+from .gcd import _gcd_aicpu
+from .geqrf import _geqrf_aicpu
+from .hard_sigmoid import _hard_sigmoid_aicpu
+from .hard_sigmoid_grad import _hard_sigmoid_grad_aicpu
+from .heaviside import _heaviside_aicpu
+from .hypot import _hypot_aicpu
+from .identity_n import _identity_n_aicpu
+from .index_fill import _index_fill_aicpu
+from .kldivloss import _kldiv_loss_aicpu
+from .kldivlossgrad import _kldiv_loss_grad_aicpu
+from .lcm import _lcm_aicpu
+from .less_equal import _less_equal_aicpu
+from .logical_xor import _logical_xor_aicpu
+from .logit import _logit_aicpu
+from .logit_grad import _logit_grad_aicpu
+from .log_normal_reverse import _log_normal_reverse_aicpu
+from .lower_bound import _lower_bound_aicpu
+from .lstsq import _lstsq_aicpu
+from .lu_unpack import _lu_unpack_aicpu
+from .lu_unpack_grad import _lu_unpack_grad_aicpu
+from .matrix_exp import _matrix_exp_aicpu