aicpu migration 35 ops, 0105 branch

2023-01-05 19:11:49 +08:00 · 2023-01-05 19:11:49 +08:00 · 66cfa84dce
parent 551bcec327
commit 66cfa84dce
87 changed files with 11161 additions and 28 deletions
--- a/.jenkins/check/config/filter_cppcheck.txt
+++ b/.jenkins/check/config/filter_cppcheck.txt
@ -98,3 +98,8 @@
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "zerodivcond"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantInitialization"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "noConstructor"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "noExplicitConstructor"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "identicalConditionAfterEarlyExit"
--- a/.jenkins/check/config/whitelizard.txt
+++ b/.jenkins/check/config/whitelizard.txt
@ -282,3 +282,44 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc:aicpu::MaxUnpool2DCpuKernel::MaxUnpool2DCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc:aicpu::MatrixSolveLsCpuKernel::Compute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc:aicpu::Col2imCpuKernel::Col2imParamCheck
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd_update.cc:aicpu::ScatterNdUpdateCpuKernel::Compute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_sparse.cc:aicpu::RaggedTensorToSparseCpuKernel::Compute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc:aicpu::MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute_Complex
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_tensor.cc:aicpu::RaggedTensorToTensorCpuKernel::Compute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute_Complex
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::GenerateRandomCrop
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::SDBBExt2Compute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc:aicpu::MaxUnpool3DCpuKernel::MaxUnpool3DCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute_Complex
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc:aicpu::SpatialMaxPoolWithArgMaxHelper
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute_Complex
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc:aicpu::Generate
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.cc:aicpu::ScatterNdCpuKernel::Compute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossComputeFP
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc:aicpu::MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool.cc:aicpu::SpacialMaxPool
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute_Complex
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute_Complex
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::SDBBExt2Compute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc:aicpu::MaxUnpool3DCpuKernel::MaxUnpool3DCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/slice.cc:aicpu::SliceCpuKernel::SliceCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute_Complex
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc:aicpu::SpatialMaxPoolWithArgMaxHelper
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute_Complex
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradC
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc:aicpu::Generate
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc:mindspore::opt::AICpuLibSelectPass::Process
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradComputeFP16
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossComputeFP16
--- a/docs/api/api_python/ops/mindspore.ops.PSROIPooling.rst
+++ b/docs/api/api_python/ops/mindspore.ops.PSROIPooling.rst
@ -1,26 +0,0 @@
-mindspore.ops.PSROIPooling
-==========================
-
-.. py:class:: mindspore.ops.PSROIPooling(spatial_scale, group_size, output_dim)
-
-    对输入Tensor应用Position Sensitive ROI-Pooling。
-
-    参数：
-        - **spatial_scale** (float) - 将框坐标映射到输入坐标的比例因子。例如，如果你的框定义在224x224的图像上，并且你的输入是112x112的特征图（由原始图像的0.5倍缩放产生），此时需要将其设置为0.5。
-        - **group_size** (int) - 执行池化后输出的大小（以像素为单位），以（高度，宽度）的格式输出。
-        - **output_dim** (int) - 执行池化后输出的维度。
-
-    输入：
-        - **features** (Tensor) - 输入特征Tensor，其shape必须为 :math:`(N, C, H, W)` 。 各维度的值应满足： :math:`(C == output\_dim * group\_size * group\_size)` 。数据类型为float16或者float32。
-        - **rois** (Tensor) - 其shape为 :math:`(batch, 5, rois_n)` ，数据类型为float16或者float32。第一个维度的batch为批处理大小。第二个维度的大小必须为5。第三维度rois_n是rois的数量。rois_n的值格式为：(index, x1, y1, x2, y2)。其中第一个元素是rois的索引。方框坐标格式为(x1、y1、x2、y2)，之后将把这些方框的选中的区域提取出来。区域坐标必须满足0 <= x1 < x2和0 <= y1 < y2。
-
-    输出：
-        - **out** (Tensor) - 池化后的输出。其shape为 :math:`(rois.shape[0] * rois.shape[2], output\_dim, group\_size, group\_size)` 。
-
-    异常：
-        - **TypeError** - `spatial_scale` 不是float类型。
-        - **TypeError** - `group_size` 或者 `output_dim` 不是 int类型。
-        - **TypeError** - `features` 或者 `rois` 不是Tensor。
-        - **TypeError** - `rois` 数据类型不是float16或者float32。
-        - **ValueError** - `features` 的shape不满足 :math:`(C == output\_dim * group\_size * group\_size)` 。
-        - **ValueError** - `spatial_scale` 为负数。
--- a/mindspore/ccsrc/include/common/utils/utils.h
+++ b/mindspore/ccsrc/include/common/utils/utils.h
@ -157,6 +157,7 @@ constexpr auto kCastOpName = "Cast";
 constexpr auto kCentralizationOpName = "Centralization";
 constexpr auto kCeLUOpName = "CeLU";
 constexpr auto kCeluV2OpName = "CeluV2";
+constexpr auto kCheckNumericsOpName = "CheckNumerics";
 constexpr auto kClearZeroOpName = "ClearZero";
 constexpr auto kClipBoxesOpName = "kClipBoxes";
 constexpr auto kClipBoxesDOpName = "kClipBoxesD";
@ -277,6 +278,7 @@ constexpr auto kFillV2DOpName = "FillV2D";
 constexpr auto kFSEDecodeOpName = "FSEDecode";
 constexpr auto kFive2FourOpName = "Five2Four";
 constexpr auto kFlattenGradOpName = "FlattenGrad";
+constexpr auto kFloorDivOpName = "FloorDiv";
 constexpr auto kFour2FiveOpName = "Four2Five";
 constexpr auto kFractionalAvgPoolGradOpName = "FractionalAvgPoolGrad";
 constexpr auto kFusedAdaFactorName = "FusedAdaFactor";
@ -386,6 +388,7 @@ constexpr auto kLinSpaceDOpName = "LinSpaceD";
 constexpr auto kListDiffOpName = "ListDiff";
 constexpr auto kLogMatrixDeterminantOpName = "LogMatrixDeterminant";
 constexpr auto kLogOpName = "Log";
+constexpr auto kLog1pOpName = "Log1p";
 constexpr auto kLogSoftmaxOpName = "LogSoftmax";
 constexpr auto kLogSoftmaxV2OpName = "LogSoftmaxV2";
 constexpr auto kLogSoftmaxGradOpName = "LogSoftmaxGrad";
@ -409,6 +412,8 @@ constexpr auto kMatrixSetDiagOpName = "MatrixSetDiag";
 constexpr auto kMatrixSetDiagDOpName = "MatrixSetDiagD";
 constexpr auto kMatrixSetDiagV3OpName = "MatrixSetDiagV3";
 constexpr auto kMatrixSolveLsOpName = "MatrixSolveLs";
+constexpr auto kMatrixTriangularSolveOpName = "MatrixTriangularSolve";
+constexpr auto kMaximumGradGradOpName = "MaximumGradGrad";
 constexpr auto kMaximumGradOpName = "MaximumGrad";
 constexpr auto kMaximumOpName = "Maximum";
 constexpr auto kMaxPool3DGradGradOpName = "MaxPool3DGradGrad";
@ -422,15 +427,22 @@ constexpr auto kMaxPoolExt2OpName = "MaxPoolExt2";
 constexpr auto kMaxPoolWithArgmaxOpName = "MaxPoolWithArgmax";
 constexpr auto kMaxUnpool2DOpName = "MaxUnpool2D";
 constexpr auto kMaxUnpool2DGradOpName = "MaxUnpool2DGrad";
+constexpr auto kMaxUnpool3DOpName = "MaxUnpool3D";
+constexpr auto kMaxUnpool3DGradOpName = "MaxUnpool3DGrad";
 constexpr auto kMeanGradOpName = "MeanGrad";
 constexpr auto kMedianOpName = "Median";
 constexpr auto kMedianGradOpName = "MedianGrad";
 constexpr auto kMemCpyAsyncOpName = "memcpy_async";
+constexpr auto kMinimumGradGradOpName = "MinimumGradGrad";
 constexpr auto kMinimumGradOpName = "MinimumGrad";
 constexpr auto kMinimumOpName = "Minimum";
 constexpr auto kMirrorPadOpName = "MirrorPad";
 constexpr auto kMomentumOpName = "Momentum";
 constexpr auto kMulOpName = "Mul";
+constexpr auto kMulNoNanOpName = "MulNoNan";
+constexpr auto kMultilabelMarginLossGradOpName = "MultilabelMarginLossGrad";
+constexpr auto kMultiMarginLossGradOpName = "MultiMarginLossGrad";
+constexpr auto kMultiMarginLossOpName = "MultiMarginLoss";
 constexpr auto kMultinomialOpName = "Multinomial";
 constexpr auto kMuxReceiveOpName = "MuxReceive";
 constexpr auto kMuxSendOpName = "MuxSend";
@ -438,17 +450,21 @@ constexpr auto kNanToNumOpName = "NanToNum";
 constexpr auto kNegOpName = "Neg";
 constexpr auto kIm2ColOpName = "Im2Col";
 constexpr auto kNewIm2ColOpName = "NewIm2Col";
+constexpr auto kNextAfterOpName = "NextAfter";
 constexpr auto kIm2colOpName = "Im2col";
 constexpr auto kNMSWithMaskOpName = "NMSWithMask";
 constexpr auto kNonDeterministicInts = "NonDeterministicInts";
+constexpr auto kNonDeterministicIntsOpName = "NonDeterministicInts";
 constexpr auto kNonMaxSuppressionV3OpName = "NonMaxSuppressionV3";
 constexpr auto kNonZeroOpName = "NonZero";
 constexpr auto kNPUAllocFloatStatusOpName = "NPUAllocFloatStatus";
 constexpr auto kNPUClearFloatStatusOpName = "NPUClearFloatStatus";
 constexpr auto kNPUGetFloatStatusOpName = "NPUGetFloatStatus";
+constexpr auto kNthElementOpName = "NthElement";
 constexpr auto kNuclearNormOpName = "NuclearNorm";
 constexpr auto kOneHotOpName = "OneHot";
 constexpr auto kOneHotDOpName = "OneHotD";
+constexpr auto kOrgqrOpName = "Orgqr";
 constexpr auto kPadAndShiftOpName = "PadAndShift";
 constexpr auto kPaddingOpName = "Padding";
 constexpr auto kPadOpName = "Pad";
@ -457,8 +473,11 @@ constexpr auto kParallelResizeBilinearOpName = "ParallelResizeBilinear";
 constexpr auto kSyncResizeBilinearV2OpName = "SyncResizeBilinearV2";
 constexpr auto kParallelResizeBilinearGradOpName = "ParallelResizeBilinearGrad";
 constexpr auto kSyncResizeBilinearV2GradOpName = "SyncResizeBilinearV2Grad";
+constexpr auto kParameterizedTruncatedNormalOpName = "ParameterizedTruncatedNormal";
 constexpr auto kPartialOpName = "partial";
+constexpr auto kPdistGradOpName = "PdistGrad";
 constexpr auto kPoissonOpName = "Poisson";
+constexpr auto kPolarOpName = "Polar";
 constexpr auto kPoolingOpName = "Pooling";
 constexpr auto kPSROIPoolingOpName = "PSROIPooling";
 constexpr auto kPSROIPoolingV2OpName = "PSROIPoolingV2";
@ -481,13 +500,18 @@ constexpr auto kPushOpName = "Push";
 constexpr auto kQrOpName = "Qr";
 constexpr auto kPushWeightOpName = "PushWeight";
 constexpr auto kQuantileOpName = "Quantile";
+constexpr auto kRaggedRangeOpName = "RaggedRange";
+constexpr auto kRaggedTensorToSparseOpName = "RaggedTensorToSparse";
+constexpr auto kRaggedTensorToTensorOpName = "RaggedTensorToTensor";
 constexpr auto kRandomChoiceWithMaskOpName = "RandomChoiceWithMask";
+constexpr auto kRandomPoissonOpName = "RandomPoisson";
 constexpr auto kRandomShuffleOpName = "RandomShuffle";
 constexpr auto kRangeOpName = "Range";
 constexpr auto kRangeDOpName = "RangeD";
 constexpr auto kQuantDTypeCastOpName = "QuantDTypeCast";
 constexpr auto kRealDivOpName = "RealDiv";
 constexpr auto kReciprocalOpName = "Reciprocal";
+constexpr auto kReciprocalGradOpName = "ReciprocalGrad";
 constexpr auto kRecvOpName = "StreamRecv";
 constexpr auto kReduceAllOpName = "ReduceAll";
 constexpr auto kReduceAllDOpName = "ReduceAllD";
@ -536,6 +560,7 @@ constexpr auto kResizeNearestNeighborV2DOpName = "ResizeNearestNeighborV2D";
 constexpr auto kReverseV2OpName = "ReverseV2";
 constexpr auto kReverseV2DOpName = "ReverseV2D";
 constexpr auto kReturnOpName = "Return";
+constexpr auto kRGBToHSVOpName = "RGBToHSV";
 constexpr auto kROIAlignGradName = "ROIAlignGrad";
 constexpr auto kRpcRecvOpName = "RpcRecv";
 constexpr auto kRpcSendOpName = "RpcSend";
@ -543,6 +568,9 @@ constexpr auto kRpnProposalsOpName = "RpnProposals";
 constexpr auto kRpnProposalsDOpName = "RpnProposalsD";
 constexpr auto kRsqrtGradOpName = "RsqrtGrad";
 constexpr auto kRsqrtOpName = "Rsqrt";
+constexpr auto kSampleDistortedBoundingBoxExt2OpName = "SampleDistortedBoundingBoxExt2";
+constexpr auto kScaleAndTranslateOpName = "ScaleAndTranslate";
+constexpr auto kScaleAndTranslateGradOpName = "ScaleAndTranslateGrad";
 constexpr auto kScatterAddOpName = "ScatterAdd";
 constexpr auto kScatterNdOpName = "ScatterNd";
 constexpr auto kScatterNdDOpName = "ScatterNdD";
@ -554,13 +582,19 @@ constexpr auto kSegmentMinOpName = "SegmentMin";
 constexpr auto kSegmentProdOpName = "SegmentProd";
 constexpr auto kSegmentSumOpName = "SegmentSum";
 constexpr auto kSelectOpName = "Select";
+constexpr auto kSelfAdjointEigOpName = "SelfAdjointEig";
 constexpr auto kSeLUOpName = "SeLU";
 constexpr auto kSeluOpName = "Selu";
 constexpr auto kSendOpName = "StreamSend";
+constexpr auto kSetSizeOpName = "SetSize";
 constexpr auto kSGDName = "SGD";
 constexpr auto kSigmoidOpName = "Sigmoid";
 constexpr auto kSigmoidCrossEntropyWithLogitsV2OpName = "SigmoidCrossEntropyWithLogitsV2";
+constexpr auto kSignOpName = "Sign";
 constexpr auto kSimpleMeanGradOpName = "SimpleMeanGrad";
+constexpr auto kSinOpName = "Sin";
+constexpr auto kSincOpName = "Sinc";
+constexpr auto kSinhOpName = "Sinh";
 constexpr auto kSliceGradOpName = "SliceGrad";
 constexpr auto kSliceOpName = "Slice";
 constexpr auto kSliceDV2OpName = "SliceDV2";
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/check_numerics.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/check_numerics.cc
@ -0,0 +1,134 @@
+/**
+ * Copyright 2021 Jilin University
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved..
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "check_numerics.h"
+
+#include <securec.h>
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const std::uint32_t kCheckNumericsInputNum{1};
+const std::uint32_t kCheckNumericsOutputNum{1};
+const char *const kCheckNumerics{"CheckNumerics"};
+const std::int64_t kCheckNumericsParallelNum{64 * 1024};
+}  // namespace
+
+namespace aicpu {
+namespace detail {
+template <typename T>
+inline bool ScalarCheckNumerics(const T x) {
+  return !std::isfinite(x);
+}
+template <>
+inline bool ScalarCheckNumerics(const Eigen::half x) {
+  return !Eigen::half_impl::isfinite(x);
+}
+inline std::uint32_t ParallelForCheckNumerics(const CpuKernelContext &ctx, std::int64_t total,
+                                              std::int64_t per_unit_size,
+                                              const std::function<void(std::int64_t, std::int64_t)> &work) {
+  if (total > kCheckNumericsParallelNum)
+    return aicpu::CpuKernelUtils::ParallelFor(ctx, total, per_unit_size, work);
+  else
+    work(0, total);
+  return KERNEL_STATUS_OK;
+}
+template <typename T>
+inline std::uint32_t ComputeCheckNumericsKernel(const CpuKernelContext &ctx) {
+  T *input0{static_cast<T *>(ctx.Input(0)->GetData())};
+  T *output{static_cast<T *>(ctx.Output(0)->GetData())};
+  std::int64_t total{ctx.Input(0)->NumElements()};
+  std::uint32_t core_num{aicpu::CpuKernelUtils::GetCPUNum(ctx)};
+  std::int64_t per_unit_size{total / std::min(std::max(1L, core_num - 2L), total)};
+  bool flag = false;
+  std::uint32_t ret = ParallelForCheckNumerics(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
+    flag = flag || std::any_of(input0 + begin, input0 + end, ScalarCheckNumerics<T>);
+    if (!flag) {
+      auto ret = memcpy_s(output + begin, static_cast<size_t>((end - begin) * sizeof(T)), input0 + begin,
+                          static_cast<size_t>((end - begin) * sizeof(T)));
+      if (ret != EOK) {
+        KERNEL_LOG_ERROR("memcpy_s error");
+      }
+    }
+  });
+  return flag ? KERNEL_STATUS_PARAM_INVALID : ret;
+}
+template <typename T>
+inline std::uint32_t ComputeCheckNumerics(const CpuKernelContext &ctx) {
+  std::uint32_t result{ComputeCheckNumericsKernel<T>(ctx)};
+  if (result != KERNEL_STATUS_OK) {
+    KERNEL_LOG_ERROR("CheckNumerics compute failed.");
+  }
+  return result;
+}
+
+inline std::uint32_t ExtraCheckCheckNumerics(const CpuKernelContext &ctx) {
+  if (ctx.Input(0)->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Get input data failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Output(0)->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Get output data failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
+    KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
+                     DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
+    KERNEL_LOG_ERROR(
+      "The data size of the input [%llu] need be the same as the output "
+      "[%llu].",
+      ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+inline std::uint32_t CheckCheckNumerics(CpuKernelContext &ctx) {
+  return NormalCheck(ctx, kCheckNumericsInputNum, kCheckNumericsOutputNum) ? KERNEL_STATUS_PARAM_INVALID
+                                                                           : ExtraCheckCheckNumerics(ctx);
+}
+
+inline std::uint32_t ComputeCheckNumerics(const CpuKernelContext &ctx) {
+  DataType input_type{ctx.Input(0)->GetDataType()};
+  switch (input_type) {
+    case DT_FLOAT16:
+      return ComputeCheckNumerics<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return ComputeCheckNumerics<std::float_t>(ctx);
+    case DT_DOUBLE:
+      return ComputeCheckNumerics<std::double_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+}  // namespace detail
+
+std::uint32_t CheckNumericsCpuKernel::Compute(CpuKernelContext &ctx) {
+  return detail::CheckCheckNumerics(ctx) ? KERNEL_STATUS_PARAM_INVALID : detail::ComputeCheckNumerics(ctx);
+}
+
+REGISTER_CPU_KERNEL(kCheckNumerics, CheckNumericsCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/check_numerics.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/check_numerics.h
@ -0,0 +1,29 @@
+/**
+ * Copyright 2021 Jilin University
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_CHECK_NUMERICS_H
+#define AICPU_KERNELS_NORMALIZED_CHECK_NUMERICS_H
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class CheckNumericsCpuKernel final : public CpuKernel {
+ public:
+  std::uint32_t Compute(CpuKernelContext &ctx) override;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/floordiv.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/floordiv.cc
@ -0,0 +1,296 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021.All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "floordiv.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *const kFloorDiv = "FloorDiv";
+// when input data size is more than kParallelDataNum, use Parallel func
+const int64_t kParallelDataNum = 2 * 1024;
+const int64_t kParallelDataNumMid = 4 * 1024;
+const int64_t kParallelDataNumSameShape = 16 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 32 * 1024;
+
+#define FLOORDIV_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                          \
+    uint32_t result = FloorDivCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                      \
+      KERNEL_LOG_ERROR("FloorDiv kernel compute failed."); \
+      return result;                                       \
+    }                                                      \
+    break;                                                 \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t FloorDivCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kFloorDiv);
+  KERNEL_HANDLE_ERROR(FloorDivParamCheck(ctx), "[%s] check params failed.", kFloorDiv);
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    FLOORDIV_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    FLOORDIV_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    FLOORDIV_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    FLOORDIV_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    FLOORDIV_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    FLOORDIV_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
+    FLOORDIV_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    FLOORDIV_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    FLOORDIV_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("FloorDiv kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t FloorDivCpuKernel::FloorDivParamCheck(const CpuKernelContext &ctx) const {
+  // the non null of input_0, input_1, output has been verified in NormalCheck
+  Tensor *input_0 = ctx.Input(0);
+  Tensor *input_1 = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
+  KERNEL_CHECK_NULLPTR(input_1->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
+  KERNEL_CHECK_NULLPTR(output->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
+  DataType input0_type = input_0->GetDataType();
+  DataType input1_type = input_1->GetDataType();
+  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with "
+                     "input1 [%s].",
+                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+T DivCal(const T &x_i, const T &y_i) {
+  return static_cast<T>(Eigen::numext::floor(x_i / y_i));
+}
+
+template <>
+int8_t DivCal(const int8_t &x_i, const int8_t &y_i) {
+  if ((x_i < 0) != (y_i < 0)) {
+    int8_t abs_x_i = x_i < 0 ? -x_i : x_i;
+    int8_t abs_y_i = y_i < 0 ? -y_i : y_i;
+    return (-(abs_x_i + abs_y_i - 1) / abs_y_i);
+  } else {
+    return (x_i / y_i);
+  }
+}
+
+template <>
+int16_t DivCal(const int16_t &x_i, const int16_t &y_i) {
+  if ((x_i < 0) != (y_i < 0)) {
+    int16_t abs_x_i = x_i < 0 ? -x_i : x_i;
+    int16_t abs_y_i = y_i < 0 ? -y_i : y_i;
+    return (-(abs_x_i + abs_y_i - 1) / abs_y_i);
+  } else {
+    return (x_i / y_i);
+  }
+}
+
+template <>
+int32_t DivCal(const int32_t &x_i, const int32_t &y_i) {
+  if ((x_i < 0) != (y_i < 0)) {
+    int32_t abs_x_i = x_i < 0 ? -x_i : x_i;
+    int32_t abs_y_i = y_i < 0 ? -y_i : y_i;
+    return (-(abs_x_i + abs_y_i - 1) / abs_y_i);
+  } else {
+    return (x_i / y_i);
+  }
+}
+
+template <>
+int64_t DivCal(const int64_t &x_i, const int64_t &y_i) {
+  if ((x_i < 0) != (y_i < 0)) {
+    int64_t abs_x_i = x_i < 0 ? -x_i : x_i;
+    int64_t abs_y_i = y_i < 0 ? -y_i : y_i;
+    return (-(abs_x_i + abs_y_i - 1) / abs_y_i);
+  } else {
+    return (x_i / y_i);
+  }
+}
+
+// special compute is used in the following situations.
+// 1. the shapes of input1 and input2 are the same
+// 2. input1 is a 1D tensor with only one element or input1 is scalar
+// 3. input2 is a 1D tensor with only one element or input2 is scalar
+// 4. the shapes of input1 and input2 are different
+template <typename T>
+uint32_t FloorDivCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
+                                           const T *input2, T *output) {
+  switch (type) {
+    case BcastShapeType::SAME_SHAPE:
+      for (int64_t i = start; i < end; ++i) {
+        if (*(input2 + i) == static_cast<T>(0)) {
+          KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
+          return KERNEL_STATUS_INNER_ERROR;
+        }
+        *(output + i) = DivCal<T>(*(input1 + i), *(input2 + i));
+      }
+      break;
+    case BcastShapeType::X_ONE_ELEMENT:
+      for (int64_t i = start; i < end; ++i) {
+        if (*(input2 + i) == static_cast<T>(0)) {
+          KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
+          return KERNEL_STATUS_INNER_ERROR;
+        }
+        *(output + i) = DivCal<T>(*input1, *(input2 + i));
+      }
+      break;
+    case BcastShapeType::Y_ONE_ELEMENT:
+      for (int64_t i = start; i < end; ++i) {
+        if (*input2 == static_cast<T>(0)) {
+          KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
+          return KERNEL_STATUS_INNER_ERROR;
+        }
+        *(output + i) = DivCal<T>(*(input1 + i), *input2);
+      }
+      break;
+    default:
+      KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
+      break;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t FloorDivCpuKernel::NoBcastCompute(const CpuKernelContext &ctx) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
+  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
+  int64_t data_num = ctx.Output(0)->NumElements();
+  BcastShapeType type = in0_elements_nums == in1_elements_nums
+                          ? BcastShapeType::SAME_SHAPE
+                          : (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
+
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumSameShapeMid) {
+      max_core_num = std::min(max_core_num, static_cast<int64_t>(4));  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    uint32_t status = KERNEL_STATUS_OK;
+    auto sharder_floor_div = [&](int64_t start, int64_t end) {
+      uint32_t status_sharder = SpecialCompute<T>(type, start, end, in0, in1, out);
+      if (status_sharder != KERNEL_STATUS_OK) {
+        status = status_sharder;
+      }
+    };
+
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_floor_div),
+                        "FloorDiv Compute failed.");
+    return status;
+  }
+
+  return SpecialCompute<T>(type, 0, data_num, in0, in1, out);
+}
+
+template <typename T>
+uint32_t FloorDivCpuKernel::BcastParallelCompute(const CpuKernelContext &ctx, const Bcast &bcast) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+
+  uint32_t min_core_num = 1;
+  int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+  int64_t data_num = ctx.Output(0)->NumElements();
+  if (data_num <= kParallelDataNumMid) {
+    max_core_num = std::min(max_core_num, static_cast<int64_t>(4));  // up to 4 cpu cores
+  }
+
+  if (max_core_num > data_num) {
+    max_core_num = data_num;
+  }
+  uint32_t status = KERNEL_STATUS_OK;
+  auto sharder_floor_div = [&](int64_t start, int64_t end) {
+    for (int64_t i = start; i < end; ++i) {
+      if (*(in1 + bcast.GetBroadcastYIndex(i)) == static_cast<T>(0)) {
+        KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
+        status = KERNEL_STATUS_INNER_ERROR;
+        break;
+      }
+      *(out + i) = DivCal<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
+    }
+  };
+
+  KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_floor_div),
+                      "FloorDiv Compute failed.");
+  return status;
+}
+
+template <typename T>
+uint32_t FloorDivCpuKernel::BcastCompute(const CpuKernelContext &ctx, const Bcast &bcast) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+
+  int64_t data_num = ctx.Output(0)->NumElements();
+  if (data_num >= kParallelDataNum) {
+    return BcastParallelCompute<T>(ctx, bcast);
+  } else {
+    for (int64_t i = 0; i < data_num; ++i) {
+      if (*(in1 + bcast.GetBroadcastYIndex(i)) == static_cast<T>(0)) {
+        KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
+        return KERNEL_STATUS_INNER_ERROR;
+      }
+      *(out + i) = DivCal<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t FloorDivCpuKernel::FloorDivCompute(const CpuKernelContext &ctx) {
+  Tensor *input0_tensor = ctx.Input(0);
+  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input0_elements_nums = input0_tensor->NumElements();
+
+  Tensor *input1_tensor = ctx.Input(1);
+  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input1_elements_nums = input1_tensor->NumElements();
+
+  bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
+  if (isNeedBcast) {
+    return NoBcastCompute<T>(ctx);
+  } else {
+    Bcast bcast(input0_shape, input1_shape);
+    if (!bcast.IsValid()) {
+      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+
+    return BcastCompute<T>(ctx, bcast);
+  }
+}
+
+REGISTER_CPU_KERNEL(kFloorDiv, FloorDivCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/floordiv.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/floordiv.h
@ -0,0 +1,49 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_FLOORDIV_H_
+#define AICPU_KERNELS_NORMALIZED_FLOORDIV_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class FloorDivCpuKernel : public CpuKernel {
+ public:
+  FloorDivCpuKernel() = default;
+  ~FloorDivCpuKernel() override = default;
+
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t FloorDivParamCheck(const CpuKernelContext &ctx) const;
+
+  template <typename T>
+  uint32_t SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, T *output);
+
+  template <typename T>
+  uint32_t NoBcastCompute(const CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t BcastCompute(const CpuKernelContext &ctx, const Bcast &bcast);
+
+  template <typename T>
+  uint32_t BcastParallelCompute(const CpuKernelContext &ctx, const Bcast &bcast);
+
+  template <typename T>
+  uint32_t FloorDivCompute(const CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/log1p.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/log1p.cc
@ -0,0 +1,162 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "log1p.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 1;
+const char *const kLog1p = "Log1p";
+constexpr int64_t kParallelDataNums = 16 * 1024;
+
+#define LOG1P_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                       \
+    uint32_t result = Log1pCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                   \
+      KERNEL_LOG_ERROR("Log1p kernel compute failed."); \
+      return result;                                    \
+    }                                                   \
+    break;                                              \
+  }
+
+#define LOG1P_COMPUTE_CASE2(DTYPE, TYPE, CTX)           \
+  case (DTYPE): {                                       \
+    uint32_t result = Log1pComputeComplex<TYPE>(CTX);   \
+    if (result != KERNEL_STATUS_OK) {                   \
+      KERNEL_LOG_ERROR("Log1p kernel compute failed."); \
+      return result;                                    \
+    }                                                   \
+    break;                                              \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t Log1pCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLog1p);
+  KERNEL_HANDLE_ERROR(Log1pCheck(ctx), "[%s] check params failed.", kLog1p);
+  DataType data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    LOG1P_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    LOG1P_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    LOG1P_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    LOG1P_COMPUTE_CASE2(DT_COMPLEX64, std::complex<float>, ctx)
+    LOG1P_COMPUTE_CASE2(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("Log1p kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t Log1pCpuKernel::Log1pCheck(const CpuKernelContext &ctx) const {
+  auto input_0 = ctx.Input(0);
+  auto output_0 = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
+  KERNEL_CHECK_NULLPTR(output_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
+  KERNEL_CHECK_NULLPTR(input_0->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.")
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t Log1pCpuKernel::Log1pCompute(const CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Input(0)->NumElements();
+  int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
+  if (data_size <= kParallelDataNums) {
+    for (int64_t i = 0; i < data_num; i++) {
+      KERNEL_CHECK_FALSE(*(input_x + i) >= static_cast<T>(-1), KERNEL_STATUS_PARAM_INVALID,
+                         "[%llu] must be at least more than -1.", i);
+      *(output_y + i) = Eigen::numext::log1p(*(input_x + i));
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shard_log1p = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        KERNEL_CHECK_FALSE(*(input_x + i) >= static_cast<T>(-1), KERNEL_STATUS_PARAM_INVALID,
+                           "[%llu] must be at least more than -1.", i);
+        *(output_y + i) = Eigen::numext::log1p(*(input_x + i));
+      }
+      return KERNEL_STATUS_OK;
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log1p),
+                        "Log1p Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t Log1pCpuKernel::Log1pComputeComplex(const CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto data_type = ctx.Input(0)->GetDataType();
+  int64_t data_num = ctx.Input(0)->NumElements();
+  int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
+  typedef Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> ArrayxXd;
+  ArrayxXd array_x(1, data_num);
+  if (data_size <= kParallelDataNums) {
+    if (data_type == DT_COMPLEX64) {
+      for (int64_t i = 0; i < data_num; i++) {
+        array_x(0, i) = *(input_x + i);
+        KERNEL_CHECK_FALSE(array_x(0, i).real() >= static_cast<float>(-1), KERNEL_STATUS_PARAM_INVALID,
+                           "[%llu] must be at least more than -1.", i);
+        *(output_y + i) = Eigen::numext::log1p(*(input_x + i));
+      }
+    } else {
+      for (int64_t i = 0; i < data_num; i++) {
+        array_x(0, i) = *(input_x + i);
+        KERNEL_CHECK_FALSE(array_x(0, i).real() >= static_cast<double>(-1), KERNEL_STATUS_PARAM_INVALID,
+                           "[%llu] must be at least more than -1.", i);
+        *(output_y + i) = Eigen::numext::log1p(*(input_x + i));
+      }
+    }
+    return KERNEL_STATUS_OK;
+  } else {
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shard_log1p = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        if (data_type == DT_COMPLEX64) {
+          array_x(0, i) = *(input_x + i);
+          KERNEL_CHECK_FALSE(array_x(0, i).real() >= static_cast<float>(-1), KERNEL_STATUS_PARAM_INVALID,
+                             "[%llu] must be at least more than -1.", i);
+          *(output_y + i) = Eigen::numext::log1p(*(input_x + i));
+        } else {
+          array_x(0, i) = *(input_x + i);
+          KERNEL_CHECK_FALSE(array_x(0, i).real() >= static_cast<double>(-1), KERNEL_STATUS_PARAM_INVALID,
+                             "[%llu] must be at least more than -1.", i);
+          *(output_y + i) = Eigen::numext::log1p(*(input_x + i));
+        }
+      }
+      return KERNEL_STATUS_OK;
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log1p),
+                        "Log1p Compute failed.");
+    return KERNEL_STATUS_OK;
+  }
+}
+REGISTER_CPU_KERNEL(kLog1p, Log1pCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/log1p.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/log1p.h
@ -0,0 +1,38 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_LOG1P_H
+#define AICPU_KERNELS_NORMALIZED_LOG1P_H
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class Log1pCpuKernel : public CpuKernel {
+ public:
+  Log1pCpuKernel() = default;
+  ~Log1pCpuKernel() override = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t Log1pCheck(const CpuKernelContext &ctx) const;
+
+  template <typename T>
+  uint32_t Log1pCompute(const CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t Log1pComputeComplex(const CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_triangular_solve.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_triangular_solve.cc
@ -0,0 +1,180 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "matrix_triangular_solve.h"
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include "Eigen/Core"
+#include "complex"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include "kernel_log.h"
+
+using namespace Eigen;
+using namespace std;
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *kMatrixTriangularSolve = "MatrixTriangularSolve";
+constexpr int64_t kParallelDataNums = 16 * 1024;
+
+#define MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                                       \
+    uint32_t result = MatrixTriangularSolveCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                                   \
+      KERNEL_LOG_ERROR("MatrixTriangularSolve kernel compute failed."); \
+      return result;                                                    \
+    }                                                                   \
+    break;                                                              \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t MatrixTriangularSolveCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
+                      "MatrixTriangularSolve check input and output number failed.");
+
+  KERNEL_HANDLE_ERROR(MatrixTriangularSolveCheck(ctx), "MatrixTriangularSolve check params failed.");
+  // check the data type of the inputs
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
+    MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("MatrixTriangularSolve kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MatrixTriangularSolveCpuKernel::MatrixTriangularSolveCheck(CpuKernelContext &ctx) {
+  Tensor *in_matrix = ctx.Input(0);
+  Tensor *in_rhs = ctx.Input(1);
+  // check same data type constraint
+  auto in_type0 = in_matrix->GetDataType();
+  auto in_type1 = in_rhs->GetDataType();
+  KERNEL_CHECK_FALSE((in_type0 == in_type1), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input1 [%s] need be same with "
+                     "input0 [%s].",
+                     DTypeStr(in_type1).c_str(), DTypeStr(in_type0).c_str())
+  // check the number of matrix
+  auto in_shape0 = in_matrix->GetTensorShape();
+  auto in_shape1 = in_rhs->GetTensorShape();
+
+  std::vector<int64_t> dims0 = in_shape0->GetDimSizes();
+  std::vector<int64_t> dims1 = in_shape1->GetDimSizes();
+
+  // Check the shape of two inputs
+  if (dims0[0] != dims1[0]) {
+    KERNEL_LOG_ERROR("The shapes of two inputs are not matched");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  // check square
+  int m = dims0.size();
+  if (dims0[m - 2] != dims0[m - 1] || dims0[m - 1] == 0) {
+    KERNEL_LOG_ERROR("The input0 must be one or more squares.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MatrixTriangularSolveCpuKernel::MatrixTriangularSolveCompute(CpuKernelContext &ctx) {
+  Tensor *matrix_tensor = ctx.Input(0);
+  Tensor *rhs_tensor = ctx.Input(1);
+  Tensor *y_tensor = ctx.Output(0);
+
+  auto input_matrix = reinterpret_cast<T *>(matrix_tensor->GetData());
+  KERNEL_CHECK_NULLPTR(input_matrix, KERNEL_STATUS_PARAM_INVALID, "Get input data0 failed.")
+  auto input_rhs = reinterpret_cast<T *>(rhs_tensor->GetData());
+  KERNEL_CHECK_NULLPTR(input_rhs, KERNEL_STATUS_PARAM_INVALID, "Get input data1 failed.")
+  auto output_y = reinterpret_cast<T *>(y_tensor->GetData());
+  KERNEL_CHECK_NULLPTR(output_y, KERNEL_STATUS_PARAM_INVALID, "Get output data failed.")
+
+  AttrValue *lower_attr = ctx.GetAttr("lower");
+  KERNEL_CHECK_NULLPTR(lower_attr, KERNEL_STATUS_PARAM_INVALID, "Get attr [lower] failed.");
+  AttrValue *adjoint_attr = ctx.GetAttr("adjoint");
+  KERNEL_CHECK_NULLPTR(adjoint_attr, KERNEL_STATUS_PARAM_INVALID, "Get attr [adjoint] failed.");
+  bool lower_data = lower_attr->GetBool();
+  bool adjoint_data = adjoint_attr->GetBool();
+
+  auto matrix_shape = matrix_tensor->GetTensorShape();
+  auto rhs_shape = rhs_tensor->GetTensorShape();
+  auto y_shape = y_tensor->GetTensorShape();
+
+  // Get the number of elements
+  auto input1_num = matrix_tensor->NumElements();
+
+  // slice
+  std::vector<int64_t> matrix_dims = matrix_shape->GetDimSizes();
+  auto last_matrix_dims = *(matrix_dims.end() - 1);
+  size_t matrix_size = last_matrix_dims * last_matrix_dims;  // size of a matrix
+  size_t matrix_num = input1_num / matrix_size;              // number of matrix
+
+  std::vector<int64_t> rhs_dims = rhs_shape->GetDimSizes();
+  auto last_rhs_dims = *(rhs_dims.end() - 1);
+  size_t rhs_size = last_matrix_dims * last_rhs_dims;
+
+  auto data_size = matrix_num * matrix_size;
+
+  auto shard_matrix_triangular_solve = [&](size_t start, size_t end) {
+    for (size_t k = start; k < end; ++k) {
+      Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_input(
+        input_matrix + k * matrix_size, last_matrix_dims, last_matrix_dims);
+      Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_rhs(
+        input_rhs + k * rhs_size, last_matrix_dims, last_rhs_dims);
+      Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_output(
+        output_y + k * rhs_size, last_matrix_dims, last_rhs_dims);
+      if (lower_data) {
+        auto triangle = eigen_input.template triangularView<Eigen::Lower>();
+        if (adjoint_data) {
+          eigen_output.noalias() = triangle.adjoint().solve(eigen_rhs);
+        } else {
+          eigen_output.noalias() = triangle.solve(eigen_rhs);
+        }
+      } else {
+        auto triangle = eigen_input.template triangularView<Eigen::Upper>();
+        if (adjoint_data) {
+          eigen_output.noalias() = triangle.adjoint().solve(eigen_rhs);
+        } else {
+          eigen_output.noalias() = triangle.solve(eigen_rhs);
+        }
+      }
+    }
+  };
+  if (data_size < kParallelDataNums) {
+    shard_matrix_triangular_solve(0, matrix_num);
+  } else {
+    uint32_t min_core_num = 1;
+    uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > matrix_num) {
+      max_core_num = matrix_num;
+    }
+    KERNEL_HANDLE_ERROR(
+      CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_matrix_triangular_solve),
+      "MatrixTriangularSolve Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kMatrixTriangularSolve, MatrixTriangularSolveCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_triangular_solve.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_triangular_solve.h
@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.       \
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_MATRIXTRIANGULARSOLVE_H_
+#define AICPU_KERNELS_NORMALIZED_MATRIXTRIANGULARSOLVE_H_
+
+#include "cpu_ops_kernel.h"
+#include "Eigen/Core"
+
+namespace aicpu {
+class MatrixTriangularSolveCpuKernel : public CpuKernel {
+ public:
+  MatrixTriangularSolveCpuKernel() = default;
+  ~MatrixTriangularSolveCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  bool lower;
+  bool adjoint;
+
+  template <typename T>
+  static uint32_t MatrixTriangularSolveCompute(CpuKernelContext &ctx);
+
+  static uint32_t MatrixTriangularSolveCheck(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum_grad_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum_grad_grad.cc
@ -0,0 +1,127 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "maximum_grad_grad.h"
+
+#include <fstream>
+#include <iostream>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+constexpr uint32_t kMaximumGradGradInputNum = 4;
+constexpr uint32_t kMaximumGradGradOutputNum = 3;
+const char *kMaximumGradGrad = "MaximumGradGrad";
+
+#define MAXIMUMGRADGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                                 \
+    uint32_t result = MaximumGradGradCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                             \
+      KERNEL_LOG_ERROR("MaximumGradGrad kernel compute failed."); \
+      return result;                                              \
+    }                                                             \
+    break;                                                        \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t MaximumGradGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMaximumGradGradInputNum, kMaximumGradGradOutputNum),
+                      "MaximumGradGrad check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MaximumGradGradParamCheck(ctx), "MaximumGradGrad check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    MAXIMUMGRADGRAD_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    MAXIMUMGRADGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    MAXIMUMGRADGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    default:
+      KERNEL_LOG_ERROR("The data type of input is not support, input data type is [%s].", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MaximumGradGradCpuKernel::MaximumGradGradParamCheck(CpuKernelContext &ctx) {
+  // the non null of inputs and outputs has been verified in NormalCheck
+  Tensor *x1 = ctx.Input(0);
+  Tensor *x2 = ctx.Input(1);
+  Tensor *grad_y1 = ctx.Input(2);
+  Tensor *grad_y2 = ctx.Input(3);
+  // type check
+  DataType grad_y1_type = grad_y1->GetDataType();
+  DataType grad_y2_type = grad_y2->GetDataType();
+  DataType x1_type = x1->GetDataType();
+  DataType x2_type = x2->GetDataType();
+  KERNEL_CHECK_FALSE(((grad_y1_type == grad_y2_type) && (grad_y2_type == x1_type) && (x1_type == x2_type)),
+                     KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of grad_y1 [%s], grad_y2 [%s], x1 [%s] and "
+                     "x2 [%s] need to be same.",
+                     DTypeStr(grad_y1_type).c_str(), DTypeStr(grad_y2_type).c_str(), DTypeStr(x1_type).c_str(),
+                     DTypeStr(x2_type).c_str())
+  // shape check
+  auto grad_y1_shape = grad_y1->GetTensorShape()->GetDimSizes();
+  auto grad_y2_shape = grad_y2->GetTensorShape()->GetDimSizes();
+  auto x1_shape = x1->GetTensorShape()->GetDimSizes();
+  auto x2_shape = x2->GetTensorShape()->GetDimSizes();
+  KERNEL_CHECK_FALSE(grad_y1_shape == x1_shape, KERNEL_STATUS_PARAM_INVALID, "Mismatch in shape of grad_y1 and x1.");
+  KERNEL_CHECK_FALSE(grad_y2_shape == x2_shape, KERNEL_STATUS_PARAM_INVALID, "Mismatch in shape of grad_y2 and x2.");
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MaximumGradGradCpuKernel::MaximumGradGradCompute(CpuKernelContext &ctx) {
+  Tensor *input0_tensor = ctx.Input(0);
+  Tensor *input1_tensor = ctx.Input(1);
+
+  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
+  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
+
+  Bcast bcast(input0_shape, input1_shape);
+  if (!bcast.IsValid()) {
+    KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return BcastCompute<T>(ctx, bcast);
+}
+
+template <typename T>
+uint32_t MaximumGradGradCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto in2 = reinterpret_cast<T *>(ctx.Input(2)->GetData());
+  auto in3 = reinterpret_cast<T *>(ctx.Input(3)->GetData());
+  auto out0 = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto out1 = reinterpret_cast<T *>(ctx.Output(1)->GetData());
+  auto out2 = reinterpret_cast<T *>(ctx.Output(2)->GetData());
+  *out0 = static_cast<T>(0);
+  *out1 = static_cast<T>(0);
+  int64_t data_num = ctx.Output(2)->NumElements();
+
+  for (int64_t i = 0; i < data_num; ++i) {
+    if (*(in0 + bcast.GetBroadcastXIndex(i)) >= *(in1 + bcast.GetBroadcastYIndex(i))) {
+      *(out2 + i) = *(in2 + bcast.GetBroadcastXIndex(i));
+    } else {
+      *(out2 + i) = *(in3 + bcast.GetBroadcastYIndex(i));
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMaximumGradGrad, MaximumGradGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum_grad_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum_grad_grad.h
@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_MAXIMUM_GRAD_GRAD_H_
+#define AICPU_KERNELS_NORMALIZED_MAXIMUM_GRAD_GRAD_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class MaximumGradGradCpuKernel : public CpuKernel {
+ public:
+  MaximumGradGradCpuKernel() = default;
+  ~MaximumGradGradCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t MaximumGradGradParamCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t MaximumGradGradCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool.cc
@ -0,0 +1,286 @@
+/**
+ * Copyright 2021 Harbin Institute of Technology
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "maxpool.h"
+
+#include <Eigen/Dense>
+#include <string>
+#include <vector>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+namespace {
+const char *MAXPOOL = "MaxPool";
+constexpr uint32_t kMaxPoolInputNum = 1;
+constexpr uint32_t kMaxPoolOutputNum = 1;
+constexpr int64_t kParallelNum = 64 * 1024;
+struct PoolParams {
+  int depth;
+
+  int tensor_cols;
+  int tensor_rows;
+  int tensor_batch;
+
+  int ksize_rows;
+  int ksize_cols;
+  int ksize_depth;
+
+  int strides_rows;
+  int strides_cols;
+  int strides_depth;
+
+  int64_t out_height;
+  int64_t out_width;
+  int out_depth;
+
+  int64_t pad_top;
+  int64_t pad_bottom;
+  int64_t pad_left;
+  int64_t pad_right;
+};
+}  // namespace
+namespace aicpu {
+uint32_t GetOutputSize(int input_size, int kernel_size, int stride, const std::string &padding, int64_t *output_size,
+                       int64_t *padding_before, int64_t *padding_after) {
+  KERNEL_CHECK_FALSE(stride > 0, KERNEL_STATUS_PARAM_INVALID, "[MaxPool] Stride must be positive.");
+  std::string same("SAME"), valid("VALID");
+  if (valid == padding) {
+    *output_size = (input_size - kernel_size + stride) / stride;
+    *padding_before = 0;
+    *padding_after = 0;
+  } else if (same == padding) {
+    *output_size = (input_size + stride - 1) / stride;
+    const int64_t padding_need =
+      std::max(static_cast<int64_t>(0), (*output_size - 1) * stride + kernel_size - input_size);
+    *padding_before = padding_need / 2;
+    *padding_after = padding_need - *padding_before;
+  } else {
+    KERNEL_LOG_ERROR("[MaxPool] Padding is invalid.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (*output_size < 0) {
+    KERNEL_LOG_ERROR("[MaxPool] Computed output size is negative.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+uint32_t ConstructPoolParams(aicpu::CpuKernelContext &ctx, const aicpu::TensorShape &data_format, PoolParams &params) {
+  Format format = data_format.GetFormat();
+  KERNEL_CHECK_FALSE((format == FORMAT_NHWC || format == FORMAT_NCHW), KERNEL_STATUS_PARAM_INVALID,
+                     "[MaxPool] Format is not NHWC or NCHW.");
+  std::vector<int64_t> tensor_in_shapes = data_format.GetDimSizes();
+  if (tensor_in_shapes.size() != 4) {
+    KERNEL_LOG_ERROR("[MaxPool] Input tensor must have 2 spacial dimensions.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  std::vector<int64_t> ksize = ctx.GetAttr("ksize")->GetListInt(), strides = ctx.GetAttr("strides")->GetListInt();
+  std::string padding = ctx.GetAttr("padding")->GetString();
+  std::string data_format_str = "";
+  if (ctx.GetAttr("data_format") == nullptr) {
+    KERNEL_LOG_INFO("[MaxPool] Attr data_format is empty, using default value NHWC.");
+    format = FORMAT_NHWC;
+  } else {
+    std::map<std::string, aicpu::Format> format_str_to_enum_map = {{"NHWC", FORMAT_NHWC}, {"NCHW", FORMAT_NCHW}};
+    data_format_str = ctx.GetAttr("data_format")->GetString();
+
+    KERNEL_HANDLE_ERROR(format_str_to_enum_map.find(data_format_str) == format_str_to_enum_map.end(),
+                        "[MaxPool] data_format string is invalid.");
+    format = format_str_to_enum_map[data_format_str];
+  }
+  switch (format) {
+    case FORMAT_NHWC:
+      params.depth = tensor_in_shapes[kFormatNHWCIndexC];
+      params.tensor_rows = tensor_in_shapes[kFormatNHWCIndexH];
+      params.tensor_cols = tensor_in_shapes[kFormatNHWCIndexW];
+      params.tensor_batch = tensor_in_shapes[kFormatNHWCIndexN];
+      params.ksize_rows = ksize[kFormatNHWCIndexH];
+      params.ksize_cols = ksize[kFormatNHWCIndexW];
+      params.ksize_depth = ksize[kFormatNHWCIndexC];
+      params.strides_rows = strides[kFormatNHWCIndexH];
+      params.strides_cols = strides[kFormatNHWCIndexW];
+      params.strides_depth = strides[kFormatNHWCIndexC];
+      break;
+    case FORMAT_NCHW:
+      params.depth = tensor_in_shapes[kFormatNCHWIndexC];
+      params.tensor_rows = tensor_in_shapes[kFormatNCHWIndexH];
+      params.tensor_cols = tensor_in_shapes[kFormatNCHWIndexW];
+      params.tensor_batch = tensor_in_shapes[kFormatNCHWIndexN];
+      params.ksize_rows = ksize[kFormatNCHWIndexH];
+      params.ksize_cols = ksize[kFormatNCHWIndexW];
+      params.ksize_depth = ksize[kFormatNCHWIndexC];
+      params.strides_rows = strides[kFormatNCHWIndexH];
+      params.strides_cols = strides[kFormatNCHWIndexW];
+      params.strides_depth = strides[kFormatNCHWIndexC];
+      break;
+    default:
+      KERNEL_LOG_ERROR("[MaxPool] Format is not NHWC or NCHW, current is [%s].", FormatToSerialString(format).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  auto ret1 = GetOutputSize(params.tensor_rows, params.ksize_rows, params.strides_rows, padding, &params.out_height,
+                            &params.pad_top, &params.pad_bottom),
+       ret2 = GetOutputSize(params.tensor_cols, params.ksize_cols, params.strides_cols, padding, &params.out_width,
+                            &params.pad_left, &params.pad_right);
+  KERNEL_CHECK_FALSE(ret1 == KERNEL_STATUS_OK && ret2 == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
+                     "[MaxPool] An error occurred while calculating output size.");
+  params.out_depth = params.depth;
+  return KERNEL_STATUS_OK;
+}
+template <class T>
+uint32_t SpacialMaxPool(CpuKernelContext &ctx, const PoolParams &params) {
+  Tensor *input = ctx.Input(kFirstInputIndex);
+  Tensor *output = ctx.Output(kFirstOutputIndex);
+
+  const T *raw_input_data = static_cast<T *>(input->GetData());
+  T *raw_output_data = static_cast<T *>(output->GetData());
+  auto shard_NCHW = [&params, &raw_input_data, &raw_output_data](int64_t start, int64_t limit) {
+    typedef Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenArrayMap;
+    typedef Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> EigenArrayMap;
+    const int64_t batch_size = limit;
+    const int64_t X_W = static_cast<int64_t>(params.tensor_cols), X_H = static_cast<int64_t>(params.tensor_rows);
+    const int64_t Y_W = params.out_width, Y_H = params.out_height;
+    const int64_t X_HxW = X_H * X_W, Y_HxW = Y_H * Y_W;
+    const int64_t X_stride = X_HxW, Y_stride = Y_HxW;
+    const int64_t stride_h = static_cast<int64_t>(params.strides_rows),
+                  stride_w = static_cast<int64_t>(params.strides_cols);
+    const int64_t pad_t = params.pad_top, pad_l = params.pad_left;
+    const int64_t kernel_h = static_cast<int64_t>(params.ksize_rows),
+                  kernel_w = static_cast<int64_t>(params.ksize_cols);
+    const T *x_ptr = raw_input_data + start * X_stride;
+    T *y_ptr = raw_output_data + start * Y_stride;
+    for (int64_t i = start; i < batch_size; ++i) {
+      ConstEigenArrayMap x_arr(x_ptr, X_W, X_H);
+      EigenArrayMap y_arr(y_ptr, Y_W, Y_H);
+      for (int64_t h = 0; h < Y_H; ++h) {
+        const int64_t t = std::max(h * stride_h - pad_t, static_cast<int64_t>(0));
+        const int64_t b = std::min(h * stride_h - pad_t + kernel_h, X_H);
+        for (int64_t w = 0; w < Y_W; ++w) {
+          const int64_t l = std::max(w * stride_w - pad_l, static_cast<int64_t>(0));
+          const int64_t r = std::min(w * stride_w - pad_l + kernel_w, X_W);
+          const int64_t y = h * Y_W + w;
+          y_arr(y) = x_arr.block(l, t, r - l, b - t).maxCoeff();
+        }
+      }
+      x_ptr += X_stride;
+      y_ptr += Y_stride;
+    }
+  };
+  auto shard_NHWC = [&params, &raw_input_data, &raw_output_data](int64_t start, int64_t limit) {
+    typedef Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenArrayMap;
+    typedef Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> EigenArrayMap;
+    const int64_t batch_size = limit;
+    const int64_t X_W = static_cast<int64_t>(params.tensor_cols), X_H = static_cast<int64_t>(params.tensor_rows);
+    const int64_t Y_W = params.out_width, Y_H = params.out_height;
+    const int64_t X_HxW = X_H * X_W, Y_HxW = Y_H * Y_W;
+    const int64_t C = static_cast<int64_t>(params.depth);
+    const int64_t X_stride = X_HxW * C, Y_stride = Y_HxW * C;
+    const int64_t stride_h = static_cast<int64_t>(params.strides_rows),
+                  stride_w = static_cast<int64_t>(params.strides_cols);
+    const int64_t pad_t = params.pad_top, pad_l = params.pad_left;
+    const int64_t kernel_h = static_cast<int64_t>(params.ksize_rows),
+                  kernel_w = static_cast<int64_t>(params.ksize_cols);
+    const T *x_ptr = raw_input_data + start * X_stride;
+    T *y_ptr = raw_output_data + start * Y_stride;
+    for (int64_t i = start; i < batch_size; ++i) {
+      ConstEigenArrayMap x_arr(x_ptr, C, X_HxW);
+      EigenArrayMap y_arr(y_ptr, C, Y_HxW);
+      for (int64_t h = 0; h < Y_H; ++h) {
+        const int64_t t = std::max(h * stride_h - pad_t, static_cast<int64_t>(0));
+        const int64_t b = std::min(h * stride_h - pad_t + kernel_h, X_H);
+        for (int64_t w = 0; w < Y_W; ++w) {
+          const int64_t l = std::max(w * stride_w - pad_l, static_cast<int64_t>(0));
+          const int64_t r = std::min(w * stride_w - pad_l + kernel_w, X_W);
+          const int64_t y = h * Y_W + w;
+          y_arr.col(y).setConstant(Eigen::NumTraits<T>::lowest());
+          for (int64_t xi = t; xi < b; ++xi) {
+            for (int64_t yj = l; yj < r; ++yj) {
+              y_arr.col(y) = y_arr.col(y).max(x_arr.col(xi * X_W + yj));
+            }
+          }
+        }
+      }
+      x_ptr += X_stride;
+      y_ptr += Y_stride;
+    }
+  };
+  int64_t total_elements = params.tensor_batch * params.tensor_cols * params.tensor_rows * params.depth;
+  if (ctx.GetAttr("data_format") != nullptr && ctx.GetAttr("data_format")->GetString() == "NCHW") {
+    int64_t total_images = params.tensor_batch * params.depth;
+    KERNEL_LOG_INFO("[MaxPool] Calling new shard_NCHW");
+    if (total_elements <= kParallelNum) {
+      shard_NCHW(0, total_images);
+      return KERNEL_STATUS_OK;
+    } else {
+      uint32_t max_core_num = aicpu::CpuKernelUtils::GetCPUNum(ctx);
+      max_core_num = std::min(total_images, static_cast<int64_t>(max_core_num));
+      return CpuKernelUtils::ParallelFor(ctx, total_images, total_images / max_core_num, shard_NCHW);
+    }
+  } else {
+    int64_t total_images_with_chann = params.tensor_batch;
+    KERNEL_LOG_INFO("[MaxPool] Calling new shard_NHWC");
+    if (total_elements <= kParallelNum) {
+      shard_NHWC(0, total_images_with_chann);
+      return KERNEL_STATUS_OK;
+    } else {
+      uint32_t max_core_num = aicpu::CpuKernelUtils::GetCPUNum(ctx);
+      max_core_num = std::min(total_images_with_chann, static_cast<int64_t>(max_core_num));
+      return CpuKernelUtils::ParallelFor(ctx, total_images_with_chann, total_images_with_chann / max_core_num,
+                                         shard_NHWC);
+    }
+  }
+}
+
+template <class T>
+uint32_t ComputeMaxPoolImpl(CpuKernelContext &ctx) {
+  TensorShape ts = *(ctx.Input(kFirstInputIndex)->GetTensorShape());
+  PoolParams params;
+  KERNEL_CHECK_FALSE(ConstructPoolParams(ctx, ts, params) == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
+                     "[MaxPool] Pooling parameters construct failed.")
+  return SpacialMaxPool<T>(ctx, params);
+}
+uint32_t MaxPoolCpuKernel::Compute(CpuKernelContext &ctx) {
+  const std::vector<std::string> required_attrs = {"ksize", "strides", "padding"};
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMaxPoolInputNum, kMaxPoolOutputNum, required_attrs),
+                      "[MaxPool] Check input and output number failed.");
+  DataType input_type = ctx.Input(kFirstInputIndex)->GetDataType();
+  switch (input_type) {
+    case DT_FLOAT16:
+      return ComputeMaxPoolImpl<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return ComputeMaxPoolImpl<float>(ctx);
+    case DT_DOUBLE:
+      return ComputeMaxPoolImpl<double>(ctx);
+    case DT_INT8:
+      return ComputeMaxPoolImpl<int8_t>(ctx);
+    case DT_INT16:
+      return ComputeMaxPoolImpl<int16_t>(ctx);
+    case DT_INT32:
+      return ComputeMaxPoolImpl<int32_t>(ctx);
+    case DT_INT64:
+      return ComputeMaxPoolImpl<int64_t>(ctx);
+    case DT_UINT8:
+      return ComputeMaxPoolImpl<uint8_t>(ctx);
+    case DT_UINT16:
+      return ComputeMaxPoolImpl<uint16_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("[MaxPool] Data type [%s] is not supported.", DTypeStr(input_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(MAXPOOL, MaxPoolCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool.h
@ -0,0 +1,27 @@
+/**
+ * Copyright 2021 Harbin Institute of Technology
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+
+namespace aicpu {
+class MaxPoolCpuKernel : public CpuKernel {
+ public:
+  ~MaxPoolCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+};
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum_grad_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum_grad_grad.cc
@ -0,0 +1,129 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "minimum_grad_grad.h"
+
+#include <fstream>
+#include <iostream>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+constexpr uint32_t kMinimumGradGradInputNum = 4;
+constexpr uint32_t kMinimumGradGradOutputNum = 3;
+const char *kMinimumGradGrad = "MinimumGradGrad";
+
+#define MINIMUMGRADGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                                 \
+    uint32_t result = MinimumGradGradCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                             \
+      KERNEL_LOG_ERROR("MinimumGradGrad kernel compute failed."); \
+      return result;                                              \
+    }                                                             \
+    break;                                                        \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t MinimumGradGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMinimumGradGradInputNum, kMinimumGradGradOutputNum),
+                      "MinimumGradGrad check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MinimumGradGradParamCheck(ctx), "MinimumGradGrad check params failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    MINIMUMGRADGRAD_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    MINIMUMGRADGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    MINIMUMGRADGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    default:
+      KERNEL_LOG_ERROR("The data type of input is not support, input data type is [%s].", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MinimumGradGradCpuKernel::MinimumGradGradParamCheck(CpuKernelContext &ctx) {
+  // the non null of inputs and outputs has been verified in
+  // NormalCheck
+  Tensor *x1 = ctx.Input(0);
+  Tensor *x2 = ctx.Input(1);
+  Tensor *grad_y1 = ctx.Input(2);
+  Tensor *grad_y2 = ctx.Input(3);
+
+  // type check
+  DataType grad_y1_type = grad_y1->GetDataType();
+  DataType grad_y2_type = grad_y2->GetDataType();
+  DataType x1_type = x1->GetDataType();
+  DataType x2_type = x2->GetDataType();
+  KERNEL_CHECK_FALSE(((grad_y1_type == grad_y2_type) && (grad_y2_type == x1_type) && (x1_type == x2_type)),
+                     KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of grad_y1 [%s], grad_y2 [%s], x1 [%s] and "
+                     "x2 [%s] need to be same.",
+                     DTypeStr(grad_y1_type).c_str(), DTypeStr(grad_y2_type).c_str(), DTypeStr(x1_type).c_str(),
+                     DTypeStr(x2_type).c_str())
+  // shape check
+  auto grad_y1_shape = grad_y1->GetTensorShape()->GetDimSizes();
+  auto grad_y2_shape = grad_y2->GetTensorShape()->GetDimSizes();
+  auto x1_shape = x1->GetTensorShape()->GetDimSizes();
+  auto x2_shape = x2->GetTensorShape()->GetDimSizes();
+  KERNEL_CHECK_FALSE(grad_y1_shape == x1_shape, KERNEL_STATUS_PARAM_INVALID, "Mismatch in shape of grad_y1 and x1.");
+  KERNEL_CHECK_FALSE(grad_y2_shape == x2_shape, KERNEL_STATUS_PARAM_INVALID, "Mismatch in shape of grad_y2 and x2.");
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MinimumGradGradCpuKernel::MinimumGradGradCompute(CpuKernelContext &ctx) {
+  Tensor *input0_tensor = ctx.Input(0);
+  Tensor *input1_tensor = ctx.Input(1);
+
+  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
+  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
+
+  Bcast bcast(input0_shape, input1_shape);
+  if (!bcast.IsValid()) {
+    KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return BcastCompute<T>(ctx, bcast);
+}
+
+template <typename T>
+uint32_t MinimumGradGradCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto in2 = reinterpret_cast<T *>(ctx.Input(2)->GetData());
+  auto in3 = reinterpret_cast<T *>(ctx.Input(3)->GetData());
+  auto out0 = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto out1 = reinterpret_cast<T *>(ctx.Output(1)->GetData());
+  auto out2 = reinterpret_cast<T *>(ctx.Output(2)->GetData());
+  *out0 = static_cast<T>(0);
+  *out1 = static_cast<T>(0);
+  int64_t data_num = ctx.Output(2)->NumElements();
+
+  for (int64_t i = 0; i < data_num; ++i) {
+    if (*(in0 + bcast.GetBroadcastXIndex(i)) <= *(in1 + bcast.GetBroadcastYIndex(i))) {
+      *(out2 + i) = *(in2 + bcast.GetBroadcastXIndex(i));
+    } else {
+      *(out2 + i) = *(in3 + bcast.GetBroadcastYIndex(i));
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMinimumGradGrad, MinimumGradGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum_grad_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum_grad_grad.h
@ -0,0 +1,44 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_MINIMUM_GRAD_GRAD_H_
+#define AICPU_KERNELS_NORMALIZED_MINIMUM_GRAD_GRAD_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class MinimumGradGradCpuKernel : public CpuKernel {
+ public:
+  MinimumGradGradCpuKernel() = default;
+  ~MinimumGradGradCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t MinimumGradGradParamCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t MinimumGradGradCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t NoBcastCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/mul_no_nan.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/mul_no_nan.cc
@ -0,0 +1,249 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mul_no_nan.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *kMulNoNan = "MulNoNan";
+// when input data size is more than kParallelDataNum, use Parallel func
+const int64_t kParallelDataNum = 8 * 1024;
+const int64_t kParallelDataNumMid = 64 * 1024;
+const int64_t kParallelDataNumSameShape = 32 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 256 * 1024;
+
+#define MULNONAN_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                          \
+    uint32_t result = MulNoNanCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                      \
+      KERNEL_LOG_ERROR("MulNoNan kernel compute failed."); \
+      return result;                                       \
+    }                                                      \
+    break;                                                 \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t MulNoNanCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MulNoNan check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MulNoNanParamCheck(ctx), "MulNoNan check params failed.");
+
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    MULNONAN_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    MULNONAN_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    MULNONAN_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    MULNONAN_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    MULNONAN_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    MULNONAN_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
+    MULNONAN_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
+    MULNONAN_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
+    MULNONAN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    MULNONAN_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    MULNONAN_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    MULNONAN_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
+    MULNONAN_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("MulNoNan kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MulNoNanCpuKernel::MulNoNanParamCheck(CpuKernelContext &ctx) {
+  // the non null of input_0, input_1, output has been verified in NormalCheck
+  Tensor *input_0 = ctx.Input(0);
+  Tensor *input_1 = ctx.Input(1);
+  Tensor *output = ctx.Output(0);
+  DataType input0_type = input_0->GetDataType();
+  DataType input1_type = input_1->GetDataType();
+  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with "
+                     "input1 [%s].",
+                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
+  KERNEL_LOG_DEBUG(
+    "LessCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+// special compute is used in the following situations.
+// 1. the shapes of input1 and input2 are the same
+// 2. input1 is a 1D tensor with only one element or input1 is scalar
+// 3. input2 is a 1D tensor with only one element or input2 is scalar
+// 4. the shapes of input1 and input2 are different
+template <typename T>
+void MulNoNanCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
+                                       const T *input2, T *output) {
+  switch (type) {
+    case BcastShapeType::SAME_SHAPE:
+      for (int64_t i = start; i < end; ++i) {
+        if (*(input2 + i) == (T)0) {
+          *(output + i) = (T)0;
+        } else {
+          *(output + i) = *(input1 + i) * *(input2 + i);
+        }
+      }
+      break;
+    case BcastShapeType::X_ONE_ELEMENT:
+      for (int64_t i = start; i < end; ++i) {
+        if (*(input2 + i) == (T)0) {
+          *(output + i) = (T)0;
+        } else {
+          *(output + i) = *input1 * *(input2 + i);
+        }
+      }
+      break;
+    case BcastShapeType::Y_ONE_ELEMENT:
+      if (*input2 == (T)0) {
+        for (int64_t i = start; i < end; ++i) {
+          *(output + i) = (T)0;
+        }
+      } else {
+        for (int64_t i = start; i < end; ++i) {
+          *(output + i) = *(input1 + i) * *input2;
+        }
+      }
+      break;
+    default:
+      KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
+      break;
+  }
+}
+
+template <typename T>
+uint32_t MulNoNanCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t in0_elements_nums = ctx.Input(0)->NumElements();
+  int64_t in1_elements_nums = ctx.Input(1)->NumElements();
+  int64_t data_num = ctx.Output(0)->NumElements();
+  BcastShapeType type = in0_elements_nums == in1_elements_nums
+                          ? BcastShapeType::SAME_SHAPE
+                          : (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
+
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumSameShapeMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_mul_no_nan = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, in0, in1, out); };
+
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("Divisor max_core_num is 0");
+    } else {
+      KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_mul_no_nan),
+                          "MulNoNan Compute failed.");
+    }
+  } else {
+    SpecialCompute<T>(type, 0, data_num, in0, in1, out);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MulNoNanCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Output(0)->NumElements();
+
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_mul_no_nan = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; ++i) {
+        if (*(in1 + bcast.GetBroadcastYIndex(i)) == (T)0) {
+          *(out + i) = (T)0;
+        } else {
+          *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) * *(in1 + bcast.GetBroadcastYIndex(i));
+        }
+      }
+    };
+
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("Divisor max_core_num is 0");
+    } else {
+      KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_mul_no_nan),
+                          "MulNoNan Compute failed.");
+    }
+
+  } else {
+    for (int64_t i = 0; i < data_num; ++i) {
+      if (*(in1 + bcast.GetBroadcastYIndex(i)) == (T)0) {
+        *(out + i) = (T)0;
+      } else {
+        *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) * *(in1 + bcast.GetBroadcastYIndex(i));
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MulNoNanCpuKernel::MulNoNanCompute(CpuKernelContext &ctx) {
+  Tensor *input0_tensor = ctx.Input(0);
+  auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input0_elements_nums = input0_tensor->NumElements();
+
+  Tensor *input1_tensor = ctx.Input(1);
+  auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input1_elements_nums = input1_tensor->NumElements();
+
+  bool noNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
+  if (noNeedBcast) {
+    return NoBcastCompute<T>(ctx);
+  } else {
+    Bcast bcast(input0_shape, input1_shape);
+    if (!bcast.IsValid()) {
+      KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    return BcastCompute<T>(ctx, bcast);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMulNoNan, MulNoNanCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/mul_no_nan.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/mul_no_nan.h
@ -0,0 +1,48 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_MUL_NO_NAN_H_
+#define AICPU_KERNELS_NORMALIZED_MUL_NO_NAN_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class MulNoNanCpuKernel : public CpuKernel {
+ public:
+  MulNoNanCpuKernel() = default;
+  ~MulNoNanCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t MulNoNanParamCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, T *output);
+
+  template <typename T>
+  uint32_t NoBcastCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
+
+  template <typename T>
+  uint32_t MulNoNanCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multilabel_margin_loss_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multilabel_margin_loss_grad.cc
@ -0,0 +1,196 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "multilabel_margin_loss_grad.h"
+
+#include <Eigen/Dense>
+#include <algorithm>
+#include <iostream>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const char *kMultilabelMarginLossGrad = "MultilabelMarginLossGrad";
+}  // namespace
+
+namespace aicpu {
+uint32_t MultilabelMarginLossGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  uint32_t kInputNum = 4;
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
+                      "MultilabelMarginLossGrad check input and output number failed.");
+  KERNEL_HANDLE_ERROR(MultilabelMarginLossGradCheck(ctx), "MultilabelMarginLossGrad check params failed.");
+  auto data_type = ctx.Input(1)->GetDataType();
+  switch (data_type) {
+    case DT_FLOAT16:
+      return MultilabelMarginLossGradComputeFP16<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return MultilabelMarginLossGradCompute<float>(ctx);
+    default:
+      KERNEL_LOG_ERROR("MultilabelMarginLossGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t MultilabelMarginLossGradCpuKernel::MultilabelMarginLossGradCheck(CpuKernelContext &ctx) {
+  auto target = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
+  size_t dims = ctx.Input(1)->GetTensorShape()->GetDims();
+  int64_t batch_size =
+    (dims == 2) ? ctx.Input(1)->GetTensorShape()->GetDimSize(1) : ctx.Input(1)->GetTensorShape()->GetDimSize(0);
+  size_t data_num = ctx.Input(1)->GetTensorShape()->NumElements();
+  AttrValue *Attr_red = ctx.GetAttr("reduction");
+  std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
+  for (size_t i = 0; i < data_num; i++) {
+    KERNEL_CHECK_FALSE(*(target + i) >= -1 && (*(target + i) < batch_size), KERNEL_STATUS_PARAM_INVALID,
+                       "[%s]'s target out of range.", ctx.GetOpType().c_str());
+  }
+  if (reduction == "none") {
+    if (dims == 1) {
+      KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() == 0, KERNEL_STATUS_PARAM_INVALID,
+                         "[%s]'s y_grad should be a scalar "
+                         "when rank of x is 1.",
+                         ctx.GetOpType().c_str())
+    } else {
+      KERNEL_CHECK_FALSE(
+        ctx.Input(0)->GetTensorShape()->GetDims() == 1 &&
+          ctx.Input(0)->GetTensorShape()->GetDimSize(0) == ctx.Input(1)->GetTensorShape()->GetDimSize(0),
+        KERNEL_STATUS_PARAM_INVALID,
+        "[%s]'s y_grad's shape should be the same as "
+        "{x_shape[0]} when the rank of x is 2 and reduction is none.",
+        ctx.GetOpType().c_str())
+    }
+  } else {
+    KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() == 0, KERNEL_STATUS_PARAM_INVALID,
+                       "[%s]'s y_grad should be a scalar "
+                       "when reduction is mean or sum.",
+                       ctx.GetOpType().c_str())
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MultilabelMarginLossGradCpuKernel::MultilabelMarginLossGradCompute(CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto input_target = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
+  auto input_istarget = reinterpret_cast<int32_t *>(ctx.Input(3)->GetData());
+  auto output_x_grad = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  AttrValue *Attr_red = ctx.GetAttr("reduction");
+  std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
+  size_t dims = ctx.Input(1)->GetTensorShape()->GetDims();
+  size_t batch_size =
+    (dims == 2) ? ctx.Input(1)->GetTensorShape()->GetDimSize(1) : ctx.Input(1)->GetTensorShape()->GetDimSize(0);
+  size_t data_num = ctx.Input(1)->GetTensorShape()->NumElements();
+  size_t nframe = data_num / batch_size;
+  auto g = static_cast<T>(reduction == "mean" ? 1. / data_num : 1. / batch_size);
+  std::vector<T> output_vector(data_num, 0);
+  for (size_t t = 0; t < nframe; t++) {
+    for (size_t m = 0; m < batch_size; m++) {
+      int32_t target_idx = input_target[m];
+      if (target_idx < 0) {
+        break;
+      }
+      auto calc_target = input_x[target_idx];
+      for (size_t n = 0; n < batch_size; n++) {
+        if (input_istarget[n] == 0) {
+          float z = 1 - calc_target + input_x[n];
+          if (z > 0) {
+            output_vector[t * batch_size + target_idx] -= g;
+            output_vector[t * batch_size + n] += g;
+          }
+        }
+      }
+    }
+    input_x += batch_size;
+    input_target += batch_size;
+    input_istarget += batch_size;
+  }
+  auto y_grad = ctx.Input(0);
+  auto y_grad_data = reinterpret_cast<T *>(y_grad->GetData());
+  size_t y_grad_dims = y_grad->GetTensorShape()->GetDims();
+  if (reduction != "none" || y_grad_dims == 0) {
+    for (size_t i = 0; i < data_num; i++) {
+      *(output_x_grad + i) = output_vector[i] * (*(y_grad_data));
+    }
+  } else {
+    for (size_t i = 0; i < nframe; i++) {
+      for (size_t j = 0; j < batch_size; j++) {
+        *(output_x_grad + i * batch_size + j) = output_vector[i * batch_size + j] * (*(y_grad_data + i));
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t MultilabelMarginLossGradCpuKernel::MultilabelMarginLossGradComputeFP16(CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto input_target = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
+  auto input_istarget = reinterpret_cast<int32_t *>(ctx.Input(3)->GetData());
+  auto output_x_grad = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  AttrValue *Attr_red = ctx.GetAttr("reduction");
+  std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
+  size_t dims = ctx.Input(1)->GetTensorShape()->GetDims();
+  size_t batch_size =
+    (dims == 2) ? ctx.Input(1)->GetTensorShape()->GetDimSize(1) : ctx.Input(1)->GetTensorShape()->GetDimSize(0);
+  size_t data_num = ctx.Input(1)->GetTensorShape()->NumElements();
+  size_t nframe = data_num / batch_size;
+  float g = static_cast<float>(reduction == "mean" ? 1. / data_num : 1. / batch_size);
+  std::vector<float> output_vector(data_num, 0);
+  for (size_t t = 0; t < nframe; t++) {
+    for (size_t m = 0; m < batch_size; m++) {
+      int32_t target_idx = input_target[m];
+      if (target_idx < 0) {
+        break;
+      }
+      float calc_target = static_cast<float>(input_x[target_idx]);
+      for (size_t n = 0; n < batch_size; n++) {
+        if (input_istarget[n] == 0) {
+          float z = 1 - calc_target + static_cast<float>(input_x[n]);
+          if (z > 0) {
+            output_vector[t * batch_size + target_idx] -= g;
+            output_vector[t * batch_size + n] += g;
+          }
+        }
+      }
+    }
+    input_x += batch_size;
+    input_target += batch_size;
+    input_istarget += batch_size;
+  }
+  auto y_grad = ctx.Input(0);
+  auto y_grad_data = reinterpret_cast<T *>(y_grad->GetData());
+  size_t y_grad_dims = y_grad->GetTensorShape()->GetDims();
+  if (reduction != "none" || y_grad_dims == 0) {
+    for (size_t i = 0; i < data_num; i++) {
+      *(output_x_grad + i) = static_cast<T>(output_vector[i] * static_cast<float>(*(y_grad_data)));
+    }
+  } else {
+    for (size_t i = 0; i < nframe; i++) {
+      for (size_t j = 0; j < batch_size; j++) {
+        *(output_x_grad + i * batch_size + j) =
+          static_cast<T>(output_vector[i * batch_size + j] * static_cast<float>(*(y_grad_data + i)));
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kMultilabelMarginLossGrad, MultilabelMarginLossGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multilabel_margin_loss_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multilabel_margin_loss_grad.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_MULITLABEL_MARGIN_LOSS_GRAD_H_
+#define AICPU_KERNELS_NORMALIZED_MULTILABEL_MARGIN_LOSS_GRAD_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class MultilabelMarginLossGradCpuKernel : public CpuKernel {
+ public:
+  MultilabelMarginLossGradCpuKernel() = default;
+  ~MultilabelMarginLossGradCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  static uint32_t MultilabelMarginLossGradCheck(CpuKernelContext &ctx);
+  template <typename T>
+  static uint32_t MultilabelMarginLossGradCompute(CpuKernelContext &ctx);
+  template <typename T>
+  static uint32_t MultilabelMarginLossGradComputeFP16(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/non_max_suppression_with_overlaps.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/non_max_suppression_with_overlaps.cc
@ -0,0 +1,168 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "non_max_suppression_with_overlaps.h"
+
+#include <algorithm>
+#include <queue>
+
+#include "Eigen/Core"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "cpu_attr_value.h"
+#include "cpu_tensor.h"
+#include "cpu_tensor_shape.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/allocator_utils.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kNonMaxSuppressionWithOverlaps = "NonMaxSuppressionWithOverlaps";
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 5;
+const uint32_t kFirstInputIndex = 0;
+const uint32_t kSecondInputIndex = 1;
+const uint32_t kThirdInputIndex = 2;
+const uint32_t kforthInputIndex = 3;
+const uint32_t kfifthInputIndex = 4;
+const uint32_t kFirstOutputIndex = 0;
+const uint32_t kOverlapsRank = 2;
+}  // namespace
+
+namespace aicpu {
+uint32_t NonMaxSuppressionWithOverlapsCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
+                      "NonMaxSuppressionWithOverlaps check input and output number failed.");
+  overlaps_ = ctx.Input(kFirstInputIndex);
+  scores_ = ctx.Input(kSecondInputIndex);
+  Tensor *max_output_size_tensor = ctx.Input(kThirdInputIndex);
+  max_output_size_ = *static_cast<int32_t *>(max_output_size_tensor->GetData());
+  KERNEL_CHECK_FALSE((max_output_size_ >= 0), KERNEL_STATUS_PARAM_INVALID,
+                     "The input max_output_size must be non-negative");
+  overlap_threshold_tensor_ = ctx.Input(kforthInputIndex);
+  score_threshold_tensor_ = ctx.Input(kfifthInputIndex);
+  output_indices_ = ctx.Output(kFirstOutputIndex);
+
+  std::shared_ptr<TensorShape> overlaps_shape = overlaps_->GetTensorShape();
+  int32_t overlaps_rank = overlaps_shape->GetDims();
+  if (overlaps_rank != kOverlapsRank || overlaps_shape->GetDimSize(0) != overlaps_shape->GetDimSize(1)) {
+    KERNEL_LOG_ERROR(
+      "The input dim size of overlaps must be 2-D and must be square, "
+      "while %d, %lld",
+      overlaps_rank, overlaps_shape->GetDimSize(1));
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  num_boxes_ = overlaps_shape->GetDimSize(0);
+
+  std::shared_ptr<TensorShape> scores_shape = scores_->GetTensorShape();
+  int32_t scores_rank = scores_shape->GetDims();
+  KERNEL_CHECK_FALSE((scores_rank == 1), KERNEL_STATUS_PARAM_INVALID,
+                     "The input dim size of scores must be 1-D, while %d.", scores_rank);
+  KERNEL_CHECK_FALSE((scores_shape->GetDimSize(0) == num_boxes_), KERNEL_STATUS_PARAM_INVALID,
+                     "The len of scores must be equal to the number of boxes, "
+                     "while dims[%lld], num_boxes_[%d].",
+                     scores_shape->GetDimSize(0), num_boxes_);
+
+  overlaps_dtype_ = static_cast<DataType>(overlaps_->GetDataType());
+  if (overlaps_dtype_ != DT_FLOAT) {
+    KERNEL_LOG_ERROR("The dtype of input[0] overlaps must be float.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  scores_dtype_ = static_cast<DataType>(scores_->GetDataType());
+  if (scores_dtype_ != DT_FLOAT) {
+    KERNEL_LOG_ERROR("The dtype of input[1] scores must be float.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  overlap_threshold_dtype_ = static_cast<DataType>(overlap_threshold_tensor_->GetDataType());
+  if (overlap_threshold_dtype_ != DT_FLOAT) {
+    KERNEL_LOG_ERROR("The dtype of input[3] overlap_threshold must be float.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  score_threshold_dtype_ = static_cast<DataType>(score_threshold_tensor_->GetDataType());
+  if (score_threshold_dtype_ != DT_FLOAT) {
+    KERNEL_LOG_ERROR("The dtype of input[4] score_threshold must be float.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T, typename T_threshold>
+uint32_t NonMaxSuppressionWithOverlapsCpuKernel::DoNonMaxSuppressionWithOverlapsOp() {
+  KERNEL_LOG_INFO("DoNonMaxSuppressionWithOverlapsOp start!!");
+  Eigen::TensorMap<Eigen::Tensor<T, kOverlapsRank, Eigen::RowMajor>> overlaps_map(
+    reinterpret_cast<T *>(overlaps_->GetData()), num_boxes_, num_boxes_);
+  std::vector<T> scores_data(num_boxes_);
+  std::copy_n(reinterpret_cast<T *>(scores_->GetData()), num_boxes_, scores_data.begin());
+  auto overlap_threshold = static_cast<T>(*(static_cast<T_threshold *>(overlap_threshold_tensor_->GetData())));
+  auto score_threshold = static_cast<T>(*(static_cast<T_threshold *>(score_threshold_tensor_->GetData())));
+  std::unique_ptr<int32_t[]> indices_data(new int32_t[max_output_size_]);
+  if (indices_data == nullptr) {
+    KERNEL_LOG_ERROR("DoNonMaxSuppressionWithOverlapsOp: new indices_data failed");
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  struct Candidate {
+    int box_index;
+    T score;
+    int suppress_begin_index;
+  };
+  auto cmp = [](const Candidate boxes_i, const Candidate boxes_j) { return boxes_i.score < boxes_j.score; };
+  std::priority_queue<Candidate, std::deque<Candidate>, decltype(cmp)> candidate_priority_queue(cmp);
+  for (uint32_t i = 0; i < scores_data.size(); ++i) {
+    if (scores_data[i] > score_threshold) {
+      candidate_priority_queue.emplace(Candidate({(int)i, scores_data[i]}));
+    }
+  }
+  T similarity = static_cast<T>(0.0);
+  Candidate next_candidate = {.box_index = 0, .score = static_cast<T>(0.0), .suppress_begin_index = 0};
+  int32_t cnt = 0;
+  while (cnt < max_output_size_ && !candidate_priority_queue.empty()) {
+    next_candidate = candidate_priority_queue.top();
+    candidate_priority_queue.pop();
+    bool should_suppress = false;
+    for (int j = cnt - 1; j >= next_candidate.suppress_begin_index; --j) {
+      similarity = overlaps_map(next_candidate.box_index, indices_data[j]);
+      if (similarity >= overlap_threshold) {
+        should_suppress = true;
+        break;
+      }
+    }
+    next_candidate.suppress_begin_index = cnt;
+    if (!should_suppress) {
+      indices_data[cnt] = next_candidate.box_index;
+      cnt += 1;
+    }
+  }
+  auto value = reinterpret_cast<int32_t *>(output_indices_->GetData());
+  for (int j = 0; j <= std::min(cnt, max_output_size_) - 1; j++) {
+    *(value + j) = indices_data[j];
+  }
+  output_indices_->GetTensorShape()->SetDimSizes({std::min(cnt, max_output_size_)});
+  KERNEL_LOG_INFO("DoNonMaxSuppressionWithOverlapsOp end!!");
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t NonMaxSuppressionWithOverlapsCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_LOG_INFO("NonMaxSuppressionWithOverlaps kernel in.");
+  uint32_t res = GetInputAndCheck(ctx);
+  if (res != KERNEL_STATUS_OK) {
+    return res;
+  }
+  res = DoNonMaxSuppressionWithOverlapsOp<float, float>();
+  KERNEL_CHECK_FALSE((res == KERNEL_STATUS_OK), res, "Compute failed.");
+  KERNEL_LOG_INFO("Compute end!!");
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kNonMaxSuppressionWithOverlaps, NonMaxSuppressionWithOverlapsCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/non_max_suppression_with_overlaps.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/non_max_suppression_with_overlaps.h
@ -0,0 +1,47 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_NON_MAX_SUPPRESSION_WITH_OVERLAPS_H_
+#define AICPU_KERNELS_NORMALIZED_NON_MAX_SUPPRESSION_WITH_OVERLAPS_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+#include "eigen_tensor.h"
+
+namespace aicpu {
+class NonMaxSuppressionWithOverlapsCpuKernel : public CpuKernel {
+ public:
+  ~NonMaxSuppressionWithOverlapsCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t GetInputAndCheck(CpuKernelContext &ctx);
+  template <typename T, typename T_threshold>
+  uint32_t DoNonMaxSuppressionWithOverlapsOp();
+
+  const Tensor *overlaps_ = nullptr;
+  Tensor *scores_ = nullptr;
+  Tensor *overlap_threshold_tensor_ = nullptr;
+  Tensor *score_threshold_tensor_ = nullptr;
+  Tensor *output_indices_ = nullptr;
+  int32_t num_boxes_ = 0;
+  int32_t max_output_size_ = 0;
+  DataType overlaps_dtype_ = DT_UINT32;
+  DataType scores_dtype_ = DT_UINT32;
+  DataType overlap_threshold_dtype_ = DT_UINT32;
+  DataType score_threshold_dtype_ = DT_UINT32;
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_NON_MAX_SUPPRESSION_WITH_OVERLAPS_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nth_element.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nth_element.cc
@ -0,0 +1,138 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nth_element.h"
+
+#include <vector>
+#include <algorithm>
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+#include "utils/eigen_tensor.h"
+#include "kernel_log.h"
+#include "status.h"
+using namespace std;
+
+namespace {
+const char *kNthElement = "NthElement";
+constexpr uint64_t kParallelDataNums = 32 * 1024;
+
+#define NTHELEMENT_COMPUTE_CASE(DTYPE, TYPE, X, Y, N, LAST_DIM, CTX)   \
+  case (DTYPE): {                                                      \
+    uint32_t result = NthElementCompute<TYPE>(X, Y, N, LAST_DIM, CTX); \
+    if (result != KERNEL_STATUS_OK) {                                  \
+      KERNEL_LOG_ERROR("NthElement kernel compute failed.");           \
+      return result;                                                   \
+    }                                                                  \
+    break;                                                             \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t NthElement::Compute(CpuKernelContext &ctx) {
+  Tensor *input_n = ctx.Input(1);
+  KERNEL_CHECK_FALSE((input_n->GetTensorShape()->GetDimSizes().empty()), KERNEL_STATUS_PARAM_INVALID,
+                     "Input n must be a scalar.");
+  DataType n_type = input_n->GetDataType();
+  KERNEL_CHECK_FALSE((n_type == DT_INT32), KERNEL_STATUS_PARAM_INVALID, "The type of input n must be int32.");
+  KERNEL_CHECK_NULLPTR(input_n->GetData(), KERNEL_STATUS_PARAM_INVALID, "NthElement Get input n failed.");
+  int32_t *n_data = reinterpret_cast<int32_t *>(input_n->GetData());
+  int32_t n = *n_data;
+  KERNEL_CHECK_FALSE((n >= 0), KERNEL_STATUS_PARAM_INVALID, "Input n must be non-negative but is [%d].", n);
+
+  Tensor *x = ctx.Input(0);
+  KERNEL_CHECK_NULLPTR(x, KERNEL_STATUS_PARAM_INVALID, "NthElement Get input x failed.");
+  auto x_shape = x->GetTensorShape();
+  int32_t dims = x_shape->GetDims();
+  KERNEL_CHECK_FALSE((dims >= 1), KERNEL_STATUS_PARAM_INVALID, "Input x must be at least rank 1 but is rank [%d]",
+                     dims);
+  const int32_t last_dim = x_shape->GetDimSize(dims - 1);
+  KERNEL_CHECK_FALSE((last_dim > n), KERNEL_STATUS_PARAM_INVALID, "Input x must have last dimension = [%d] > n = [%d]",
+                     last_dim, n);
+
+  AttrValue *reverse_attr = ctx.GetAttr("reverse");
+  KERNEL_CHECK_NULLPTR(reverse_attr, KERNEL_STATUS_PARAM_INVALID, "NthElement get attr reverse failed.");
+  bool reverse = reverse_attr->GetBool();
+  if (reverse) {
+    n = last_dim - n - 1;
+  }
+
+  Tensor *y = ctx.Output(0);
+
+  auto x_type = x->GetDataType();
+  switch (x_type) {
+    NTHELEMENT_COMPUTE_CASE(DT_FLOAT, float, x, y, n, last_dim, ctx)
+    NTHELEMENT_COMPUTE_CASE(DT_FLOAT16, Eigen::half, x, y, n, last_dim, ctx)
+    NTHELEMENT_COMPUTE_CASE(DT_UINT8, uint8_t, x, y, n, last_dim, ctx)
+    NTHELEMENT_COMPUTE_CASE(DT_UINT16, uint16_t, x, y, n, last_dim, ctx)
+    NTHELEMENT_COMPUTE_CASE(DT_INT8, int8_t, x, y, n, last_dim, ctx)
+    NTHELEMENT_COMPUTE_CASE(DT_INT16, int16_t, x, y, n, last_dim, ctx)
+    NTHELEMENT_COMPUTE_CASE(DT_INT32, int32_t, x, y, n, last_dim, ctx)
+    NTHELEMENT_COMPUTE_CASE(DT_INT64, int64_t, x, y, n, last_dim, ctx)
+    NTHELEMENT_COMPUTE_CASE(DT_DOUBLE, double, x, y, n, last_dim, ctx)
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                       DTypeStr(x_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t NthElement::NthElementCompute(Tensor *x, Tensor *y, const int32_t n, const int32_t last_dim,
+                                       CpuKernelContext &ctx) {
+  T *x_addrs = reinterpret_cast<T *>(x->GetData());
+  T *y_addrs = reinterpret_cast<T *>(y->GetData());
+
+  const uint64_t num_rows = y->NumElements();
+  const uint64_t num = x->NumElements();
+
+  if (num <= kParallelDataNums) {
+    std::vector<T> buf(last_dim);
+    for (size_t i = 0; i < num_rows; i++) {
+      const T *input_start = x_addrs + i * last_dim;
+      const T *input_end = input_start + last_dim;
+      std::copy(input_start, input_end, buf.begin());
+      std::nth_element(buf.begin(), buf.begin() + n, buf.end());
+      y_addrs[i] = buf[n];
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > num_rows) {
+      max_core_num = num_rows;
+    }
+    auto shard_nth_element = [&](size_t start, size_t end) {
+      std::vector<T> buf(last_dim);
+      for (size_t i = start; i < end; ++i) {
+        const T *input_start = x_addrs + i * last_dim;
+        const T *input_end = input_start + last_dim;
+        std::copy(input_start, input_end, buf.begin());
+        std::nth_element(buf.begin(), buf.begin() + n, buf.end());
+        y_addrs[i] = buf[n];
+      }
+    };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0");
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, num_rows, num_rows / max_core_num, shard_nth_element),
+                        "NthElement Parallel Compute failed.");
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kNthElement, NthElement);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nth_element.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nth_element.h
@ -0,0 +1,34 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_NTH_ELEMENT_H_
+#define AICPU_KERNELS_NORMALIZED_NTH_ELEMENT_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+
+namespace aicpu {
+class NthElement : public CpuKernel {
+ public:
+  NthElement() = default;
+  ~NthElement() override = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t NthElementCompute(Tensor *x, Tensor *y, const int32_t n, const int32_t last_dim, CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/one_hot.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/one_hot.cc
@ -0,0 +1,198 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file one_hot.cc
+ * \brief
+ */
+#include "one_hot.h"
+#include <string>
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "cpu_types.h"
+#include "utils/kernel_util.h"
+#include "utils/eigen_tensor.h"
+#include "utils/sparse_tensor.h"
+
+namespace {
+const uint32_t kInputNum = 4;
+const uint32_t kOutputNum = 1;
+const char *kOneHot = "OneHot";
+const int64_t kParallelDataNumSameShape = 100 * 1024;
+#define ONE_HOT_INPUT_COMPUTE_CASE(DTYPE, TYPE, ODTYPE, CTX)                                                      \
+  case (DTYPE): {                                                                                                 \
+    switch (ODTYPE) {                                                                                             \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_COMPLEX64, std::complex<float>, CTX)                            \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_COMPLEX128, std::complex<double>, CTX)                          \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_DOUBLE, double, CTX)                                            \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_FLOAT, float_t, CTX);                                           \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_FLOAT16, Eigen::half, CTX)                                      \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_INT8, int8_t, CTX)                                              \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_INT16, int16_t, CTX)                                            \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_INT32, int32_t, CTX)                                            \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_INT64, int64_t, CTX)                                            \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_UINT8, uint8_t, CTX)                                            \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_UINT16, uint16_t, CTX)                                          \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_UINT32, uint32_t, CTX)                                          \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_UINT64, uint64_t, CTX)                                          \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_BOOL, bool, CTX)                                                \
+      ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_STRING, std::string, CTX)                                       \
+      default:                                                                                                    \
+        KERNEL_LOG_ERROR("OneHot kernel output data type [%s] not support.", DTypeStr(output_data_type).c_str()); \
+        return KERNEL_STATUS_PARAM_INVALID;                                                                       \
+    }                                                                                                             \
+    break;                                                                                                        \
+  }
+
+#define ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, ODTYPE, OTYPE, CTX) \
+  case (ODTYPE): {                                                   \
+    uint32_t result = OneHotCompute<OTYPE, TYPE>(CTX);               \
+    if (result != KERNEL_STATUS_OK) {                                \
+      KERNEL_LOG_ERROR("OneHot kernel compute failed.");             \
+      return result;                                                 \
+    }                                                                \
+    break;                                                           \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t OneHotCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "OneHot check input and output number failed.");
+  KERNEL_HANDLE_ERROR(OneHotParamCheck(ctx), "OneHot check params failed.");
+  auto input_data_type = ctx.Input(0)->GetDataType();
+  auto output_data_type = ctx.Output(0)->GetDataType();
+  switch (input_data_type) {
+    ONE_HOT_INPUT_COMPUTE_CASE(DT_UINT8, uint8_t, output_data_type, ctx);
+    ONE_HOT_INPUT_COMPUTE_CASE(DT_INT32, int32_t, output_data_type, ctx);
+    ONE_HOT_INPUT_COMPUTE_CASE(DT_INT64, int64_t, output_data_type, ctx);
+    default:
+      KERNEL_LOG_ERROR("OneHot kernel input data type [%s] not support.", DTypeStr(input_data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T, typename TI>
+uint32_t OneHotCpuKernel::OneHotCompute(CpuKernelContext &ctx) {
+  // 输入张量
+  Tensor *indices = ctx.Input(0);
+  // 输出张量
+  Tensor *output = ctx.Output(0);
+  // 输入张量数据
+  auto indices_data = reinterpret_cast<TI *>(indices->GetData());
+  // 输出张量数据
+  auto output_data = reinterpret_cast<T *>(output->GetData());
+  // depth值
+  auto depth = reinterpret_cast<int32_t *>(ctx.Input(1)->GetData());
+  // on_value值
+  auto on_value = reinterpret_cast<T *>(ctx.Input(2)->GetData());
+  // off_value值
+  auto off_value = reinterpret_cast<T *>(ctx.Input(3)->GetData());
+  // 输入张量形状
+  auto indices_shape = indices->GetTensorShape();
+  // axis值
+  int64_t axis = ctx.GetAttr("axis") == nullptr ? -1 : ctx.GetAttr("axis")->GetInt();
+  if (axis == -1) {
+    axis = indices_shape->GetDims();
+  }
+  // 输出张量形状
+  auto output_shape = output->GetTensorShape();
+  // 对输出张量用off_value进行初始化匿名函数
+  auto init_output_func = [&](int64_t start, int64_t end) -> void {
+    for (int i = start; i < end; ++i) {
+      *(output_data + i) = *(off_value);
+    }
+  };
+  // 计算axis前维度大小
+  int64_t prefix_dim_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    prefix_dim_size *= indices_shape->GetDimSize(i);
+  }
+  // 计算计算axis后维度大小
+  int64_t suffix_dim_size = indices_shape->NumElements() / prefix_dim_size;
+  // 输入张量元素总个数
+  int64_t data_num = indices_shape->NumElements();
+  // depth_value为depth的具体值
+  int32_t depth_value = *(depth);
+  // 将输出张量的维度看做{prefix_dim_size，depth, suffix_dim_size}
+  // 通过offset = suffix_dim_size == 1?(d0 * depth_value + d1):(d0 * prefix_dim_size * depth_value + d1 *
+  // suffix_dim_size  + d2)来计算出独热张量有效值的位置 然后对输出张量的该位置赋值为on_value
+  const auto get_output_func = [&](int64_t start, int64_t end) -> void {
+    for (int64_t i = start; i < end; ++i) {
+      int64_t d0 = i / suffix_dim_size;
+      int64_t d1 = i - (d0 * suffix_dim_size);
+      int64_t depth_v = SubtleMustCopy<int64_t>(*(indices_data + d0 * suffix_dim_size + d1));
+      if (depth_v < static_cast<int64_t>(depth_value) && depth_v >= 0) {
+        int64_t offset = suffix_dim_size == 1 ? i * depth_value + depth_v
+                                              : d0 * depth_value * suffix_dim_size + depth_v * suffix_dim_size + d1;
+        *(output_data + offset) = *(on_value);
+      }
+    }
+  };
+  // 使用CpuKernelUtils::GetCPUNum接口获取AI CPU的核数
+  uint32_t max_core_num = std::max(1U, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+  // 多线程执行状态
+  bool run_state = true;
+  // 对于数据量小于100K的场景则只单核运行，否则使用实际的AI CPU总核数进行计算
+  if (data_num >= kParallelDataNumSameShape) {
+    max_core_num = (max_core_num > data_num) ? data_num : max_core_num;
+    max_core_num = max_core_num == 0 ? 1 : max_core_num;
+    uint32_t ret1 = CpuKernelUtils::ParallelFor(ctx, output_shape->NumElements(),
+                                                (output_shape->NumElements() / max_core_num), init_output_func);
+    uint32_t ret2 = CpuKernelUtils::ParallelFor(ctx, data_num, (data_num / max_core_num), get_output_func);
+    run_state = (ret1 == KERNEL_STATUS_OK) && (ret2 == KERNEL_STATUS_OK);
+  } else {
+    // 输入数据大小没有100k，单核调用
+    init_output_func(0, output_shape->NumElements());
+    get_output_func(0, data_num);
+  }
+  return run_state ? KERNEL_STATUS_OK : KERNEL_STATUS_INNER_ERROR;
+}
+
+// 参数校验
+uint32_t OneHotCpuKernel::OneHotParamCheck(CpuKernelContext &ctx) {
+  Tensor *indices = ctx.Input(0);
+  Tensor *depth = ctx.Input(1);
+  Tensor *on_value = ctx.Input(2);
+  Tensor *off_value = ctx.Input(3);
+  int64_t axis = ctx.GetAttr("axis") == nullptr ? -1 : ctx.GetAttr("axis")->GetInt();
+
+  DataType on_value_type = on_value->GetDataType();
+  DataType off_value_type = off_value->GetDataType();
+  KERNEL_CHECK_FALSE((on_value_type == off_value_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of on_value [%s] need be same with off_value [%s].",
+                     DTypeStr(on_value_type).c_str(), DTypeStr(off_value_type).c_str())
+  auto depth_shape = depth->GetTensorShape();
+  auto on_value_shape = on_value->GetTensorShape();
+  auto off_value_shape = off_value->GetTensorShape();
+  KERNEL_CHECK_FALSE((depth_shape->GetDims() == 0), KERNEL_STATUS_PARAM_INVALID,
+                     "Depth must be a scalar, actual dim num is %d.", depth_shape->GetDims())
+  KERNEL_CHECK_FALSE((on_value_shape->GetDims() == 0), KERNEL_STATUS_PARAM_INVALID,
+                     "On_value must be a scalar, actual dim num is %d.", on_value_shape->GetDims())
+  KERNEL_CHECK_FALSE((off_value_shape->GetDims() == 0), KERNEL_STATUS_PARAM_INVALID,
+                     "Off_value must be a scalar , actual dim num is %d.", off_value_shape->GetDims())
+  int32_t output_dims = indices->GetTensorShape()->GetDims() + 1;
+  KERNEL_CHECK_FALSE(((axis > -2 && axis < output_dims)), KERNEL_STATUS_PARAM_INVALID,
+                     "Expected axis value should between [-1, %d]. But received: %d.", output_dims - 1, axis)
+  int32_t depth_value = *(reinterpret_cast<int32_t *>(ctx.Input(1)->GetData()));
+  KERNEL_CHECK_FALSE((depth_value >= 0), KERNEL_STATUS_PARAM_INVALID,
+                     "Depth should be a non-negative. But received: %d.", depth_value)
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kOneHot, OneHotCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/one_hot.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/one_hot.h
@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file one_hot.h
+ * \brief
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_ONE_HOT_H_
+#define AICPU_KERNELS_NORMALIZED_ONE_HOT_H_
+
+#include <type_traits>
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class OneHotCpuKernel : public CpuKernel {
+ public:
+  OneHotCpuKernel() = default;
+  ~OneHotCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T, typename TI>
+  uint32_t OneHotCompute(CpuKernelContext &ctx);
+
+  uint32_t OneHotParamCheck(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/orgqr.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/orgqr.cc
@ -0,0 +1,228 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orgqr.h"
+
+#include "Eigen/Dense"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include <numeric>
+#include <iostream>
+
+using namespace Eigen;
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *kOrgqr = "Orgqr";
+const double ZERO = 0.;
+const uint32_t kTWO = 2;
+constexpr int64_t kParallelDataNums = 18 * 1024;
+constexpr int64_t kParallelDataNumsMid = 32 * 1024;
+
+#define ORGQR_COMPUTE(DTYPE, TYPE, CTX)                 \
+  case (DTYPE): {                                       \
+    uint32_t result = OrgqrCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                   \
+      KERNEL_LOG_ERROR("Orgqr kernel compute failed."); \
+      return result;                                    \
+    }                                                   \
+    break;                                              \
+  }
+#define ORGQR_COMPUTE_COMPLEX(DTYPE, TYPE, CTX)         \
+  case (DTYPE): {                                       \
+    uint32_t result = OrgqrComputeComplex<TYPE>(CTX);   \
+    if (result != KERNEL_STATUS_OK) {                   \
+      KERNEL_LOG_ERROR("Orgqr kernel compute failed."); \
+      return result;                                    \
+    }                                                   \
+    break;                                              \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t OrgqrCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Orgqr check input and output number failed.");
+  KERNEL_HANDLE_ERROR(OrgqrCheck(ctx), "[%s] check params failed.", kOrgqr);
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    ORGQR_COMPUTE(DT_FLOAT, float, ctx)
+    ORGQR_COMPUTE(DT_DOUBLE, double, ctx)
+    ORGQR_COMPUTE_COMPLEX(DT_COMPLEX64, std::complex<float_t>, ctx)
+    ORGQR_COMPUTE_COMPLEX(DT_COMPLEX128, std::complex<double_t>, ctx)
+    default:
+      KERNEL_LOG_ERROR("Orgqr kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t OrgqrCpuKernel::OrgqrCheck(CpuKernelContext &ctx) {
+  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  size_t shape_size = shape_x.size();
+  KERNEL_CHECK_FALSE((shape_size > 1), KERNEL_STATUS_PARAM_INVALID, "Input x must be at least rank 2.")
+  KERNEL_CHECK_FALSE((shape_x[shape_size - kTWO] > 0), KERNEL_STATUS_PARAM_INVALID,
+                     "Dimension [%zu] of input x must be at least 1, but [%zu].", shape_size - kTWO,
+                     shape_x[shape_size - kTWO])
+  KERNEL_CHECK_FALSE((shape_x[shape_size - 1] > 0), KERNEL_STATUS_PARAM_INVALID,
+                     "Dimension [%zu] of input x must be at least 1, but [%zu].", shape_size - 1,
+                     shape_x[shape_size - 1])
+  KERNEL_CHECK_FALSE((shape_x[shape_size - kTWO] >= shape_x[shape_size - 1]), KERNEL_STATUS_PARAM_INVALID,
+                     "Dimension [%zu] of input x must be bigger than dimension [%zu], when input x has rank [%zu].",
+                     shape_size - kTWO, shape_size - 1, shape_size)
+  std::vector<int64_t> shape_tau = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  size_t shape_tau_size = shape_tau.size();
+  KERNEL_CHECK_FALSE((shape_x[shape_size - 1] >= shape_tau[shape_tau_size - 1]), KERNEL_STATUS_PARAM_INVALID,
+                     "Dimension [%zu] of input tau must be less than [%zu], but [%zu].", shape_tau_size - 1,
+                     shape_x[shape_size - 1], shape_tau[shape_tau_size - 1])
+  if (shape_size > kTWO) {
+    KERNEL_CHECK_FALSE((shape_x[0] == shape_tau[0]), KERNEL_STATUS_PARAM_INVALID,
+                       "Dimension 0 of input tau must equal Dimension 0 of input x when input has batch")
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t OrgqrCpuKernel::OrgqrCompute(CpuKernelContext &ctx) {
+  auto *x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto *tau = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto *y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  size_t shape_size = shape_x.size();
+  size_t m = shape_x[shape_size - kTWO];
+  size_t n = shape_x[shape_size - 1];
+  std::vector<int64_t> shape_tau = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  size_t p = *(shape_tau.end() - 1);
+  size_t size_mn = m * n;
+  size_t matrix_num = ctx.Input(0)->NumElements() / size_mn;
+  int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
+  typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartrixXd;
+  typedef Eigen::Matrix<T, Eigen::Dynamic, 1> VectorXd;
+  if (data_size <= kParallelDataNums) {
+    for (size_t i = 0; i < matrix_num; i++) {
+      Eigen::Map<MartrixXd> martrix_y(y + i * m * n, m, n);
+      Eigen::Map<MartrixXd> martrix_x(x + i * m * n, m, n);
+      MartrixXd tmp = MartrixXd::Identity(m, m);
+      Eigen::Map<VectorXd> vector_tau(tau + i * p, p, 1);
+      for (size_t k = 0; k < p; k++) {
+        VectorXd vector_v = martrix_x.block(k, k, m - k, 1);
+        vector_v[0] = 1;
+        tmp.rightCols(m - k) =
+          tmp.rightCols(m - k) - vector_tau(k) * (tmp.rightCols(m - k) * vector_v) * vector_v.transpose();
+      }
+      martrix_y = tmp.leftCols(n);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (data_size <= kParallelDataNumsMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+    if (max_core_num > matrix_num) {
+      max_core_num = matrix_num;
+    }
+    auto shard_qr = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        Eigen::Map<MartrixXd> martrix_y(y + i * m * n, m, n);
+        Eigen::Map<MartrixXd> martrix_x(x + i * m * n, m, n);
+        MartrixXd tmp = MartrixXd::Identity(m, m);
+        Eigen::Map<VectorXd> vector_tau(tau + i * p, p, 1);
+        for (size_t k = 0; k < p; k++) {
+          VectorXd vector_v = martrix_x.block(k, k, m - k, 1);
+          vector_v[0] = 1;
+          tmp.rightCols(m - k) =
+            tmp.rightCols(m - k) - vector_tau(k) * (tmp.rightCols(m - k) * vector_v) * vector_v.transpose();
+        }
+        martrix_y = tmp.leftCols(n);
+      }
+    };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_qr),
+                        "Orgqr Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t OrgqrCpuKernel::OrgqrComputeComplex(CpuKernelContext &ctx) {
+  auto *x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto *tau = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto *y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  size_t shape_size = shape_x.size();
+  size_t m = shape_x[shape_size - kTWO];
+  size_t n = shape_x[shape_size - 1];
+  std::vector<int64_t> shape_tau = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  size_t p = *(shape_tau.end() - 1);
+  size_t size_mn = m * n;
+  size_t matrix_num = ctx.Input(0)->NumElements() / size_mn;
+  int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
+  typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartrixXd;
+  typedef Eigen::Matrix<T, Eigen::Dynamic, 1> VectorXd;
+  if (data_size <= kParallelDataNums) {
+    for (size_t i = 0; i < matrix_num; i++) {
+      Eigen::Map<MartrixXd> martrix_y(y + i * m * n, m, n);
+      Eigen::Map<MartrixXd> martrix_x(x + i * m * n, m, n);
+      MartrixXd tmp = MartrixXd::Identity(m, m);
+      Eigen::Map<VectorXd> vector_tau(tau + i * p, p, 1);
+      for (size_t k = 0; k < p; k++) {
+        VectorXd vector_v = martrix_x.block(k, k, m - k, 1);
+        vector_v[0] = 1;
+        tmp.rightCols(m - k) =
+          tmp.rightCols(m - k) - vector_tau(k) * (tmp.rightCols(m - k) * vector_v) * vector_v.adjoint();
+      }
+      martrix_y = tmp.leftCols(n);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (data_size <= kParallelDataNumsMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+    if (max_core_num > matrix_num) {
+      max_core_num = matrix_num;
+    }
+    auto shard_qr = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        Eigen::Map<MartrixXd> martrix_y(y + i * m * n, m, n);
+        Eigen::Map<MartrixXd> martrix_x(x + i * m * n, m, n);
+        MartrixXd tmp = MartrixXd::Identity(m, m);
+        Eigen::Map<VectorXd> vector_tau(tau + i * p, p, 1);
+        for (size_t k = 0; k < p; k++) {
+          VectorXd vector_v = martrix_x.block(k, k, m - k, 1);
+          vector_v[0] = 1;
+          tmp.rightCols(m - k) =
+            tmp.rightCols(m - k) - vector_tau(k) * (tmp.rightCols(m - k) * vector_v) * vector_v.adjoint();
+        }
+        martrix_y = tmp.leftCols(n);
+      }
+    };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_qr),
+                        "Orgqr Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kOrgqr, OrgqrCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/orgqr.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/orgqr.h
@ -0,0 +1,43 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_ORGQR_H_
+#define AICPU_KERNELS_NORMALIZED_ORQGR_H_
+
+#include <vector>
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class OrgqrCpuKernel : public CpuKernel {
+ public:
+  OrgqrCpuKernel() = default;
+  ~OrgqrCpuKernel() = default;
+
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t OrgqrCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t OrgqrCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t OrgqrComputeComplex(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_ORGQR_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pack.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pack.cc
@ -0,0 +1,140 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pack.h"
+#include <securec.h>
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "Eigen/Core"
+
+namespace {
+const uint32_t kOutputNum{1u};
+const uint32_t kInputNum{aicpu::kDynamicInput};
+const char *kPack = "Pack";
+// constexpr int64_t kParallelDataNums = 512 * 1024;
+
+#define PACK_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                      \
+    uint32_t result = PackCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                  \
+      KERNEL_LOG_ERROR("Pack kernel compute failed."); \
+      return result;                                   \
+    }                                                  \
+    break;                                             \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t PackCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kPack);
+  KERNEL_HANDLE_ERROR(PackCheck(ctx), "[%s] check params failed.", kPack);
+  DataType data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    PACK_COMPUTE_CASE(DT_BOOL, bool, ctx)
+    PACK_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    PACK_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    PACK_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    PACK_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    PACK_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    PACK_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    PACK_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    PACK_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    PACK_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
+    PACK_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
+    PACK_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
+    PACK_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
+    PACK_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("Pack kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t PackCpuKernel::PackCheck(CpuKernelContext &ctx) {
+  auto *input = ctx.Input(0);
+  AttrValue *n_attr = ctx.GetAttr("N");
+  AttrValue *axis_attr = ctx.GetAttr("axis");
+  int64_t axis = axis_attr->GetInt();
+  auto expanded_num_dims = input->GetTensorShape()->GetDims() + 1;  // first_input.dims() + 1;
+  if (axis < 0) axis += expanded_num_dims;
+
+  if (axis < 0 || axis >= expanded_num_dims) {
+    KERNEL_LOG_ERROR("Pack axis error.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  int64_t input_num = n_attr->GetInt();
+  auto x1_dims = input->GetTensorShape()->GetDims();
+  for (int64_t i = 1; i < input_num; i++) {
+    auto input_dims = ctx.Input(i)->GetTensorShape()->GetDims();
+    if (x1_dims != input_dims) {
+      KERNEL_LOG_ERROR("Pack input dims no equal.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PackCpuKernel::PackCompute(CpuKernelContext &ctx) {
+  AttrValue *axis_attr = ctx.GetAttr("axis");
+  int64_t axis = axis_attr->GetInt();
+
+  AttrValue *n_attr = ctx.GetAttr("N");
+  int64_t input_num = n_attr->GetInt();
+
+  auto *input = ctx.Input(0);
+  auto *output = ctx.Output(0);
+
+  auto expanded_num_dims = input->GetTensorShape()->GetDims() + 1;
+  if (axis < 0) axis += expanded_num_dims;
+
+  std::vector<int64_t> temp_shape = input->GetTensorShape()->GetDimSizes();
+  temp_shape.insert(temp_shape.begin() + axis, input_num);
+
+  auto *y = reinterpret_cast<T *>(output->GetData());
+  int64_t x_NumElements = input->GetTensorShape()->NumElements();
+
+  if (axis == 0) {
+    int64_t num = 0;
+    for (int64_t j = 0; j < input_num; j++) {
+      auto *input_x = reinterpret_cast<T *>(ctx.Input(j)->GetData());
+      auto input_numelements = ctx.Input(j)->GetTensorShape()->NumElements();
+      for (int64_t i = 0; i < input_numelements; i++) {
+        *(y + num) = *(input_x + i);
+        num++;
+      }
+    }
+  } else {
+    int64_t num = 0;
+    for (int64_t j = 0; j < x_NumElements; j++) {
+      for (int64_t i = 0; i < input_num; i++) {
+        auto *input_x = reinterpret_cast<T *>(ctx.Input(i)->GetData());
+        *(y + num) = *(input_x + j);
+        num++;
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kPack, PackCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pack.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pack.h
@ -0,0 +1,37 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_PACK_H_
+#define AICPU_KERNELS_NORMALIZED_PACK_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+
+class PackCpuKernel : public CpuKernel {
+ public:
+  ~PackCpuKernel() = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx);
+
+  uint32_t PackCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t PackCompute(CpuKernelContext &ctx);
+};
+
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc
@ -0,0 +1,379 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "parameterized_truncated_normal.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+#include <Eigen/Dense>
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <random>
+
+using namespace std;
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 5;
+const char *kParameterizedTruncatedNormal = "ParameterizedTruncatedNormal";
+using RNG_Engine = std::mt19937;
+static constexpr int kMaxIterations = 1000;
+
+#define BATCH_SIZE_CASE(DTYPE, TYPE, CTX)                   \
+  case (DTYPE): {                                           \
+    batch_size = int64_t(GetBatchSizeCheckDims<TYPE>(CTX)); \
+    break;                                                  \
+  }
+
+// override functions for half
+bool isinf(Eigen::half &data) { return Eigen::half_impl::isinf(data); }
+void swap(Eigen::half &data1, Eigen::half &data2) {
+  Eigen::half tmp = data1;
+  data1 = data2;
+  data2 = tmp;
+}
+
+Eigen::half exp(Eigen::half &data) { return Eigen::half_impl::exp(data); }
+Eigen::half log(Eigen::half &data) { return Eigen::half_impl::log(data); }
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+T GetBatchSizeCheckDims(CpuKernelContext &ctx) {
+  auto output_shape = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  for (int i = 1; i < ctx.Input(0)->NumElements(); i++) {
+    KERNEL_CHECK_FALSE((output_shape[i] >= 0), KERNEL_STATUS_PARAM_INVALID, "The output dimension must be >= 0.")
+  }
+  return output_shape[0];
+}
+
+template <typename T>
+void Generate(int64_t size, T mean, T stddev, T minval, T maxval, T **output_ptr, RNG_Engine &rng) {
+  auto output = *output_ptr;
+  std::normal_distribution<double> normal_dist(0, 1);
+  std::uniform_real_distribution<double> unifrom_dist(0, 1);
+  // Vectorized intermediate calculations for uniform rejection sampling.
+  const T stddev_inside_bound = T(1.3);
+
+  /**
+   * If possible, make one-sided bound be the lower bound, or make both
+   * bounds positive. Otherwise, the bounds are on either side of the
+   * mean.
+   */
+  if ((isinf(minval) && minval < T(0)) || maxval < mean) {
+    // Reverse all calculations. norm_min and norm_max will be flipped.
+    swap(minval, maxval);
+    stddev = -stddev;
+  }
+
+  auto tmp_num = (stddev == static_cast<T>(0)) ? static_cast<T>(1) : stddev;
+  // Calculate normalized samples, then convert them.
+  const T norm_min = (minval - mean) / tmp_num;
+  const T norm_max = (maxval - mean) / tmp_num;
+  int sample_num = 0;
+
+  // Determine the method to use.
+  const T sqrt_factor = sqrt((norm_min * norm_min) + T(4));
+  const T cutoff = T(2) * exp(T(0.5) + (norm_min * (norm_min - sqrt_factor)) / T(4)) / (norm_min + sqrt_factor);
+  const T diff = norm_max - norm_min;
+
+  if (((norm_min < -stddev_inside_bound) && (norm_max >= T(0.))) ||
+      ((norm_max > stddev_inside_bound) && (norm_min <= T(0.)))) {
+    /**
+     * If the bounds are a least 3 standard deviations from the mean
+     * on at least one side then we rejection sample by sampling
+     * from the normal distribution and rejecting samples outside
+     * the bounds.
+     * Under this condition the acceptance rate per iteration should
+     * always be ~ 50%. This sampler is more efficient (and more
+     * numerically stable when one or both bounds is far from the mean).
+     */
+    while (sample_num < size) {
+      for (int iter = 0; iter <= kMaxIterations;) {
+        T normal_sample = T(normal_dist(rng));
+
+        if ((normal_sample >= norm_min) && (normal_sample <= norm_max)) {
+          *output = normal_sample * stddev + mean;
+          if (stddev <= static_cast<T>(0)) {
+            *output = static_cast<T>(INFINITY);
+          } else {
+            output = output + 1;
+          }
+          sample_num++;
+          break;
+        } else {
+          iter++;
+          if (iter > kMaxIterations) {
+            /**
+             * This should never occur because this sampler should
+             * (by the selection criteria above) be used if at least 3
+             * standard deviations of one side of the distribution
+             * is within the limits (so acceptance probability per
+             * iterations >~ 1/2 per iteration).
+             */
+            KERNEL_LOG_ERROR(
+              "TruncatedNormal randn rejection sampler "
+              "exceeded maximum iterations");
+            *output_ptr = output;
+            return;
+          }
+        }
+      }
+    }
+  } else if (diff < cutoff) {
+    // Sample from a uniform distribution on [norm_min, norm_max].
+    const T plus_Factor = (norm_min < T(0)) ? T(0) : norm_min * norm_min;
+
+    while (sample_num < size) {
+      for (int iter = 0; iter <= kMaxIterations;) {
+        T uniform_sample = T(unifrom_dist(rng));
+
+        T z = uniform_sample * diff + norm_min;
+        T g = (plus_Factor - z * z) / T(2.0);
+
+        bool accept = T(unifrom_dist(rng)) <= exp(g);
+
+        if (accept || iter + 1 >= kMaxIterations) {
+          if (!accept) {
+            KERNEL_LOG_ERROR(
+              "TruncatedNormal uniform rejection sampler "
+              "exceeded max iterations. Sample may contain outliers.");
+            *output_ptr = output;
+            return;
+          }
+
+          *output = z * stddev + mean;
+          if (stddev <= static_cast<T>(0)) {
+            *output = static_cast<T>(INFINITY);
+          } else {
+            output = output + 1;
+          }
+          sample_num++;
+          break;
+
+        } else {
+          iter++;
+        }
+      }
+    }
+  } else {
+    /**
+     * Sample from an exponential distribution with alpha maximizing
+     * acceptance probability, offset by norm_min from the origin.
+     * Accept only if less than norm_max.
+     */
+    const T alpha = (norm_min + sqrt((norm_min * norm_min) + T(4))) / T(2);
+    while (sample_num < size) {
+      for (int iter = 0; iter <= kMaxIterations;) {
+        T uniform_sample = T(unifrom_dist(rng));
+        T z = -log(uniform_sample) / alpha + norm_min;
+        const T x = norm_min < alpha ? alpha - z : norm_min - alpha;
+        const T g = exp(-x * x / T(2.0));
+
+        const T u = T(unifrom_dist(rng));
+
+        bool accept = (u <= g && z < norm_max);
+        if (accept || iter + 1 >= kMaxIterations) {
+          if (!accept) {
+            KERNEL_LOG_ERROR(
+              "TruncatedNormal exponential distribution "
+              "rejection sampler exceeds max iterations. "
+              "Sample may contain outliers.");
+            *output_ptr = output;
+            return;
+          }
+          *output = z * stddev + mean;
+          output = output + 1;
+          sample_num++;
+          break;
+        } else {
+          iter++;
+        }
+      }
+    }
+  }
+
+  *output_ptr = output;
+  return;
+}
+
+template <typename T_shape, typename T_val>
+uint32_t BatchGenerate(CpuKernelContext &ctx) {
+  Tensor *input_0 = ctx.Input(0);
+  auto output_shape = reinterpret_cast<T_shape *>(input_0->GetData());
+  // check shape
+  auto batch_size = output_shape[0];
+  int sample_size = 1;
+  for (int i = 1; i < ctx.Input(0)->NumElements(); i++) {
+    sample_size *= output_shape[i];
+  }
+
+  Tensor *input_3 = ctx.Input(3);
+  Tensor *input_4 = ctx.Input(4);
+  Tensor *input_1 = ctx.Input(1);
+  Tensor *input_2 = ctx.Input(2);
+  Tensor *output = ctx.Output(0);
+
+  auto output_data = reinterpret_cast<T_val *>(output->GetData());
+  auto means = reinterpret_cast<T_val *>(input_1->GetData());
+  auto stdevs = reinterpret_cast<T_val *>(input_2->GetData());
+  auto minvals = reinterpret_cast<T_val *>(input_3->GetData());
+  auto maxvals = reinterpret_cast<T_val *>(input_4->GetData());
+
+  // setup seed
+  int64_t final_seed = 0;
+  auto attr_seed = ctx.GetAttr("seed");
+  if (attr_seed != nullptr) {
+    final_seed = attr_seed->GetInt();
+  }
+  if (final_seed == 0) {
+    auto attr_seed2 = ctx.GetAttr("seed2");
+    if (attr_seed2 != nullptr) {
+      final_seed = attr_seed2->GetInt();
+    }
+  }
+
+  // setup random engine
+  std::random_device r;
+  RNG_Engine rng;
+  final_seed = final_seed ? final_seed : r();
+  rng.seed(final_seed);
+
+  vector<T_val *> params = {means, stdevs, minvals, maxvals};
+
+  vector<int> params_idx;
+  if (input_1->NumElements() > 1) {
+    params_idx.push_back(0);
+  }
+  if (input_2->NumElements() > 1) {
+    params_idx.push_back(1);
+  }
+  if (input_3->NumElements() > 1) {
+    params_idx.push_back(2);
+  }
+  if (input_4->NumElements() > 1) {
+    params_idx.push_back(3);
+  }
+
+  for (int batch = 0; batch < batch_size; batch++) {
+    auto maxval = *params[3];
+    auto minval = *params[2];
+    KERNEL_CHECK_FALSE((maxval > minval), KERNEL_STATUS_PARAM_INVALID,
+                       "Max value must be greater than min value in each batch")
+    Generate<T_val>(int64_t(sample_size), *params[0], *params[1], minval, maxval, &output_data, rng);
+    for (auto i : params_idx) {
+      params[i] = params[i] + 1;
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t ParameterizedTruncatedNormalCpuKernel::ParameterizedTruncatedNormalCheck(CpuKernelContext &ctx) {
+  DataType val_datatype = ctx.Input(1)->GetDataType();
+  DataType shape_datatype = ctx.Input(0)->GetDataType();
+
+  for (uint32_t i = 0; i < kInputNum; i++) {
+    Tensor *input = ctx.Input(i);
+
+    // check input datatype
+    DataType input_datatype = input->GetDataType();
+    switch (i) {
+      case 0:
+        KERNEL_CHECK_FALSE((input_datatype == DT_INT32 || input_datatype == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
+                           "Input[0] data type must DT_INT32 or DT_INT64,"
+                           "but got data type[%s].",
+                           DTypeStr(input_datatype).c_str());
+        break;
+      case 1:
+        KERNEL_CHECK_FALSE((input_datatype == DT_FLOAT16 || input_datatype == DT_FLOAT || input_datatype == DT_DOUBLE),
+                           KERNEL_STATUS_PARAM_INVALID,
+                           "Input[1] data type must DT_FLOAT16 or DT_FLOAT or DT_DOUBLE,"
+                           "but got data type[%s].",
+                           DTypeStr(input_datatype).c_str());
+        break;
+      default:
+        KERNEL_CHECK_FALSE((input_datatype == val_datatype), KERNEL_STATUS_PARAM_INVALID,
+                           "The data type of input[%u] [%s] need be same with input[1] [%s].", i,
+                           DTypeStr(input_datatype).c_str(), DTypeStr(val_datatype).c_str())
+    }
+
+    // check input dimension
+    auto input_dims = input->GetTensorShape()->GetDims();
+
+    int64_t batch_size = 0;
+    switch (shape_datatype) {
+      BATCH_SIZE_CASE(DT_INT32, int32_t, ctx)
+      BATCH_SIZE_CASE(DT_INT64, int64_t, ctx)
+      default:
+        KERNEL_LOG_ERROR("input0 data type [%u] not support.", shape_datatype);
+        return KERNEL_STATUS_PARAM_INVALID;
+    }
+    KERNEL_CHECK_FALSE((batch_size >= 0), KERNEL_STATUS_PARAM_INVALID, "The batch size must be >= 0.")
+
+    switch (i) {
+      case 0:
+        KERNEL_CHECK_FALSE((input_dims == 1), KERNEL_STATUS_PARAM_INVALID,
+                           "Input[0] should be rank 1, but got rank [%d].", input_dims);
+        break;
+
+      default:
+        KERNEL_CHECK_FALSE((input_dims <= 1), KERNEL_STATUS_PARAM_INVALID,
+                           "Input[%u] should be at most rank 1, but got rank [%d].", i, input_dims);
+        if (input_dims == 1) {
+          auto num_of_elems = input->NumElements();
+
+          KERNEL_CHECK_FALSE((num_of_elems == 1 || num_of_elems == batch_size), KERNEL_STATUS_PARAM_INVALID,
+                             "Input[%u] length should be 1 or equal to the "
+                             "batch size, got %d.",
+                             i, num_of_elems);
+        }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+void ParameterizedTruncatedNormalCpuKernel::SetMap() {
+  calls_[DT_INT32][DT_FLOAT16] = BatchGenerate<int32_t, Eigen::half>;
+  calls_[DT_INT32][DT_FLOAT] = BatchGenerate<int32_t, float>;
+  calls_[DT_INT32][DT_DOUBLE] = BatchGenerate<int32_t, double>;
+  calls_[DT_INT64][DT_FLOAT16] = BatchGenerate<int64_t, Eigen::half>;
+  calls_[DT_INT64][DT_FLOAT] = BatchGenerate<int64_t, float>;
+  calls_[DT_INT64][DT_DOUBLE] = BatchGenerate<int64_t, double>;
+}
+
+uint32_t ParameterizedTruncatedNormalCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
+                      "ParameterizedTruncatedNormal check input and output number failed.");
+
+  KERNEL_HANDLE_ERROR(ParameterizedTruncatedNormalCheck(ctx), "ParameterizedTruncatedNormal check params failed.");
+
+  DataType val_datatype = ctx.Input(1)->GetDataType();
+  DataType shape_datatype = ctx.Input(0)->GetDataType();
+
+  SetMap();
+  calls_[shape_datatype][val_datatype](ctx);
+  calls_.clear();
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kParameterizedTruncatedNormal, ParameterizedTruncatedNormalCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.h
@ -0,0 +1,38 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_PARAMETERIZEDTRUNCATEDNORMAL_H_
+#define AICPU_KERNELS_NORMALIZED_PARAMETERIZEDTRUNCATEDNORMAL_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class ParameterizedTruncatedNormalCpuKernel : public CpuKernel {
+ public:
+  ParameterizedTruncatedNormalCpuKernel() = default;
+  ~ParameterizedTruncatedNormalCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  static uint32_t ParameterizedTruncatedNormalCheck(CpuKernelContext &ctx);
+
+  // use map for 2 template parameter functions
+  void SetMap();
+  std::map<int, std::map<int, std::function<void(CpuKernelContext &)>>> calls_;
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_PARAMETERIZEDTRUNCATEDNORMAL_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pdist_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pdist_grad.cc
@ -0,0 +1,185 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pdist_grad.h"
+
+#include <algorithm>
+#include <math.h>
+
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include "kernel_log.h"
+#include "status.h"
+
+namespace {
+const char *kPdistGrad = "PdistGrad";
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 3;
+constexpr int64_t kParallelDataNums = 16 * 1024;
+constexpr int64_t kParallelDataNumsMid = 7 * 1024;
+
+#define SWITCH_PARALLEL(SHARD, end_num, divisor)                                                  \
+  if (end_num >= (kParallelDataNumsMid / divisor)) {                                              \
+    uint32_t min_core_num = 1;                                                                    \
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);     \
+    if (end_num < (kParallelDataNums / divisor)) {                                                \
+      max_core_num = std::min(max_core_num, 4L);                                                  \
+    }                                                                                             \
+    if (max_core_num > end_num) {                                                                 \
+      max_core_num = end_num;                                                                     \
+    }                                                                                             \
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, end_num / max_core_num, SHARD), \
+                        "PdistGrad #SHARD Compute failed.");                                      \
+  } else {                                                                                        \
+    SHARD(0, end_num);                                                                            \
+  }
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+struct Grad {
+  static inline T abs(T x) { return static_cast<T>(std::abs(*((float *)&x))); }
+
+  static inline T pow(T x, float p) { return static_cast<T>(std::pow(*((float *)&x), p)); }
+
+  static inline T sign(T x) { return x > T{0.0f} ? T{1.0f} : T{-1.0f}; }
+
+  struct o_grad {
+    static inline T backward(T diff, T grad, T dist, float p) { return diff > T{0.0f} ? grad : -grad; }
+  };
+
+  struct t_grad {
+    static inline float backward(float diff, float grad, float dist, float p) {
+      return dist == 0.0f ? 0.0f : grad * diff / dist;
+    }
+
+    static inline Eigen::half backward(Eigen::half diff, Eigen::half grad, Eigen::half dist, float p) {
+      return dist == Eigen::half{0.0f} ? Eigen::half{0.0f}
+                                       : sign(diff) * pow(abs(diff), p - 1) * grad / pow(dist, p - 1);
+    }
+  };
+
+  struct p_grad {
+    static inline T backward(T diff, T grad, T dist, float p) {
+      return dist == T{0.0f} ? T{0.0f} : sign(diff) * pow(abs(diff), p - 1) * grad / pow(dist, p - 1);
+    }
+  };
+
+  struct i_grad {
+    static inline T backward(T diff, T grad, T dist, float p) {
+      return (diff == dist || -diff == dist) ? sign(diff) * grad : T{0.0f};
+    }
+  };
+
+  template <typename S>
+  static uint32_t ParallelForPdistGrad(T *grad, T *x, T *dist, T *y, float p, CpuKernelContext &ctx) {
+    int64_t data_num = ctx.Input(1)->NumElements();
+    int64_t n = ctx.Input(1)->GetTensorShape()->GetDimSize(0);
+    int64_t m = ctx.Input(1)->GetTensorShape()->GetDimSize(1);
+    auto shard_pdistgrad = [&](int64_t start, int64_t end) {
+      int64_t index;
+      for (int64_t col = start; col < end; col++) {
+        index = 0;
+        for (int64_t i = col; i < data_num; i += m) {
+          for (int64_t j = i + m; j < data_num; j += m) {
+            T diff = x[i] - x[j];
+            if (diff == T{0.0f}) {
+              index++;
+              continue;
+            }
+            T result = S::backward(diff, grad[index], dist[index], p);
+            *(y + i) += result;
+            *(y + j) -= result;
+            index++;
+          }
+        }
+      }
+    };
+    SWITCH_PARALLEL(shard_pdistgrad, m, n);
+    return KERNEL_STATUS_OK;
+  }
+
+  static inline uint32_t PdistGradComputeKernel(T *grad, T *x, T *dist, T *y, float p, CpuKernelContext &ctx) {
+    int64_t data_num = ctx.Input(1)->NumElements();
+    T zero = T{0};
+    auto shard_fill = [&](int64_t start, int64_t end) { std::fill(y + start, y + end, zero); };
+    SWITCH_PARALLEL(shard_fill, data_num, 1);
+    if (p == 0.0) {
+      return KERNEL_STATUS_OK;
+    } else if (p == 1.0) {
+      return ParallelForPdistGrad<o_grad>(grad, x, dist, y, p, ctx);
+    } else if (p == 2.0) {
+      return ParallelForPdistGrad<t_grad>(grad, x, dist, y, p, ctx);
+    } else if (std::isinf(p)) {
+      return ParallelForPdistGrad<i_grad>(grad, x, dist, y, p, ctx);
+    } else {
+      return ParallelForPdistGrad<p_grad>(grad, x, dist, y, p, ctx);
+    }
+  }
+};  // Grad
+
+uint32_t PdistGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "PdistGrad check input and output number failed.");
+  DataType input_type = ctx.Input(1)->GetDataType();
+  DataType output_type = ctx.Output(0)->GetDataType();
+  KERNEL_CHECK_FALSE((input_type == output_type), KERNEL_STATUS_PARAM_INVALID,
+                     "Input data type[%s] is not equal to output data type[%s].", DTypeStr(input_type).c_str(),
+                     DTypeStr(output_type).c_str());
+  uint64_t input_size = ctx.Input(1)->GetDataSize();
+  uint64_t output_size = ctx.Output(0)->GetDataSize();
+  KERNEL_CHECK_FALSE((input_size == output_size), KERNEL_STATUS_PARAM_INVALID,
+                     "Input data size[%llu] is not equal to output data size[%llu].", input_size, output_size);
+  switch (input_type) {
+    case DT_FLOAT16:
+      return PdistGradCompute<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return PdistGradCompute<float>(ctx);
+    default:
+      KERNEL_LOG_ERROR("PdistGrad kernel data type [%s] not support.", DTypeStr(input_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+template <typename T>
+uint32_t PdistGradCpuKernel::PdistGradCompute(CpuKernelContext &ctx) {
+  Tensor *grad_tensor = ctx.Input(0);
+  Tensor *x_tensor = ctx.Input(1);
+  Tensor *pdist_tensor = ctx.Input(2);
+  Tensor *y_tensor = ctx.Output(0);
+
+  T *grad = reinterpret_cast<T *>(grad_tensor->GetData());
+  T *x = reinterpret_cast<T *>(x_tensor->GetData());
+  T *pdist = reinterpret_cast<T *>(pdist_tensor->GetData());
+  T *y = reinterpret_cast<T *>(y_tensor->GetData());
+
+  float p = 2.0;
+  AttrValue *p_attr = ctx.GetAttr("p");
+  if (p_attr != nullptr) {
+    p = p_attr->GetFloat();
+  }
+  KERNEL_CHECK_FALSE((p >= 0), KERNEL_STATUS_PARAM_INVALID, "Attr[p] data cannot be less than 0.");
+
+  uint32_t ret = Grad<T>::PdistGradComputeKernel(grad, x, pdist, y, p, ctx);
+  if (ret != KERNEL_STATUS_OK) {
+    return ret;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kPdistGrad, PdistGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pdist_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pdist_grad.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_PDIST_GRAD_H_
+#define AICPU_KERNELS_NORMALIZED_PDIST_GRAD_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+
+namespace aicpu {
+class PdistGradCpuKernel : public CpuKernel {
+ public:
+  PdistGradCpuKernel() = default;
+  ~PdistGradCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t PdistGradCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/polar.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/polar.cc
@ -0,0 +1,82 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All right reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "polar.h"
+
+#include "complex"
+#include "cpu_kernel_utils.h"
+#include "iostream"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 2;
+const uint32_t kOutputNum = 1;
+const char *kPolar = "Polar";
+const int64_t kParallelDataNumMid = 35 * 1024;
+const int64_t kParallelDataNum = 7 * 1024;
+}  // namespace
+
+namespace aicpu {
+uint32_t PolarCpuKernel::Compute(CpuKernelContext &ctx) {
+  DataType abs_type = ctx.Input(0)->GetDataType();
+  DataType angle_type = ctx.Input(1)->GetDataType();
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Polar check input and output number failed.");
+  if (abs_type == DT_FLOAT && angle_type == DT_FLOAT) {
+    return PolarCompute<float>(ctx);
+  } else if (abs_type == DT_DOUBLE && angle_type == DT_DOUBLE) {
+    return PolarCompute<double>(ctx);
+  } else {
+    KERNEL_LOG_ERROR("Polar kernel data type [%s],[%s] not support.", DTypeStr(abs_type).c_str(),
+                     DTypeStr(angle_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t PolarCpuKernel::PolarCompute(CpuKernelContext &ctx) {
+  auto abs = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto angle = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto output = reinterpret_cast<std::complex<T> *>(ctx.Output(0)->GetData());
+  auto input_shape = ctx.Input(0)->GetTensorShape();
+  int64_t elements = input_shape->NumElements();
+  auto sharder_polar = [&](int64_t start, int64_t end) {
+    for (int64_t i = start; i < end; i++) {
+      output[i].real(abs[i] * cos(angle[i]));
+      output[i].imag(abs[i] * sin(angle[i]));
+    }
+  };
+  if (elements > kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (elements <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, static_cast<int64_t>(4));  // up to 4 cpu cores
+    }
+
+    if (max_core_num > elements) {
+      max_core_num = elements;
+    }
+    if (max_core_num > 0) {
+      KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, elements, elements / max_core_num, sharder_polar),
+                          "Polar Compute failed.");
+    }
+  } else {
+    sharder_polar(0, elements);
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kPolar, PolarCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/polar.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/polar.h
@ -0,0 +1,40 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file polar.h
+ * \brief
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_STFT_H_
+#define AICPU_KERNELS_NORMALIZED_STFT_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class PolarCpuKernel : public CpuKernel {
+ public:
+  PolarCpuKernel() = default;
+  ~PolarCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  static uint32_t PolarCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_range.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_range.cc
@ -0,0 +1,203 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ragged_range.h"
+
+#include <vector>
+#include <cmath>
+#include <type_traits>
+
+#include "cpu_kernel_utils.h"
+#include "utils/kernel_util.h"
+#include "utils/eigen_tensor.h"
+#include "kernel_log.h"
+#include "status.h"
+using namespace std;
+
+namespace {
+const uint32_t kOutputNum = 2;
+const uint32_t kInputNum = 3;
+const char *kRaggedRange = "RaggedRange";
+constexpr int64_t kParallelDataNums = 16 * 1024;
+
+#define RAGGEDRANGE_COMPUTE_CASE(DTYPE, TYPE, TSPLITS, NROWS, STARTS, LIMITS, DELTAS, BROADCAST_START,      \
+                                 BROADCAST_LIMITS, BROADCAST_DELTAS, RT_NESTED_SPLITS, RT_DENSE_VALUE, CTX) \
+  case (DTYPE): {                                                                                           \
+    uint32_t result =                                                                                       \
+      RaggedRangeCompute<TYPE, TSPLITS>(NROWS, STARTS, LIMITS, DELTAS, BROADCAST_START, BROADCAST_LIMITS,   \
+                                        BROADCAST_DELTAS, RT_NESTED_SPLITS, RT_DENSE_VALUE, CTX);           \
+    if (result != KERNEL_STATUS_OK) {                                                                       \
+      KERNEL_LOG_ERROR("RaggedRange kernel compute failed.");                                               \
+      return result;                                                                                        \
+    }                                                                                                       \
+    break;                                                                                                  \
+  }
+
+}  // namespace
+
+namespace aicpu {
+uint32_t RaggedRange::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "RaggedRange check params failed.");
+  Tensor *starts = ctx.Input(0);
+  auto starts_shape = starts->GetTensorShape();
+  int32_t starts_dim = starts_shape->GetDims();
+
+  Tensor *limits = ctx.Input(1);
+  auto limits_shape = limits->GetTensorShape();
+  int32_t limits_dim = limits_shape->GetDims();
+
+  Tensor *deltas = ctx.Input(2);
+  auto deltas_shape = deltas->GetTensorShape();
+  int32_t deltas_dim = deltas_shape->GetDims();
+
+  KERNEL_CHECK_FALSE((starts_dim <= 1), KERNEL_STATUS_PARAM_INVALID, "starts must be a scalar or vector.");
+  KERNEL_CHECK_FALSE((limits_dim <= 1), KERNEL_STATUS_PARAM_INVALID, "limits must be a scalar or vector.");
+  KERNEL_CHECK_FALSE((deltas_dim <= 1), KERNEL_STATUS_PARAM_INVALID, "deltas must be a scalar or vector.");
+
+  bool broadcast_starts = starts_dim == 0;
+  bool broadcast_limits = limits_dim == 0;
+  bool broadcast_deltas = deltas_dim == 0;
+
+  vector<int> in_sizes;
+  if (!broadcast_starts) in_sizes.push_back(starts_shape->GetDimSize(0));
+  if (!broadcast_limits) in_sizes.push_back(limits_shape->GetDimSize(0));
+  if (!broadcast_deltas) in_sizes.push_back(deltas_shape->GetDimSize(0));
+  for (uint32_t i = 1; i < in_sizes.size(); ++i) {
+    KERNEL_CHECK_FALSE((in_sizes[i] == in_sizes[i - 1]), KERNEL_STATUS_PARAM_INVALID,
+                       "starts, limits, and deltas must have the same shape.");
+  }
+
+  uint32_t nrows = in_sizes.empty() ? 1 : in_sizes[0];
+
+  AttrValue *attr = ctx.GetAttr("Tsplits");
+  KERNEL_CHECK_NULLPTR(attr, KERNEL_STATUS_PARAM_INVALID, "Get attr[Tsplits] failed.");
+  DataType Tsplits = attr->GetDataType();
+  KERNEL_CHECK_FALSE((Tsplits == DT_INT32 || Tsplits == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
+                     "The attr Tsplits must be int32 or int64.");
+
+  Tensor *rt_nested_splits = ctx.Output(0);
+  Tensor *rt_dense_values = ctx.Output(1);
+
+  auto starts_type = starts->GetDataType();
+  auto limits_type = limits->GetDataType();
+  auto deltas_type = deltas->GetDataType();
+  KERNEL_CHECK_FALSE((starts_type == limits_type && limits_type == deltas_type), KERNEL_STATUS_PARAM_INVALID,
+                     "starts, limits and deltas must have the same type.");
+
+  if (Tsplits == DT_INT32) {
+    switch (starts_type) {
+      RAGGEDRANGE_COMPUTE_CASE(DT_FLOAT, float, int32_t, nrows, starts, limits, deltas, broadcast_starts,
+                               broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
+      RAGGEDRANGE_COMPUTE_CASE(DT_DOUBLE, double, int32_t, nrows, starts, limits, deltas, broadcast_starts,
+                               broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
+      RAGGEDRANGE_COMPUTE_CASE(DT_INT32, int32_t, int32_t, nrows, starts, limits, deltas, broadcast_starts,
+                               broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
+      RAGGEDRANGE_COMPUTE_CASE(DT_INT64, int64_t, int32_t, nrows, starts, limits, deltas, broadcast_starts,
+                               broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
+      default:
+        KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                         DTypeStr(starts_type).c_str());
+        return KERNEL_STATUS_PARAM_INVALID;
+    }
+  } else {
+    switch (starts_type) {
+      RAGGEDRANGE_COMPUTE_CASE(DT_FLOAT, float, int64_t, nrows, starts, limits, deltas, broadcast_starts,
+                               broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
+      RAGGEDRANGE_COMPUTE_CASE(DT_DOUBLE, double, int64_t, nrows, starts, limits, deltas, broadcast_starts,
+                               broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
+      RAGGEDRANGE_COMPUTE_CASE(DT_INT32, int32_t, int64_t, nrows, starts, limits, deltas, broadcast_starts,
+                               broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
+      RAGGEDRANGE_COMPUTE_CASE(DT_INT64, int64_t, int64_t, nrows, starts, limits, deltas, broadcast_starts,
+                               broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
+      default:
+        KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                         DTypeStr(starts_type).c_str());
+        return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T, typename TSPLITS>
+uint32_t RaggedRange::RaggedRangeCompute(const uint32_t nrows, Tensor *starts, Tensor *limits, Tensor *deltas,
+                                         bool broadcast_starts, bool broadcast_limits, bool broadcast_deltas,
+                                         Tensor *rt_nested_splits, Tensor *rt_dense_values, CpuKernelContext &ctx) {
+  T *starts_addr = reinterpret_cast<T *>(starts->GetData());
+  T *limits_addr = reinterpret_cast<T *>(limits->GetData());
+  T *deltas_addr = reinterpret_cast<T *>(deltas->GetData());
+
+  TSPLITS *rt_nested_splits_addr = reinterpret_cast<TSPLITS *>(rt_nested_splits->GetData());
+  rt_nested_splits_addr[0] = 0;
+  for (uint32_t row = 0; row < nrows; ++row) {
+    T start = broadcast_starts ? starts_addr[0] : starts_addr[row];
+    T limit = broadcast_limits ? limits_addr[0] : limits_addr[row];
+    T delta = broadcast_deltas ? deltas_addr[0] : deltas_addr[row];
+    KERNEL_CHECK_FALSE((delta != 0), KERNEL_STATUS_PARAM_INVALID, "Requires delta != 0.");
+    rt_nested_splits_addr[row + 1] = rt_nested_splits_addr[row] + RangeSize<T, TSPLITS>(start, limit, delta);
+  }
+
+  T *rt_dense_values_addr = reinterpret_cast<T *>(rt_dense_values->GetData());
+  if (nrows <= kParallelDataNums) {
+    int value_index = 0;
+    for (uint32_t row = 0; row < nrows; ++row) {
+      TSPLITS row_size = rt_nested_splits_addr[row + 1] - rt_nested_splits_addr[row];
+      T value = broadcast_starts ? starts_addr[0] : starts_addr[row];
+      T delta = broadcast_deltas ? deltas_addr[0] : deltas_addr[row];
+      for (TSPLITS i = 0; i < row_size; ++i) {
+        rt_dense_values_addr[value_index++] = value;
+        value += delta;
+      }
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num > nrows) {
+      max_core_num = nrows;
+    }
+    auto shared_rtvalues = [&](size_t start, size_t end) {
+      for (size_t row = start; row < end; row++) {
+        TSPLITS row_size = rt_nested_splits_addr[row + 1] - rt_nested_splits_addr[row];
+        T value = broadcast_starts ? starts_addr[0] : starts_addr[row];
+        T delta = broadcast_deltas ? deltas_addr[0] : deltas_addr[row];
+        TSPLITS y_offset = rt_nested_splits_addr[row];
+        for (TSPLITS i = 0; i < row_size; ++i) {
+          rt_dense_values_addr[y_offset++] = value;
+          value += delta;
+        }
+      }
+    };
+    uint32_t ret = CpuKernelUtils::ParallelFor(ctx, nrows, nrows / max_core_num, shared_rtvalues);
+    if (ret != KERNEL_STATUS_OK) {
+      KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed.");
+      return KERNEL_STATUS_INNER_ERROR;
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T, typename TSPLITS>
+TSPLITS RaggedRange::RangeSize(T start, T limit, T delta) {
+  if (((delta > 0) && (limit < start)) || ((delta < 0) && (limit > start))) {
+    return 0;
+  }
+  return (std::is_integral<T>::value ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
+                                     : std::ceil(std::abs((limit - start) / delta)));
+}
+
+REGISTER_CPU_KERNEL(kRaggedRange, RaggedRange);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_range.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_range.h
@ -0,0 +1,40 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_RAGGED_RANGE_H_
+#define AICPU_KERNELS_NORMALIZED_RAGGED_RANGE_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+
+namespace aicpu {
+
+class RaggedRange : public CpuKernel {
+ public:
+  RaggedRange() = default;
+  ~RaggedRange() override = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T, typename TSPLITS>
+  uint32_t RaggedRangeCompute(const uint32_t nrows, Tensor *starts, Tensor *limits, Tensor *deltas,
+                              bool broadcast_starts, bool broadcast_limits, bool broadcast_deltas,
+                              Tensor *rt_nested_splits, Tensor *rt_dense_values, CpuKernelContext &ctx);
+
+  template <typename T, typename TSPLITS>
+  TSPLITS RangeSize(T start, T limit, T delta);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_sparse.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_sparse.cc
@ -0,0 +1,336 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ragged_tensor_to_sparse.h"
+
+namespace {
+const std::uint32_t kInputNum{aicpu::kDynamicInput};
+const std::uint32_t kOutputNum{3u};
+const char *kRaggedTensorToSparse = "RaggedTensorToSparse";
+}  // namespace
+
+namespace aicpu {
+uint32_t RaggedTensorToSparseCpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
+  n_ = ctx.GetInputsSize() - 1;
+  KERNEL_CHECK_FALSE((n_ >= 1), KERNEL_STATUS_PARAM_INVALID,
+                     "Input num must great equal 1,"
+                     "but got input num[%u]",
+                     n_);
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
+                      "RaggedTensorToSparse check input and output number failed.");
+  Tensor *rt_dense_values_ptr = ctx.Input(n_);
+  KERNEL_CHECK_NULLPTR(rt_dense_values_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input rt_dense_values failed.");
+  auto rt_dense_values_shape_ptr = rt_dense_values_ptr->GetTensorShape();
+  KERNEL_CHECK_NULLPTR(rt_dense_values_shape_ptr, KERNEL_STATUS_PARAM_INVALID,
+                       "Get input rt_dense_values shape failed.");
+  DataType rt_dense_values_data_type = rt_dense_values_ptr->GetDataType();
+  KERNEL_CHECK_FALSE((rt_dense_values_data_type == DT_INT32 || rt_dense_values_data_type == DT_INT64 ||
+                      rt_dense_values_data_type == DT_BOOL || rt_dense_values_data_type == DT_INT8 ||
+                      rt_dense_values_data_type == DT_UINT8 || rt_dense_values_data_type == DT_INT16 ||
+                      rt_dense_values_data_type == DT_UINT16 || rt_dense_values_data_type == DT_DOUBLE ||
+                      rt_dense_values_data_type == DT_FLOAT || rt_dense_values_data_type == DT_FLOAT16),
+                     KERNEL_STATUS_PARAM_INVALID,
+                     "Input rt_dense_values data type must {DT_BOOL, DT_INT8, "
+                     "DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, "
+                     "DT_DOUBLE, DT_FLOAT, DT_FLOAT16},"
+                     "but got data type [%s].",
+                     DTypeStr(rt_dense_values_data_type).c_str());
+  auto rt_dense_values_data_ptr = rt_dense_values_ptr->GetData();
+  KERNEL_CHECK_NULLPTR(rt_dense_values_data_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input rt_dense_values data failed.");
+  return KERNEL_STATUS_OK;
+}
+
+// Validate `rt_nested_splits`
+template <typename T1>
+uint32_t RaggedTensorToSparseCpuKernel::ValidateInputs(std::vector<typename TTypes<T1>::Flat> rt_nested_splits,
+                                                       const Tensor *rt_dense_values_in) {
+  for (uint32_t i = 0; i < rt_nested_splits.size(); ++i) {
+    if (rt_nested_splits[i].size() == 0) {
+      KERNEL_LOG_ERROR("ragged splits may not be empty.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (rt_nested_splits[i](0) != 0) {
+      KERNEL_LOG_ERROR("First value of ragged splits must be 0.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    for (uint32_t j = 1; j < rt_nested_splits[i].size(); ++j) {
+      if (rt_nested_splits[i](j) < rt_nested_splits[i](j - 1)) {
+        KERNEL_LOG_ERROR("Ragged splits should be non decreasing.");
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+    }
+    if (i > 0) {
+      T1 last_split = rt_nested_splits[i - 1](rt_nested_splits[i - 1].size() - 1);
+      if (rt_nested_splits[i].size() != last_split + 1) {
+        KERNEL_LOG_ERROR(
+          "Final value of ragged splits must match the length "
+          "the corresponding ragged values.");
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+    }
+  }
+  if (rt_dense_values_in->GetTensorShape()->GetDimSizes()[0] !=
+      rt_nested_splits.back()(rt_nested_splits.back().size() - 1)) {
+    KERNEL_LOG_ERROR(
+      "Final value of ragged splits must match the length "
+      "the corresponding ragged values.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+std::vector<std::vector<int64_t>> RaggedTensorToSparseCpuKernel::MakeIndexSuffixes(const TensorShape &values_shape) {
+  std::vector<std::vector<int64_t>> suffixes{{}};
+  for (int32_t dim = 1; dim < values_shape.GetDims(); ++dim) {
+    std::vector<std::vector<int64_t>> new_suffixes;
+    for (const auto &suffix : suffixes) {
+      for (int64_t i = 0; i < values_shape.GetDimSize(dim); ++i) {
+        new_suffixes.push_back(suffix);
+        new_suffixes.back().push_back(i);
+      }
+    }
+    suffixes.swap(new_suffixes);
+  }
+  return suffixes;
+}
+
+template <typename T1>
+bool RaggedTensorToSparseCpuKernel::IsCompleted(const std::vector<int64_t> &pos, int dim,
+                                                const std::vector<typename TTypes<T1>::Flat> &rt_nested_splits) {
+  int64_t current_child = pos[dim + 1];
+  int64_t limit_child = rt_nested_splits[dim](pos[dim] + 1);
+  return current_child >= limit_child;
+}
+
+void RaggedTensorToSparseCpuKernel::input_list(CpuKernelContext &ctx, OpInputList *list) {
+  static uint32_t start = 0, stop;
+  if (ctx.Input(0)->NumElements() > 0) {
+    stop = start + static_cast<uint32_t>(ctx.Input(0)->NumElements());
+    *list = OpInputList(&ctx, start, stop);
+  }
+}
+
+template <typename T1, typename T2>
+uint32_t RaggedTensorToSparseCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  // Assemble each value in `sparse_indices` using three parts:
+  // - `index_prefix` is the index in dimensions up through the last ragged
+  //   dimension.
+  // - `index_middle` is the index in the last ragged dimension.
+  // - `index_suffix` is the index in the dense value dimensions.
+  OpInputList rt_nested_splits_in;
+  input_list(ctx, &rt_nested_splits_in);
+  const int64_t rt_nested_splits_len = n_;
+  std::vector<typename TTypes<T1>::Flat> rt_nested_splits;
+  rt_nested_splits.reserve(n_);
+  for (int i = 0; i < rt_nested_splits_len; ++i) {
+    if (rt_nested_splits_in[i]->NumElements() > 0) {
+      EigenTensor indicesET(rt_nested_splits_in[i], rt_nested_splits_in[i]->GetData());
+      Eigen::Tensor<T1, 1, Eigen::RowMajor, Eigen::DenseIndex> m = indicesET.flat<T1>();
+      rt_nested_splits.push_back(indicesET.flat<T1>());
+    }
+  }
+
+  const Tensor *rt_dense_values_in = ctx.Input(n_);
+  KERNEL_CHECK_FALSE((ValidateInputs<T1>(rt_nested_splits, rt_dense_values_in) == KERNEL_STATUS_OK),
+                     KERNEL_STATUS_PARAM_INVALID, "ValidateInputs failed.");
+  KERNEL_CHECK_FALSE((Update<T1>(ctx, rt_nested_splits) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
+                     "Update failed.");
+  OutPutSparseValues<T2>(ctx);
+  OutPutSparseDenseShape<T1>(ctx, rt_nested_splits_in, rt_nested_splits);
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T1>
+uint32_t RaggedTensorToSparseCpuKernel::Update(CpuKernelContext &ctx,
+                                               std::vector<typename TTypes<T1>::Flat> rt_nested_splits) {
+  const Tensor *rt_dense_values_in = ctx.Input(n_);
+  const int64_t rt_nested_splits_len = n_;
+
+  std::vector<int64_t> index_prefix(n_);
+  std::vector<std::vector<int64_t>> index_suffixes = MakeIndexSuffixes(*rt_dense_values_in->GetTensorShape());
+
+  // Allocate the `sparse_indices` output tensor.
+  const int64_t nvals = (rt_nested_splits.back()(rt_nested_splits.back().size() - 1) * index_suffixes.size());
+  const int64_t indices_len = rt_nested_splits_len + rt_dense_values_in->GetTensorShape()->GetDims();
+  Tensor *sparse_indices = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR((sparse_indices), KERNEL_STATUS_PARAM_INVALID, "Get sparse_indices failed.");
+  sparse_indices->SetDataType(DT_INT64);
+  auto sparse_indices_ptr = reinterpret_cast<int64_t *>(sparse_indices->GetData());
+  KERNEL_CHECK_NULLPTR(sparse_indices_ptr, KERNEL_STATUS_PARAM_INVALID, "Get sparse_indices data failed.");
+  KERNEL_CHECK_NULLPTR(sparse_indices, KERNEL_STATUS_PARAM_INVALID, "Create sparse_indices Flat failed.");
+
+  // pos[i] is the current position in rt_nested_splits[i].  final_pos is a
+  // reference to make it easier to refer to pos[-1].
+  std::vector<int64_t> pos(n_);
+  int64_t &final_pos = pos[n_ - 1];
+  // Each iteration through the loop, we increment pos[-1], and add indices
+  // for all the values corresponding to
+  // rt_nested_splits[-1][pos[-1]:pos[-1]+1].
+  int next_index = 0;
+  int64_t num = 0;
+  int max_final_pos = rt_nested_splits.back().size() - 1;
+  for (; final_pos < max_final_pos; ++final_pos) {
+    // Update `pos` to skip over completed elements (i.e., elements where
+    // we have already generated indices for all contained values).
+    for (int dim = n_ - 2; dim >= 0; --dim) {
+      while (IsCompleted<T1>(pos, dim, rt_nested_splits)) {
+        pos[dim] += 1;
+      }
+    }
+    // Update index_prefix.
+    for (size_t dim = 0; dim < index_prefix.size(); ++dim) {
+      int start = dim > 0 ? rt_nested_splits[dim - 1](pos[dim - 1]) : 0;
+      index_prefix[dim] = pos[dim] - start;
+    }
+    // Get length of the final-ragged-dimension slice.
+    const auto &final_splits = rt_nested_splits[n_ - 1];
+    int64_t slice_len = final_splits(final_pos + 1) - final_splits(final_pos);
+    // Add sparse_indices for this slice.
+    for (int64_t i = 0; i < slice_len; ++i) {
+      for (const auto &index_suffix : index_suffixes) {
+        int dim = 0;
+        for (int64_t index : index_prefix) {  // index_prefix
+          sparse_indices_ptr[num++] = index;
+          dim++;
+        }
+        dim++;
+        sparse_indices_ptr[num++] = i;
+        for (int64_t index : index_suffix) {  // index_suffix
+          sparse_indices_ptr[num++] = index;
+          dim++;
+        }
+        KERNEL_CHECK_FALSE((dim == indices_len), KERNEL_STATUS_PARAM_INVALID,
+                           "dim should be equal to indices_len,but get %d.", dim);
+        ++next_index;
+      }
+    }
+  }
+  KERNEL_CHECK_FALSE((next_index == nvals), KERNEL_STATUS_PARAM_INVALID,
+                     "next_index should be equal to nvals,but get %d.", next_index);
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T2>
+void RaggedTensorToSparseCpuKernel::OutPutSparseValues(CpuKernelContext &ctx) {
+  // Output the `sparse_values` Tensor.
+  const Tensor *rt_dense_values_in = ctx.Input(n_);
+  Tensor *spares_values_out = ctx.Output(1);
+  if (rt_dense_values_in->GetTensorShape()->GetDims() == 1) {
+    spares_values_out->SetDataType(rt_dense_values_in->GetDataType());
+    auto rt_dense_shape = rt_dense_values_in->GetTensorShape();
+    auto spares_values_out_ptr = reinterpret_cast<T2 *>(spares_values_out->GetData());
+    auto rt_dense_values_in_ptr = reinterpret_cast<T2 *>(rt_dense_values_in->GetData());
+    for (int64_t i = 0; i < rt_dense_values_in->NumElements(); i++) {
+      spares_values_out_ptr[i] = rt_dense_values_in_ptr[i];
+    }
+  } else {
+    spares_values_out->SetDataType(rt_dense_values_in->GetDataType());
+    auto rt_dense_shape = rt_dense_values_in->GetTensorShape();
+    auto spares_values_out_ptr = reinterpret_cast<T2 *>(spares_values_out->GetData());
+    auto rt_dense_values_in_ptr = reinterpret_cast<T2 *>(rt_dense_values_in->GetData());
+    for (int64_t i = 0; i < rt_dense_values_in->NumElements(); i++) {
+      spares_values_out_ptr[i] = rt_dense_values_in_ptr[i];
+    }
+  }
+}
+
+template <typename T1>
+void RaggedTensorToSparseCpuKernel::OutPutSparseDenseShape(CpuKernelContext &ctx, OpInputList rt_nested_splits_in,
+                                                           std::vector<typename TTypes<T1>::Flat> rt_nested_splits) {
+  // Output the `sparse_dense_shape` Tensor.
+  const Tensor *rt_dense_values_in = ctx.Input(n_);
+  Tensor *sparse_dense_shape_out = ctx.Output(2);
+  int64_t *sparse_dense_shape = static_cast<int64_t *>(sparse_dense_shape_out->GetData());
+  sparse_dense_shape[0] = rt_nested_splits_in[0]->GetTensorShape()->GetDimSizes()[0] - 1;
+  for (int dim = 0; dim < n_; ++dim) {
+    const auto &splits = rt_nested_splits[dim];
+    T1 max_width = 0;
+    for (int i = 1; i < splits.size(); ++i) {
+      max_width = std::max(max_width, splits(i) - splits(i - 1));
+    }
+    sparse_dense_shape[dim + 1] = max_width;
+  }
+  for (int dim = 1; dim < rt_dense_values_in->GetTensorShape()->GetDims(); ++dim) {
+    sparse_dense_shape[dim + n_] = rt_dense_values_in->GetTensorShape()->GetDimSizes()[dim];
+  }
+}
+
+uint32_t RaggedTensorToSparseCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_CHECK_FALSE((CheckAndInitParams(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
+                     "CheckAndInitParams failed.");
+  DataType type1 = ctx.Input(n_)->GetDataType();
+  DataType SplitType = ctx.Input(0)->GetDataType();
+  switch (SplitType) {
+    case DT_INT32:
+      switch (type1) {
+        case DT_DOUBLE:
+          return DoCompute<int32_t, double>(ctx);
+        case DT_FLOAT16:
+          return DoCompute<int32_t, Eigen::half>(ctx);
+        case DT_FLOAT:
+          return DoCompute<int32_t, float>(ctx);
+        case DT_INT8:
+          return DoCompute<int32_t, int8_t>(ctx);
+        case DT_INT16:
+          return DoCompute<int32_t, int16_t>(ctx);
+        case DT_INT32:
+          return DoCompute<int32_t, int32_t>(ctx);
+        case DT_INT64:
+          return DoCompute<int32_t, int64_t>(ctx);
+        case DT_UINT8:
+          return DoCompute<int32_t, uint8_t>(ctx);
+        case DT_UINT16:
+          return DoCompute<int32_t, uint16_t>(ctx);
+        case DT_BOOL:
+          return DoCompute<int32_t, bool>(ctx);
+        default:
+          KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(type1).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      };
+      break;
+    case DT_INT64:
+      switch (type1) {
+        case DT_DOUBLE:
+          return DoCompute<int64_t, double>(ctx);
+        case DT_FLOAT16:
+          return DoCompute<int64_t, Eigen::half>(ctx);
+        case DT_FLOAT:
+          return DoCompute<int64_t, float>(ctx);
+        case DT_INT8:
+          return DoCompute<int64_t, int8_t>(ctx);
+        case DT_INT16:
+          return DoCompute<int64_t, int16_t>(ctx);
+        case DT_INT32:
+          return DoCompute<int64_t, int32_t>(ctx);
+        case DT_INT64:
+          return DoCompute<int64_t, int64_t>(ctx);
+        case DT_UINT8:
+          return DoCompute<int64_t, uint8_t>(ctx);
+        case DT_UINT16:
+          return DoCompute<int64_t, uint16_t>(ctx);
+        case DT_BOOL:
+          return DoCompute<int64_t, bool>(ctx);
+        default:
+          KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(type1).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      };
+      break;
+    default:
+      KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(SplitType).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+REGISTER_CPU_KERNEL(kRaggedTensorToSparse, RaggedTensorToSparseCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_sparse.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_sparse.h
@ -0,0 +1,87 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_RAGGED_TENSOR_TO_SPARSE_H_
+#define AICPU_KERNELS_NORMALIZED_RAGGED_TENSOR_TO_SPARSE_H_
+
+#include <memory>
+#include <vector>
+
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "securec.h"
+#include "status.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace aicpu {
+class OpInputList {
+ public:
+  OpInputList() : ctx_(nullptr), start_(0), stop_(0) {}
+  OpInputList(CpuKernelContext *ctx, uint32_t start, uint32_t stop) : ctx_(ctx), start_(start), stop_(stop) {}
+  OpInputList(const OpInputList &) = default;
+  OpInputList &operator=(const OpInputList &other) = default;
+  Tensor *operator[](uint32_t i) const { return ctx_->Input(start_ + i); }
+  uint32_t size() const { return stop_ - start_; }
+
+ private:
+  CpuKernelContext *ctx_;  // not owned
+  uint32_t start_;
+  uint32_t stop_;
+};
+
+class RaggedTensorToSparseCpuKernel : public CpuKernel {
+ public:
+  RaggedTensorToSparseCpuKernel() : type1(DT_DOUBLE), n_(1) {}
+  ~RaggedTensorToSparseCpuKernel() = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t CheckAndInitParams(CpuKernelContext &ctx);
+
+  template <typename T1>
+  uint32_t ValidateInputs(std::vector<typename TTypes<T1>::Flat> rt_nested_splits, const Tensor *rt_dense_values_in);
+
+  std::vector<std::vector<int64_t>> MakeIndexSuffixes(const TensorShape &values_shape);
+
+  template <typename T1>
+  bool IsCompleted(const std::vector<int64_t> &pos, int dim,
+                   const std::vector<typename TTypes<T1>::Flat> &rt_nested_splits);
+
+  void input_list(CpuKernelContext &ctx, OpInputList *list);
+
+  template <typename T1, typename T2>
+  uint32_t DoCompute(CpuKernelContext &ctx);
+
+  template <typename T1>
+  uint32_t Update(CpuKernelContext &ctx, std::vector<typename TTypes<T1>::Flat> rt_nested_splits);
+
+  template <typename T2>
+  void OutPutSparseValues(CpuKernelContext &ctx);
+
+  template <typename T1>
+  void OutPutSparseDenseShape(CpuKernelContext &ctx, OpInputList rt_nested_splits_in,
+                              std::vector<typename TTypes<T1>::Flat> rt_nested_splits);
+
+ private:
+  DataType type1;
+  int64_t n_;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_tensor.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_tensor.cc
@ -0,0 +1,617 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ragged_tensor_to_tensor.h"
+
+namespace {
+constexpr uint32_t kInputNum = 4;
+constexpr uint32_t kOutputNum = 1;
+const char *kRaggedTensorToTensor = "RaggedTensorToTensor";
+}  // namespace
+
+namespace aicpu {
+uint32_t RaggedTensorToTensorCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
+                      "RaggedTensorToTensor check input and output number failed.");
+  DataType type1 = ctx.Input(1)->GetDataType();
+  DataType SplitType = ctx.Input(0)->GetDataType();
+  switch (SplitType) {
+    case DT_INT32:
+      switch (type1) {
+        case DT_DOUBLE:
+          return DoCompute<int32_t, double>(ctx);
+        case DT_FLOAT16:
+          return DoCompute<int32_t, Eigen::half>(ctx);
+        case DT_FLOAT:
+          return DoCompute<int32_t, float>(ctx);
+        case DT_INT8:
+          return DoCompute<int32_t, int8_t>(ctx);
+        case DT_INT16:
+          return DoCompute<int32_t, int16_t>(ctx);
+        case DT_INT32:
+          return DoCompute<int32_t, int32_t>(ctx);
+        case DT_INT64:
+          return DoCompute<int32_t, int64_t>(ctx);
+        case DT_UINT8:
+          return DoCompute<int32_t, uint8_t>(ctx);
+        case DT_UINT16:
+          return DoCompute<int32_t, uint16_t>(ctx);
+        case DT_BOOL:
+          return DoCompute<int32_t, bool>(ctx);
+        default: {
+          KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(type1).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+        }
+      };
+      break;
+    case DT_INT64:
+      switch (type1) {
+        case DT_DOUBLE:
+          return DoCompute<int64_t, double>(ctx);
+        case DT_FLOAT16:
+          return DoCompute<int64_t, Eigen::half>(ctx);
+        case DT_FLOAT:
+          return DoCompute<int64_t, float>(ctx);
+        case DT_INT8:
+          return DoCompute<int64_t, int8_t>(ctx);
+        case DT_INT16:
+          return DoCompute<int64_t, int16_t>(ctx);
+        case DT_INT32:
+          return DoCompute<int64_t, int32_t>(ctx);
+        case DT_INT64:
+          return DoCompute<int64_t, int64_t>(ctx);
+        case DT_UINT8:
+          return DoCompute<int64_t, uint8_t>(ctx);
+        case DT_UINT16:
+          return DoCompute<int64_t, uint16_t>(ctx);
+        case DT_BOOL:
+          return DoCompute<int64_t, bool>(ctx);
+        default: {
+          KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(type1).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+        }
+      };
+      break;
+    default: {
+      KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(SplitType).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+}
+
+graphStatus RaggedTensorToTensorCpuKernel::GetRowPartitionTypes(CpuKernelContext &ctx) {
+  std::vector<std::string> partition_types;
+  AttrValue *row_part = ctx.GetAttr("row_partition_types");
+  int64_t N = ctx.Input(0)->GetTensorShape()->GetDims();
+  row_partition_types_.reserve(N);
+  partition_types.reserve(N);
+  if (!row_part) {
+    KERNEL_LOG_ERROR("row_partition_types error.");
+    return GRAPH_FAILED;
+  }
+  partition_types = row_part->GetListString();
+  const auto string_to_type =
+    new std::unordered_map<std::string, RowPartitionType>({{"FIRST_DIM_SIZE", RowPartitionType::FIRST_DIM_SIZE},
+                                                           {"VALUE_ROWIDS", RowPartitionType::VALUE_ROWIDS},
+                                                           {"ROW_LENGTHS", RowPartitionType::ROW_LENGTHS},
+                                                           {"ROW_SPLITS", RowPartitionType::ROW_SPLITS},
+                                                           {"ROW_LIMITS", RowPartitionType::ROW_LIMITS},
+                                                           {"ROW_STARTS", RowPartitionType::ROW_STARTS}});
+
+  for (const std::string &type_str : partition_types) {
+    const auto iter = string_to_type->find(type_str);
+    if (iter == string_to_type->end()) {
+      delete string_to_type;
+      KERNEL_LOG_ERROR("Unknown string for partition info type.");
+      return GRAPH_FAILED;
+    }
+    row_partition_types_.push_back(iter->second);
+  }
+  delete string_to_type;
+  return GRAPH_SUCCESS;
+}
+
+int32_t RaggedTensorToTensorCpuKernel::GetRaggedRank(const std::vector<RowPartitionType> &partition_types) {
+  if (partition_types.empty()) {
+    return 0;
+  }
+  if (partition_types[0] == RowPartitionType::FIRST_DIM_SIZE) {
+    return partition_types.size() - 1;
+  }
+  return partition_types.size();
+}
+
+RowPartitionType RaggedTensorToTensorCpuKernel::GetRowPartitionTypeByDimension(int dimension) {
+  if (row_partition_types_[0] == RowPartitionType::FIRST_DIM_SIZE) {
+    return row_partition_types_[dimension + 1];
+  } else {
+    return row_partition_types_[dimension];
+  }
+}
+
+// Returns the relationship between dimension and dimension + 1.
+template <typename INDEX_TYPE>
+typename TTypes<INDEX_TYPE>::Flat RaggedTensorToTensorCpuKernel::GetRowPartitionTensor(CpuKernelContext &c,
+                                                                                       int64_t dimension) {
+  if (row_partition_types_[0] == RowPartitionType::FIRST_DIM_SIZE) {
+    Tensor *row_partition = c.Input(dimension + 1 + kFirstPartitionInputIndex);
+    EigenTensor rowET(row_partition, reinterpret_cast<INDEX_TYPE *>(row_partition->GetData()));
+    typename TTypes<INDEX_TYPE>::Flat flat_tensor = rowET.flat<INDEX_TYPE>();
+    return flat_tensor;
+  } else {
+    Tensor *row_partition = c.Input(dimension + kFirstPartitionInputIndex);
+    EigenTensor rowET(row_partition, reinterpret_cast<INDEX_TYPE *>(row_partition->GetData()));
+    typename TTypes<INDEX_TYPE>::Flat flat_tensor = rowET.flat<INDEX_TYPE>();
+    return flat_tensor;
+  }
+}
+
+string RaggedTensorToTensorCpuKernel::RowPartitionTypeToString(RowPartitionType row_partition_type) {
+  switch (row_partition_type) {
+    case RowPartitionType::FIRST_DIM_SIZE:
+      return "FIRST_DIM_SIZE";
+    case RowPartitionType::VALUE_ROWIDS:
+      return "VALUE_ROWIDS";
+    case RowPartitionType::ROW_LENGTHS:
+      return "ROW_LENGTHS";
+    case RowPartitionType::ROW_SPLITS:
+      return "ROW_SPLITS";
+    case RowPartitionType::ROW_LIMITS:
+      return "ROW_LIMITS";
+    case RowPartitionType::ROW_STARTS:
+      return "ROW_STARTS";
+    default:
+      return "UNKNOWN ROW PARTITION TYPE";
+  }
+}
+
+graphStatus RaggedTensorToTensorCpuKernel::ValidateDefaultValueShape(const TensorShapeProto &default_value_shape,
+                                                                     const TensorShapeProto &value_shape,
+                                                                     const char *op_name) {
+  if (default_value_shape.unknown_rank || value_shape.unknown_rank) {
+    return GRAPH_SUCCESS;
+  }
+  if (default_value_shape.dims.size() > value_shape.dims.size()) {
+    KERNEL_LOG_ERROR("default_value must have less dimensions than the values.");
+    return GRAPH_FAILED;
+  }
+  for (size_t i = 0; i < std::min(default_value_shape.dims.size(), value_shape.dims.size() - 1); ++i) {
+    if (default_value_shape.dims[i].size >= 0 && value_shape.dims[i + 1].size >= 0 &&
+        default_value_shape.dims[i].size != 1 && default_value_shape.dims[i].size != value_shape.dims[i + 1].size) {
+      return GRAPH_FAILED;
+    }
+  }
+  return GRAPH_SUCCESS;
+}
+
+graphStatus RaggedTensorToTensorCpuKernel::AsProto(Tensor *tshape, TensorShapeProto *proto, std::string name) const {
+  proto->dims.clear();
+  if (name == "shape") {
+    if (tshape->GetTensorShape()) {
+      if ((tshape->GetDataType() == DT_INT32 &&
+           static_cast<int64_t *>(tshape->GetData())[0] == static_cast<int32_t>(-1)) ||
+          (tshape->GetDataType() == DT_INT64 &&
+           static_cast<int64_t *>(tshape->GetData())[0] == static_cast<int64_t>(-1))) {
+        proto->unknown_rank = true;
+        return KERNEL_STATUS_OK;
+      }
+    }
+    if (tshape->GetDataType() == DT_INT32) {
+      int64_t dimsnum = tshape->GetTensorShape()->NumElements();
+      Dim tdim;
+      proto->dims.reserve(dimsnum);
+      auto dd = static_cast<int32_t *>(tshape->GetData());
+      for (int64_t i = 0; i < tshape->GetTensorShape()->NumElements(); i++) {
+        tdim.size = dd[i];
+        proto->dims.push_back(tdim);
+        proto->unknown_rank = false;
+      }
+      return KERNEL_STATUS_OK;
+    } else if (tshape->GetDataType() == DT_INT64) {
+      int64_t dimsnum = tshape->GetTensorShape()->NumElements();
+      Dim tdim;
+      proto->dims.reserve(dimsnum);
+      for (int64_t i = 0; i < tshape->GetTensorShape()->NumElements(); i++) {
+        tdim.size = static_cast<int64_t *>(tshape->GetData())[i];
+        proto->dims.push_back(tdim);
+        proto->unknown_rank = false;
+      }
+      return KERNEL_STATUS_OK;
+    }
+    KERNEL_LOG_ERROR("Expected an int32 or int64 shape tensor.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  } else {
+    if (tshape->GetTensorShape()->GetUnknownRank()) {
+      proto->unknown_rank = true;
+    } else {
+      for (int i = 0; i < tshape->GetTensorShape()->GetDims(); i++) {
+        Dim dim;
+        dim.size = tshape->GetTensorShape()->GetDimSizes()[i];
+        proto->dims.push_back(dim);
+      }
+    }
+    return KERNEL_STATUS_OK;
+  }
+}
+
+graphStatus RaggedTensorToTensorCpuKernel::CombineRaggedTensorToTensorShapes(int32_t ragged_rank,
+                                                                             const TensorShapeProto &shape,
+                                                                             const TensorShapeProto &value_shape,
+                                                                             TensorShapeProto &output_shape,
+                                                                             const char *op_name) {
+  if (value_shape.unknown_rank && shape.unknown_rank) {
+    output_shape.dims.clear();
+    output_shape.unknown_rank = true;
+    return GRAPH_SUCCESS;
+  }
+  if (shape.unknown_rank) {
+    while (output_shape.dims.size() < ragged_rank + value_shape.dims.size()) {
+      Dim temp_dim;
+      temp_dim.size = -1;
+      output_shape.dims.emplace_back(temp_dim);
+    }
+  } else {
+    output_shape = shape;
+  }
+  if (value_shape.unknown_rank) {
+    return GRAPH_SUCCESS;
+  }
+  if (ragged_rank + value_shape.dims.size() != output_shape.dims.size()) {
+    KERNEL_LOG_ERROR(
+      "error:ragged_rank plus value_shape dims should be equal to output dim "
+      "sizes.");
+    return GRAPH_FAILED;
+  }
+
+  for (size_t i = 1; i < value_shape.dims.size(); ++i) {
+    const Dim value_dim = value_shape.dims[i];
+    Dim output_shape_dim = output_shape.dims.at(output_shape.dims.size() - value_shape.dims.size() + i);
+    if (value_dim.size >= 0) {
+      if (output_shape_dim.size >= 0 && output_shape_dim.size != value_dim.size) {
+        KERNEL_LOG_ERROR("Value and shape dimension are inconsistent.");
+        return GRAPH_FAILED;
+      }
+      if (output_shape_dim.size < 0) {
+        output_shape_dim.size = value_dim.size;
+      }
+    }
+  }
+  return GRAPH_SUCCESS;
+}
+
+template <typename INDEX_TYPE>
+uint32_t RaggedTensorToTensorCpuKernel::CalculateOutputSize(INDEX_TYPE first_dim, CpuKernelContext &c,
+                                                            vector<INDEX_TYPE> *result) {
+  TensorShapeProto value_shape_proto;
+  Tensor *value_ptr = c.Input(kValueInputIndex);
+  AsProto(value_ptr, &value_shape_proto, "value");
+  TensorShapeProto default_value_shape_proto;
+  Tensor *default_value_ptr = c.Input(kDefaultValueInputIndex);
+  AsProto(default_value_ptr, &default_value_shape_proto, "default_value");
+  TensorShapeProto output_shape_proto;
+  Tensor *output_ptr = c.Output(0);
+  KERNEL_CHECK_NULLPTR(output_ptr, KERNEL_STATUS_PARAM_INVALID, "Output error.");
+  KERNEL_CHECK_FALSE(
+    (ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto, "RaggedTensorToTensor") != GRAPH_FAILED),
+    KERNEL_STATUS_PARAM_INVALID, "ValidateDefaultValueShape error.");
+  TensorShapeProto shape_proto;
+  {
+    Tensor *shape_ptr = c.Input(kShapeInputIndex);
+    AsProto(shape_ptr, &shape_proto, "shape");
+  }
+  KERNEL_CHECK_FALSE((CombineRaggedTensorToTensorShapes(ragged_rank_, shape_proto, value_shape_proto,
+                                                        output_shape_proto, "RaggedTensorToTensor") != GRAPH_FAILED),
+                     KERNEL_STATUS_PARAM_INVALID, "CombineRaggedTensorToTensorShapes error.");
+  result->reserve(output_shape_proto.dims.size());
+  for (unsigned int dim = 0; dim < output_shape_proto.dims.size(); dim++) {
+    // Note that this may be -1 (if dimension size is unknown).
+    result->push_back(output_shape_proto.dims[dim].size);
+  }
+  if ((*result)[0] < 0) {
+    (*result)[0] = first_dim;
+  }
+  for (int i = 1; i <= ragged_rank_; ++i) {
+    KERNEL_CHECK_FALSE(((*result)[i] >= 0), KERNEL_STATUS_PARAM_INVALID, "Result error.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+/**
+ * The output_index represents the index in the output tensor
+ * where the first element of a particular dimension would be written.
+ * If it is -1, it indicates that the index is out of scope.
+ * Example, given first_dimension = 10, first_dimension_output = 6,
+ * and output_index_multiplier = 100:
+ * result = [0 100 200 300 400 500 -1 -1 -1 -1]
+ * If first_dimension_output = 11 instead, then:
+ * result = [0 100 200 300 400 500 600 700 800 900]
+ */
+template <typename INDEX_TYPE>
+vector<INDEX_TYPE> RaggedTensorToTensorCpuKernel::CalculateFirstParentOutputIndex(INDEX_TYPE first_dimension,
+                                                                                  INDEX_TYPE output_index_multiplier,
+                                                                                  INDEX_TYPE first_dimension_output) {
+  const INDEX_TYPE min_dimension = std::min(first_dimension, first_dimension_output);
+  vector<INDEX_TYPE> result;
+  result.reserve(first_dimension);
+  int current_output_index = 0;
+  for (INDEX_TYPE i = 0; i < min_dimension; ++i, current_output_index += output_index_multiplier) {
+    result.push_back(current_output_index);
+  }
+  for (INDEX_TYPE i = min_dimension; i < first_dimension; ++i) {
+    result.push_back(-1);
+  }
+  unsigned int fisrt_dim = (unsigned int)first_dimension;
+  if (result.size() < fisrt_dim) KERNEL_LOG_ERROR("Resize size shou l d be greater equal first dim.");
+  return result;
+}
+
+template <typename INDEX_TYPE>
+uint32_t RaggedTensorToTensorCpuKernel::CalculateOutputIndexRowSplit(const typename TTypes<INDEX_TYPE>::Flat &row_split,
+                                                                     const vector<INDEX_TYPE> &parent_output_index,
+                                                                     INDEX_TYPE output_index_multiplier,
+                                                                     INDEX_TYPE output_size,
+                                                                     vector<INDEX_TYPE> *result) {
+  INDEX_TYPE row_split_size = row_split.size();
+  if (row_split_size > 0) {
+    result->reserve(row_split(row_split_size - 1));
+  }
+  for (INDEX_TYPE i = 0; i < row_split_size - 1; ++i) {
+    INDEX_TYPE row_length = row_split(i + 1) - row_split(i);
+    INDEX_TYPE real_length = std::min(output_size, row_length);
+    INDEX_TYPE parent_output_index_current = parent_output_index[i];
+    if (parent_output_index_current == -1) {
+      real_length = 0;
+    }
+    for (INDEX_TYPE j = 0; j < real_length; ++j) {
+      result->push_back(parent_output_index_current);
+      parent_output_index_current += output_index_multiplier;
+    }
+    for (INDEX_TYPE j = 0; j < row_length - real_length; ++j) {
+      result->push_back(-1);
+    }
+  }
+  if (row_split_size > 0) {
+    unsigned int row_split_size1 = row_split(row_split_size - 1);
+    KERNEL_CHECK_FALSE((result->size() >= row_split_size1), KERNEL_STATUS_PARAM_INVALID,
+                       "Result size should be greater equal row split size.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+// Calculate the output index of the first element of a list.
+// The parent_output_index is the same computation for the previous list.
+// -1 indicates an element or list that is out of range.
+// The output_index_multiplier is the number of output indices one moves
+// forward for each column.
+// E.g., given:
+// value_rowids:[0 1 2 2 2 3 5 5 6]
+// parent_output_index:[1000 1100 2000 2100 -1 3000 4000]
+// output_index_multiplier: 10
+// output_size: 2
+// You get:
+// result = [1000 1100 2000 2010 -1 2100 -1 -1 3000]
+// result[0] = parent_output_index[value_rowids[0]]
+// result[1] = parent_output_index[value_rowids[1]]
+// result[2] = parent_output_index[value_rowids[2]]
+// result[3] = parent_output_index[value_rowids[2] + 10]
+// result[4] = -1 because it is the third element the size is 2.
+// result[5] = parent_output_index[value_rowids[3]]
+// result[6] = -1 because parent_output_index[value_rowids[6]] == -1
+// result[7] = -1 because parent_output_index[value_rowids[6]] == -1
+// result[8] = parent_output_index[value_rowids[7]]
+template <typename INDEX_TYPE>
+uint32_t RaggedTensorToTensorCpuKernel::CalculateOutputIndexValueRowID(
+  const typename TTypes<INDEX_TYPE>::Flat &value_rowids, const vector<INDEX_TYPE> &parent_output_index,
+  INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size, vector<INDEX_TYPE> *result) {
+  const INDEX_TYPE index_size = value_rowids.size();
+  result->reserve(index_size);
+  KERNEL_CHECK_FALSE((index_size != 0), KERNEL_STATUS_PARAM_INVALID, "Index size should not be zero.");
+  INDEX_TYPE current_output_column = 0;
+  unsigned int current_value_rowid = value_rowids(0);
+  KERNEL_CHECK_FALSE((current_value_rowid < parent_output_index.size()), KERNEL_STATUS_PARAM_INVALID,
+                     "Current value rowid should be less than parent output index size.");
+  INDEX_TYPE current_output_index = parent_output_index[current_value_rowid];
+  result->push_back(current_output_index);
+  for (INDEX_TYPE i = 1; i < index_size; ++i) {
+    unsigned int next_value_rowid = value_rowids(i);
+    if (next_value_rowid == current_value_rowid && current_output_index >= 0) {
+      ++current_output_column;
+      if (current_output_column < output_size) {
+        current_output_index += output_index_multiplier;
+      } else {
+        current_output_index = -1;
+      }
+    }
+    if (next_value_rowid != current_value_rowid) {
+      current_output_column = 0;
+      current_value_rowid = next_value_rowid;
+      if (next_value_rowid >= parent_output_index.size()) {
+        KERNEL_LOG_ERROR("Next value rowid should be less than parent output index size.");
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+      current_output_index = parent_output_index[next_value_rowid];
+    }
+    result->push_back(current_output_index);
+  }
+  size_t result_size = result->size();
+  size_t value_rowid_size = value_rowids.size();
+  KERNEL_CHECK_FALSE((result_size == value_rowid_size), KERNEL_STATUS_PARAM_INVALID, "Invalid row ids.");
+  return KERNEL_STATUS_OK;
+}
+
+template <typename INDEX_TYPE>
+uint32_t RaggedTensorToTensorCpuKernel::CalculateOutputIndex(CpuKernelContext &ctx, int64_t dimension,
+                                                             const vector<INDEX_TYPE> &parent_output_index,
+                                                             INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size,
+                                                             vector<INDEX_TYPE> *result) {
+  const typename TTypes<INDEX_TYPE>::Flat row_partition_tensor = GetRowPartitionTensor<INDEX_TYPE>(ctx, dimension);
+  auto partition_type = GetRowPartitionTypeByDimension(dimension);
+  switch (partition_type) {
+    case RowPartitionType::VALUE_ROWIDS:
+      return CalculateOutputIndexValueRowID(row_partition_tensor, parent_output_index, output_index_multiplier,
+                                            output_size, result);
+    case RowPartitionType::ROW_SPLITS:
+      return CalculateOutputIndexRowSplit(row_partition_tensor, parent_output_index, output_index_multiplier,
+                                          output_size, result);
+    default:
+      KERNEL_LOG_ERROR("Unsupported partition type:[%s]", RowPartitionTypeToString(partition_type));
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+template <typename INDEX_TYPE>
+uint32_t RaggedTensorToTensorCpuKernel::GetFirstDimensionSize(CpuKernelContext &ctx, INDEX_TYPE *result) {
+  const Tensor *first_partition_tensor = ctx.Input(kFirstPartitionInputIndex);
+  const RowPartitionType first_partition_type = row_partition_types_[0];
+
+  switch (first_partition_type) {
+    case RowPartitionType::FIRST_DIM_SIZE:
+      *result = static_cast<INDEX_TYPE *>(first_partition_tensor->GetData())[0];
+      return KERNEL_STATUS_OK;
+    case RowPartitionType::VALUE_ROWIDS:
+      KERNEL_LOG_ERROR("Cannot handle VALUE_ROWIDS in first dimension.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    case RowPartitionType::ROW_SPLITS:
+      *result = first_partition_tensor->GetTensorShape()->GetDimSizes()[0] - 1;
+      return KERNEL_STATUS_OK;
+    default:
+      KERNEL_LOG_ERROR("Cannot handle type [%s]", RowPartitionTypeToString(first_partition_type));
+      return KERNEL_STATUS_INNER_ERROR;
+  }
+}
+
+template <typename INDEX_TYPE, typename VALUE_TYPE>
+uint32_t RaggedTensorToTensorCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  KERNEL_CHECK_FALSE((GetRowPartitionTypes(ctx) != GRAPH_FAILED), KERNEL_STATUS_PARAM_INVALID,
+                     "GetRowPartitionTypes error");
+  ragged_rank_ = GetRaggedRank(row_partition_types_);
+  INDEX_TYPE first_dimension;
+  KERNEL_CHECK_FALSE((GetFirstDimensionSize(ctx, &first_dimension) == 0), KERNEL_STATUS_PARAM_INVALID,
+                     "GetFirstDimensionSize error.");
+  vector<INDEX_TYPE> output_size;
+  KERNEL_CHECK_FALSE((CalculateOutputSize(first_dimension, ctx, &output_size) == 0), KERNEL_STATUS_PARAM_INVALID,
+                     "CalculateOutputSize error.");
+
+  vector<INDEX_TYPE> multiplier;
+  multiplier.resize(output_size.size());
+  multiplier[multiplier.size() - 1] = 1;
+  for (int i = output_size.size() - 2; i >= 0; --i) {
+    multiplier[i] = multiplier[i + 1] * output_size[i + 1];
+  }
+
+  Tensor *output_tensor = nullptr;
+  output_tensor = ctx.Output(0);
+  auto output_shape = output_tensor->GetTensorShape();
+  auto output_shape_dims = output_shape->GetDimSizes();
+  for (unsigned int i = 0; i < output_size.size(); i++) {
+    output_shape_dims[i] = output_size[i];
+  }
+
+  const INDEX_TYPE full_size = multiplier[0] * output_size[0];
+  if (full_size > 0) {
+    vector<INDEX_TYPE> output_index = CalculateFirstParentOutputIndex(first_dimension, multiplier[0], output_size[0]);
+    for (int i = 1; i <= ragged_rank_; ++i) {
+      vector<INDEX_TYPE> new_output_index;
+      KERNEL_CHECK_FALSE(
+        (CalculateOutputIndex(ctx, i - 1, output_index, multiplier[i], output_size[i], &new_output_index) == 0),
+        KERNEL_STATUS_PARAM_INVALID, "CalculateOutputIndex error.");
+      output_index = new_output_index;
+    }
+    return SetOutput<INDEX_TYPE, VALUE_TYPE>(ctx, output_index, output_tensor);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename INDEX_TYPE, typename VALUE_TYPE>
+uint32_t RaggedTensorToTensorCpuKernel::SetOutput(CpuKernelContext &ctx, const vector<INDEX_TYPE> &output_index,
+                                                  Tensor *output_tensor) {
+  EigenTensor outputET(output_tensor, reinterpret_cast<INDEX_TYPE *>(output_tensor->GetData()));
+  typename aicpu::TTypes<VALUE_TYPE>::Flat output_flat = outputET.flat<VALUE_TYPE>();
+  const auto value_tensor = ctx.Input(kValueInputIndex);
+  const auto default_value_tensor = ctx.Input(kDefaultValueInputIndex);
+  if (value_tensor->GetTensorShape()->GetDims() == 1) {
+    // Initialize tensor to default_value.
+    VALUE_TYPE *base_output = output_flat.data();
+    VALUE_TYPE *default_value_pt = static_cast<VALUE_TYPE *>(default_value_tensor->GetData());
+    VALUE_TYPE default_value = default_value_pt[0];
+    std::fill(base_output, base_output + output_flat.size(), default_value);
+    EigenTensor valuesET(value_tensor, reinterpret_cast<INDEX_TYPE *>(value_tensor->GetData()));
+    auto values = valuesET.flat<VALUE_TYPE>();
+    unsigned int values_size = values.size();
+    KERNEL_CHECK_FALSE((values_size == output_index.size()), KERNEL_STATUS_PARAM_INVALID,
+                       "Values and indices must be equal.");
+    for (unsigned int i = 0; i < values_size; ++i) {
+      if (output_index[i] >= 0) {
+        output_flat(output_index[i]) = values(i);
+      }
+    }
+  } else {
+    auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
+    auto default_value_shape = default_value_tensor->GetTensorShape()->GetDimSizes();
+    int64_t output_element_size = 1;
+    for (const int64_t &d : output_shape) {
+      output_element_size *= d;
+    }
+    // Initialize tensor to default_value.
+    std::vector<int64_t> broadcast_shape;
+    auto ret = GetBroadcastShape(default_value_shape, output_shape, broadcast_shape);
+    KERNEL_CHECK_FALSE(ret == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID, "Broadcast failed.");
+    KERNEL_CHECK_FALSE(broadcast_shape == output_shape, KERNEL_STATUS_PARAM_INVALID,
+                       "Unable to broadcast shape of default_value to result.");
+    BroadcastIterator iter(default_value_shape, output_shape, broadcast_shape);
+    auto default_value_addr = reinterpret_cast<VALUE_TYPE *>(default_value_tensor->GetData());
+    auto output_addr = reinterpret_cast<VALUE_TYPE *>(output_tensor->GetData());
+    iter.SetPos(0);
+    for (int i = 0; i < output_element_size; ++i) {
+      output_addr[i] = default_value_addr[iter.GetInputPosA()];
+      iter.GenNextPos();
+    }
+    VALUE_TYPE *base_output = output_flat.data();
+    EigenTensor valuesET(value_tensor, reinterpret_cast<INDEX_TYPE *>(value_tensor->GetData()));
+    auto values = valuesET.flat<VALUE_TYPE>();
+    size_t values_size = values.size();
+    size_t output_index_size = output_index.size();
+    //  A value "element" is a group of values that are arranged together.
+    // For example, if the value shape is [3,4,5], then 20 values are in a
+    // value element.
+    unsigned int value_element_size;
+    if (output_index_size != 0) {
+      value_element_size = values_size / output_index_size;
+    } else {
+      KERNEL_LOG_DEBUG("Values and indices must be equal");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    unsigned int value_element_bytesize = value_element_size * sizeof(VALUE_TYPE);
+    const VALUE_TYPE *values_base = values.data();
+    unsigned int values_dimsize = value_tensor->GetTensorShape()->GetDimSizes()[0];
+    KERNEL_CHECK_FALSE((values_dimsize == output_index_size), KERNEL_STATUS_PARAM_INVALID,
+                       "Values and indices must be equal.");
+    KERNEL_CHECK_FALSE((values_size == output_index_size * value_element_size), KERNEL_STATUS_PARAM_INVALID,
+                       "Values and indices must be equal.");
+
+    INDEX_TYPE value_index = 0;
+    for (unsigned int i = 0; i < output_index_size; ++i, value_index += value_element_size) {
+      if (output_index[i] >= 0) {
+        VALUE_TYPE *dst = base_output + output_index[i];
+        const VALUE_TYPE *src = values_base + value_index;
+        copy_array<VALUE_TYPE, INDEX_TYPE>(dst, src, value_element_size, value_element_bytesize);
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kRaggedTensorToTensor, RaggedTensorToTensorCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_tensor.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_tensor.h
@ -0,0 +1,150 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_RAGGEDTENSORTOTENSOR_H_
+#define AICPU_KERNELS_NORMALIZED_RAGGEDTENSORTOTENSOR_H_
+#include <memory>
+#include <vector>
+#include <iostream>
+#include <string>
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "securec.h"
+#include "status.h"
+#include "utils/eigen_tensor.h"
+#include "utils/broadcast_iterator.h"
+#include "utils/kernel_util.h"
+#include "Eigen/Core"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include <unordered_map>
+using std::string;
+using std::vector;
+
+namespace aicpu {
+struct DimStruct {
+  int64_t size = 1;
+};
+using Dim = DimStruct;
+
+struct TensorShapeProtoStruct {
+  std::vector<Dim> dims;
+  bool unknown_rank = false;
+};
+using TensorShapeProto = TensorShapeProtoStruct;
+
+enum class RowPartitionType { FIRST_DIM_SIZE, VALUE_ROWIDS, ROW_LENGTHS, ROW_SPLITS, ROW_LIMITS, ROW_STARTS };
+const int kShapeInputIndex = 0;
+const int kValueInputIndex = 1;
+const int kDefaultValueInputIndex = 2;
+const int kFirstPartitionInputIndex = 3;
+using graphStatus = uint32_t;
+const graphStatus GRAPH_FAILED = 0xFFFFFFFF;
+const graphStatus GRAPH_SUCCESS = 0;
+
+template <typename VALUE_TYPE, typename INDEX_TYPE>
+void slow_copy_array(VALUE_TYPE *dst, const VALUE_TYPE *src, INDEX_TYPE size) {
+  for (INDEX_TYPE index = 0; index < size; ++index) {
+    dst[index] = src[index];
+  }
+}
+
+template <typename VALUE_TYPE, typename INDEX_TYPE>
+void copy_array(VALUE_TYPE *dst, const VALUE_TYPE *src, INDEX_TYPE size, size_t bytes) {
+  memcpy(dst, src, bytes);
+}
+
+template <>
+void copy_array<string, int64_t>(std::string *dst, const string *src, int64_t size, size_t bytes) {
+  slow_copy_array(dst, src, size);
+}
+
+template <>
+void copy_array<string, int32_t>(string *dst, const string *src, int32_t size, size_t bytes) {
+  slow_copy_array(dst, src, size);
+}
+
+template <>
+void copy_array<Eigen::half, int64_t>(Eigen::half *dst, const Eigen::half *src, int64_t size, size_t bytes) {
+  slow_copy_array(dst, src, size);
+}
+
+template <>
+void copy_array<Eigen::half, int32_t>(Eigen::half *dst, const Eigen::half *src, int32_t size, size_t bytes) {
+  slow_copy_array(dst, src, size);
+}
+
+class RaggedTensorToTensorCpuKernel : public CpuKernel {
+ public:
+  graphStatus GetRowPartitionTypes(CpuKernelContext &ctx);
+  int32_t GetRaggedRank(const std::vector<RowPartitionType> &partition_types);
+  RowPartitionType GetRowPartitionTypeByDimension(int dimension);
+
+  template <typename INDEX_TYPE>
+  typename TTypes<INDEX_TYPE>::Flat GetRowPartitionTensor(CpuKernelContext &c, int64_t dimension);
+
+  string RowPartitionTypeToString(RowPartitionType row_partition_type);
+
+  graphStatus ValidateDefaultValueShape(const TensorShapeProto &default_value_shape,
+                                        const TensorShapeProto &value_shape, const char *op_name);
+
+  graphStatus AsProto(Tensor *tshape, TensorShapeProto *proto, std::string name) const;
+
+  graphStatus CombineRaggedTensorToTensorShapes(int32_t ragged_rank, const TensorShapeProto &shape,
+                                                const TensorShapeProto &value_shape, TensorShapeProto &output_shape,
+                                                const char *op_name);
+
+  template <typename INDEX_TYPE>
+  uint32_t CalculateOutputSize(INDEX_TYPE first_dim, CpuKernelContext &c, vector<INDEX_TYPE> *result);
+
+  template <typename INDEX_TYPE>
+  vector<INDEX_TYPE> CalculateFirstParentOutputIndex(INDEX_TYPE first_dimension, INDEX_TYPE output_index_multiplier,
+                                                     INDEX_TYPE first_dimension_output);
+
+  template <typename INDEX_TYPE>
+  uint32_t CalculateOutputIndexRowSplit(const typename TTypes<INDEX_TYPE>::Flat &row_split,
+                                        const vector<INDEX_TYPE> &parent_output_index,
+                                        INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size,
+                                        vector<INDEX_TYPE> *result);
+
+  template <typename INDEX_TYPE>
+  uint32_t CalculateOutputIndexValueRowID(const typename TTypes<INDEX_TYPE>::Flat &value_rowids,
+                                          const vector<INDEX_TYPE> &parent_output_index,
+                                          INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size,
+                                          vector<INDEX_TYPE> *result);
+
+  template <typename INDEX_TYPE>
+  uint32_t CalculateOutputIndex(CpuKernelContext &context, int64_t dimension,
+                                const vector<INDEX_TYPE> &parent_output_index, INDEX_TYPE output_index_multiplier,
+                                INDEX_TYPE output_size, vector<INDEX_TYPE> *result);
+
+  template <typename INDEX_TYPE>
+  uint32_t GetFirstDimensionSize(CpuKernelContext &context, INDEX_TYPE *result);
+
+  template <typename INDEX_TYPE, typename VALUE_TYPE>
+  uint32_t DoCompute(CpuKernelContext &context);
+
+  template <typename INDEX_TYPE, typename VALUE_TYPE>
+  uint32_t SetOutput(CpuKernelContext &context, const vector<INDEX_TYPE> &output_index, Tensor *output_tensor);
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  std::vector<RowPartitionType> row_partition_types_;
+  int ragged_rank_;
+};
+};  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal.cc
@ -0,0 +1,160 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reciprocal.h"
+
+#include <float.h>
+#include <complex>
+
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kReciprocal = "Reciprocal";
+const size_t kReciprocalInputNum = 1;
+const size_t kReciprocalOutputNum = 1;
+constexpr int64_t kParallelDataNums = 32 * 1024;
+}  // namespace
+
+namespace aicpu {
+uint32_t ReciprocalCpuKernel::Compute(CpuKernelContext &ctx) {
+  Tensor *x = ctx.Input(0);
+  Tensor *y = ctx.Output(0);
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kReciprocalOutputNum, kReciprocalInputNum), "Check Reciprocal params failed.");
+  if (x->GetDataType() != y->GetDataType()) {
+    KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s]",
+                     DTypeStr(x->GetDataType()).c_str(), DTypeStr(y->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (x->GetDataSize() != y->GetDataSize()) {
+    KERNEL_LOG_ERROR(
+      "The data size of the input [%llu] need be the same as the output "
+      "[%llu]",
+      x->GetDataSize(), y->GetDataSize());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  uint64_t data_num = x->NumElements();
+  DataType data_type = x->GetDataType();
+  uint32_t res = KERNEL_STATUS_OK;
+
+  switch (data_type) {
+    case DT_FLOAT:
+      res = ReciprocalCompute<float>(x, y, data_num, ctx);
+      break;
+    case DT_DOUBLE:
+      res = ReciprocalCompute<double>(x, y, data_num, ctx);
+      break;
+    case DT_FLOAT16:
+      res = ReciprocalCompute<Eigen::half>(x, y, data_num, ctx);
+      break;
+    case DT_COMPLEX64:
+      res = ReciprocalComputeComplex<std::complex<float>>(x, y, data_num, ctx);
+      break;
+    case DT_COMPLEX128:
+      res = ReciprocalComputeComplex<std::complex<double>>(x, y, data_num, ctx);
+      break;
+    default:
+      KERNEL_LOG_ERROR("Reciprocal kernel data type [%s] not support", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (res != KERNEL_STATUS_OK) {
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t ReciprocalCpuKernel::ReciprocalCompute(Tensor *x, Tensor *y, uint64_t data_num, CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<T *>(x->GetData());
+  auto output_y = reinterpret_cast<T *>(y->GetData());
+  if (data_num <= kParallelDataNums) {
+    for (size_t i = 0; i < data_num; i++) {
+      if (input_x[i] == static_cast<T>(0)) {
+        KERNEL_LOG_ERROR("Reciprocal kernel input[%d] cannot be 0", i);
+        return KERNEL_STATUS_INNER_ERROR;
+      }
+      output_y[i] = static_cast<T>(1) / (input_x[i]);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto shared_reciprocal = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        if (input_x[i] == static_cast<T>(0)) {
+          KERNEL_LOG_ERROR("Reciprocal kernel input[%d] cannot be 0", i);
+          return KERNEL_STATUS_INNER_ERROR;
+        }
+        output_y[i] = static_cast<T>(1) / (input_x[i]);
+      }
+      return KERNEL_STATUS_OK;
+    };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_reciprocal);
+    if (ret != KERNEL_STATUS_OK) {
+      KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed");
+      return KERNEL_STATUS_INNER_ERROR;
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_reciprocal),
+                        "Reciprocal Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t ReciprocalCpuKernel::ReciprocalComputeComplex(Tensor *x, Tensor *y, uint64_t data_num, CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<T *>(x->GetData());
+  auto output_y = reinterpret_cast<T *>(y->GetData());
+  if (data_num <= kParallelDataNums) {
+    for (size_t i = 0; i < data_num; i++) {
+      output_y[i] = conj(input_x[i]) / (input_x[i].real() * input_x[i].real() + input_x[i].imag() * input_x[i].imag());
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shared_reciprocal = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        output_y[i] =
+          conj(input_x[i]) / (input_x[i].real() * input_x[i].real() + input_x[i].imag() * input_x[i].imag());
+      }
+    };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_reciprocal);
+    if (ret != KERNEL_STATUS_OK) {
+      KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed");
+      return KERNEL_STATUS_INNER_ERROR;
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_reciprocal),
+                        "Reciprocal Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kReciprocal, ReciprocalCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal.h
@ -0,0 +1,35 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_RECIPROCAL_H_
+#define AICPU_KERNELS_NORMALIZED_RECIPROCAL_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class ReciprocalCpuKernel : public CpuKernel {
+ public:
+  ~ReciprocalCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t ReciprocalCompute(Tensor *x, Tensor *y, uint64_t data_num, CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t ReciprocalComputeComplex(Tensor *x, Tensor *y, uint64_t data_num, CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal_grad.cc
@ -0,0 +1,155 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "reciprocal_grad.h"
+
+#include <float.h>
+#include <complex>
+#include <math.h>
+
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kReciprocalGrad = "ReciprocalGrad";
+const size_t kReciprocalGradInputNum = 2;
+const size_t kReciprocalGradOutputNum = 1;
+constexpr int64_t kParallelDataNums = 64 * 1024;
+constexpr int64_t kParallelComplexDataNums = 16 * 1024;
+}  // namespace
+
+namespace aicpu {
+uint32_t ReciprocalGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kReciprocalGradInputNum, kReciprocalGradOutputNum),
+                      "Check ReciprocalGrad params failed.");
+  Tensor *y = ctx.Input(0);
+  Tensor *dy = ctx.Input(1);
+  Tensor *z = ctx.Output(0);
+  if (y->GetDataType() != dy->GetDataType()) {
+    KERNEL_LOG_ERROR("The data type of the input2 [%s] need be the same as the input1 [%s]",
+                     DTypeStr(dy->GetDataType()).c_str(), DTypeStr(y->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (y->GetDataSize() != dy->GetDataSize()) {
+    KERNEL_LOG_ERROR(
+      "The data size of the input2 [%llu] need be the same as the input1 "
+      "[%llu]",
+      dy->GetDataSize(), y->GetDataSize());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  uint64_t data_num = y->NumElements();
+  DataType data_type = y->GetDataType();
+  uint32_t res = KERNEL_STATUS_OK;
+
+  switch (data_type) {
+    case DT_FLOAT16:
+      res = ReciprocalGradCompute<Eigen::half>(y, dy, z, data_num, ctx);
+      break;
+    case DT_FLOAT:
+      res = ReciprocalGradCompute<float>(y, dy, z, data_num, ctx);
+      break;
+    case DT_DOUBLE:
+      res = ReciprocalGradCompute<double>(y, dy, z, data_num, ctx);
+      break;
+    case DT_COMPLEX64:
+      res = ReciprocalGradComputeComplex<std::complex<float>>(y, dy, z, data_num, ctx);
+      break;
+    case DT_COMPLEX128:
+      res = ReciprocalGradComputeComplex<std::complex<double>>(y, dy, z, data_num, ctx);
+      break;
+    default:
+      KERNEL_LOG_ERROR("ReciprocalGrad invalid input type [%s]", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (res != KERNEL_STATUS_OK) {
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t ReciprocalGradCpuKernel::ReciprocalGradCompute(Tensor *y, Tensor *dy, Tensor *z, uint64_t data_num,
+                                                        CpuKernelContext &ctx) {
+  auto input_y = reinterpret_cast<T *>(y->GetData());
+  auto input_dy = reinterpret_cast<T *>(dy->GetData());
+  auto output_z = reinterpret_cast<T *>(z->GetData());
+  if (data_num <= kParallelDataNums) {
+    for (size_t i = 0; i < data_num; i++) {
+      output_z[i] = static_cast<T>(-1) * input_dy[i] * input_y[i] * input_y[i];
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shard_ReciprocalGrad = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        output_z[i] = static_cast<T>(-1) * input_dy[i] * input_y[i] * input_y[i];
+      }
+    };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_ReciprocalGrad);
+    if (ret != KERNEL_STATUS_OK) {
+      KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed");
+      return KERNEL_STATUS_INNER_ERROR;
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_ReciprocalGrad),
+                        "ReciprocalGrad Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t ReciprocalGradCpuKernel::ReciprocalGradComputeComplex(Tensor *y, Tensor *dy, Tensor *z, uint64_t data_num,
+                                                               CpuKernelContext &ctx) {
+  auto input_y = reinterpret_cast<T *>(y->GetData());
+  auto input_dy = reinterpret_cast<T *>(dy->GetData());
+  auto output_z = reinterpret_cast<T *>(z->GetData());
+  if (data_num <= kParallelComplexDataNums) {
+    for (size_t i = 0; i < data_num; i++) {
+      output_z[i] = static_cast<T>(-1) * input_dy[i] * conj(input_y[i] * input_y[i]);
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shard_ReciprocalGrad = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        output_z[i] = static_cast<T>(-1) * input_dy[i] * conj(input_y[i] * input_y[i]);
+      }
+    };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_ReciprocalGrad);
+    if (ret != KERNEL_STATUS_OK) {
+      KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed");
+      return KERNEL_STATUS_INNER_ERROR;
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_ReciprocalGrad),
+                        "ReciprocalGrad Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kReciprocalGrad, ReciprocalGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal_grad.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_RECIPROCALGRAD_H_
+#define AICPU_KERNELS_NORMALIZED_RECIPROCALGRAD_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class ReciprocalGradCpuKernel : public CpuKernel {
+ public:
+  ~ReciprocalGradCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t ReciprocalGradCompute(Tensor *y, Tensor *dy, Tensor *z, uint64_t data_num, CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t ReciprocalGradComputeComplex(Tensor *y, Tensor *dy, Tensor *z, uint64_t data_num, CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc
@ -0,0 +1,487 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reduce_mean.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include "algorithm"
+#include "iostream"
+
+namespace {
+const char *kReduceMean = "ReduceMean";
+
+#define REDUCEMEAN_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX)    \
+  case (DTYPE): {                                            \
+    uint32_t result = ReduceMeanCompute<TYPE1, TYPE2>(CTX);  \
+    if (result != KERNEL_STATUS_OK) {                        \
+      KERNEL_LOG_ERROR("ReduceMean kernel compute failed."); \
+      return result;                                         \
+    }                                                        \
+    break;                                                   \
+  }
+
+#define REDUCEMEAN_COMPUTE_CASE_CP(DTYPE, TYPE1, TYPE2, CTX)        \
+  case (DTYPE): {                                                   \
+    uint32_t result = ReduceMeanCompute_Complex<TYPE1, TYPE2>(CTX); \
+    if (result != KERNEL_STATUS_OK) {                               \
+      KERNEL_LOG_ERROR("ReduceMean kernel compute failed.");        \
+      return result;                                                \
+    }                                                               \
+    break;                                                          \
+  }
+
+#define REDUCEMEAN_COMPUTE_CASE_ALL(TYPE, CTX)                               \
+  REDUCEMEAN_COMPUTE_CASE_CP(DT_COMPLEX64, std::complex<float>, TYPE, CTX)   \
+  REDUCEMEAN_COMPUTE_CASE_CP(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
+  REDUCEMEAN_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)                      \
+  REDUCEMEAN_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX)                        \
+  REDUCEMEAN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX)                \
+  REDUCEMEAN_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX)                        \
+  REDUCEMEAN_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX)                      \
+  REDUCEMEAN_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX)                      \
+  REDUCEMEAN_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX)                      \
+  REDUCEMEAN_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX)                      \
+  REDUCEMEAN_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX)                    \
+  REDUCEMEAN_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX)                    \
+  REDUCEMEAN_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+T ComplexDiv(T sum, int64_t num) {
+  T res;
+  auto real = sum.real();
+  auto imag = sum.imag();
+  res.real(real / num);
+  res.imag(imag / num);
+  return res;
+}
+
+uint32_t ReduceMeanCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  uint32_t input_num = ctx.GetInputsSize();
+  uint32_t output_num = ctx.GetOutputsSize();
+  if (input_num != 2 || output_num != 1) {
+    KERNEL_LOG_ERROR("The number of input or output parameters does not match.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  Tensor *input_data = ctx.Input(0);
+  KERNEL_CHECK_NULLPTR(input_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[0] failed.")
+  Tensor *axes_data = ctx.Input(1);
+  KERNEL_CHECK_NULLPTR(axes_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[1] failed.")
+  Tensor *output_data = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR(output_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output[0] failed.");
+  DataType data_type = ctx.Input(0)->GetDataType();
+  DataType axes_type = ctx.Input(1)->GetDataType();
+  switch (axes_type) {
+    case DT_INT32:
+      switch (data_type) {
+        REDUCEMEAN_COMPUTE_CASE_ALL(int32_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    case DT_INT64:
+      switch (data_type) {
+        REDUCEMEAN_COMPUTE_CASE_ALL(int64_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    default:
+      KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(axes_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+/*
+Calculate the mean of the corresponding dimension data
+Rule: except for the specified dimension, a set of data with other
+dimensions unchanged participate in the calculation of a mean.
+e.g.    input_x : float array[2][2][2]={1,2,3,4,5,6,7,8}
+        axes : [1 , 2]
+        output:[2.5, 6.5]
+        2.5 is calculated from array[0][0][0], array[0][0][1],
+                                array[0][1][0] and array[0][1][1]
+The same group of data addresses involved in calculating the
+mean consists of one same base address and different offset addresses
+input_data_address = base_address + offset_address
+*/
+template <typename T1, typename T2>
+uint32_t ReduceMeanCpuKernel::ReduceMeanCompute(CpuKernelContext &ctx) {
+  Tensor *input_data = ctx.Input(0);
+  auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
+  const int64_t input_data_num = input_data->NumElements();
+  auto input_data_shape = input_data->GetTensorShape();
+  const int32_t input_data_dims = input_data_shape->GetDims();
+  std::vector<int64_t> input_data_dimsize = input_data_shape->GetDimSizes();
+  std::vector<int64_t> dims_addr(input_data_dims);
+  dims_addr[input_data_dims - 1] = 1;
+  int64_t addr_tmp = 1;
+  for (int32_t i = input_data_dims - 2; i > -1; i--) {
+    addr_tmp *= input_data_dimsize[i + 1];
+    dims_addr[i] = addr_tmp;
+  }
+  Tensor *output_data = ctx.Output(0);
+  auto output_data_shape = output_data->GetTensorShape();
+  auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
+  const int64_t output_data_num = output_data->NumElements();
+  Tensor *axes_data = ctx.Input(1);
+  auto axes_data_addr = reinterpret_cast<T2 *>(axes_data->GetData());
+  int64_t axes_data_num = axes_data->NumElements();
+  // Check the effectiveness of the value of axes
+  for (int64_t i = 0; i < axes_data_num; i++) {
+    if ((*(axes_data_addr + i) >= input_data_dims) || (*(axes_data_addr + i) < -input_data_dims)) {
+      KERNEL_LOG_ERROR("The value of axes is incorrect.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    } else if (*(axes_data_addr + i) < 0) {
+      *(axes_data_addr + i) += input_data_dims;
+    }
+  }
+  std::sort(axes_data_addr, axes_data_addr + axes_data_num);
+  std::vector<T2> axes_data_norepeat;
+  for (int64_t i = 0; i < axes_data_num - 1; i++) {
+    T2 value = axes_data_addr[i];
+    if (value == axes_data_addr[i + 1]) {
+      axes_data_num--;
+      continue;
+    }
+    axes_data_norepeat.push_back(value);
+  }
+  axes_data_norepeat.push_back(axes_data_addr[axes_data_num - 1]);
+  // deal with attr
+  auto attr_value = ctx.GetAttr("keep_dims");
+  bool keep_dims;
+  if (attr_value == nullptr) {
+    keep_dims = false;
+  } else {
+    keep_dims = static_cast<bool>(attr_value->GetBool());
+  }
+  if (axes_data_num == input_data_dims) {
+    if (keep_dims) {
+      std::vector<int64_t> dims_new(axes_data_num, 1);
+      output_data_shape->SetDimSizes(dims_new);
+    } else {
+      std::vector<int64_t> dims_new(1, 1);
+      output_data_shape->SetDimSizes(dims_new);
+    }
+    T1 data_sum = static_cast<T1>(0);
+    for (int64_t i = 0; i < input_data_num; i++) {
+      data_sum += input_data_addr[i];
+    }
+    output_data_addr[0] = data_sum / input_data_num;
+  } else {
+    std::vector<int64_t> dims_new(input_data_shape->GetDimSizes());
+    if (keep_dims) {
+      for (auto iter = axes_data_norepeat.cbegin(); iter != axes_data_norepeat.cend(); iter++) {
+        dims_new[*iter] = 1;
+      }
+    } else {
+      for (auto iter = axes_data_norepeat.rbegin(); iter != axes_data_norepeat.rend(); iter++) {
+        dims_new.erase(dims_new.begin() + (*iter));
+      }
+    }
+    output_data_shape->SetDimSizes(dims_new);
+    // Extract unspecified dimensions
+    std::vector<T2> dims_base;
+    const int32_t axes_data_num_const = axes_data_num;
+    const int32_t dims_base_num = input_data_dims - axes_data_num_const;
+    for (T2 i = 0; i < (T2)input_data_dims; i++) {
+      bool cflag = true;
+      for (int64_t j = 0; j < axes_data_num_const; j++) {
+        if (axes_data_norepeat[j] == i) {
+          cflag = false;
+          break;
+        }
+      }
+      if (cflag) {
+        dims_base.push_back(i);
+      }
+    }
+    int64_t addr_stride[axes_data_num_const];
+    addr_tmp = 1;
+    addr_stride[axes_data_num_const - 1] = addr_tmp;
+    for (int32_t i = axes_data_num_const - 2; i > -1; i--) {
+      addr_tmp *= input_data_dimsize[axes_data_norepeat[i + 1]];
+      addr_stride[i] = addr_tmp;
+    }
+    int64_t offset_num = addr_tmp * input_data_dimsize[axes_data_norepeat[0]];
+    if ((input_data_num > 256 * 1024 && input_data_num / output_data_num > 256) || (output_data_num > 1024)) {
+      uint32_t min_core_num = 1;
+      int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+      if (max_core_num > output_data_num) {
+        max_core_num = output_data_num;
+      }
+      auto shard_compute = [&](size_t start, size_t end) {
+        for (size_t i = start; i < end; i++) {
+          int64_t output_i_addr = 0;
+          int64_t seq_tmp = i;
+          for (int32_t j = dims_base_num - 1; j > -1; j--) {
+            int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
+            int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
+            seq_tmp = next;
+            output_i_addr += loc * dims_addr[dims_base[j]];
+            if (seq_tmp == 0) {
+              break;
+            }
+          }
+          T1 data_sum = input_data_addr[output_i_addr];
+          // In the array, the actual address of the element participating in the calculation.
+          int64_t addr_offset = 0;
+          for (int64_t j = 1; j < offset_num; j++) {
+            int32_t stride = axes_data_num_const - 1;
+            for (int32_t k = stride - 1; k > -1; k--) {
+              if (j % addr_stride[k] == 0) {
+                addr_offset -=
+                  (input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
+                stride = k;
+                continue;
+              }
+              break;
+            }
+            addr_offset += dims_addr[axes_data_norepeat[stride]];
+            data_sum += input_data_addr[output_i_addr + addr_offset];
+          }
+          output_data_addr[i] = data_sum / offset_num;
+        }
+      };
+      KERNEL_HANDLE_ERROR(
+        CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, shard_compute),
+        "ReduceMean Compute failed.");
+    } else {
+      for (int64_t i = 0; i < output_data_num; i++) {
+        // In the array, the actual address of the output.
+        int64_t output_i_addr = 0;
+        int64_t seq_tmp = i;
+        for (int32_t j = dims_base_num - 1; j > -1; j--) {
+          int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
+          int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
+          seq_tmp = next;
+          output_i_addr += loc * dims_addr[dims_base[j]];
+          if (seq_tmp == 0) {
+            break;
+          }
+        }
+        T1 data_sum = input_data_addr[output_i_addr];
+        // In the array, the actual address of the element participating in the calculation.
+        int64_t addr_offset = 0;
+        for (int64_t j = 1; j < offset_num; j++) {
+          int32_t stride = axes_data_num_const - 1;
+          for (int32_t k = stride - 1; k > -1; k--) {
+            if (j % addr_stride[k] == 0) {
+              addr_offset -=
+                (input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
+              stride = k;
+              continue;
+            }
+            break;
+          }
+          addr_offset += dims_addr[axes_data_norepeat[stride]];
+          data_sum += input_data_addr[output_i_addr + addr_offset];
+        }
+        output_data_addr[i] = data_sum / offset_num;
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T1, typename T2>
+uint32_t ReduceMeanCpuKernel::ReduceMeanCompute_Complex(CpuKernelContext &ctx) {
+  Tensor *input_data = ctx.Input(0);
+  auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
+  const int64_t input_data_num = input_data->NumElements();
+  auto input_data_shape = input_data->GetTensorShape();
+  const int32_t input_data_dims = input_data_shape->GetDims();
+  std::vector<int64_t> input_data_dimsize = input_data_shape->GetDimSizes();
+  std::vector<int64_t> dims_addr(input_data_dims);
+  dims_addr[input_data_dims - 1] = 1;
+  int64_t addr_tmp = 1;
+  for (int32_t i = input_data_dims - 2; i > -1; i--) {
+    addr_tmp *= input_data_dimsize[i + 1];
+    dims_addr[i] = addr_tmp;
+  }
+  Tensor *output_data = ctx.Output(0);
+  auto output_data_shape = output_data->GetTensorShape();
+  auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
+  const int64_t output_data_num = output_data->NumElements();
+  Tensor *axes_data = ctx.Input(1);
+  auto axes_data_addr = reinterpret_cast<T2 *>(axes_data->GetData());
+  int64_t axes_data_num = axes_data->NumElements();
+  // Check the effectiveness of the value of axes
+  for (int64_t i = 0; i < axes_data_num; i++) {
+    if ((*(axes_data_addr + i) >= input_data_dims) || (*(axes_data_addr + i) < -input_data_dims)) {
+      KERNEL_LOG_ERROR("The value of axes is incorrect.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    } else if (*(axes_data_addr + i) < 0) {
+      *(axes_data_addr + i) += input_data_dims;
+    }
+  }
+  std::sort(axes_data_addr, axes_data_addr + axes_data_num);
+  std::vector<T2> axes_data_norepeat;
+  for (int64_t i = 0; i < axes_data_num - 1; i++) {
+    T2 value = axes_data_addr[i];
+    if (value == axes_data_addr[i + 1]) {
+      axes_data_num--;
+      continue;
+    }
+    axes_data_norepeat.push_back(value);
+  }
+  axes_data_norepeat.push_back(axes_data_addr[axes_data_num - 1]);
+  // deal with attr
+  auto attr_value = ctx.GetAttr("keep_dims");
+  bool keep_dims;
+  if (attr_value == nullptr) {
+    keep_dims = false;
+  } else {
+    keep_dims = static_cast<bool>(attr_value->GetBool());
+  }
+  if (axes_data_num == input_data_dims) {
+    if (keep_dims) {
+      std::vector<int64_t> dims_new(axes_data_num, 1);
+      output_data_shape->SetDimSizes(dims_new);
+    } else {
+      std::vector<int64_t> dims_new(1, 1);
+      output_data_shape->SetDimSizes(dims_new);
+    }
+    T1 data_sum = static_cast<T1>(0);
+    for (int64_t i = 0; i < input_data_num; i++) {
+      data_sum += input_data_addr[i];
+    }
+    output_data_addr[0] = ComplexDiv<T1>(data_sum, input_data_num);
+  } else {
+    std::vector<int64_t> dims_new(input_data_shape->GetDimSizes());
+    if (keep_dims) {
+      for (auto iter = axes_data_norepeat.cbegin(); iter != axes_data_norepeat.cend(); iter++) {
+        dims_new[*iter] = 1;
+      }
+    } else {
+      for (auto iter = axes_data_norepeat.rbegin(); iter != axes_data_norepeat.rend(); iter++) {
+        dims_new.erase(dims_new.begin() + (*iter));
+      }
+    }
+    output_data_shape->SetDimSizes(dims_new);
+    // Extract unspecified dimensions
+    std::vector<T2> dims_base;
+    const int32_t axes_data_num_const = axes_data_num;
+    const int32_t dims_base_num = input_data_dims - axes_data_num_const;
+    for (T2 i = 0; i < (T2)input_data_dims; i++) {
+      bool cflag = true;
+      for (int64_t j = 0; j < axes_data_num_const; j++) {
+        if (axes_data_norepeat[j] == i) {
+          cflag = false;
+          break;
+        }
+      }
+      if (cflag) {
+        dims_base.push_back(i);
+      }
+    }
+    int64_t addr_stride[axes_data_num_const];
+    addr_tmp = 1;
+    addr_stride[axes_data_num_const - 1] = addr_tmp;
+    for (int32_t i = axes_data_num_const - 2; i > -1; i--) {
+      addr_tmp *= input_data_dimsize[axes_data_norepeat[i + 1]];
+      addr_stride[i] = addr_tmp;
+    }
+    int64_t offset_num = addr_tmp * input_data_dimsize[axes_data_norepeat[0]];
+    if ((input_data_num > 256 * 1024 && input_data_num / output_data_num > 256) || (output_data_num > 1024)) {
+      uint32_t min_core_num = 1;
+      int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+      if (max_core_num > output_data_num) {
+        max_core_num = output_data_num;
+      }
+      auto shard_compute = [&](size_t start, size_t end) {
+        for (size_t i = start; i < end; i++) {
+          int64_t output_i_addr = 0;
+          int64_t seq_tmp = i;
+          for (int32_t j = dims_base_num - 1; j > -1; j--) {
+            int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
+            int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
+            seq_tmp = next;
+            output_i_addr += loc * dims_addr[dims_base[j]];
+            if (seq_tmp == 0) {
+              break;
+            }
+          }
+          T1 data_sum = input_data_addr[output_i_addr];
+          // In the array, the actual address of the element participating in the calculation.
+          int64_t addr_offset = 0;
+          for (int64_t j = 1; j < offset_num; j++) {
+            int32_t stride = axes_data_num_const - 1;
+            for (int32_t k = stride - 1; k > -1; k--) {
+              if (j % addr_stride[k] == 0) {
+                addr_offset -=
+                  (input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
+                stride = k;
+                continue;
+              }
+              break;
+            }
+            addr_offset += dims_addr[axes_data_norepeat[stride]];
+            data_sum += input_data_addr[output_i_addr + addr_offset];
+          }
+          output_data_addr[i] = ComplexDiv<T1>(data_sum, offset_num);
+        }
+      };
+      KERNEL_HANDLE_ERROR(
+        CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, shard_compute),
+        "ReduceMean Compute failed.");
+    } else {
+      for (int64_t i = 0; i < output_data_num; i++) {
+        // In the array, the actual address of the output.
+        int64_t output_i_addr = 0;
+        int64_t seq_tmp = i;
+        for (int32_t j = dims_base_num - 1; j > -1; j--) {
+          int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
+          int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
+          seq_tmp = next;
+          output_i_addr += loc * dims_addr[dims_base[j]];
+          if (seq_tmp == 0) {
+            break;
+          }
+        }
+        T1 data_sum = input_data_addr[output_i_addr];
+        // In the array, the actual address of the element participating in the calculation.
+        int64_t addr_offset = 0;
+        for (int64_t j = 1; j < offset_num; j++) {
+          int32_t stride = axes_data_num_const - 1;
+          for (int32_t k = stride - 1; k > -1; k--) {
+            if (j % addr_stride[k] == 0) {
+              addr_offset -=
+                (input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
+              stride = k;
+              continue;
+            }
+            break;
+          }
+          addr_offset += dims_addr[axes_data_norepeat[stride]];
+          data_sum += input_data_addr[output_i_addr + addr_offset];
+        }
+        output_data_addr[i] = ComplexDiv<T1>(data_sum, offset_num);
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kReduceMean, ReduceMeanCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.h
@ -0,0 +1,38 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_REDUCEMEAN_H_
+#define AICPU_KERNELS_NORMALIZED_REDUCEMEAN_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class ReduceMeanCpuKernel : public CpuKernel {
+ public:
+  ReduceMeanCpuKernel() = default;
+  ~ReduceMeanCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T1, typename T2>
+  static uint32_t ReduceMeanCompute(CpuKernelContext &ctx);
+
+  template <typename T1, typename T2>
+  static uint32_t ReduceMeanCompute_Complex(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc
@ -0,0 +1,496 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reduce_prod.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include "algorithm"
+#include "iostream"
+
+namespace {
+const char *kReduceProd = "ReduceProd";
+
+#define REDUCEPROD_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX)    \
+  case (DTYPE): {                                            \
+    uint32_t result = ReduceProdCompute<TYPE1, TYPE2>(CTX);  \
+    if (result != KERNEL_STATUS_OK) {                        \
+      KERNEL_LOG_ERROR("ReduceProd kernel compute failed."); \
+      return result;                                         \
+    }                                                        \
+    break;                                                   \
+  }
+
+#define REDUCEPROD_COMPUTE_CASE_CP(DTYPE, TYPE1, TYPE2, CTX)        \
+  case (DTYPE): {                                                   \
+    uint32_t result = ReduceProdCompute_Complex<TYPE1, TYPE2>(CTX); \
+    if (result != KERNEL_STATUS_OK) {                               \
+      KERNEL_LOG_ERROR("ReduceProd kernel compute failed.");        \
+      return result;                                                \
+    }                                                               \
+    break;                                                          \
+  }
+
+#define REDUCEPROD_COMPUTE_CASE_ALL(TYPE, CTX)                               \
+  REDUCEPROD_COMPUTE_CASE_CP(DT_COMPLEX64, std::complex<float>, TYPE, CTX)   \
+  REDUCEPROD_COMPUTE_CASE_CP(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
+  REDUCEPROD_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)                      \
+  REDUCEPROD_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX)                        \
+  REDUCEPROD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX)                \
+  REDUCEPROD_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX)                        \
+  REDUCEPROD_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX)                      \
+  REDUCEPROD_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX)                      \
+  REDUCEPROD_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX)                      \
+  REDUCEPROD_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX)                      \
+  REDUCEPROD_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX)                    \
+  REDUCEPROD_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX)                    \
+  REDUCEPROD_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+T ReduceProdCpuKernel::ComputeMul(T num_1, T num_2) {
+  T res;
+  auto a = num_1.real();
+  auto b = num_1.imag();
+  auto x = num_2.real();
+  auto y = num_2.imag();
+  auto real_res = a * x - b * y;
+  auto imag_res = b * x + a * y;
+  res.real(real_res);
+  res.imag(imag_res);
+  return res;
+}
+
+uint32_t ReduceProdCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  uint32_t input_num = ctx.GetInputsSize();
+  uint32_t output_num = ctx.GetOutputsSize();
+  if (input_num != 2 || output_num != 1) {
+    KERNEL_LOG_ERROR("The number of input or output parameters does not match.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  Tensor *input_data = ctx.Input(0);
+  KERNEL_CHECK_NULLPTR(input_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[0] failed.")
+  Tensor *axes_data = ctx.Input(1);
+  KERNEL_CHECK_NULLPTR(axes_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[1] failed.")
+  Tensor *output_data = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR(output_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output[0] failed.");
+  DataType data_type = ctx.Input(0)->GetDataType();
+  DataType axes_type = ctx.Input(1)->GetDataType();
+  switch (axes_type) {
+    case DT_INT32:
+      switch (data_type) {
+        REDUCEPROD_COMPUTE_CASE_ALL(int32_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    case DT_INT64:
+      switch (data_type) {
+        REDUCEPROD_COMPUTE_CASE_ALL(int64_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    default:
+      KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(axes_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+/*
+Calculate the prod of the corresponding dimension data
+Rule: except for the specified dimension, a set of data with other
+dimensions unchanged participate in the calculation of a prod.
+e.g.    input_x : float array[2][2][2]={1,2,3,4,5,6,7,8}
+        axes : [1 , 2]
+        output:[2.5, 6.5]
+        2.5 is calculated from array[0][0][0], array[0][0][1],
+                                array[0][1][0] and array[0][1][1]
+The same group of data addresses involved in calculating the
+prod consists of one same base address and different offset addresses
+input_data_address = base_address + offset_address
+*/
+template <typename T1, typename T2>
+uint32_t ReduceProdCpuKernel::ReduceProdCompute(CpuKernelContext &ctx) {
+  Tensor *input_data = ctx.Input(0);
+  auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
+  const int64_t input_data_num = input_data->NumElements();
+  auto input_data_shape = input_data->GetTensorShape();
+  const int32_t input_data_dims = input_data_shape->GetDims();
+  std::vector<int64_t> input_data_dimsize = input_data_shape->GetDimSizes();
+  std::vector<int64_t> dims_addr(input_data_dims);
+  dims_addr[input_data_dims - 1] = 1;
+  int64_t addr_tmp = 1;
+  for (int32_t i = input_data_dims - 2; i > -1; i--) {
+    addr_tmp *= input_data_dimsize[i + 1];
+    dims_addr[i] = addr_tmp;
+  }
+  Tensor *output_data = ctx.Output(0);
+  auto output_data_shape = output_data->GetTensorShape();
+  auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
+  const int64_t output_data_num = output_data->NumElements();
+  Tensor *axes_data = ctx.Input(1);
+  auto axes_data_addr = reinterpret_cast<T2 *>(axes_data->GetData());
+  int64_t axes_data_num = axes_data->NumElements();
+  // Check the effectiveness of the value of axes
+  for (int64_t i = 0; i < axes_data_num; i++) {
+    if ((*(axes_data_addr + i) >= input_data_dims) || (*(axes_data_addr + i) < -input_data_dims)) {
+      KERNEL_LOG_ERROR("The value of axes is incorrect.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    } else if (*(axes_data_addr + i) < 0) {
+      *(axes_data_addr + i) += input_data_dims;
+    }
+  }
+  std::sort(axes_data_addr, axes_data_addr + axes_data_num);
+  std::vector<T2> axes_data_norepeat;
+  for (int64_t i = 0; i < axes_data_num - 1; i++) {
+    T2 value = axes_data_addr[i];
+    if (value == axes_data_addr[i + 1]) {
+      axes_data_num--;
+      continue;
+    }
+    axes_data_norepeat.push_back(value);
+  }
+  axes_data_norepeat.push_back(axes_data_addr[axes_data_num - 1]);
+  // deal with attr
+  auto attr_value = ctx.GetAttr("keep_dims");
+  bool keep_dims;
+  if (attr_value == nullptr) {
+    keep_dims = false;
+  } else {
+    keep_dims = static_cast<bool>(attr_value->GetBool());
+  }
+  if (axes_data_num == input_data_dims) {
+    if (keep_dims) {
+      std::vector<int64_t> dims_new(axes_data_num, 1);
+      output_data_shape->SetDimSizes(dims_new);
+    } else {
+      std::vector<int64_t> dims_new(1, 1);
+      output_data_shape->SetDimSizes(dims_new);
+    }
+    T1 data_prod = static_cast<T1>(1);
+    for (int64_t i = 0; i < input_data_num; i++) {
+      data_prod *= input_data_addr[i];
+    }
+    output_data_addr[0] = data_prod;
+  } else {
+    std::vector<int64_t> dims_new(input_data_shape->GetDimSizes());
+    if (keep_dims) {
+      for (auto iter = axes_data_norepeat.cbegin(); iter != axes_data_norepeat.cend(); iter++) {
+        dims_new[*iter] = 1;
+      }
+    } else {
+      for (auto iter = axes_data_norepeat.rbegin(); iter != axes_data_norepeat.rend(); iter++) {
+        dims_new.erase(dims_new.begin() + (*iter));
+      }
+    }
+    output_data_shape->SetDimSizes(dims_new);
+    // Extract unspecified dimensions
+    std::vector<T2> dims_base;
+    const int32_t axes_data_num_const = axes_data_num;
+    const int32_t dims_base_num = input_data_dims - axes_data_num_const;
+    for (T2 i = 0; i < (T2)input_data_dims; i++) {
+      bool cflag = true;
+      for (int64_t j = 0; j < axes_data_num_const; j++) {
+        if (axes_data_norepeat[j] == i) {
+          cflag = false;
+          break;
+        }
+      }
+      if (cflag) {
+        dims_base.push_back(i);
+      }
+    }
+    int64_t addr_stride[axes_data_num_const];
+    addr_tmp = 1;
+    addr_stride[axes_data_num_const - 1] = addr_tmp;
+    for (int32_t i = axes_data_num_const - 2; i > -1; i--) {
+      addr_tmp *= input_data_dimsize[axes_data_norepeat[i + 1]];
+      addr_stride[i] = addr_tmp;
+    }
+    int64_t offset_num = addr_tmp * input_data_dimsize[axes_data_norepeat[0]];
+    if ((input_data_num > 256 * 1024 && input_data_num / output_data_num > 256) || (output_data_num > 1024)) {
+      uint32_t min_core_num = 1;
+      int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+      if (max_core_num > output_data_num) {
+        max_core_num = output_data_num;
+      }
+      auto shard_compute = [&](size_t start, size_t end) {
+        for (size_t i = start; i < end; i++) {
+          int64_t output_i_addr = 0;
+          int64_t seq_tmp = i;
+          for (int32_t j = dims_base_num - 1; j > -1; j--) {
+            int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
+            int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
+            seq_tmp = next;
+            output_i_addr += loc * dims_addr[dims_base[j]];
+            if (seq_tmp == 0) {
+              break;
+            }
+          }
+          T1 data_prod = input_data_addr[output_i_addr];
+          // In the array, the actual address of the element participating in the calculation.
+          int64_t addr_offset = 0;
+          for (int64_t j = 1; j < offset_num; j++) {
+            int32_t stride = axes_data_num_const - 1;
+            for (int32_t k = stride - 1; k > -1; k--) {
+              if (j % addr_stride[k] == 0) {
+                addr_offset -=
+                  (input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
+                stride = k;
+                continue;
+              }
+              break;
+            }
+            addr_offset += dims_addr[axes_data_norepeat[stride]];
+            data_prod *= input_data_addr[output_i_addr + addr_offset];
+          }
+          output_data_addr[i] = data_prod;
+        }
+      };
+      KERNEL_HANDLE_ERROR(
+        CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, shard_compute),
+        "ReduceProd Compute failed.");
+    } else {
+      for (int64_t i = 0; i < output_data_num; i++) {
+        // In the array, the actual address of the output.
+        int64_t output_i_addr = 0;
+        int64_t seq_tmp = i;
+        for (int32_t j = dims_base_num - 1; j > -1; j--) {
+          int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
+          int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
+          seq_tmp = next;
+          output_i_addr += loc * dims_addr[dims_base[j]];
+          if (seq_tmp == 0) {
+            break;
+          }
+        }
+        T1 data_prod = input_data_addr[output_i_addr];
+        // In the array, the actual address of the element participating in the calculation.
+        int64_t addr_offset = 0;
+        for (int64_t j = 1; j < offset_num; j++) {
+          int32_t stride = axes_data_num_const - 1;
+          for (int32_t k = stride - 1; k > -1; k--) {
+            if (j % addr_stride[k] == 0) {
+              addr_offset -=
+                (input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
+              stride = k;
+              continue;
+            }
+            break;
+          }
+          addr_offset += dims_addr[axes_data_norepeat[stride]];
+          data_prod *= input_data_addr[output_i_addr + addr_offset];
+        }
+        output_data_addr[i] = data_prod;
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T1, typename T2>
+uint32_t ReduceProdCpuKernel::ReduceProdCompute_Complex(CpuKernelContext &ctx) {
+  Tensor *input_data = ctx.Input(0);
+  auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
+  const int64_t input_data_num = input_data->NumElements();
+  auto input_data_shape = input_data->GetTensorShape();
+  const int32_t input_data_dims = input_data_shape->GetDims();
+  std::vector<int64_t> input_data_dimsize = input_data_shape->GetDimSizes();
+  std::vector<int64_t> dims_addr(input_data_dims);
+  dims_addr[input_data_dims - 1] = 1;
+  int64_t addr_tmp = 1;
+  for (int32_t i = input_data_dims - 2; i > -1; i--) {
+    addr_tmp *= input_data_dimsize[i + 1];
+    dims_addr[i] = addr_tmp;
+  }
+  Tensor *output_data = ctx.Output(0);
+  auto output_data_shape = output_data->GetTensorShape();
+  auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
+  const int64_t output_data_num = output_data->NumElements();
+  Tensor *axes_data = ctx.Input(1);
+  auto axes_data_addr = reinterpret_cast<T2 *>(axes_data->GetData());
+  int64_t axes_data_num = axes_data->NumElements();
+  // Check the effectiveness of the value of axes
+  for (int64_t i = 0; i < axes_data_num; i++) {
+    if ((*(axes_data_addr + i) >= input_data_dims) || (*(axes_data_addr + i) < -input_data_dims)) {
+      KERNEL_LOG_ERROR("The value of axes is incorrect.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    } else if (*(axes_data_addr + i) < 0) {
+      *(axes_data_addr + i) += input_data_dims;
+    }
+  }
+  std::sort(axes_data_addr, axes_data_addr + axes_data_num);
+  std::vector<T2> axes_data_norepeat;
+  for (int64_t i = 0; i < axes_data_num - 1; i++) {
+    T2 value = axes_data_addr[i];
+    if (value == axes_data_addr[i + 1]) {
+      axes_data_num--;
+      continue;
+    }
+    axes_data_norepeat.push_back(value);
+  }
+  axes_data_norepeat.push_back(axes_data_addr[axes_data_num - 1]);
+  // deal with attr
+  auto attr_value = ctx.GetAttr("keep_dims");
+  bool keep_dims;
+  if (attr_value == nullptr) {
+    keep_dims = false;
+  } else {
+    keep_dims = static_cast<bool>(attr_value->GetBool());
+  }
+  if (axes_data_num == input_data_dims) {
+    if (keep_dims) {
+      std::vector<int64_t> dims_new(axes_data_num, 1);
+      output_data_shape->SetDimSizes(dims_new);
+    } else {
+      std::vector<int64_t> dims_new(1, 1);
+      output_data_shape->SetDimSizes(dims_new);
+    }
+    T1 data_prod;
+    data_prod.real(1);
+    data_prod.imag(0);
+    for (int64_t i = 0; i < input_data_num; i++) {
+      T1 data_cur = input_data_addr[i];
+      data_prod = ComputeMul<T1>(data_prod, data_cur);
+    }
+    output_data_addr[0] = data_prod;
+  } else {
+    std::vector<int64_t> dims_new(input_data_shape->GetDimSizes());
+    if (keep_dims) {
+      for (auto iter = axes_data_norepeat.cbegin(); iter != axes_data_norepeat.cend(); iter++) {
+        dims_new[*iter] = 1;
+      }
+    } else {
+      for (auto iter = axes_data_norepeat.rbegin(); iter != axes_data_norepeat.rend(); iter++) {
+        dims_new.erase(dims_new.begin() + (*iter));
+      }
+    }
+    output_data_shape->SetDimSizes(dims_new);
+    // Extract unspecified dimensions
+    std::vector<T2> dims_base;
+    const int32_t axes_data_num_const = axes_data_num;
+    const int32_t dims_base_num = input_data_dims - axes_data_num_const;
+    for (T2 i = 0; i < (T2)input_data_dims; i++) {
+      bool cflag = true;
+      for (int64_t j = 0; j < axes_data_num_const; j++) {
+        if (axes_data_norepeat[j] == i) {
+          cflag = false;
+          break;
+        }
+      }
+      if (cflag) {
+        dims_base.push_back(i);
+      }
+    }
+    int64_t addr_stride[axes_data_num_const];
+    addr_tmp = 1;
+    addr_stride[axes_data_num_const - 1] = addr_tmp;
+    for (int32_t i = axes_data_num_const - 2; i > -1; i--) {
+      addr_tmp *= input_data_dimsize[axes_data_norepeat[i + 1]];
+      addr_stride[i] = addr_tmp;
+    }
+    int64_t offset_num = addr_tmp * input_data_dimsize[axes_data_norepeat[0]];
+    if ((input_data_num > 256 * 1024 && input_data_num / output_data_num > 256) || (output_data_num > 1024)) {
+      uint32_t min_core_num = 1;
+      int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+      if (max_core_num > output_data_num) {
+        max_core_num = output_data_num;
+      }
+      auto shard_compute = [&](size_t start, size_t end) {
+        for (size_t i = start; i < end; i++) {
+          int64_t output_i_addr = 0;
+          int64_t seq_tmp = i;
+          for (int32_t j = dims_base_num - 1; j > -1; j--) {
+            int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
+            int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
+            seq_tmp = next;
+            output_i_addr += loc * dims_addr[dims_base[j]];
+            if (seq_tmp == 0) {
+              break;
+            }
+          }
+          T1 data_prod = input_data_addr[output_i_addr];
+          // In the array, the actual address of the element participating in the calculation.
+          int64_t addr_offset = 0;
+          for (int64_t j = 1; j < offset_num; j++) {
+            int32_t stride = axes_data_num_const - 1;
+            for (int32_t k = stride - 1; k > -1; k--) {
+              if (j % addr_stride[k] == 0) {
+                addr_offset -=
+                  (input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
+                stride = k;
+                continue;
+              }
+              break;
+            }
+            addr_offset += dims_addr[axes_data_norepeat[stride]];
+            T1 data_cur = input_data_addr[i];
+            data_prod = ComputeMul<T1>(data_prod, data_cur);
+          }
+          output_data_addr[i] = data_prod;
+        }
+      };
+      KERNEL_HANDLE_ERROR(
+        CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, shard_compute),
+        "ReduceProd Compute failed.");
+    } else {
+      for (int64_t i = 0; i < output_data_num; i++) {
+        // In the array, the actual address of the output.
+        int64_t output_i_addr = 0;
+        int64_t seq_tmp = i;
+        for (int32_t j = dims_base_num - 1; j > -1; j--) {
+          int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
+          int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
+          seq_tmp = next;
+          output_i_addr += loc * dims_addr[dims_base[j]];
+          if (seq_tmp == 0) {
+            break;
+          }
+        }
+        T1 data_prod = input_data_addr[output_i_addr];
+        // In the array, the actual address of the element participating in the calculation.
+        int64_t addr_offset = 0;
+        for (int64_t j = 1; j < offset_num; j++) {
+          int32_t stride = axes_data_num_const - 1;
+          for (int32_t k = stride - 1; k > -1; k--) {
+            if (j % addr_stride[k] == 0) {
+              addr_offset -=
+                (input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
+              stride = k;
+              continue;
+            }
+            break;
+          }
+          addr_offset += dims_addr[axes_data_norepeat[stride]];
+          T1 data_cur = input_data_addr[output_i_addr + addr_offset];
+          data_prod = ComputeMul<T1>(data_prod, data_cur);
+        }
+        output_data_addr[i] = data_prod;
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kReduceProd, ReduceProdCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.h
@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_REDUCEPROD_H_
+#define AICPU_KERNELS_NORMALIZED_REDUCEPROD_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class ReduceProdCpuKernel : public CpuKernel {
+ public:
+  ReduceProdCpuKernel() = default;
+  ~ReduceProdCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  static T ComputeMul(T num_1, T num_2);
+
+  template <typename T1, typename T2>
+  static uint32_t ReduceProdCompute(CpuKernelContext &ctx);
+
+  template <typename T1, typename T2>
+  static uint32_t ReduceProdCompute_Complex(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/relu.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/relu.cc
@ -0,0 +1,107 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "relu.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 1;
+const char *kRelu = "Relu";
+// when input data size is more than kParallelDataNum, use Parallel func
+const int64_t kParallelDataNum = 2 * 1024;
+const int64_t kParallelDataNumMid = 16 * 1024;
+const int64_t kParallelDataNumSameShape = 7 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
+
+#define RELU_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                      \
+    uint32_t result = ReluCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                  \
+      KERNEL_LOG_ERROR("Relu kernel compute failed."); \
+      return result;                                   \
+    }                                                  \
+    break;                                             \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t ReluCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Relu check input and output number failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    RELU_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    RELU_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    RELU_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    RELU_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    RELU_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    RELU_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
+    RELU_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    RELU_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    RELU_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("Relu kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+void ReluCpuKernel::DoCompute(int64_t start, int64_t end, const T *input1, T *output) {
+  for (int64_t i = start; i < end; ++i) {
+    T v = *(input1 + i);
+    bool p = v > static_cast<T>(0);
+    *(output + i) = p ? v : static_cast<T>(0);
+  }
+}
+
+template <typename T>
+uint32_t ReluCpuKernel::ReluCompute(CpuKernelContext &ctx) {
+  auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Output(0)->NumElements();
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumSameShapeMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto sharder_relu = [&](int64_t start, int64_t end) { DoCompute<T>(start, end, in0, out); };
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_relu),
+                        "Relu Compute failed.");
+  } else {
+    DoCompute<T>(0, data_num, in0, out);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kRelu, ReluCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/relu.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/relu.h
@ -0,0 +1,39 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_RELU_H_
+#define AICPU_KERNELS_NORMALIZED_RELU_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class ReluCpuKernel : public CpuKernel {
+ public:
+  ReluCpuKernel() = default;
+  ~ReluCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  void DoCompute(int64_t start, int64_t end, const T *input1, T *output);
+  template <typename T>
+  uint32_t ReluCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reversev2.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reversev2.cc
@ -0,0 +1,186 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reversev2.h"
+#include <securec.h>
+#include "Eigen/Core"
+
+#include "cpu_kernel_utils.h"
+#include "iostream"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+using namespace std;
+namespace {
+const uint32_t kInputNum = 2;
+const uint32_t kOutputNum = 1;
+const char *kReverseV2 = "ReverseV2";
+}  // namespace
+
+namespace aicpu {
+uint32_t ReverseV2CpuKernel::Compute(CpuKernelContext &ctx) {
+  int x_max_dim = 8;
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "ReverseV2 check input or output is failed.");
+  DataType axis_type = ctx.Input(1)->GetDataType();
+  KERNEL_CHECK_FALSE((axis_type == DT_INT32 || axis_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of [axis] need be DT_INT32 or DT_INT64.")
+  auto x_shape = ctx.Input(0)->GetTensorShape();
+  auto axis_shape = ctx.Input(1)->GetTensorShape();
+  DataType data_type = DataType(ctx.Input(0)->GetDataType());
+  std::vector<int64_t> reverse_shape;
+  for (int i = 0; i < x_shape->GetDims(); i++) {
+    reverse_shape.push_back(false);
+  }
+  // dims check
+  if (x_shape->GetDims() == 0 || axis_shape->GetDims() == 0) {
+    uint32_t ret = ComputeDiffType(data_type, reverse_shape, ctx);
+    if (ret != KERNEL_STATUS_OK) {
+      return ret;
+    }
+    return KERNEL_STATUS_OK;
+  }
+  KERNEL_CHECK_FALSE((x_shape->GetDims() > 0 && x_shape->GetDims() <= x_max_dim), KERNEL_STATUS_PARAM_INVALID,
+                     "Shapes of x is not support.")
+  KERNEL_CHECK_FALSE((axis_shape->GetDims() == 1), KERNEL_STATUS_PARAM_INVALID, "Shapes of axis is not support.")
+
+  auto input0_datasize = ctx.Input(0)->GetDataSize();
+  auto output_datasize = ctx.Output(0)->GetDataSize();
+  KERNEL_CHECK_FALSE((input0_datasize == output_datasize), KERNEL_STATUS_PARAM_INVALID,
+                     "The data size of input0 [%d] need be same with "
+                     "output0 [%d].",
+                     input0_datasize, output_datasize)
+  int64_t dim = x_shape->GetDims();
+  auto input_axis = reinterpret_cast<int64_t *>(ctx.Input(1)->GetData());
+  int64_t axis_element = axis_shape->NumElements();
+  for (int j = 0; j < axis_element; j++) {
+    int64_t realdim = *(input_axis + j) < 0 ? dim + *(input_axis + j) : *(input_axis + j);
+    KERNEL_CHECK_FALSE((realdim >= 0 && realdim < dim), KERNEL_STATUS_PARAM_INVALID, "[%d] is invalid", realdim)
+    KERNEL_CHECK_FALSE((!reverse_shape[realdim]), KERNEL_STATUS_PARAM_INVALID, "axis [%d], specified more than once.",
+                       realdim)
+    reverse_shape[realdim] = true;
+  }
+  uint32_t ret = ComputeDiffType(data_type, reverse_shape, ctx);
+  if (ret != KERNEL_STATUS_OK) {
+    return ret;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t ReverseV2CpuKernel::ComputeDiffType(DataType data_type, std::vector<int64_t> reverse_shape,
+                                             CpuKernelContext &ctx) {
+  switch (data_type) {
+    case DT_FLOAT16:
+      return ComputeReverseV2<Eigen::half>(reverse_shape, ctx);
+    case DT_FLOAT:
+      return ComputeReverseV2<float>(reverse_shape, ctx);
+    case DT_DOUBLE:
+      return ComputeReverseV2<double>(reverse_shape, ctx);
+    case DT_UINT8:
+      return ComputeReverseV2<uint8_t>(reverse_shape, ctx);
+    case DT_INT8:
+      return ComputeReverseV2<int8_t>(reverse_shape, ctx);
+    case DT_UINT16:
+      return ComputeReverseV2<uint16_t>(reverse_shape, ctx);
+    case DT_INT16:
+      return ComputeReverseV2<int16_t>(reverse_shape, ctx);
+    case DT_INT32:
+      return ComputeReverseV2<int32_t>(reverse_shape, ctx);
+    case DT_INT64:
+      return ComputeReverseV2<int64_t>(reverse_shape, ctx);
+    case DT_BOOL:
+      return ComputeReverseV2<bool>(reverse_shape, ctx);
+    case DT_COMPLEX64:
+      return ComputeReverseV2<std::complex<float>>(reverse_shape, ctx);
+    case DT_COMPLEX128:
+      return ComputeReverseV2<std::complex<double>>(reverse_shape, ctx);
+    case DT_STRING:
+      return ComputeReverseV2<string>(reverse_shape, ctx);
+    default:
+      KERNEL_LOG_ERROR("ReverseV2 invalid input type[%s]", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t ReverseV2CpuKernel::ComputeReverseV2(std::vector<int64_t> reverse_shape, CpuKernelContext &ctx) {
+  auto x_shape = ctx.Input(0)->GetTensorShape();
+  auto input_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  if (x_shape->GetDims() == 0) {
+    *(output_data) = *(input_data);
+    return KERNEL_STATUS_OK;
+  }
+  auto axis_shape = ctx.Input(1)->GetTensorShape();
+  if (axis_shape->GetDims() == 0) {
+    for (int i = 0; i < x_shape->NumElements(); i++) {
+      *(output_data + i) = *(input_data + i);
+    }
+    return KERNEL_STATUS_OK;
+  }
+  int64_t front = 1;
+  int64_t shape_element = x_shape->NumElements();
+  int64_t dim = x_shape->GetDims();
+  std::vector<int64_t> dims = x_shape->GetDimSizes();
+  bool redo = false;
+  for (int j = 0; j < dim; j++) {
+    front = front * dims[j];
+    if (j != dim - 1 && reverse_shape[j] == true) {
+      if (redo == true) {
+        auto copy_size = shape_element * sizeof(T);
+        auto ret_mem = memcpy_s(input_data, copy_size, output_data, copy_size);
+        KERNEL_CHECK_FALSE(ret_mem == EOK, KERNEL_STATUS_INNER_ERROR, "Memcpy failed, size = [%zu].", copy_size);
+      }
+      int64_t row_size = shape_element / front;
+      int64_t input_forward = (dims[j] - 1) * row_size;
+      int64_t save = input_forward;
+      int64_t output_forward = 0;
+      int64_t behind = shape_element / (front / dims[j]);
+      for (int k = 0; k < front / dims[j]; k++) {
+        int64_t remain = dims[j];
+        while (remain > 0) {
+          auto copy_size = row_size * sizeof(T);
+          auto cur_output = output_data + output_forward;
+          auto cur_input = input_data + input_forward;
+          auto ret_mem = memcpy_s(cur_output, copy_size, cur_input, copy_size);
+          KERNEL_CHECK_FALSE(ret_mem == EOK, KERNEL_STATUS_INNER_ERROR, "Memcpy size[%zu] from input to output failed.",
+                             copy_size);
+          input_forward = input_forward - row_size;
+          output_forward = output_forward + row_size;
+          remain--;
+        }
+        save = save + behind;
+        input_forward = save;
+      }
+      redo = true;
+    } else if (j == dim - 1 && reverse_shape[j] == true) {
+      if (redo == true) {
+        auto copy_size = shape_element * sizeof(T);
+        auto ret_mem = memcpy_s(input_data, copy_size, output_data, copy_size);
+        KERNEL_CHECK_FALSE(ret_mem == EOK, KERNEL_STATUS_INNER_ERROR, "Memcpy failed, size = [%zu].", copy_size);
+      }
+      int64_t output_forward = 0;
+      for (int k = 0; k < shape_element / dims[j]; k++) {
+        for (int i = dims[j] - 1; i >= 0; i--) {
+          *(output_data + output_forward) = *(input_data + i + k * dims[j]);
+          output_forward++;
+        }
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kReverseV2, ReverseV2CpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reversev2.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reversev2.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_REVERSEV2_H_
+#define AICPU_KERNELS_NORMALIZED_REVERSEV2_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class ReverseV2CpuKernel : public CpuKernel {
+ public:
+  ReverseV2CpuKernel() = default;
+  ~ReverseV2CpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t ComputeDiffType(DataType data_type, std::vector<int64_t> reverse_shape, CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t ComputeReverseV2(std::vector<int64_t> reverse_shape, CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/rgb_to_hsv.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/rgb_to_hsv.cc
@ -0,0 +1,161 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "rgb_to_hsv.h"
+
+#include <iostream>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+constexpr size_t kInputShapeRank = 3;
+constexpr size_t kOutputShapeRank = 3;
+constexpr int64_t kImageChannels = 3;
+const char *kInputStr = "input";
+const char *kOutputStr = "output";
+const char *kRGBToHSV = "RGBToHSV";
+// when input data size is more than kParallelDataNum, use Parallel func
+}  // namespace
+
+namespace aicpu {
+
+const std::map<std::string, RGBToHSVCpuKernel::KernelFunction> RGBToHSVCpuKernel::kernels_ = {
+  {"(DT_FLOAT16,DT_FLOAT16)", &RGBToHSVCpuKernel::DoCompute<Eigen::half, Eigen::half>},
+  {"(DT_FLOAT,DT_FLOAT)", &RGBToHSVCpuKernel::DoCompute<float, float>},
+  {"(DT_DOUBLE,DT_DOUBLE)", &RGBToHSVCpuKernel::DoCompute<double, double>}};
+
+const std::vector<std::string> RGBToHSVCpuKernel::kernels_name_ = {"(DT_FLOAT16,DT_FLOAT16)", "(DT_FLOAT,DT_FLOAT)",
+                                                                   "(DT_DOUBLE,DT_DOUBLE)"};
+
+template <typename T1, typename T2>
+uint32_t RGBToHSVCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  Tensor *input_tensor = ctx.Input(0);
+  Tensor *output_tensor = ctx.Output(0);
+  auto input_shape = input_tensor->GetTensorShape()->GetDimSizes();
+  int64_t input0_elements_nums = input_tensor->NumElements();
+  auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
+  auto input_data = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
+  auto out = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
+
+  for (int64_t i = 0; i < input0_elements_nums; i = i + 3) {
+    auto t_red = *(input_data + i);
+    auto t_green = *(input_data + i + 1);
+    auto t_blue = *(input_data + i + 2);
+    auto t_value = std::max(std::max(t_red, t_blue), t_green);
+    auto t_minimum = std::min(std::min(t_red, t_blue), t_green);
+    auto range = t_value - t_minimum;
+    auto t_saturation = t_value > static_cast<T1>(0) ? (range / t_value) : static_cast<T1>(0);
+    auto norm = static_cast<T1>(1.0) / static_cast<T1>(6.0) / range;
+    auto t_hue = t_green == t_value ? (norm * (t_blue - t_red) + static_cast<T1>(2.0) / static_cast<T1>(6.0))
+                                    : (norm * (t_red - t_green) + static_cast<T1>(4.0) / static_cast<T1>(6.0));
+    t_hue = t_red == t_value ? (norm * (t_green - t_blue)) : t_hue;
+    t_hue = range > static_cast<T1>(0) ? t_hue : static_cast<T1>(0);
+    t_hue = t_hue < static_cast<T1>(0) ? (t_hue + static_cast<T1>(1)) : t_hue;
+    *(out + i) = t_hue;
+    *(out + i + 1) = t_saturation;
+    *(out + i + 2) = t_value;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t RGBToHSVCpuKernel::CheckParam(CpuKernelContext &ctx, const std::string &in_or_out, uint32_t index,
+                                       size_t rank) {
+  Tensor *param = nullptr;
+  if (in_or_out == kInputStr) {
+    param = ctx.Input(index);
+  } else if (in_or_out == kOutputStr) {
+    param = ctx.Output(index);
+  }
+  std::string err_header = ConcatString(kRGBToHSV, " op ", in_or_out, "[", index, "]");
+
+  KERNEL_CHECK_NULLPTR(param, KERNEL_STATUS_PARAM_INVALID, "%s tensor is nullptr.", err_header.c_str());
+
+  auto param_shape = param->GetTensorShape();
+  KERNEL_CHECK_NULLPTR(param_shape, KERNEL_STATUS_PARAM_INVALID, "%s tensor shape is nullptr.", err_header.c_str());
+  auto param_dim_sizes = param_shape->GetDimSizes();
+  if (param_dim_sizes.size() < 1) {
+    KERNEL_LOG_ERROR("%s shape rank must be at least 1, but got shape[%zu].", err_header.c_str(),
+                     VectorToString(param_dim_sizes).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  if (param->GetData() == nullptr) {
+    KERNEL_CHECK_NULLPTR(param, KERNEL_STATUS_PARAM_INVALID, "%s tensor data is nullptr.", err_header.c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t RGBToHSVCpuKernel::CheckShapes(CpuKernelContext &ctx) {
+  auto input0_shape = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes();
+  if (input0_shape.back() != kImageChannels) {
+    KERNEL_LOG_ERROR(
+      "%s op input[0] shape last dim should be [%d], but got "
+      "shape[%s].",
+      kRGBToHSV, kImageChannels, VectorToString(input0_shape).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t RGBToHSVCpuKernel::CheckParams(CpuKernelContext &ctx) {
+  auto ret = CheckParam(ctx, kInputStr, kFirstInputIndex, kInputShapeRank);
+  if (ret != KERNEL_STATUS_OK) {
+    return ret;
+  }
+
+  ret = CheckShapes(ctx);
+  if (ret != KERNEL_STATUS_OK) {
+    return ret;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t RGBToHSVCpuKernel::Compute(CpuKernelContext &ctx) {
+  auto input0 = ctx.Input(kFirstInputIndex);
+  KERNEL_CHECK_NULLPTR(input0, KERNEL_STATUS_PARAM_INVALID, "%s input[0] tensor is nullptr.", kRGBToHSV);
+  DataType input0_data_type = input0->GetDataType();
+  KERNEL_LOG_DEBUG("%s op input[0] data type is [%s].", kRGBToHSV, DTypeStr(input0_data_type).c_str());
+
+  auto output = ctx.Output(kFirstOutputIndex);
+  KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "%s output[0] tensor is nullptr.", kRGBToHSV);
+  DataType output_data_type = output->GetDataType();
+  KERNEL_LOG_DEBUG("%s op output[0] data type is [%s].", kRGBToHSV, DTypeStr(output_data_type).c_str());
+
+  std::string kernel_name = ConcatString("(", DTypeStr(input0_data_type), ",", DTypeStr(output_data_type), ")");
+
+  auto it = kernels_.find(kernel_name);
+  if (it != kernels_.end()) {
+    auto ret = CheckParams(ctx);
+    if (ret != KERNEL_STATUS_OK) {
+      return ret;
+    }
+    auto kernel = it->second;
+    ret = kernel(ctx);
+    KERNEL_LOG_DEBUG("%s op end.", kRGBToHSV);
+    return ret;
+  }
+
+  KERNEL_LOG_ERROR("%s op only support data type [%s], but got [%s].", kRGBToHSV, VectorToString(kernels_name_).c_str(),
+                   kernel_name.c_str());
+  return KERNEL_STATUS_PARAM_INVALID;
+}
+
+REGISTER_CPU_KERNEL(kRGBToHSV, RGBToHSVCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/rgb_to_hsv.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/rgb_to_hsv.h
@ -0,0 +1,51 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_RGBToHSV_H_
+#define AICPU_KERNELS_NORMALIZED_RGBToHSV_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class RGBToHSVCpuKernel : public CpuKernel {
+ public:
+  RGBToHSVCpuKernel() = default;
+
+  ~RGBToHSVCpuKernel() = default;
+
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename TInput, typename TOutput>
+  static uint32_t DoCompute(CpuKernelContext &ctx);
+
+  uint32_t CheckParams(CpuKernelContext &ctx);
+
+  uint32_t CheckParam(CpuKernelContext &ctx, const std::string &in_or_out, uint32_t index, size_t rank);
+
+  uint32_t CheckShapes(CpuKernelContext &ctx);
+
+ private:
+  using KernelFunction = uint32_t (*)(CpuKernelContext &ctx);
+  static const std::map<std::string, KernelFunction> kernels_;
+  static const std::vector<std::string> kernels_name_;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/rsqrt_grad.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/rsqrt_grad.cc
@ -0,0 +1,163 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsqrt_grad.h"
+
+#include <algorithm>
+#include <complex>
+#include <iostream>
+
+#include "utils/eigen_tensor.h"
+
+namespace {
+const char *kRsqrtGrad = "RsqrtGrad";
+constexpr uint32_t kOutputNum = 1;
+constexpr uint32_t kInputNum = 2;
+}  // namespace
+
+namespace aicpu {
+uint32_t RsqrtGradCpuKernel::Compute(CpuKernelContext &ctx) {
+  if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  Tensor *input_0 = ctx.Input(kFirstInputIndex);
+  Tensor *input_1 = ctx.Input(kSecondInputIndex);
+  if ((input_0->GetDataSize() == 0) || (input_1->GetDataSize() == 0)) {
+    KERNEL_LOG_INFO("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_OK;
+  }
+  // choose compute function depend on dataType
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    case DT_FLOAT16:
+      return RsqrtGradComputeFP16<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return RsqrtGradCompute<float>(ctx);
+    case DT_DOUBLE:
+      return RsqrtGradCompute<double>(ctx);
+    case DT_INT8:
+      return RsqrtGradCompute<int8_t>(ctx);
+    case DT_INT32:
+      return RsqrtGradCompute<int32_t>(ctx);
+    case DT_COMPLEX128:
+      return RsqrtGradComputeComplex<std::complex<double>>(ctx);
+    case DT_COMPLEX64:
+      return RsqrtGradComputeComplex<std::complex<float>>(ctx);
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                       aicpu::DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+template <typename T>
+uint32_t RsqrtGradCpuKernel::RsqrtGradComputeFP16(CpuKernelContext &ctx) {
+  Tensor *y = ctx.Input(0);
+  Tensor *dy = ctx.Input(1);
+  Tensor *z = ctx.Output(0);
+  auto y_ptr = reinterpret_cast<T *>(y->GetData());
+  auto dy_ptr = reinterpret_cast<T *>(dy->GetData());
+  auto z_ptr = reinterpret_cast<T *>(z->GetData());
+  int32_t input_0_num = y->GetTensorShape()->NumElements();
+  int32_t input_1_num = dy->GetTensorShape()->NumElements();
+
+  if (input_0_num >= input_1_num) {
+    for (int32_t i = 0; i < input_1_num; i++) {
+      z_ptr[i] =
+        static_cast<T>((static_cast<double>(y_ptr[i]) * static_cast<double>(y_ptr[i]) * static_cast<double>(y_ptr[i])) *
+                       (static_cast<double>(dy_ptr[i]) / (static_cast<double>(-2))));
+    }
+    for (int32_t i = input_1_num; i < input_0_num; i++) {
+      z_ptr[i] = (T)(0);
+    }
+  } else {
+    for (int32_t i = 0; i < input_0_num; i++) {
+      z_ptr[i] =
+        static_cast<T>((static_cast<double>(y_ptr[i]) * static_cast<double>(y_ptr[i]) * static_cast<double>(y_ptr[i])) *
+                       (static_cast<double>(dy_ptr[i]) / (static_cast<double>(-2))));
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t RsqrtGradCpuKernel::RsqrtGradCompute(CpuKernelContext &ctx) {
+  Tensor *y = ctx.Input(0);
+  Tensor *dy = ctx.Input(1);
+  Tensor *z = ctx.Output(0);
+
+  KERNEL_CHECK_NULLPTR(z->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get output data failed",
+                       ctx.GetOpType().c_str())
+  KERNEL_LOG_INFO(
+    "[%s] Input[0] data size is [%llu], input[1] data size is [%llu], output "
+    "data size is [%llu].",
+    ctx.GetOpType().c_str(), y->GetDataSize(), dy->GetDataSize(), z->GetDataSize());
+  auto y_ptr = reinterpret_cast<T *>(y->GetData());
+  auto dy_ptr = reinterpret_cast<T *>(dy->GetData());
+  auto z_ptr = reinterpret_cast<T *>(z->GetData());
+  int32_t input_0_num = y->GetTensorShape()->NumElements();
+  int32_t input_1_num = dy->GetTensorShape()->NumElements();
+
+  if (input_0_num >= input_1_num) {
+    for (int32_t i = 0; i < input_1_num; i++) {
+      z_ptr[i] = (dy_ptr[i] * y_ptr[i] * y_ptr[i] * y_ptr[i]) / (static_cast<T>(-2));
+    }
+    for (int32_t i = input_1_num; i < input_0_num; i++) {
+      z_ptr[i] = (T)(0);
+    }
+  } else {
+    for (int32_t i = 0; i < input_0_num; i++) {
+      z_ptr[i] = (dy_ptr[i] * y_ptr[i] * y_ptr[i] * y_ptr[i]) / (static_cast<T>(-2));
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t RsqrtGradCpuKernel::RsqrtGradComputeComplex(CpuKernelContext &ctx) {
+  Tensor *y = ctx.Input(0);
+  Tensor *dy = ctx.Input(1);
+  Tensor *z = ctx.Output(0);
+
+  KERNEL_CHECK_NULLPTR(z->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get output data failed",
+                       ctx.GetOpType().c_str())
+  KERNEL_LOG_INFO(
+    "[%s] Input[0] data size is [%llu], input[1] data size is [%llu], output "
+    "data size is [%llu].",
+    ctx.GetOpType().c_str(), y->GetDataSize(), dy->GetDataSize(), z->GetDataSize());
+  auto y_ptr = reinterpret_cast<T *>(y->GetData());
+  auto dy_ptr = reinterpret_cast<T *>(dy->GetData());
+  auto z_ptr = reinterpret_cast<T *>(z->GetData());
+  int32_t input_0_num = y->GetTensorShape()->NumElements();
+  int32_t input_1_num = dy->GetTensorShape()->NumElements();
+  if (input_0_num >= input_1_num) {
+    for (int32_t i = 0; i < input_1_num; i++) {
+      z_ptr[i] = (dy_ptr[i] * conj(y_ptr[i]) * conj(y_ptr[i]) * conj(y_ptr[i])) * (static_cast<T>(-0.5));
+    }
+    for (int32_t i = input_1_num; i < input_0_num; i++) {
+      z_ptr[i] = static_cast<T>(0);
+    }
+  } else {
+    for (int32_t i = 0; i < input_0_num; i++) {
+      z_ptr[i] = (dy_ptr[i] * conj(y_ptr[i]) * conj(y_ptr[i]) * conj(y_ptr[i])) * (static_cast<T>(-0.5));
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kRsqrtGrad, RsqrtGradCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/rsqrt_grad.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/rsqrt_grad.h
@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_MUL_H_
+#define AICPU_KERNELS_NORMALIZED_MUL_H_
+#define EIGEN_USE_THREADS
+#define EIGEN_USE_SIMPLE_THREAD_POOL
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+#include "utils/bcast.h"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include <Eigen/Dense>
+
+namespace aicpu {
+class RsqrtGradCpuKernel : public CpuKernel {
+ public:
+  RsqrtGradCpuKernel() = default;
+  ~RsqrtGradCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t RsqrtGradCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t RsqrtGradComputeComplex(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t RsqrtGradComputeFP16(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_MUL_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc
@ -0,0 +1,421 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sample_distorted_bounding_box_ext2.h"
+
+#include <random>
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 3;
+const uint32_t kInputNum = 3;
+const char *kSDBBExt2 = "SampleDistortedBoundingBoxExt2";
+
+#define SDBBExt2CpuKernel_COMPUTE_CASE(DTYPE, TYPE, CTX)                         \
+  case (DTYPE): {                                                                \
+    uint32_t result = SDBBExt2Compute<TYPE>(CTX);                                \
+    if (result != KERNEL_STATUS_OK) {                                            \
+      KERNEL_LOG_ERROR("SampleDistortedBoundingBoxExt2 kernel compute failed."); \
+      return result;                                                             \
+    }                                                                            \
+    break;                                                                       \
+  }
+}  // namespace
+
+namespace aicpu {
+uint64_t SDBBExt2CpuKernel::New64() {
+  std::random_device device("/dev/urandom");
+  static std::mt19937_64 rng = std::mt19937_64(device());
+  return (rng)();
+}
+
+void SDBBExt2CpuKernel::InitPhiloxRandom(int64_t seed, int64_t seed2) {
+  if (seed == 0 && seed2 == 0) {
+    seed = New64();
+    seed2 = New64();
+  }
+  generator_ = PhiloxRandom(seed, seed2);
+}
+
+float SDBBExt2CpuKernel::RandFloat() {
+  uint32_t x = GenerateSingle();
+  const uint32_t man = x & 0x7fffffu;  // 23 bit mantissa
+  const uint32_t exp = static_cast<uint32_t>(127);
+  const uint32_t val = (exp << 23) | man;
+
+  float result;
+  memcpy(&result, &val, sizeof(val));
+  return result - 1.0f;
+}
+
+uint32_t SDBBExt2CpuKernel::Uniform(uint32_t n) {
+  if (n == 0) {
+    return GenerateSingle() * n;
+  } else if (0 == (n & (n - 1))) {
+    return GenerateSingle() & (n - 1);
+  } else {
+    const uint32_t range = ~static_cast<uint32_t>(0);
+    const uint32_t rem = (range % n) + 1;
+    uint32_t rnd;
+    do {
+      rnd = GenerateSingle();
+    } while (rnd < rem);
+    return rnd % n;
+  }
+}
+
+SDBBExt2CpuKernel::ResultElementType SDBBExt2CpuKernel::GenerateSingle() {
+  if (used_result_index_ == PhiloxRandom::kResultElementCount) {
+    unused_results_ = generator_();
+    used_result_index_ = 0;
+  }
+  return unused_results_[used_result_index_++];
+}
+
+bool SDBBExt2CpuKernel::SatisfiesOverlapConstraints(const Rectangle &crop, float minimum_object_covered,
+                                                    const std::vector<Rectangle> &bounding_boxes) {
+  const float kMinArea = 1.0;
+  if (crop.Area() < kMinArea) {
+    return false;
+  }
+
+  bool is_object_covered = false;
+  for (const auto &bbox : bounding_boxes) {
+    const float object_area = bbox.Area();
+    if (object_area < kMinArea) {
+      continue;
+    }
+
+    if (object_area == 0) {
+      continue;
+    }
+    const float object_covered = crop.Intersect(bbox).Area() / object_area;
+    if (object_covered >= minimum_object_covered) {
+      is_object_covered = true;
+      break;
+    }
+  }
+  return is_object_covered;
+}
+
+bool SDBBExt2CpuKernel::GenerateRandomCrop(int original_width, int original_height, float min_relative_crop_area,
+                                           float max_relative_crop_area, float aspect_ratio, Rectangle *crop_rect) {
+  if (max_relative_crop_area <= 0.0 || aspect_ratio <= 0.0 || original_width <= 0 || original_height <= 0 ||
+      min_relative_crop_area > max_relative_crop_area) {
+    return false;
+  }
+
+  const float min_area = min_relative_crop_area * original_width * original_height;
+  const float max_area = max_relative_crop_area * original_width * original_height;
+
+  if (aspect_ratio == 0) {
+    return false;
+  }
+  int height = static_cast<int>(lrintf(std::sqrt(min_area / aspect_ratio)));
+  if (aspect_ratio == 0) {
+    return false;
+  }
+  int max_height = static_cast<int>(lrintf(std::sqrt(max_area / aspect_ratio)));
+  if (lrintf(max_height * aspect_ratio) > original_width) {
+    const float kEps = 0.0000001;
+    const float kBias = 0.5;
+    if (aspect_ratio == 0) {
+      return false;
+    }
+    max_height = static_cast<int>((original_width + kBias - kEps) / aspect_ratio);
+    if (lrintf(max_height * aspect_ratio) > original_width) {
+      max_height -= 1;
+    }
+  }
+
+  if (max_height > original_height) {
+    max_height = original_height;
+  }
+
+  if (height >= max_height) {
+    height = max_height;
+  }
+
+  if (height < max_height) {
+    height += Uniform(max_height - height + 1);
+  }
+  int width = static_cast<int>(lrintf(height * aspect_ratio));
+  float area = static_cast<float>(width * height);
+  if (area < min_area) {
+    height += 1;
+    width = static_cast<int>(lrintf(height * aspect_ratio));
+    area = width * height;
+  }
+
+  if (area > max_area) {
+    height -= 1;
+    width = static_cast<int>(lrintf(height * aspect_ratio));
+    area = width * height;
+  }
+
+  if (area < min_area || area > max_area || width > original_width || height > original_height || width <= 0 ||
+      height <= 0) {
+    return false;
+  }
+
+  int y = 0;
+  if (height < original_height) {
+    y = Uniform(original_height - height);
+  }
+  int x = 0;
+  if (width < original_width) {
+    x = Uniform(original_width - width);
+  }
+
+  crop_rect->min_x_ = x;
+  crop_rect->min_y_ = y;
+  crop_rect->max_x_ = x + width;
+  crop_rect->max_y_ = y + height;
+  return true;
+}
+
+uint32_t SDBBExt2CpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
+                      "SampleDistortedBoundingBoxExt2 check input and output number failed.");
+  KERNEL_HANDLE_ERROR(SDBBExt2Check(ctx), "SampleDistortedBoundingBoxExt2 check params or bcast failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    SDBBExt2CpuKernel_COMPUTE_CASE(DT_UINT8, uint8_t, ctx) SDBBExt2CpuKernel_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+      SDBBExt2CpuKernel_COMPUTE_CASE(DT_INT16, int16_t, ctx) SDBBExt2CpuKernel_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+        SDBBExt2CpuKernel_COMPUTE_CASE(DT_INT64, int64_t, ctx) default
+        : KERNEL_LOG_ERROR("SampleDistortedBoundingBoxExt2 kernel data type [%s] not support.",
+                           DTypeStr(data_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SDBBExt2CpuKernel::SDBBExt2Check(CpuKernelContext &ctx) {
+  auto image_size = ctx.Input(0);
+  auto bounding_boxes = ctx.Input(1);
+  auto min_object_covered = ctx.Input(2);
+  auto begin = ctx.Output(0);
+  auto size = ctx.Output(1);
+  auto bboxes = ctx.Output(2);
+  KERNEL_CHECK_NULLPTR(image_size->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
+  KERNEL_CHECK_NULLPTR(bounding_boxes->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
+  KERNEL_CHECK_NULLPTR(min_object_covered->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 2 data failed.")
+  KERNEL_CHECK_NULLPTR(begin->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed")
+  KERNEL_CHECK_NULLPTR(size->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 1 data failed")
+  KERNEL_CHECK_NULLPTR(bboxes->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 2 data failed")
+
+  auto attr_seed = ctx.GetAttr("seed");
+  KERNEL_CHECK_NULLPTR(attr_seed, KERNEL_STATUS_PARAM_INVALID, "Get seed attr failed.")
+  seed = attr_seed->GetInt();
+
+  auto attr_seed2 = ctx.GetAttr("seed2");
+  KERNEL_CHECK_NULLPTR(attr_seed2, KERNEL_STATUS_PARAM_INVALID, "Get seed2 attr failed.")
+  seed2 = attr_seed2->GetInt();
+
+  auto attr_aspect_ratio_range = ctx.GetAttr("aspect_ratio_range");
+  KERNEL_CHECK_NULLPTR(attr_aspect_ratio_range, KERNEL_STATUS_PARAM_INVALID, "Get aspect_ratio_range attr failed.")
+  aspect_ratio_range = attr_aspect_ratio_range->GetListFloat();
+
+  auto attr_area_range = ctx.GetAttr("area_range");
+  KERNEL_CHECK_NULLPTR(attr_area_range, KERNEL_STATUS_PARAM_INVALID, "Get area_range attr failed.")
+  area_range = attr_area_range->GetListFloat();
+
+  auto attr_max_attempts = ctx.GetAttr("max_attempts");
+  KERNEL_CHECK_NULLPTR(attr_max_attempts, KERNEL_STATUS_PARAM_INVALID, "Get max_attempts attr failed.")
+  max_attempts = attr_max_attempts->GetInt();
+
+  auto attr_use_image_if_no_bounding_boxes = ctx.GetAttr("use_image_if_no_bounding_boxes");
+  KERNEL_CHECK_NULLPTR(attr_use_image_if_no_bounding_boxes, KERNEL_STATUS_PARAM_INVALID,
+                       "Get use_image_if_no_bounding_boxes attr failed.")
+  use_image_if_no_bounding_boxes = attr_use_image_if_no_bounding_boxes->GetBool();
+
+  KERNEL_CHECK_NULLPTR(image_size->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input image_size shape failed.")
+  KERNEL_CHECK_NULLPTR(bounding_boxes->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID,
+                       "Get input bounding_boxes shape failed.")
+  KERNEL_CHECK_NULLPTR(min_object_covered->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID,
+                       "Get input min_object_covered shape failed.")
+
+  std::vector<int64_t> shape_image_size = image_size->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> shape_bounding_boxes = bounding_boxes->GetTensorShape()->GetDimSizes();
+
+  KERNEL_CHECK_FALSE((shape_image_size.size() == 1), KERNEL_STATUS_PARAM_INVALID,
+                     "image_size must be 1-dimensional, got: [%d].", shape_image_size.size())
+  const int image_size_num = 3;
+  KERNEL_CHECK_FALSE((shape_image_size.at(0) == image_size_num), KERNEL_STATUS_PARAM_INVALID,
+                     "image_size must contain 3 elements, got: [%d].", shape_image_size.size())
+
+  const int shape_bounding_boxes_size = 3;
+  KERNEL_CHECK_FALSE((shape_bounding_boxes.size() == shape_bounding_boxes_size), KERNEL_STATUS_PARAM_INVALID,
+                     "input boxes must be 3-dimensional [batch, num_boxes, "
+                     "coords], got: [%d].",
+                     shape_bounding_boxes.size())
+  const int bounding_boxes_size = 4;
+  KERNEL_CHECK_FALSE((shape_bounding_boxes.at(shape_bounding_boxes.size() - 1) == bounding_boxes_size),
+                     KERNEL_STATUS_PARAM_INVALID, "bounding boxes must have shape [4], got: [%d].",
+                     shape_bounding_boxes.at(shape_bounding_boxes.size() - 1))
+
+  const int aspect_ratio_range_size = 2;
+  KERNEL_CHECK_FALSE((aspect_ratio_range.size() == aspect_ratio_range_size), KERNEL_STATUS_PARAM_INVALID,
+                     "Aspect ratio range field must specify 2 dimensions.")
+  KERNEL_CHECK_FALSE((aspect_ratio_range[0] > 0 && aspect_ratio_range[1] > 0), KERNEL_STATUS_PARAM_INVALID,
+                     "Aspect ratio range must be positive: [%f], [%f].", aspect_ratio_range[0], aspect_ratio_range[1])
+
+  const int area_range_size = 2;
+  KERNEL_CHECK_FALSE((area_range.size() == area_range_size), KERNEL_STATUS_PARAM_INVALID,
+                     "Area range field must specify 2 dimensions.")
+  KERNEL_CHECK_FALSE((area_range[0] > 0 && area_range[1] > 0), KERNEL_STATUS_PARAM_INVALID,
+                     "Area range must be positive: [%f], [%f].", area_range[0], area_range[1])
+  KERNEL_CHECK_FALSE((area_range[0] <= 1 && area_range[1] <= 1), KERNEL_STATUS_PARAM_INVALID,
+                     "Area range must be less then or equal to 1.0: [%f], [%f].", area_range[0], area_range[1])
+
+  KERNEL_CHECK_FALSE((max_attempts > 0), KERNEL_STATUS_PARAM_INVALID, "Max attempts must be positive: [%d]",
+                     max_attempts)
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t SDBBExt2CpuKernel::SDBBExt2Compute(CpuKernelContext &ctx) {
+  auto image_size = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto bounding_boxes = reinterpret_cast<float *>(ctx.Input(1)->GetData());
+  auto min_object_covered = reinterpret_cast<float *>(ctx.Input(2)->GetData());
+  auto begin = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto size = reinterpret_cast<T *>(ctx.Output(1)->GetData());
+  auto bboxes = reinterpret_cast<float *>(ctx.Output(2)->GetData());
+
+  const int32_t height = static_cast<int32_t>(image_size[0]);
+  const int32_t width = static_cast<int32_t>(image_size[1]);
+  if (!(height > 0 && width > 0)) {
+    KERNEL_LOG_ERROR("Image height and width must be positive, got: [%d] and [%d]", height, width);
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  float min_object_covered_val = 0.0;
+  min_object_covered_val = *min_object_covered;
+  if (min_object_covered_val < 0.0 || min_object_covered_val > 1.0) {
+    KERNEL_LOG_ERROR("min_object_covered must be in [0.0, 1.0], got: [%f]", min_object_covered_val);
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  const int index_y_min = 0;
+  const int index_x_min = 1;
+  const int index_y_max = 2;
+  const int index_x_max = 3;
+  const int kBBoxSize = 4;
+  std::vector<Rectangle> boxes;
+  int64_t size_bounding_boxes = ctx.Input(1)->NumElements();
+  if (size_bounding_boxes > 0) {
+    for (int b = 0; b < size_bounding_boxes / kBBoxSize; ++b) {
+      if (!(bounding_boxes[b * kBBoxSize + index_x_min] < bounding_boxes[b * kBBoxSize + index_x_max])) {
+        KERNEL_LOG_ERROR("x_min must be less than x_max, got: [%f] and [%f]",
+                         bounding_boxes[b * kBBoxSize + index_x_min], bounding_boxes[b * kBBoxSize + index_x_max]);
+        return KERNEL_STATUS_INNER_ERROR;
+      }
+      if (!(bounding_boxes[b * kBBoxSize + index_y_min] < bounding_boxes[b * kBBoxSize + index_y_max])) {
+        KERNEL_LOG_ERROR("y_min must be less than y_max, got: [%f] and [%f]",
+                         bounding_boxes[b * kBBoxSize + index_y_min], bounding_boxes[b * kBBoxSize + index_y_max]);
+        return KERNEL_STATUS_INNER_ERROR;
+      }
+      for (int i = 0; i < kBBoxSize; ++i) {
+        if (bounding_boxes[b * kBBoxSize + i] < 0.0 || bounding_boxes[b * kBBoxSize + i] > 1.0) {
+          KERNEL_LOG_ERROR("All bounding box coordinates must be in [0.0, 1.0], got: [%f]",
+                           bounding_boxes[b * kBBoxSize + i]);
+          return KERNEL_STATUS_INNER_ERROR;
+        }
+      }
+      const int32_t x_min = static_cast<int32_t>(bounding_boxes[b * kBBoxSize + index_x_min] * width);
+      const int32_t y_min = static_cast<int32_t>(bounding_boxes[b * kBBoxSize + index_y_min] * height);
+      const int32_t x_max = static_cast<int32_t>(bounding_boxes[b * kBBoxSize + index_x_max] * width);
+      const int32_t y_max = static_cast<int32_t>(bounding_boxes[b * kBBoxSize + index_y_max] * height);
+      boxes.push_back(Rectangle(x_min, y_min, x_max, y_max));
+    }
+  }
+
+  const Rectangle image_rect(0, 0, width, height);
+  if (boxes.empty()) {
+    if (!use_image_if_no_bounding_boxes) {
+      KERNEL_LOG_ERROR(
+        "No bounding boxes provided as input. One must "
+        "enable use_image_if_no_bounding_boxes if you wish "
+        "to not provide any bounding boxes.");
+      return KERNEL_STATUS_INNER_ERROR;
+    }
+
+    boxes.push_back(image_rect);
+  }
+
+  const float min_sample_area = area_range[0];
+  const float max_sample_area = area_range[1];
+  const float min_sample_aspect_ratio = aspect_ratio_range[0];
+  const float max_sample_aspect_ratio = aspect_ratio_range[1];
+
+  InitPhiloxRandom(seed, seed2);
+
+  Rectangle crop_rect;
+  bool sample_generated = false;
+  for (int i = 0; i < max_attempts; ++i) {
+    const float sample_aspect_ratio =
+      RandFloat() * (max_sample_aspect_ratio - min_sample_aspect_ratio) + min_sample_aspect_ratio;
+    if (GenerateRandomCrop(width, height, min_sample_area, max_sample_area, sample_aspect_ratio, &crop_rect)) {
+      if (SatisfiesOverlapConstraints(crop_rect, min_object_covered_val, boxes)) {
+        sample_generated = true;
+        break;
+      }
+    }
+  }
+
+  if (!sample_generated) {
+    crop_rect = image_rect;
+  }
+
+  // Determine the cropping parameters from the bounding box.
+  const int target_width = crop_rect.max_x_ - crop_rect.min_x_;
+  const int target_height = crop_rect.max_y_ - crop_rect.min_y_;
+  const int offset_width = crop_rect.min_x_;
+  const int offset_height = crop_rect.min_y_;
+
+  if (width < target_width + offset_width) {
+    KERNEL_LOG_ERROR("width must be >= target_width + offset_width: [%d] vs [%d] + [%d]", width, target_width,
+                     offset_width);
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+
+  if (height < target_height + offset_height) {
+    KERNEL_LOG_ERROR("height must be >= target_height + offset_height: [%d] vs [%d] + [%d]", height, target_height,
+                     offset_height);
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+
+  begin[0] = static_cast<T>(offset_height);
+  size[0] = static_cast<T>(target_height);
+  begin[1] = static_cast<T>(offset_width);
+  size[1] = static_cast<T>(target_width);
+
+  bboxes[index_y_min] = static_cast<float>(crop_rect.min_y_) / static_cast<float>(height);
+  bboxes[index_x_min] = static_cast<float>(crop_rect.min_x_) / static_cast<float>(width);
+  bboxes[index_y_max] = static_cast<float>(crop_rect.max_y_) / static_cast<float>(height);
+  bboxes[index_x_max] = static_cast<float>(crop_rect.max_x_) / static_cast<float>(width);
+
+  // Retain all of the channels.
+  const int32_t begin_channels = 3;
+  const int32_t size_channels = 3;
+  begin[begin_channels - 1] = static_cast<T>(0);
+  size[size_channels - 1] = static_cast<T>(-1);
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kSDBBExt2, SDBBExt2CpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.h
@ -0,0 +1,101 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SAMPLE_DISTORTED_BOUNDING_BOX_EXT2_H_
+#define AICPU_KERNELS_NORMALIZED_SAMPLE_DISTORTED_BOUNDING_BOX_EXT2_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/philox_random.h"
+
+class Rectangle {
+ public:
+  Rectangle() { Set(0, 0, 0, 0); }
+  Rectangle(int xmin, int ymin, int xmax, int ymax) { Set(xmin, ymin, xmax, ymax); }
+
+  void Set(int xmin, int ymin, int xmax, int ymax) {
+    min_x_ = xmin;
+    min_y_ = ymin;
+    max_x_ = xmax;
+    max_y_ = ymax;
+  }
+
+  bool IsEmpty() const { return min_x_ > max_x_ || min_y_ > max_y_; }
+  float Area() const { return static_cast<float>((max_x_ - min_x_) * (max_y_ - min_y_)); }
+
+  Rectangle Intersect(const Rectangle &r) const {
+    const int pmin_x = std::max(min_x_, r.min_x_);
+    const int pmin_y = std::max(min_y_, r.min_y_);
+    const int pmax_x = std::min(max_x_, r.max_x_);
+    const int pmax_y = std::min(max_y_, r.max_y_);
+    if (pmin_x > pmax_x || pmin_y > pmax_y) {
+      return Rectangle();
+    } else {
+      return Rectangle(pmin_x, pmin_y, pmax_x, pmax_y);
+    }
+  }
+
+  int min_x_;
+  int min_y_;
+  int max_x_;
+  int max_y_;
+};
+
+namespace aicpu {
+class SDBBExt2CpuKernel : public CpuKernel {
+ public:
+  SDBBExt2CpuKernel() = default;
+  ~SDBBExt2CpuKernel() override = default;
+
+  static const int kResultTypeNum = 4;
+  static const int kKeyNum = 2;
+  using ResultType = Array<uint32_t, kResultTypeNum>;
+  using ResultElementType = uint32_t;
+  using Key = Array<uint32_t, kKeyNum>;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  int seed;
+  int seed2;
+  std::vector<float> aspect_ratio_range;
+  std::vector<float> area_range;
+  int max_attempts;
+  bool use_image_if_no_bounding_boxes;
+
+  PhiloxRandom generator_;
+
+  float RandFloat();
+  uint32_t Uniform(uint32_t n);
+
+  uint64_t New64();
+  void InitPhiloxRandom(int64_t seed, int64_t seed2);
+  ResultType unused_results_;
+  int used_result_index_ = PhiloxRandom::kResultElementCount;
+  ResultElementType GenerateSingle();
+
+  // Image
+  bool SatisfiesOverlapConstraints(const Rectangle &crop, float minimum_object_covered,
+                                   const std::vector<Rectangle> &bounding_boxes);
+  bool GenerateRandomCrop(int original_width, int original_height, float min_relative_crop_area,
+                          float max_relative_crop_area, float aspect_ratio, Rectangle *crop_rect);
+
+  uint32_t SDBBExt2Check(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t SDBBExt2Compute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.cc
@ -0,0 +1,196 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scatter_nd.h"
+
+#include <complex>
+
+#include "eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 3;
+const uint32_t kOutputNum = 1;
+const char *kScatterNd = "ScatterNd";
+}  // namespace
+
+namespace aicpu {
+uint32_t ScatterNdCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check ScatterNd Input and Output failed.");
+
+  Tensor *input_indices = ctx.Input(0);
+  Tensor *input_x = ctx.Input(1);
+  Tensor *input_shape = ctx.Input(2);
+
+  auto shape_x = input_x->GetTensorShape();
+  auto shape_indices = input_indices->GetTensorShape();
+  auto shape_shape = input_shape->GetTensorShape();
+  int64_t indices_shape_m = shape_indices->GetDimSize(shape_indices->GetDims() - 1);
+
+  if (shape_x->GetDims() < 1) {
+    KERNEL_LOG_ERROR("[%s] Tensor input_x's rank less than 1.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (shape_indices->GetDims() < 1) {
+    KERNEL_LOG_ERROR("[%s] Tensor input_indices's rank less than 1.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (shape_shape->GetDims() < 1) {
+    KERNEL_LOG_ERROR("[%s] Tensor input_shape's rank less than 1.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  if (indices_shape_m > shape_shape->NumElements()) {
+    KERNEL_LOG_ERROR("[%s] Tensor input_shape&input_indices ranks mismatch.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  for (int64_t i = 0; i < shape_indices->GetDims() - 1; i++) {
+    if (shape_indices->GetDimSize(i) != shape_x->GetDimSize(i)) {
+      KERNEL_LOG_ERROR("[%s], shape_indices and shape_updates mismatch.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+
+  auto data_type_x = input_x->GetDataType();
+  auto data_type_indices = input_indices->GetDataType();
+  auto data_type_shape = input_shape->GetDataType();
+  if (data_type_shape != DT_INT32 && data_type_shape != DT_INT64) {
+    KERNEL_LOG_ERROR("ScatterNd kernel data type [%s] not support.", DTypeStr(data_type_shape).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (data_type_indices != DT_INT32 && data_type_indices != DT_INT64) {
+    KERNEL_LOG_ERROR("ScatterNd kernel data type [%s] not support.", DTypeStr(data_type_indices).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (data_type_indices != data_type_shape) {
+    KERNEL_LOG_ERROR("Indices and shape must have the same type.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  switch (data_type_x) {
+    case DT_INT8:
+      return DTYPE_CHOOSE<int8_t>(ctx);
+    case DT_INT16:
+      return DTYPE_CHOOSE<int16_t>(ctx);
+    case DT_INT32:
+      return DTYPE_CHOOSE<int32_t>(ctx);
+    case DT_INT64:
+      return DTYPE_CHOOSE<int64_t>(ctx);
+    case DT_UINT8:
+      return DTYPE_CHOOSE<uint8_t>(ctx);
+    case DT_UINT16:
+      return DTYPE_CHOOSE<uint16_t>(ctx);
+    case DT_UINT32:
+      return DTYPE_CHOOSE<uint32_t>(ctx);
+    case DT_UINT64:
+      return DTYPE_CHOOSE<uint64_t>(ctx);
+    case DT_FLOAT16:
+      return DTYPE_CHOOSE<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return DTYPE_CHOOSE<float>(ctx);
+    case DT_DOUBLE:
+      return DTYPE_CHOOSE<double>(ctx);
+    case DT_COMPLEX64:
+      return DTYPE_CHOOSE<std::complex<float>>(ctx);
+    case DT_COMPLEX128:
+      return DTYPE_CHOOSE<std::complex<double>>(ctx);
+    default:
+      KERNEL_LOG_ERROR("ScatterNd kernel data type [%s] not support.", DTypeStr(data_type_x).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+template <typename data_type_x>
+uint32_t ScatterNdCpuKernel::DTYPE_CHOOSE(CpuKernelContext &ctx) {
+  auto indices_type = static_cast<DataType>(ctx.Input(0)->GetDataType());
+  switch (indices_type) {
+    case DT_INT32:
+      return ScatterNdComputeRealKernel<int32_t, data_type_x>(ctx);
+    case DT_INT64:
+      return ScatterNdComputeRealKernel<int64_t, data_type_x>(ctx);
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not supported, input data type is [%s].", ctx.GetOpType().c_str(),
+                       DTypeStr(indices_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename indices_type, typename data_type_x>
+uint32_t ScatterNdCpuKernel::ScatterNdComputeRealKernel(CpuKernelContext &ctx) {
+  int64_t n_slices = 1;
+  int64_t slice_size = 1;
+
+  const int64_t outer_dims = ctx.Input(0)->GetTensorShape()->GetDims() - 1;
+  const int64_t indices_nd = ctx.Input(0)->GetTensorShape()->GetDimSize(outer_dims);
+  const int64_t updates_dims = ctx.Input(1)->GetTensorShape()->GetDims();
+
+  auto shape_indices = ctx.Input(0)->GetTensorShape();
+  auto data_shape = reinterpret_cast<indices_type *>(ctx.Input(2)->GetData());
+  auto dims_shape = ctx.Input(2)->GetTensorShape()->NumElements();
+  auto updates_shape = ctx.Input(1)->GetTensorShape();
+  for (int64_t i = 0; i < dims_shape - indices_nd; i++) {
+    if (updates_shape->GetDimSize(i + shape_indices->GetDims() - 1) != data_shape[i + indices_nd]) {
+      KERNEL_LOG_ERROR("[%s], shape_indices and shape_updates mismatch.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+
+  for (int64_t i = 0; i < outer_dims; ++i) {
+    n_slices *= ctx.Input(0)->GetTensorShape()->GetDimSize(i);
+  }
+  for (int64_t i = outer_dims; i < updates_dims; ++i) {
+    slice_size *= ctx.Input(1)->GetTensorShape()->GetDimSize(i);
+  }
+  const int kNumberInputTwo = 2;
+  int64_t output_flat_size = 1;
+  int64_t num_shape = ctx.Input(kNumberInputTwo)->NumElements();
+  for (int64_t i = 0; i < num_shape; i++) {
+    output_flat_size *= data_shape[i];
+  }
+  int64_t remain_flat_size = output_flat_size;
+  std::vector<int64_t> dims_to_count(indices_nd, 0);
+  for (int64_t i = 0; i < indices_nd; ++i) {
+    dims_to_count[i] = remain_flat_size / data_shape[i];
+    remain_flat_size = dims_to_count[i];
+  }
+
+  auto Indices_data = reinterpret_cast<indices_type *>(ctx.Input(0)->GetData());
+  auto Updates_data = reinterpret_cast<data_type_x *>(ctx.Input(1)->GetData());
+  auto Output_data = reinterpret_cast<data_type_x *>(ctx.Output(0)->GetData());
+
+  memset(Output_data, 0, sizeof(data_type_x) * output_flat_size);
+  for (int64_t i = 0; i < n_slices; ++i) {
+    int64_t to_pos = 0;
+    for (int64_t j = 0; j < indices_nd; ++j) {
+      int64_t idx = Indices_data[i * indices_nd + j];
+
+      if (idx < 0 || idx >= data_shape[j]) {
+        KERNEL_LOG_ERROR("The indices[%d] is so big or small", idx);
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+
+      to_pos += idx * dims_to_count[j];
+    }
+    for (int64_t j = 0; j < slice_size; j++) {
+      Output_data[to_pos + j] += Updates_data[i * slice_size + j];
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kScatterNd, ScatterNdCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.h
@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_SCATTERND_H_
+#define AICPU_KERNELS_NORMALIZED_SCATTERND_H_
+
+#include <string.h>
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class ScatterNdCpuKernel : public CpuKernel {
+ public:
+  ScatterNdCpuKernel() = default;
+  ~ScatterNdCpuKernel() override = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename data_type0>
+  uint32_t DTYPE_CHOOSE(CpuKernelContext &ctx);
+
+  template <typename indices_type, typename data_type0>
+  uint32_t ScatterNdComputeRealKernel(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd_update.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd_update.cc
@ -0,0 +1,211 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scatter_nd_update.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <complex>
+#include <iostream>
+#include <map>
+
+#include "eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 3;
+const uint32_t kOutputNum = 1;
+const char *kScatterNdUpdate = "ScatterNdUpdate";
+}  // namespace
+
+namespace aicpu {
+uint32_t ScatterNdUpdateCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check ScatterNdUpdate Input and Output failed.");
+
+  Tensor *input_var = ctx.Input(0);
+  Tensor *input_indices = ctx.Input(1);
+  Tensor *input_updates = ctx.Input(2);
+
+  auto shape_var = input_var->GetTensorShape();
+  auto shape_indices = input_indices->GetTensorShape();
+  auto shape_updates = input_updates->GetTensorShape();
+
+  if (shape_var->GetDims() < 1) {
+    KERNEL_LOG_ERROR("[%s] Tensor input_var's rank less than 1.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (shape_indices->GetDims() < 2) {
+    KERNEL_LOG_ERROR("[%s] Tensor input_indices's rank less than 2.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (shape_updates->GetDims() < 1) {
+    KERNEL_LOG_ERROR("[%s] Tensor input_updates's rank less than 1.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  auto index_size = shape_indices->GetDims() - 1;
+  auto index_depth = shape_indices->GetDimSize(index_size);
+
+  if (index_depth > shape_var->GetDims()) {
+    KERNEL_LOG_ERROR("[%s] Tensor input_var&input_indices ranks mismatch.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  std::vector<int64_t> batch_shape;
+  for (int64_t i = 0; i < index_size; ++i) {
+    batch_shape.push_back(shape_indices->GetDimSize(i));
+  }
+
+  for (int64_t i = index_depth; i <= shape_var->GetDims() - 1; ++i) {
+    batch_shape.push_back(shape_var->GetDimSize(i));
+  }
+
+  if (batch_shape != shape_updates->GetDimSizes()) {
+    KERNEL_LOG_ERROR("[%s] Tensor indices's & updates' and var's shape are dismatch .", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  for (int64_t i = 0; i < index_size; i++) {
+    if (shape_indices->GetDimSize(i) != shape_updates->GetDimSize(i)) {
+      KERNEL_LOG_ERROR("[%s], Tensor indices and updates should have the same batch number.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+
+  auto data_type_var = input_var->GetDataType();
+  auto data_type_indices = input_indices->GetDataType();
+
+  if (data_type_indices != DT_INT32 && data_type_indices != DT_INT64) {
+    KERNEL_LOG_ERROR("ScatterNdUpdate kernel data type [%s] not support.", DTypeStr(data_type_indices).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  switch (data_type_var) {
+    case DT_INT8:
+      return DTYPE_CHOOSE<int8_t>(ctx);
+    case DT_INT16:
+      return DTYPE_CHOOSE<int16_t>(ctx);
+    case DT_INT32:
+      return DTYPE_CHOOSE<int32_t>(ctx);
+    case DT_INT64:
+      return DTYPE_CHOOSE<int64_t>(ctx);
+    case DT_UINT8:
+      return DTYPE_CHOOSE<uint8_t>(ctx);
+    case DT_UINT16:
+      return DTYPE_CHOOSE<uint16_t>(ctx);
+    case DT_UINT32:
+      return DTYPE_CHOOSE<uint32_t>(ctx);
+    case DT_UINT64:
+      return DTYPE_CHOOSE<uint64_t>(ctx);
+    case DT_FLOAT16:
+      return DTYPE_CHOOSE<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return DTYPE_CHOOSE<float>(ctx);
+    case DT_DOUBLE:
+      return DTYPE_CHOOSE<double>(ctx);
+    case DT_COMPLEX64:
+      return DTYPE_CHOOSE<std::complex<float>>(ctx);
+    case DT_COMPLEX128:
+      return DTYPE_CHOOSE<std::complex<double>>(ctx);
+    default:
+      KERNEL_LOG_ERROR("ScatterNdUpdate kernel data type [%s] not support.", DTypeStr(data_type_var).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename var_type>
+uint32_t ScatterNdUpdateCpuKernel::DTYPE_CHOOSE(CpuKernelContext &ctx) {
+  auto indices_type = static_cast<DataType>(ctx.Input(1)->GetDataType());
+  switch (indices_type) {
+    case DT_INT32:
+      return ScatterNdUpdateComputeRealKernel<var_type, int32_t>(ctx);
+    case DT_INT64:
+      return ScatterNdUpdateComputeRealKernel<var_type, int64_t>(ctx);
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not supported, input data type is [%s].", ctx.GetOpType().c_str(),
+                       DTypeStr(indices_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+template <typename var_type, typename indices_type>
+uint32_t ScatterNdUpdateCpuKernel::ScatterNdUpdateComputeRealKernel(CpuKernelContext &ctx) {
+  int64_t n_slices = 1;
+  int64_t slice_size = 1;
+
+  const int64_t indices_dims = ctx.Input(1)->GetTensorShape()->GetDims() - 1;
+  const int64_t indices_nd = ctx.Input(1)->GetTensorShape()->GetDimSize(indices_dims);
+  const int64_t updates_dims = ctx.Input(2)->GetTensorShape()->GetDims();
+
+  auto shape_var = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  auto shape_indices = ctx.Input(1)->GetTensorShape();
+  auto dims_shape = ctx.Input(0)->GetTensorShape()->GetDims();
+  for (int64_t i = 0; i < dims_shape - indices_nd; i++) {
+    if (ctx.Input(2)->GetTensorShape()->GetDimSize(i + shape_indices->GetDims() - 1) != shape_var[i + indices_nd]) {
+      KERNEL_LOG_ERROR("[%s] shape_indices and shape_updates mismatch.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+
+  for (int64_t i = 0; i < indices_dims; ++i) {
+    n_slices *= ctx.Input(1)->GetTensorShape()->GetDimSize(i);
+  }
+  for (int i = indices_dims; i < updates_dims; ++i) {
+    slice_size *= ctx.Input(2)->GetTensorShape()->GetDimSize(i);
+  }
+
+  const int64_t var_flat_size = ctx.Input(0)->GetTensorShape()->NumElements();
+  std::vector<int64_t> output_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+
+  int64_t remain_flat_size = var_flat_size;
+  std::vector<int64_t> dims_to_count(indices_nd, 0);
+  for (int64_t i = 0; i < indices_nd; ++i) {
+    dims_to_count[i] = remain_flat_size / output_shape[i];
+    remain_flat_size = dims_to_count[i];
+  }
+
+  auto Var_data = reinterpret_cast<var_type *>(ctx.Input(0)->GetData());
+  auto Indices_data = reinterpret_cast<indices_type *>(ctx.Input(1)->GetData());
+  auto Updates_data = reinterpret_cast<var_type *>(ctx.Input(2)->GetData());
+  auto Output_data = reinterpret_cast<var_type *>(ctx.Output(0)->GetData());
+
+  for (int64_t i = 0; i < var_flat_size; ++i) {
+    Output_data[i] = Var_data[i];
+  }
+  for (int64_t i = 0; i < n_slices; ++i) {
+    int64_t to_pos = 0;
+    for (int64_t j = 0; j < indices_nd; ++j) {
+      int64_t idx = Indices_data[i * indices_nd + j];
+
+      if (idx < 0 || idx >= output_shape[j]) {
+        KERNEL_LOG_ERROR("The indices[%d] is so big or small", idx);
+        return KERNEL_STATUS_PARAM_INVALID;
+      }
+
+      to_pos += idx * dims_to_count[j];
+    }
+    for (int64_t j = 0; j < slice_size; j++) {
+      Output_data[to_pos + j] = Updates_data[i * slice_size + j];
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kScatterNdUpdate, ScatterNdUpdateCpuKernel);
+
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd_update.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd_update.h
@ -0,0 +1,40 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_SCATTERNDUPDATE_H_
+#define AICPU_KERNELS_NORMALIZED_SCATTERNDUPDATE_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+#include "utils/bcast.h"
+#include <string.h>
+
+namespace aicpu {
+class ScatterNdUpdateCpuKernel : public CpuKernel {
+ public:
+  ScatterNdUpdateCpuKernel() = default;
+  ~ScatterNdUpdateCpuKernel() override = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename var_type>
+  uint32_t DTYPE_CHOOSE(CpuKernelContext &ctx);
+
+  template <typename var_type, typename indices_type>
+  uint32_t ScatterNdUpdateComputeRealKernel(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/select.cc
@ -0,0 +1,151 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "select.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/broadcast_iterator.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 3;
+const char *kSelect = "Select";
+
+#define SELECT_COMPUTE_CASE(DTYPE, TYPE, CTX)            \
+  case (DTYPE): {                                        \
+    uint32_t result = SelectCompute<TYPE>(CTX);          \
+    if (result != KERNEL_STATUS_OK) {                    \
+      KERNEL_LOG_ERROR("Select kernel compute failed."); \
+      return result;                                     \
+    }                                                    \
+    break;                                               \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t SelectCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Select check input and output number failed.");
+  KERNEL_HANDLE_ERROR(SelectParamCheck(ctx), "Select check params failed.");
+  auto data_type = ctx.Input(1)->GetDataType();
+  switch (data_type) {
+    SELECT_COMPUTE_CASE(DT_INT8, int8_t, ctx)
+    SELECT_COMPUTE_CASE(DT_INT16, int16_t, ctx)
+    SELECT_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    SELECT_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    SELECT_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
+    SELECT_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
+    SELECT_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
+    SELECT_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
+    SELECT_COMPUTE_CASE(DT_BOOL, uint64_t, ctx)
+    SELECT_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    SELECT_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    SELECT_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    SELECT_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx);
+    SELECT_COMPUTE_CASE(DT_COMPLEX64, std::complex<double>, ctx);
+    default:
+      KERNEL_LOG_ERROR("Select kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SelectCpuKernel::SelectParamCheck(CpuKernelContext &ctx) {
+  // the non null of input_0, input_1, output has been verified in NormalCheck
+  Tensor *input_0 = ctx.Input(0);
+  Tensor *input_1 = ctx.Input(1);
+  Tensor *input_2 = ctx.Input(2);
+  Tensor *output = ctx.Output(0);
+  DataType input0_type = input_0->GetDataType();
+  DataType input1_type = input_1->GetDataType();
+  DataType input2_type = input_2->GetDataType();
+
+  auto input_shape_a = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  auto input_shape_b = ctx.Input(2)->GetTensorShape()->GetDimSizes();
+
+  if (input0_type != DT_BOOL) {
+    KERNEL_LOG_ERROR("[%s] Data type of mask requires bool, but got data type [%s].", ctx.GetOpType().c_str(),
+                     DTypeStr(input0_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  KERNEL_CHECK_FALSE((input1_type == input2_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input1 [%s] need be same with "
+                     "input2 [%s].",
+                     DTypeStr(input1_type).c_str(), DTypeStr(input2_type).c_str())
+
+  if (input_shape_a != input_shape_b) {
+    KERNEL_LOG_ERROR("The shape of X1 must equal X2.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  KERNEL_LOG_DEBUG(
+    "SelectCpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], input2: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), input_2->GetDataSize(),
+    output->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t SelectCpuKernel::SelectCompute(CpuKernelContext &ctx) {
+  bool *condition = static_cast<bool *>(ctx.Input(0)->GetData());
+  T *x1 = static_cast<T *>(ctx.Input(1)->GetData());
+  T *x2 = static_cast<T *>(ctx.Input(2)->GetData());
+  T *y = static_cast<T *>(ctx.Output(0)->GetData());
+  auto input_shape_a = ctx.Input(1)->GetTensorShape()->GetDimSizes();
+  auto input_shape_b = ctx.Input(2)->GetTensorShape()->GetDimSizes();
+  auto input_shape_mask = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> output_shape;
+  int64_t tensor_size = 1;
+  int64_t position = 0;
+  if (input_shape_a == input_shape_mask) {
+    for (const int64_t &d : input_shape_a) {
+      tensor_size *= d;
+    }
+    for (int64_t i = 0; i < tensor_size; ++i) {
+      if (condition[i]) {
+        y[position++] = x1[i];
+      } else {
+        y[position++] = x2[i];
+      }
+    }
+  } else {
+    auto ret = GetBroadcastShape(input_shape_a, input_shape_mask, output_shape);
+    KERNEL_CHECK_FALSE(ret == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID, "Shape of x and mask can't be broadcast.");
+    for (const int64_t &d : output_shape) {
+      tensor_size *= d;
+    }
+    BroadcastIterator iter(input_shape_a, input_shape_mask, output_shape);
+    iter.SetPos(0);
+    for (int64_t i = 0; i < tensor_size; ++i) {
+      if (condition[iter.GetInputPosB()]) {
+        y[position++] = x1[i];
+      } else {
+        y[position++] = x2[i];
+      }
+      iter.GenNextPos();
+    }
+  }
+  ctx.Output(0)->GetTensorShape()->SetDimSizes({position});
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kSelect, SelectCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/select.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/select.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_SELECT_H_
+#define AICPU_KERNELS_NORMALIZED_SELECT_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class SelectCpuKernel : public CpuKernel {
+ public:
+  SelectCpuKernel() = default;
+  ~SelectCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t SelectParamCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t SelectCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/self_adjoint_eig.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/self_adjoint_eig.cc
@ -0,0 +1,127 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "self_adjoint_eig.h"
+#include "cpu_kernel_utils.h"
+#include "kernel_util.h"
+#include <complex>
+#include "utils/kernel_util.h"
+#include "Eigen/Core"
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+namespace {
+const char *kSelfAdjointEig = "SelfAdjointEig";
+const uint32_t kInputNum = 1;
+const uint32_t kOutputNum = 2;
+}  // namespace
+namespace aicpu {
+uint32_t SelfAdjointEigCpuKernel::Compute(CpuKernelContext &ctx) {
+  if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  Tensor *input0 = ctx.Input(0);
+  if ((input0->GetDataSize() == 0)) {
+    KERNEL_LOG_INFO("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_OK;
+  }
+  uint32_t ret = KERNEL_STATUS_OK;
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    case DT_FLOAT:
+      ret = SelfAdjointEigCompute<float>(ctx);
+      break;
+    case DT_DOUBLE:
+      ret = SelfAdjointEigCompute<double>(ctx);
+      break;
+    case DT_COMPLEX64:
+      ret = SelfAdjointEigCompute<complex<float>>(ctx);
+      break;
+    case DT_COMPLEX128:
+      ret = SelfAdjointEigCompute<complex<double>>(ctx);
+      break;
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                       DTypeStr(data_type).c_str());
+      ret = KERNEL_STATUS_PARAM_INVALID;
+  }
+  return ret;
+}
+
+template <typename T>
+uint32_t SelfAdjointEigCpuKernel::SelfAdjointEigCompute(CpuKernelContext &ctx) {
+  auto input_tensor = ctx.Input(0);
+  auto output_tensor0 = ctx.Output(0);
+  auto output_tensor1 = ctx.Output(1);
+  auto input_tensor_shape = input_tensor->GetTensorShape();
+  auto inputData = reinterpret_cast<T *>(input_tensor->GetData());
+  int64_t rank = input_tensor_shape->GetDims();
+  std::vector<int64_t> input_dims = input_tensor_shape->GetDimSizes();
+  const int32_t m = input_dims[rank - 1];
+  int64_t num_array = input_tensor_shape->NumElements() / (m * m);
+  using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+  if (rank <= 2) {
+    MatrixMap input0(inputData, m, m);
+    MatrixMap output0(reinterpret_cast<T *>(output_tensor0->GetData()), m, 1);
+    MatrixMap output1(reinterpret_cast<T *>(output_tensor1->GetData()), m, m);
+    AttrValue *attr = ctx.GetAttr("compute_v");
+    bool attr_ = (attr == nullptr) ? true : attr->GetBool();
+    Eigen::SelfAdjointEigenSolver<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> es(
+      input0, attr_ ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly);
+    output0 = es.eigenvalues().template cast<T>();
+    if (attr_) {
+      output1 = es.eigenvectors();
+    }
+  } else {
+    auto outputData0 = reinterpret_cast<T *>(output_tensor0->GetData());
+    auto outputData1 = reinterpret_cast<T *>(output_tensor1->GetData());
+    for (int64_t batch = 0; batch < num_array; ++batch) {
+      AttrValue *attr = ctx.GetAttr("compute_v");
+      bool attr_ = (attr == nullptr) ? true : attr->GetBool();
+      T *inputDataMap = reinterpret_cast<T *>(new T[m * m]);
+      T *outputDataMap0 = reinterpret_cast<T *>(new T[m]);
+      T *outputDataMap1 = reinterpret_cast<T *>(new T[m * m]);
+      for (int64_t i = 0; i < m * m; ++i) {
+        inputDataMap[i] = inputData[batch * m * m + i];
+        outputDataMap1[i] = outputData1[batch * m * m + i];
+      }
+      for (int64_t i = 0; i < m; ++i) {
+        outputDataMap0[i] = outputData0[batch * m + i];
+      }
+      MatrixMap input0(inputDataMap, m, m);
+      MatrixMap output0(outputDataMap0, m, 1);
+      MatrixMap output1(outputDataMap1, m, m);
+      Eigen::SelfAdjointEigenSolver<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> es(
+        input0, attr_ ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly);
+      output0 = es.eigenvalues().template cast<T>();
+      for (int64_t i = 0; i < m; i++) {
+        *(outputData0 + batch * m + i) = output0(i, 0);
+      }
+      if (attr_) {
+        output1 = es.eigenvectors();
+        for (int64_t i = 0; i < m; i++) {
+          for (int64_t j = 0; j < m; j++) {
+            *(outputData1 + batch * m * m + i * m + j) = output1(i, j);
+          }
+        }
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kSelfAdjointEig, SelfAdjointEigCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/self_adjoint_eig.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/self_adjoint_eig.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_SELFADJOINTEIG_H_
+#define AICPU_KERNELS_NORMALIZED_SELFADJOINTEIG_H_
+#include "cpu_ops_kernel.h"
+#include "Eigen/Eigenvalues"
+#include <iostream>
+namespace aicpu {
+
+class SelfAdjointEigCpuKernel : public CpuKernel {
+ public:
+  SelfAdjointEigCpuKernel() = default;
+  ~SelfAdjointEigCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t SelfAdjointEigCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_RANDOM_UNIFORM_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sign.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sign.cc
@ -0,0 +1,152 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sign.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 1;
+const char *const kSign = "Sign";
+constexpr int64_t kParallelDataNums = 128 * 1024;
+
+#define SIGN_COMPUTE_CASE(DTYPE, TYPE, CTX)                  \
+  case (DTYPE): {                                            \
+    uint32_t result = SignCompute<TYPE>(CTX);                \
+    if (result != static_cast<uint32_t>(KERNEL_STATUS_OK)) { \
+      KERNEL_LOG_ERROR("Sign kernel compute failed.");       \
+      return result;                                         \
+    }                                                        \
+    break;                                                   \
+  }
+
+#define SIGN_COMPUTE_CASE2(DTYPE, TYPE, CTX)                 \
+  case (DTYPE): {                                            \
+    uint32_t result = SignComputeComplex<TYPE>(CTX);         \
+    if (result != static_cast<uint32_t>(KERNEL_STATUS_OK)) { \
+      KERNEL_LOG_ERROR("Sign kernel compute failed.");       \
+      return result;                                         \
+    }                                                        \
+    break;                                                   \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t SignCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kSign);
+  KERNEL_HANDLE_ERROR(static_cast<uint32_t>(SignCheck(ctx)), "[%s] check params failed.", kSign);
+  DataType data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    SIGN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    SIGN_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    SIGN_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    SIGN_COMPUTE_CASE(DT_INT32, int32_t, ctx)
+    SIGN_COMPUTE_CASE(DT_INT64, int64_t, ctx)
+    SIGN_COMPUTE_CASE2(DT_COMPLEX64, std::complex<float>, ctx)
+    SIGN_COMPUTE_CASE2(DT_COMPLEX128, std::complex<double>, ctx)
+    default:
+      KERNEL_LOG_ERROR("Sign kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
+  }
+  return static_cast<uint32_t>(KERNEL_STATUS_OK);
+}
+
+KernelStatus SignCpuKernel::SignCheck(const CpuKernelContext &ctx) const {
+  auto input_0 = ctx.Input(0);
+  auto output_0 = ctx.Output(0);
+  KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
+  KERNEL_CHECK_NULLPTR(output_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
+  KERNEL_CHECK_NULLPTR(input_0->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.")
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t SignCpuKernel::SignCompute(const CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Input(0)->NumElements();
+  int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
+  if (data_size <= kParallelDataNums) {
+    for (int64_t i = 0; i < data_num; i++) {
+      if (*(input_x + i) > static_cast<T>(0)) {
+        *(output_y + i) = static_cast<T>(1);
+      } else if (*(input_x + i) == static_cast<T>(0)) {
+        *(output_y + i) = static_cast<T>(0);
+      } else {
+        *(output_y + i) = static_cast<T>(-1);
+      }
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shard_sign = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        if (*(input_x + i) > static_cast<T>(0)) {
+          *(output_y + i) = static_cast<T>(1);
+        } else if (*(input_x + i) == static_cast<T>(0)) {
+          *(output_y + i) = static_cast<T>(0);
+        } else {
+          *(output_y + i) = static_cast<T>(-1);
+        }
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_sign),
+                        "Sign Compute failed.");
+  }
+  return static_cast<uint32_t>(KERNEL_STATUS_OK);
+}
+
+template <typename T>
+uint32_t SignCpuKernel::SignComputeComplex(const CpuKernelContext &ctx) {
+  auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t data_num = ctx.Input(0)->NumElements();
+  int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
+  if (data_size <= kParallelDataNums) {
+    for (int64_t i = 0; i < data_num; i++) {
+      if (*(input_x + i) != static_cast<T>(0)) {
+        *(output_y + i) = (*(input_x + i) / Eigen::numext::abs(*(input_x + i)));
+      } else {
+        *(output_y + i) = static_cast<T>(0);
+      }
+    }
+  } else {
+    uint32_t min_num = 1;
+    int64_t max_core_num = std::max(min_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shard_sign = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        if (*(input_x + i) != static_cast<T>(0)) {
+          *(output_y + i) = (*(input_x + i) / Eigen::numext::abs(*(input_x + i)));
+        } else {
+          *(output_y + i) = static_cast<T>(0);
+        }
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_sign),
+                        "Sign Compute failed.");
+  }
+  return static_cast<uint32_t>(KERNEL_STATUS_OK);
+}
+REGISTER_CPU_KERNEL(kSign, SignCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sign.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sign.h
@ -0,0 +1,40 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SIGN_H
+#define AICPU_KERNELS_NORMALIZED_SIGN_H
+
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel/common/status.h"
+
+namespace aicpu {
+class SignCpuKernel : public CpuKernel {
+ public:
+  SignCpuKernel() = default;
+  ~SignCpuKernel() override = default;
+
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  KernelStatus SignCheck(const CpuKernelContext &ctx) const;
+
+  template <typename T>
+  uint32_t SignCompute(const CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t SignComputeComplex(const CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sin.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sin.cc
@ -0,0 +1,148 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sin.h"
+
+#include <complex>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const std::uint32_t kSinInputNum{1};
+const std::uint32_t kSinOutputNum{1};
+const char *kSin{"Sin"};
+}  // namespace
+
+namespace internal {
+template <typename T>
+inline T ScalarSin(T x) {
+  return std::sin(x);
+}
+
+template <>
+inline Eigen::half ScalarSin(Eigen::half x) {
+  const Eigen::half val{static_cast<Eigen::half>(Eigen::numext::sin(x))};
+  return val;
+}
+}  // namespace internal
+
+namespace aicpu {
+namespace detail {
+template <typename T>
+inline std::uint32_t ComputeSinKernel(const CpuKernelContext &ctx) {
+  using i64 = std::int64_t;
+  const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
+  const auto ScalarSin = internal::ScalarSin<T>;
+  auto input = static_cast<T *>(ctx.Input(0)->GetData());
+  auto output = static_cast<T *>(ctx.Output(0)->GetData());
+  i64 total = ctx.Input(0)->NumElements();
+  uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
+  i64 num = 1024;
+  if (total > num) {
+    i64 per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
+    return ParallelFor(ctx, total, per_unit_size, [&](i64 begin, i64 end) {
+      std::transform(input + begin, input + end, output + begin, ScalarSin);
+    });
+  } else if (cores != 0) {
+    std::transform(input, input + total, output, ScalarSin);
+  } else {
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+inline std::uint32_t ComputeSin(const CpuKernelContext &ctx) {
+  uint32_t result = ComputeSinKernel<T>(ctx);
+  if (result != 0) {
+    KERNEL_LOG_ERROR("Sin compute failed.");
+  }
+  return result;
+}
+
+inline std::uint32_t SinExtraCheck(const CpuKernelContext &ctx) {
+  if (ctx.Input(0)->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Get input data failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Output(0)->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Get output data failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
+    KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
+                     DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
+    KERNEL_LOG_ERROR(
+      "The data size of the input [%llu] need be the same as the output "
+      "[%llu].",
+      ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
+  if (input_dims.size() != output_dims.size()) {
+    KERNEL_LOG_ERROR(
+      "The data dim size of the input [%llu] need be the same as the output "
+      "[%llu].",
+      input_dims.size(), output_dims.size());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  for (size_t index = 0; index < input_dims.size(); index++) {
+    if (input_dims[index] != output_dims[index]) {
+      KERNEL_LOG_ERROR("The data dim of the input need be the same as the output.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+std::uint32_t SinCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
+  return NormalCheck(ctx, kSinInputNum, kSinOutputNum) ? KERNEL_STATUS_PARAM_INVALID : SinExtraCheck(ctx);
+}
+
+std::uint32_t SinCompute(const CpuKernelContext &ctx) {
+  DataType input_type{ctx.Input(0)->GetDataType()};
+  switch (input_type) {
+    case DT_FLOAT16:
+      return ComputeSin<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return ComputeSin<std::float_t>(ctx);
+    case DT_DOUBLE:
+      return ComputeSin<std::double_t>(ctx);
+    case DT_COMPLEX64:
+      return ComputeSin<std::complex<std::float_t>>(ctx);
+    case DT_COMPLEX128:
+      return ComputeSin<std::complex<std::double_t>>(ctx);
+    default:
+      KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+}  // namespace detail
+
+std::uint32_t SinCpuKernel::Compute(CpuKernelContext &ctx) {
+  return detail::SinCheck(ctx, kSinInputNum, kSinOutputNum) ? KERNEL_STATUS_PARAM_INVALID : detail::SinCompute(ctx);
+}
+REGISTER_CPU_KERNEL(kSin, SinCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sin.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sin.h
@ -0,0 +1,26 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_TAN_H_
+#define AICPU_KERNELS_NORMALIZED_TAN_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class SinCpuKernel final : public CpuKernel {
+  virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sinc.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sinc.cc
@ -0,0 +1,267 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sinc.h"
+
+#include <complex>
+#include <set>
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+constexpr double kPI = 3.14159265358979323846L;
+constexpr uint32_t kSincInputNum = 1;
+constexpr uint32_t kSincOutputNum = 1;
+const int64_t paralled_data_size = 64 * 1024;
+const char *kSinc = "Sinc";
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+uint32_t SincCpuKernel::SincTypeSameCompute(CpuKernelContext &ctx) {
+  T *x_addr = static_cast<T *>(ctx.Input(0)->GetData());
+  auto y_addr = static_cast<T *>(ctx.Output(0)->GetData());
+  size_t x_size = ctx.Input(0)->NumElements();
+  size_t date_size = x_size * sizeof(T);
+  if (date_size <= paralled_data_size) {
+    for (size_t i = 0; i < x_size; i++) {
+      if (x_addr[i] == T(0.0f)) {
+        y_addr[i] = T(1.0f);
+      } else {
+        T product = T(kPI) * x_addr[i];
+        y_addr[i] = sin(product) / product;
+      }
+    }
+  } else {
+    auto shard_sinc = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        if (x_addr[i] == T(0.0f)) {
+          y_addr[i] = T(1.0f);
+        } else {
+          T product = T(kPI) * x_addr[i];
+          y_addr[i] = sin(product) / product;
+        }
+      }
+    };
+    uint32_t min_core_num = 1;
+    size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (max_core_num == 0) {
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (max_core_num > date_size) {
+      max_core_num = date_size;
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, x_size, x_size / max_core_num, shard_sinc),
+                        "Sinc Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t SincCpuKernel::SincTypeChangeCompute(CpuKernelContext &ctx) {
+  T *x_addr = static_cast<T *>(ctx.Input(0)->GetData());
+  auto y_addr = static_cast<float *>(ctx.Output(0)->GetData());
+  size_t x_size = ctx.Input(0)->NumElements();
+  size_t date_size = x_size * sizeof(T);
+  if (date_size <= paralled_data_size) {
+    for (size_t i = 0; i < x_size; i++) {
+      if (x_addr[i] == T(0.0f)) {
+        y_addr[i] = float(1.0f);
+      } else {
+        float product = static_cast<float>(kPI) * x_addr[i];
+        y_addr[i] = sin(product) / product;
+      }
+    }
+  } else {
+    auto shard_sinc = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        if (x_addr[i] == T(0.0f)) {
+          y_addr[i] = float(1.0f);
+        } else {
+          float product = static_cast<float>(kPI) * x_addr[i];
+          y_addr[i] = sin(product) / product;
+        }
+      }
+    };
+    uint32_t min_core_num = 1;
+    size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (max_core_num == 0) {
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (max_core_num > date_size) {
+      max_core_num = date_size;
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, x_size, x_size / max_core_num, shard_sinc),
+                        "Sinc Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t SincCpuKernel::SincBoolCompute(CpuKernelContext &ctx) {
+  bool *x_addr = static_cast<bool *>(ctx.Input(0)->GetData());
+  auto y_addr = static_cast<float *>(ctx.Output(0)->GetData());
+  size_t x_size = ctx.Input(0)->NumElements();
+  size_t date_size = x_size * sizeof(T);
+  if (date_size <= paralled_data_size) {
+    for (size_t i = 0; i < x_size; i++) {
+      float tmp;
+      if (x_addr[i] == true) {
+        tmp = 1.0f;
+      } else {
+        tmp = 0.0f;
+      }
+      float product = static_cast<float>(kPI) * tmp;
+      y_addr[i] = sin(product) / product;
+    }
+  } else {
+    auto shard_sinc = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        float tmp;
+        if (x_addr[i] == true) {
+          tmp = 1.0f;
+        } else {
+          tmp = 0.0f;
+        }
+        float product = static_cast<float>(kPI) * tmp;
+        y_addr[i] = sin(product) / product;
+      }
+    };
+    uint32_t min_core_num = 1;
+    size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (max_core_num == 0) {
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (max_core_num > date_size) {
+      max_core_num = date_size;
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, x_size, x_size / max_core_num, shard_sinc),
+                        "Sinc Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+inline std::uint32_t SincExtraCheck(const CpuKernelContext &ctx) {
+  if (ctx.Input(0)->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Get input data failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Output(0)->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Get output data failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  DataType in_dtype = ctx.Input(0)->GetDataType();
+  DataType out_dtype = ctx.Output(0)->GetDataType();
+  std::set<DataType> dtypes;
+  dtypes.insert(DT_FLOAT16);
+  dtypes.insert(DT_FLOAT);
+  dtypes.insert(DT_DOUBLE);
+  dtypes.insert(DT_COMPLEX64);
+  dtypes.insert(DT_COMPLEX128);
+  if (dtypes.count(in_dtype) == 1) {
+    if (out_dtype != in_dtype) {
+      KERNEL_LOG_ERROR("The data type of the output need be the same as the input when input is [%s], but got [%s].",
+                       DTypeStr(in_dtype).c_str(), DTypeStr(out_dtype).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  } else {
+    if (out_dtype != DT_FLOAT) {
+      KERNEL_LOG_ERROR("The data type of the output must be float32 when the dtype of input is [%s], but got [%s].",
+                       DTypeStr(in_dtype).c_str(), DTypeStr(out_dtype).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
+  if (input_dims.size() != output_dims.size()) {
+    KERNEL_LOG_ERROR(
+      "The data dim size of the input [%llu] need be the same as the output "
+      "[%llu].",
+      input_dims.size(), output_dims.size());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  for (size_t index = 0; index < input_dims.size(); index++) {
+    if (input_dims[index] != output_dims[index]) {
+      KERNEL_LOG_ERROR("The data dim of the input need be the same as the output.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SincCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kSincInputNum, kSincOutputNum), "[%s] check params failed.", kSinc);
+  uint32_t res = KERNEL_STATUS_OK;
+  res = SincExtraCheck(ctx);
+  if (res != KERNEL_STATUS_OK) {
+    return res;
+  }
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    case DT_FLOAT16:
+      res = SincTypeSameCompute<Eigen::half>(ctx);
+      break;
+    case DT_FLOAT:
+      res = SincTypeSameCompute<float>(ctx);
+      break;
+    case DT_DOUBLE:
+      res = SincTypeSameCompute<double>(ctx);
+      break;
+    case DT_INT8:
+      res = SincTypeChangeCompute<int8_t>(ctx);
+      break;
+    case DT_UINT8:
+      res = SincTypeChangeCompute<uint8_t>(ctx);
+      break;
+    case DT_INT16:
+      res = SincTypeChangeCompute<int16_t>(ctx);
+      break;
+    case DT_UINT16:
+      res = SincTypeChangeCompute<uint16_t>(ctx);
+      break;
+    case DT_INT32:
+      res = SincTypeChangeCompute<int32_t>(ctx);
+      break;
+    case DT_UINT32:
+      res = SincTypeChangeCompute<uint32_t>(ctx);
+      break;
+    case DT_INT64:
+      res = SincTypeChangeCompute<int64_t>(ctx);
+      break;
+    case DT_UINT64:
+      res = SincTypeChangeCompute<uint64_t>(ctx);
+      break;
+    case DT_COMPLEX64:
+      res = SincTypeSameCompute<std::complex<float>>(ctx);
+      break;
+    case DT_COMPLEX128:
+      res = SincTypeSameCompute<std::complex<double>>(ctx);
+      break;
+    case DT_BOOL:
+      res = SincBoolCompute<bool>(ctx);
+      break;
+    default:
+      KERNEL_LOG_ERROR("Sinc invalid input type [%s]", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (res != KERNEL_STATUS_OK) {
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kSinc, SincCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sinc.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sinc.h
@ -0,0 +1,41 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SINC_H_
+#define AICPU_KERNELS_NORMALIZED_SINC_H_
+
+#include "cpu_ops_kernel.h"
+namespace aicpu {
+class SincCpuKernel : public CpuKernel {
+ public:
+  SincCpuKernel() = default;
+
+  ~SincCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t SincTypeSameCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t SincTypeChangeCompute(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t SincBoolCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sinh.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sinh.cc
@ -0,0 +1,148 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sinh.h"
+
+#include <complex>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const std::uint32_t kSinhInputNum{1};
+const std::uint32_t kSinhOutputNum{1};
+const std::uint32_t ParallelNum{4096};
+const char *kSinh{"Sinh"};
+}  // namespace
+
+namespace internal {
+template <typename T>
+inline T ScalarSinh(T x) {
+  return Eigen::numext::sinh(x);
+}
+
+template <>
+inline Eigen::half ScalarSinh(Eigen::half x) {
+  const Eigen::half val{Eigen::numext::sinh(static_cast<float>(x))};
+  return Eigen::half_impl::isnan(val) ? Eigen::half{0.0f} : val;
+}
+}  // namespace internal
+
+namespace aicpu {
+namespace detail {
+template <typename T>
+inline std::uint32_t ComputeSinhKernel(const CpuKernelContext &ctx) {
+  const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
+  const auto ScalarSinh = internal::ScalarSinh<T>;
+  auto input = static_cast<T *>(ctx.Input(0)->GetData());
+  auto output = static_cast<T *>(ctx.Output(0)->GetData());
+  std::int64_t total = ctx.Input(0)->NumElements();
+  std::uint64_t total_size = ctx.Input(0)->GetDataSize();
+  uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
+  if (total_size > ParallelNum * sizeof(T)) {
+    std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
+    return ParallelFor(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
+      std::transform(input + begin, input + end, output + begin, ScalarSinh);
+    });
+  } else if (cores != 0) {
+    std::transform(input, input + total, output, ScalarSinh);
+  } else {
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+inline std::uint32_t ComputeSinh(const CpuKernelContext &ctx) {
+  uint32_t result = ComputeSinhKernel<T>(ctx);
+  if (result != 0) {
+    KERNEL_LOG_ERROR("Sinh compute failed.");
+  }
+  return result;
+}
+
+inline std::uint32_t SinhExtraCheck(const CpuKernelContext &ctx) {
+  if (ctx.Input(0)->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Get input data failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Output(0)->GetData() == nullptr) {
+    KERNEL_LOG_ERROR("Get output data failed.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
+    KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the     output [%s].",
+                     DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
+    KERNEL_LOG_ERROR(
+      "The data size of the input [%llu] need be the same as the output "
+      "[%llu].",
+      ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
+  if (input_dims.size() != output_dims.size()) {
+    KERNEL_LOG_ERROR(
+      "The data dim size of the input [%llu] need be the same as the output "
+      "[%llu].",
+      input_dims.size(), output_dims.size());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  for (size_t index = 0; index < input_dims.size(); index++) {
+    if (input_dims[index] != output_dims[index]) {
+      KERNEL_LOG_ERROR("The data dim of the input need be the same as the output.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+std::uint32_t SinhCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
+  return NormalCheck(ctx, kSinhInputNum, kSinhOutputNum) ? KERNEL_STATUS_PARAM_INVALID : SinhExtraCheck(ctx);
+}
+
+std::uint32_t SinhCompute(const CpuKernelContext &ctx) {
+  DataType input_type{ctx.Input(0)->GetDataType()};
+  switch (input_type) {
+    case DT_FLOAT16:
+      return ComputeSinh<Eigen::half>(ctx);
+    case DT_FLOAT:
+      return ComputeSinh<std::float_t>(ctx);
+    case DT_DOUBLE:
+      return ComputeSinh<std::double_t>(ctx);
+    case DT_COMPLEX64:
+      return ComputeSinh<std::complex<std::float_t> >(ctx);
+    case DT_COMPLEX128:
+      return ComputeSinh<std::complex<std::double_t> >(ctx);
+    default:
+      KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+}  // namespace detail
+
+std::uint32_t SinhCpuKernel::Compute(CpuKernelContext &ctx) {
+  return detail::SinhCheck(ctx, kSinhInputNum, kSinhOutputNum) ? KERNEL_STATUS_PARAM_INVALID : detail::SinhCompute(ctx);
+}
+
+REGISTER_CPU_KERNEL(kSinh, SinhCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sinh.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sinh.h
@ -0,0 +1,26 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SINH_H_
+#define AICPU_KERNELS_NORMALIZED_SINH_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class SinhCpuKernel final : public CpuKernel {
+  virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/smooth_l1_loss_grad_v2.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/smooth_l1_loss_grad_v2.cc
@ -0,0 +1,387 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "smooth_l1_loss_grad_v2.h"
+
+#include <mutex>
+
+#include "Eigen/Core"
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 3;
+const uint32_t kOutputNum = 1;
+const char *kSmoothL1LossGradV2 = "SmoothL1LossGradV2";
+const int64_t kParallelDataNum = 2 * 1024;
+const int64_t kParallelDataNumMid = 16 * 1024;
+const int64_t kParallelDataNumSameShape = 7 * 1024;
+const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
+float sigma = 1.0;
+std::string reduction = "mean";
+std::mutex mtx;
+
+#define SmoothL1LossGradV2_COMPUTE_CASE(DTYPE, REDUCTION, TYPE, CTX) \
+  case (DTYPE): {                                                    \
+    KERNEL_LOG_INFO("Compute [%s]", DTypeStr(data_type).c_str());    \
+    uint32_t result = KERNEL_STATUS_PARAM_INVALID;                   \
+    if ((REDUCTION) == "mean") {                                     \
+      result = ComputeMean<TYPE>(CTX);                               \
+    } else if ((REDUCTION) == "sum") {                               \
+      result = ComputeSum<TYPE>(CTX);                                \
+    } else if ((REDUCTION) == "none") {                              \
+      result = ComputeNone<TYPE>(CTX);                               \
+    }                                                                \
+    if (result != KERNEL_STATUS_OK) {                                \
+      KERNEL_LOG_ERROR("SmoothL1LossGradV2 kernel compute failed."); \
+      return result;                                                 \
+    }                                                                \
+    break;                                                           \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t SmoothL1LossGradV2CpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
+                      "SmoothL1LossGradV2 check input and output number failed.");
+  KERNEL_HANDLE_ERROR(ParamCheck(ctx), "SmoothL1LossGradV2 check params failed.");
+
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    SmoothL1LossGradV2_COMPUTE_CASE(DT_FLOAT16, reduction, Eigen::half, ctx)
+      SmoothL1LossGradV2_COMPUTE_CASE(DT_FLOAT, reduction, float, ctx)
+        SmoothL1LossGradV2_COMPUTE_CASE(DT_DOUBLE, reduction, double, ctx) default
+        : KERNEL_LOG_ERROR("SmoothL1LossGradV2 kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SmoothL1LossGradV2CpuKernel::ParamCheck(CpuKernelContext &ctx) {
+  Tensor *predict_tensor = ctx.Input(0);
+  Tensor *label_tensor = ctx.Input(1);
+  Tensor *dout_tensor = ctx.Input(2);
+  Tensor *gradient_tensor = ctx.Output(0);
+  DataType predict_type = predict_tensor->GetDataType();
+  DataType label_type = label_tensor->GetDataType();
+  DataType dout_type = dout_tensor->GetDataType();
+  DataType gradient_type = gradient_tensor->GetDataType();
+  KERNEL_CHECK_FALSE((predict_type == label_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of predict [%s] need be same with "
+                     "label [%s].",
+                     DTypeStr(predict_type).c_str(), DTypeStr(label_type).c_str());
+  KERNEL_CHECK_FALSE((predict_type == dout_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of predict [%s] need be same with "
+                     "dout [%s].",
+                     DTypeStr(predict_type).c_str(), DTypeStr(dout_type).c_str());
+  KERNEL_CHECK_FALSE((predict_type == gradient_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of predict [%s] need be same with "
+                     "gradient [%s].",
+                     DTypeStr(predict_type).c_str(), DTypeStr(gradient_type).c_str());
+  auto predict_shape = predict_tensor->GetTensorShape();
+  auto label_shape = label_tensor->GetTensorShape();
+  auto gradient_shape = gradient_tensor->GetTensorShape();
+  int32_t predict_dims = predict_shape->GetDims();
+  int32_t label_dims = label_shape->GetDims();
+  int32_t gradient_dims = gradient_shape->GetDims();
+  KERNEL_CHECK_FALSE((predict_dims == label_dims), KERNEL_STATUS_PARAM_INVALID,
+                     "the input shape dim of predict [%d] need be same with "
+                     "label [%d].",
+                     predict_dims, label_dims);
+  KERNEL_CHECK_FALSE((predict_dims == gradient_dims), KERNEL_STATUS_PARAM_INVALID,
+                     "the input shape dim of predict [%d] need be same with "
+                     "gradient [%d].",
+                     predict_dims, gradient_dims);
+  for (int32_t i = 0; i < predict_dims; i++) {
+    KERNEL_CHECK_FALSE((predict_shape->GetDimSize(i) == label_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
+                       "the every input shape dim of predict [%d] need be same with "
+                       "label [%d] where dim in [%d].",
+                       predict_shape->GetDimSize(i), label_shape->GetDimSize(i), i);
+    KERNEL_CHECK_FALSE((predict_shape->GetDimSize(i) == gradient_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
+                       "the every input shape dim of predict [%d] need be same with "
+                       "gradient [%d] where dim in [%d].",
+                       predict_shape->GetDimSize(i), gradient_shape->GetDimSize(i), i);
+  }
+  KERNEL_LOG_DEBUG(
+    "SmoothL1LossGradV2CpuKernel[%s], predict: size[%llu];"
+    "label: size[%llu], dout: size[%llu], gradient: size[%llu].",
+    ctx.GetOpType().c_str(), predict_tensor->GetDataSize(), label_tensor->GetDataSize(), dout_tensor->GetDataSize(),
+    gradient_tensor->GetDataSize());
+  return AttributesCheck(ctx);
+}
+
+uint32_t SmoothL1LossGradV2CpuKernel::AttributesCheck(CpuKernelContext &ctx) {
+  Tensor *predict_tensor = ctx.Input(0);
+  Tensor *dout_tensor = ctx.Input(2);
+  Tensor *gradient_tensor = ctx.Output(0);
+  auto predict_shape = predict_tensor->GetTensorShape();
+  auto dout_shape = dout_tensor->GetTensorShape();
+  auto gradient_shape = gradient_tensor->GetTensorShape();
+  int32_t predict_dims = predict_shape->GetDims();
+  int32_t dout_dims = dout_shape->GetDims();
+  int32_t gradient_dims = gradient_shape->GetDims();
+  auto sigma_attr = ctx.GetAttr("sigma");
+  auto reduction_attr = ctx.GetAttr("reduction");
+  sigma = sigma_attr == nullptr ? 1.0 : sigma_attr->GetFloat();
+  reduction = reduction_attr == nullptr ? "mean" : reduction_attr->GetString();
+  KERNEL_CHECK_FALSE(sigma >= 0, KERNEL_STATUS_PARAM_INVALID,
+                     "the sigma value must greater than or equal to 0 "
+                     "when value of input sigma is [%f].",
+                     sigma);
+  KERNEL_CHECK_FALSE((reduction == "none" || reduction == "mean" || reduction == "sum"), KERNEL_STATUS_PARAM_INVALID,
+                     "the reduction value must be a value in a range of ['none','mean','sum'].", reduction);
+  if (reduction == "none" || reduction == "mean" || reduction == "sum") {
+    KERNEL_CHECK_FALSE((predict_dims == gradient_dims), KERNEL_STATUS_PARAM_INVALID,
+                       "the input shape dim of predict [%d] need be same with "
+                       "gradient [%d].",
+                       predict_dims, gradient_dims);
+    for (int32_t i = 0; i < predict_dims; i++) {
+      KERNEL_CHECK_FALSE((predict_shape->GetDimSize(i) == gradient_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
+                         "the input shape dim of predict [%d] must be same with "
+                         "gradient [%d] where dim in [%d].",
+                         predict_shape->GetDimSize(i), gradient_shape->GetDimSize(i), i);
+    }
+  }
+  if (reduction == "none") {
+    KERNEL_CHECK_FALSE((predict_dims == dout_dims), KERNEL_STATUS_PARAM_INVALID,
+                       "the input shape dim of predict [%d] need be same with "
+                       "dout [%d].",
+                       predict_dims, dout_dims);
+    for (int32_t i = 0; i < predict_dims; i++) {
+      KERNEL_CHECK_FALSE((predict_shape->GetDimSize(i) == dout_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
+                         "the every input shape dim of predict [%d] need be same with "
+                         "dout [%d] where dim in [%d].",
+                         predict_shape->GetDimSize(i), dout_shape->GetDimSize(i), i);
+    }
+  } else if (reduction == "sum" || reduction == "mean") {
+    KERNEL_CHECK_FALSE((dout_dims == 0) || ((dout_dims == 1) && (dout_tensor->NumElements() == 1)),
+                       KERNEL_STATUS_PARAM_INVALID, "the dout shape dim of dout [%d] need be a scalar.", dout_dims);
+  }
+  return KERNEL_STATUS_OK;
+}
+
+// 1 * dout         if  x  >= sigma
+// -1 * dout        if  x  <= -sigma
+// x / sigma * dout if |x| <  sigma
+template <typename T>
+uint32_t SmoothL1LossGradV2CpuKernel::ComputeSum(CpuKernelContext &ctx) {
+  KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeSum start");
+  Tensor *predict_tensor = ctx.Input(0);
+  Tensor *label_tensor = ctx.Input(1);
+  Tensor *dout_tensor = ctx.Input(2);
+  Tensor *gradient_tensor = ctx.Output(0);
+  T *predict_val = static_cast<T *>(predict_tensor->GetData());
+  T *label_val = static_cast<T *>(label_tensor->GetData());
+  T *dout_val = static_cast<T *>(dout_tensor->GetData());
+  T *gradient_val = static_cast<T *>(gradient_tensor->GetData());
+  int64_t data_num = predict_tensor->NumElements();
+  int64_t data_size = data_num * sizeof(T);
+  T *result = gradient_val;
+  if (data_size <= kParallelDataNum) {
+    for (int64_t i = 0; i < data_num; i++) {
+      T predict = *(predict_val + i);
+      T label = *(label_val + i);
+      T dout = *dout_val;
+      T x = predict - label;
+      if (x == T(0)) {
+        *(result + i) = T(0) * dout;
+      } else if (x <= -T(sigma)) {
+        *(result + i) = T(-1) * dout;
+      } else if (x >= T(sigma)) {
+        *(result + i) = T(1) * dout;
+      } else if (sigma == 0) {
+        KERNEL_LOG_ERROR("attribute sigma could not be 0.");
+      } else {
+        *(result + i) = x / T(sigma) * dout;
+      }
+    }
+    return KERNEL_STATUS_OK;
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num cannot be 0.");
+    }
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shared_smoothl1lossgradv2 = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        T predict = *(predict_val + i);
+        T label = *(label_val + i);
+        T dout = *dout_val;
+        T x = predict - label;
+        if (x == T(0)) {
+          *(result + i) = T(0) * dout;
+        } else if (x <= -T(sigma)) {
+          *(result + i) = T(-1) * dout;
+        } else if (x >= T(sigma)) {
+          *(result + i) = T(1) * dout;
+        } else if (sigma == 0) {
+          KERNEL_LOG_ERROR("attribute sigma could not be 0.");
+        } else {
+          *(result + i) = x / T(sigma) * dout;
+        }
+      }
+    };
+    return CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossgradv2);
+  }
+  KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeSum end");
+}
+
+// Mean's result is Sum's result divided by the total number of elements per
+// element
+template <typename T>
+uint32_t SmoothL1LossGradV2CpuKernel::ComputeMean(CpuKernelContext &ctx) {
+  KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeMean start");
+  Tensor *predict_tensor = ctx.Input(0);
+  Tensor *label_tensor = ctx.Input(1);
+  Tensor *dout_tensor = ctx.Input(2);
+  Tensor *gradient_tensor = ctx.Output(0);
+  T *predict_val = static_cast<T *>(predict_tensor->GetData());
+  T *label_val = static_cast<T *>(label_tensor->GetData());
+  T *dout_val = static_cast<T *>(dout_tensor->GetData());
+  T *gradient_val = static_cast<T *>(gradient_tensor->GetData());
+  int64_t data_num = predict_tensor->NumElements();
+  if (data_num == 0) {
+    KERNEL_LOG_ERROR("data_num cannot be 0.");
+  }
+  int64_t data_size = data_num * sizeof(T);
+  T *result = gradient_val;
+  if (data_size <= kParallelDataNum) {
+    for (int64_t i = 0; i < data_num; i++) {
+      T predict = *(predict_val + i);
+      T label = *(label_val + i);
+      T dout = *dout_val;
+      T x = predict - label;
+      if (x == T(0)) {
+        *(result + i) = T(0) * dout;
+      } else if (x <= -T(sigma)) {
+        *(result + i) = T(-1) / data_num * dout;
+      } else if (x >= T(sigma)) {
+        *(result + i) = T(1) / data_num * dout;
+      } else if (sigma == 0) {
+        KERNEL_LOG_ERROR("attribute sigma could not be 0.");
+      } else {
+        *(result + i) = x / T(sigma) / data_num * dout;
+      }
+    }
+    return KERNEL_STATUS_OK;
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num cannot be 0.");
+    }
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shared_smoothl1lossgradv2 = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        T predict = *(predict_val + i);
+        T label = *(label_val + i);
+        T dout = *dout_val;
+        T x = predict - label;
+        if (x == T(0)) {
+          *(result + i) = T(0) * dout;
+        } else if (x <= -T(sigma)) {
+          *(result + i) = T(-1) / data_num * dout;
+        } else if (x >= T(sigma)) {
+          *(result + i) = T(1) / data_num * dout;
+        } else if (sigma == 0) {
+          KERNEL_LOG_ERROR("attribute sigma could not be 0.");
+        } else {
+          *(result + i) = x / T(sigma) / data_num * dout;
+        }
+      }
+    };
+    return CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossgradv2);
+  }
+  KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeMean end");
+}
+
+// "None" takes grad_output as a parameter,
+// and the end result is that result of "Sum" is multiplied by the grad_output
+// one by one, that is, the weight is increased
+template <typename T>
+uint32_t SmoothL1LossGradV2CpuKernel::ComputeNone(CpuKernelContext &ctx) {
+  KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeNone start");
+  Tensor *predict_tensor = ctx.Input(0);
+  Tensor *label_tensor = ctx.Input(1);
+  Tensor *dout_tensor = ctx.Input(2);
+  Tensor *gradient_tensor = ctx.Output(0);
+  T *predict_val = static_cast<T *>(predict_tensor->GetData());
+  T *label_val = static_cast<T *>(label_tensor->GetData());
+  T *dout_val = static_cast<T *>(dout_tensor->GetData());
+  T *gradient_val = static_cast<T *>(gradient_tensor->GetData());
+  int64_t data_num = predict_tensor->NumElements();
+  int64_t data_size = data_num * sizeof(T);
+  T *result = gradient_val;
+  if (data_size <= kParallelDataNum) {
+    for (int64_t i = 0; i < data_num; i++) {
+      T predict = *(predict_val + i);
+      T label = *(label_val + i);
+      T x = predict - label;
+      T dout = *(dout_val + i);
+      if (x == T(0)) {
+        *(result + i) = T(0) * dout;
+      } else if (x <= -T(sigma)) {
+        *(result + i) = T(-1) * dout;
+      } else if (x >= T(sigma)) {
+        *(result + i) = T(1) * dout;
+      } else if (sigma == 0) {
+        KERNEL_LOG_ERROR("attribute sigma could not be 0.");
+      } else {
+        *(result + i) = dout * x / T(sigma);
+      }
+    }
+    return KERNEL_STATUS_OK;
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num cannot be 0.");
+    }
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    auto shared_smoothl1lossgradv2 = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        T predict = *(predict_val + i);
+        T label = *(label_val + i);
+        T x = predict - label;
+        T dout = *(dout_val + i);
+        if (x == T(0)) {
+          *(result + i) = T(0) * dout;
+        } else if (x <= -T(sigma)) {
+          *(result + i) = T(-1) * dout;
+        } else if (x >= T(sigma)) {
+          *(result + i) = T(1) * dout;
+        } else if (sigma == 0) {
+          KERNEL_LOG_ERROR("attribute sigma could not be 0.");
+        } else {
+          *(result + i) = dout * x / T(sigma);
+        }
+      }
+    };
+    return CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossgradv2);
+  }
+  KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeNone end");
+}
+
+REGISTER_CPU_KERNEL(kSmoothL1LossGradV2, SmoothL1LossGradV2CpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/smooth_l1_loss_grad_v2.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/smooth_l1_loss_grad_v2.h
@ -0,0 +1,44 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_GRAD_V2_H_
+#define AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_GRAD_V2_H_
+
+#include <string>
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class SmoothL1LossGradV2CpuKernel : public CpuKernel {
+ public:
+  SmoothL1LossGradV2CpuKernel() = default;
+  ~SmoothL1LossGradV2CpuKernel() = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t ParamCheck(CpuKernelContext &ctx);
+  uint32_t AttributesCheck(CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t ComputeMean(CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t ComputeSum(CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t ComputeNone(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_GRAD_V2_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/smooth_l1_loss_v2.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/smooth_l1_loss_v2.cc
@ -0,0 +1,278 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "smooth_l1_loss_v2.h"
+
+#include <mutex>
+
+#include "Eigen/Core"
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *SmoothL1LossV2 = "SmoothL1LossV2";
+const uint32_t kInputNum = 2;
+const uint32_t kOutputNum = 1;
+constexpr int64_t kParallelDataNums = 16 * 1024;
+const float opHalf = 0.5;
+float sigma = 1.0;
+std::string reduction = "mean";
+std::mutex mtx;
+
+#define COMPUTE_CASE(DTYPE, REDUCTION, TYPE, CTX)                  \
+  case (DTYPE): {                                                  \
+    KERNEL_LOG_DEBUG("Compute [%s]", DTypeStr(data_type).c_str()); \
+    uint32_t result = KERNEL_STATUS_PARAM_INVALID;                 \
+    if ((REDUCTION) == "mean") {                                   \
+      result = ComputeMean<TYPE>(CTX);                             \
+    } else if ((REDUCTION) == "sum") {                             \
+      result = ComputeSum<TYPE>(CTX);                              \
+    } else if ((REDUCTION) == "none") {                            \
+      result = ComputeNone<TYPE>(CTX);                             \
+    }                                                              \
+    if (result != KERNEL_STATUS_OK) {                              \
+      KERNEL_LOG_ERROR("SmoothL1LossV2 compute failed.");          \
+      return result;                                               \
+    }                                                              \
+    break;                                                         \
+  }
+}  // namespace
+
+namespace aicpu {
+uint32_t SmoothL1LossV2CpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check SmoothL1LossV2 params failed.");
+  KERNEL_HANDLE_ERROR(ParamCheck(ctx), "Check SmoothL1LossV2 params failed.");
+
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    COMPUTE_CASE(DT_FLOAT16, reduction, Eigen::half, ctx)
+    COMPUTE_CASE(DT_FLOAT, reduction, float, ctx)
+    COMPUTE_CASE(DT_DOUBLE, reduction, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("SmoothL1LossV2 data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t SmoothL1LossV2CpuKernel::ParamCheck(CpuKernelContext &ctx) {
+  Tensor *input_0 = ctx.Input(0);
+  Tensor *input_1 = ctx.Input(1);
+  Tensor *output_0 = ctx.Output(0);
+  DataType input0_type = input_0->GetDataType();
+  DataType input1_type = input_1->GetDataType();
+  DataType output0_type = output_0->GetDataType();
+
+  KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with "
+                     "input1 [%s].",
+                     DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str());
+  KERNEL_CHECK_FALSE((input0_type == output0_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with "
+                     "output0 [%s].",
+                     DTypeStr(input0_type).c_str(), DTypeStr(output0_type).c_str());
+  auto input0_shape = input_0->GetTensorShape();
+  auto input1_shape = input_1->GetTensorShape();
+  int32_t input0_dims = input0_shape->GetDims();
+  int32_t input1_dims = input1_shape->GetDims();
+  KERNEL_CHECK_FALSE((input0_dims == input1_dims), KERNEL_STATUS_PARAM_INVALID,
+                     "the input shape dim of input0 [%d] need be same with "
+                     "input1 [%d].",
+                     input0_dims, input1_dims);
+  for (int32_t i = 0; i < input0_dims; i++) {
+    KERNEL_CHECK_FALSE((input0_shape->GetDimSize(i) == input1_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
+                       "the every input shape dim of input0 [%d] need be same with "
+                       "input1 [%d] where dim in [%d].",
+                       input0_shape->GetDimSize(i), input1_shape->GetDimSize(i), i);
+  }
+  KERNEL_LOG_DEBUG(
+    "SmoothL1LossV2CpuKernel[%s], input0: size[%llu];"
+    "input1: size[%llu], output: size[%llu].",
+    ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output_0->GetDataSize());
+
+  return AttributeCheck(ctx);
+}
+
+uint32_t SmoothL1LossV2CpuKernel::AttributeCheck(CpuKernelContext &ctx) {
+  Tensor *input_0 = ctx.Input(0);
+  Tensor *output_0 = ctx.Output(0);
+  auto input0_shape = input_0->GetTensorShape();
+  auto output0_shape = output_0->GetTensorShape();
+  int32_t input0_dims = input0_shape->GetDims();
+  int32_t output0_dims = output0_shape->GetDims();
+
+  auto sigma_attr = ctx.GetAttr("sigma");
+  auto reduction_attr = ctx.GetAttr("reduction");
+  sigma = sigma_attr == nullptr ? 1.0 : sigma_attr->GetFloat();
+  reduction = reduction_attr == nullptr ? "mean" : reduction_attr->GetString();
+  KERNEL_CHECK_FALSE(sigma >= 0, KERNEL_STATUS_PARAM_INVALID,
+                     "the sigma value need to greater than or equal to 0 "
+                     "when input sigma value is [%f].",
+                     sigma);
+  KERNEL_CHECK_FALSE((reduction == "none" || reduction == "mean" || reduction == "sum"), KERNEL_STATUS_PARAM_INVALID,
+                     "the reduction value need be the member of ['none','mean','sum'] "
+                     "when input reduction value is [%s].",
+                     reduction);
+  if (reduction == "none") {
+    KERNEL_CHECK_FALSE((input0_dims == output0_dims), KERNEL_STATUS_PARAM_INVALID,
+                       "the input shape dim of input0 [%d] need be same with "
+                       "output0 [%d].",
+                       input0_dims, output0_dims);
+    for (int32_t i = 0; i < input0_dims; i++) {
+      KERNEL_CHECK_FALSE((input0_shape->GetDimSize(i) == output0_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
+                         "the every input shape dim of input0 [%d] need be same with "
+                         "output0 [%d] where dim in [%d].",
+                         input0_shape->GetDimSize(i), output0_shape->GetDimSize(i), i);
+    }
+  } else if (reduction == "sum" || reduction == "mean") {
+    KERNEL_CHECK_FALSE((output0_dims == 0) || ((output0_dims == 1) && (output_0->NumElements() == 1)),
+                       KERNEL_STATUS_PARAM_INVALID, "the output shape dim of output0 [%d] need be [1] or a scalar.",
+                       output0_dims);
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t SmoothL1LossV2CpuKernel::ComputeMean(CpuKernelContext &ctx) {
+  uint32_t compute_sum_res = ComputeSum<T>(ctx);
+  if (compute_sum_res != KERNEL_STATUS_OK) {
+    return compute_sum_res;
+  }
+  Tensor *predict_tensor = ctx.Input(0);
+  int64_t data_num = predict_tensor->NumElements();
+  Tensor *loss_tensor = ctx.Output(0);
+  T *loss_val = reinterpret_cast<T *>(loss_tensor->GetData());
+  T *res = loss_val;
+  if (data_num == 0) {
+    *(res) = T(0);
+  }
+  *(res) = *(res) / data_num;
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t SmoothL1LossV2CpuKernel::ComputeSum(CpuKernelContext &ctx) {
+  Tensor *predict_tensor = ctx.Input(0);
+  Tensor *label_tensor = ctx.Input(1);
+  Tensor *loss_tensor = ctx.Output(0);
+  T *predict_val = reinterpret_cast<T *>(predict_tensor->GetData());
+  T *label_val = reinterpret_cast<T *>(label_tensor->GetData());
+  T *loss_val = reinterpret_cast<T *>(loss_tensor->GetData());
+  int64_t data_num = predict_tensor->NumElements();
+  int64_t data_size = data_num * sizeof(T);
+
+  double res = 0;
+  if (data_size <= kParallelDataNums) {
+    for (int64_t i = 0; i < data_num; i++) {
+      T predict = *(predict_val + i);
+      T label = *(label_val + i);
+      T z = predict - label > T(0) ? predict - label : label - predict;
+      if (sigma == 0) {
+        res += static_cast<double>(z);
+      } else {
+        res += static_cast<double>(z < T(sigma) ? T(opHalf) * z * z / T(sigma) : z - T(opHalf) * T(sigma));
+      }
+    }
+    *(loss_val) = static_cast<T>(res);
+    return KERNEL_STATUS_OK;
+  } else {
+    auto shared_smoothl1lossv2 = [&](size_t start, size_t end) -> double {
+      double sum = 0;
+      for (size_t i = start; i < end; i++) {
+        T predict = *(predict_val + i);
+        T label = *(label_val + i);
+        T z = predict - label > T(0) ? predict - label : label - predict;
+        if (sigma == 0) {
+          res += static_cast<double>(z);
+        } else {
+          sum += static_cast<double>(z < T(sigma) ? T(opHalf) * z * z / T(sigma) : z - T(opHalf) * T(sigma));
+        }
+      }
+      mtx.lock();
+      res = res + sum;
+      mtx.unlock();
+      return KERNEL_STATUS_OK;
+    };
+
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    auto result = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossv2);
+    *(loss_val) = static_cast<T>(res);
+    return result;
+  }
+}
+
+template <typename T>
+uint32_t SmoothL1LossV2CpuKernel::ComputeNone(CpuKernelContext &ctx) {
+  Tensor *predict_tensor = ctx.Input(0);
+  Tensor *label_tensor = ctx.Input(1);
+  Tensor *loss_tensor = ctx.Output(0);
+  T *predict_val = reinterpret_cast<T *>(predict_tensor->GetData());
+  T *label_val = reinterpret_cast<T *>(label_tensor->GetData());
+  T *loss_val = reinterpret_cast<T *>(loss_tensor->GetData());
+  int64_t data_num = predict_tensor->NumElements();
+
+  T *res = loss_val;
+  int64_t data_size = data_num * sizeof(T);
+  if (data_size <= kParallelDataNums) {
+    for (int64_t i = 0; i < data_num; i++) {
+      T predict = *(predict_val + i);
+      T label = *(label_val + i);
+      T z = predict - label > T(0) ? predict - label : label - predict;
+      if (sigma == 0) {
+        *(res + i) = z;
+      } else {
+        *(res + i) = z < T(sigma) ? T(opHalf) * z * z / T(sigma) : z - T(opHalf) * T(sigma);
+      }
+    }
+    return KERNEL_STATUS_OK;
+  } else {
+    auto shared_smoothl1lossv2 = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        T predict = *(predict_val + i);
+        T label = *(label_val + i);
+        T z = predict - label > T(0) ? predict - label : label - predict;
+        if (sigma == 0) {
+          *(res + i) = z;
+        } else {
+          *(res + i) = z < T(sigma) ? T(opHalf) * z * z / T(sigma) : z - T(opHalf) * T(sigma);
+        }
+      }
+    };
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+    if (max_core_num == 0) {
+      KERNEL_LOG_ERROR("max_core_num could not be 0.");
+    }
+    return CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossv2);
+  }
+}
+
+REGISTER_CPU_KERNEL(SmoothL1LossV2, SmoothL1LossV2CpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/smooth_l1_loss_v2.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/smooth_l1_loss_v2.h
@ -0,0 +1,42 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_V2_H_
+#define AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_V2_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class SmoothL1LossV2CpuKernel : public CpuKernel {
+ public:
+  SmoothL1LossV2CpuKernel() = default;
+  ~SmoothL1LossV2CpuKernel() = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t ParamCheck(CpuKernelContext &ctx);
+  uint32_t AttributeCheck(CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t ComputeMean(CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t ComputeSum(CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t ComputeNone(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif  // AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_V2_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/philox_random.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/philox_random.h
@ -0,0 +1,185 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _AICPU_AICPU_DEVICE_CPU_KERNELS_UTILS_PHILOX_RANDOM_H
+#define _AICPU_AICPU_DEVICE_CPU_KERNELS_UTILS_PHILOX_RANDOM_H
+
+#include <stdint.h>
+#include "cpu_kernel/common/status.h"
+
+/**
+ * A class that represents an inline array.
+ * Arguments:
+ *   T: the array element type;
+ *   ElementCount: the fixed size of the array;
+ */
+template <typename T, int ElementCount>
+class Array {
+ public:
+  static constexpr int kElementCount = ElementCount;
+  Array() {
+    for (int i = 0; i < ElementCount; ++i) {
+      data_[i] = T(0);
+    }
+  }
+
+  const T &operator[](int index) const { return data_[index]; }
+
+  T &operator[](int index) { return data_[index]; }
+
+  size_t size() const { return ElementCount; }
+
+ private:
+  T data_[ElementCount];
+};
+
+class PhiloxRandom {
+ public:
+  using ResultType = Array<uint32_t, 4>;
+  using ResultElementType = uint32_t;
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = 4;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 10;
+  /*
+   * The type for the 64-bit key stored in the form of two 32-bit uint
+   * that are used in the diffusion process.
+   */
+  using Key = Array<uint32_t, 2>;
+
+  PhiloxRandom() {}
+
+  PhiloxRandom(int64_t seed, uint64_t offset) {
+    const uint32_t seed_low_index = 0;
+    const uint32_t seed_high_index = 1;
+    const uint32_t offset_low_index = 2;
+    const uint32_t offset_high_index = 3;
+    key_[seed_low_index] = static_cast<uint32_t>(seed);
+    key_[seed_high_index] = static_cast<uint32_t>(seed >> 32);
+    counter_[offset_low_index] = static_cast<uint32_t>(offset);
+    counter_[offset_high_index] = static_cast<uint32_t>(offset >> 32);
+  }
+
+  ResultType const &counter() const { return counter_; }
+
+  Key const &key() const { return key_; }
+
+  // Skip the specified number of samples of 128-bits in the current stream.
+  void Skip(uint64_t count) {
+    const uint32_t count_lo = static_cast<uint32_t>(count);
+    uint32_t count_hi = static_cast<uint32_t>(count >> 32);
+
+    counter_[0] += count_lo;
+    if (counter_[0] < count_lo) {
+      ++count_hi;
+    }
+
+    counter_[1] += count_hi;
+    if (counter_[1] < count_hi) {
+      if (++counter_[2] == 0) {
+        ++counter_[3];
+      }
+    }
+  }
+  /*
+   * Returns a group of four random numbers using the underlying Philox
+   * algorithm.
+   */
+  ResultType operator()() {
+    ResultType counter = counter_;
+    Key key = key_;
+    /*
+     * Run the single rounds for ten times. Manually unrolling the loop
+     * for better performance.
+     */
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    SkipOne();
+    return counter;
+  }
+
+ private:
+  // We use the same constants as recommended by the original paper.
+  static constexpr uint32_t kPhiloxW32A = 0x9E3779B9;
+  static constexpr uint32_t kPhiloxW32B = 0xBB67AE85;
+  static constexpr uint32_t kPhiloxM4x32A = 0xD2511F53;
+  static constexpr uint32_t kPhiloxM4x32B = 0xCD9E8D57;
+
+  // Helper function to skip the next sample of 128-bits in the current stream.
+  void SkipOne() {
+    if (++counter_[0] == 0) {
+      if (++counter_[1] == 0) {
+        if (++counter_[2] == 0) {
+          ++counter_[3];
+        }
+      }
+    }
+  }
+  /*
+   * Helper function to return the lower and higher 32-bits from two 32-bit
+   * integer multiplications.
+   */
+  static void MultiplyHighLow(uint32_t a, uint32_t b, uint32_t *result_low, uint32_t *result_high) {
+    const uint64_t product = static_cast<uint64_t>(a) * b;
+    *result_low = static_cast<uint32_t>(product);
+    *result_high = static_cast<uint32_t>(product >> 32);
+  }
+
+  // Helper function for a single round of the underlying Philox algorithm.
+  static ResultType ComputeSingleRound(const ResultType &counter, const Key &key) {
+    uint32_t lo0;
+    uint32_t hi0;
+    MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
+
+    uint32_t lo1;
+    uint32_t hi1;
+    MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
+
+    ResultType result;
+    result[0] = hi1 ^ counter[1] ^ key[0];
+    result[1] = lo1;
+    result[2] = hi0 ^ counter[3] ^ key[1];
+    result[3] = lo0;
+    return result;
+  }
+
+  void RaiseKey(Key *key) {
+    (*key)[0] += kPhiloxW32A;
+    (*key)[1] += kPhiloxW32B;
+  }
+
+ private:
+  ResultType counter_;
+  Key key_;
+};
+#endif  // _AICPU_AICPU_DEVICE_CPU_KERNELS_UTILS_PHILOX_RANDOM_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sampling_kernels.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sampling_kernels.cc
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sampling_kernels.h"
+#include <algorithm>
+#include "kernel_log.h"
+#include "status.h"
+using namespace std;
+
+namespace aicpu {
+SamplingKernelType SamplingKernelTypeFromString(std::string str) {
+  if (str == "lanczos1") return Lanczos1Kernel;
+  if (str == "lanczos3") return Lanczos3Kernel;
+  if (str == "lanczos5") return Lanczos5Kernel;
+  if (str == "gaussian") return GaussianKernel;
+  if (str == "box") return BoxKernel;
+  if (str == "triangle") return TriangleKernel;
+  if (str == "keyscubic") return KeysCubicKernel;
+  if (str == "mitchellcubic") return MitchellCubicKernel;
+  return SamplingKernelTypeEnd;
+}
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sampling_kernels.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/sampling_kernels.h
@ -0,0 +1,199 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_UTILS_SAMPLING_KERNELS_H_
+#define AICPU_UTILS_SAMPLING_KERNELS_H_
+
+#include <cmath>
+#include <stdio.h>
+#include "cpu_context.h"
+
+namespace aicpu {
+// Defines functions for different types of sampling kernels.
+enum SamplingKernelType {
+  // Lanczos kernel with radius 1.  Aliases but does not ring.
+  Lanczos1Kernel,
+
+  /**
+   * Lanczos kernel with radius 3.  High-quality practical filter but may have
+   * some ringing especially on synthetic images.
+   */
+  Lanczos3Kernel,
+
+  /**
+   * Lanczos kernel with radius 5.  Very-high-quality filter but may have
+   * stronger ringing.
+   */
+  Lanczos5Kernel,
+
+  // Gaussian kernel with radius 3, sigma = 1.5 / 3.  Less commonly used.
+  GaussianKernel,
+
+  /**
+   * Rectangle function.  Equivalent to "nearest" sampling when upscaling.
+   * Has value 1 in interval (-0.5, 0.5), value 0.5 on edge, and 0 elsewhere.
+   */
+  BoxKernel,
+
+  /**
+   * Hat/tent function with radius 1.  Equivalent to "bilinear" reconstruction
+   * when upsampling.
+   * Has value zero at -1.0 and 1.0.
+   */
+  TriangleKernel,
+
+  /**
+   * Cubic interpolant of Keys.  Equivalent to Catmull-Rom kernel.  Reasonably
+   * good quality and faster than Lanczos3Kernel.
+   */
+  KeysCubicKernel,
+
+  /**
+   * Cubic non-interpolating scheme.  For synthetic images (especially those
+   * lacking proper prefiltering), less ringing than Keys cubic kernel but less
+   * sharp.
+   */
+  MitchellCubicKernel,
+
+  // Always insert new kernel types before this.
+  SamplingKernelTypeEnd
+};
+
+/**
+ * // Converts a string into the corresponding kernel type.
+ * Returns SamplingKernelTypeEnd if the string couldn't be converted.
+ */
+SamplingKernelType SamplingKernelTypeFromString(std::string str);
+
+// A function object for a Lanczos kernel.
+struct LanczosKernelFunc {
+  // Pass 1 for Lanczos1 kernel, 3 for Lanczos3 etc.
+  explicit LanczosKernelFunc(float _radius) : radius(_radius) {}
+  float operator()(float x) const {
+    constexpr float kPI = 3.14159265359;
+    x = std::abs(x);
+    if (x > radius) {
+      return 0.0;
+    }
+    // Need to special case the limit case of sin(x) / x when x is zero.
+    if (x <= 1e-3) {
+      return 1.0;
+    }
+    return radius * std::sin(kPI * x) * std::sin(kPI * x / radius) / (kPI * kPI * x * x);
+  }
+  float Radius() const { return radius; }
+  const float radius;
+};
+
+struct GaussianKernelFunc {
+  static constexpr float kRadiusMultiplier = 3.0f;
+  /**
+   * https://en.wikipedia.org/wiki/Gaussian_function
+   * We use sigma = 0.5, as suggested on p. 4 of Ken Turkowski's "Filters
+   * for Common Resampling Tasks" for kernels with a support of 3 pixels:
+   * www.realitypixels.com/turk/computergraphics/ResamplingFilters.pdf
+   * This implies a radius of 1.5,
+   */
+  explicit GaussianKernelFunc(float _radius = 1.5f) : radius(_radius), sigma(_radius / kRadiusMultiplier) {}
+  float operator()(float x) const {
+    x = std::abs(x);
+    if (x >= radius) {
+      return 0.0;
+    }
+    return std::exp(-x * x / (2.0 * sigma * sigma));
+  }
+  float Radius() const { return radius; }
+  const float radius;
+  // Gaussian standard deviation
+  const float sigma;
+};
+
+struct BoxKernelFunc {
+  float operator()(float x) const {
+    x = std::abs(x);
+    return x < 0.5f ? 1. : x == 0.5f ? 0.5f : 0.0f;
+  }
+  float Radius() const { return 1.f; }
+};
+
+struct TriangleKernelFunc {
+  // https://en.wikipedia.org/wiki/Triangle_function
+  float operator()(float x) const {
+    x = std::abs(x);
+    return x < 1.0f ? 1.0f - x : 0.0f;
+  }
+  float Radius() const { return 1.f; }
+};
+
+struct KeysCubicKernelFunc {
+  /**
+   * http://ieeexplore.ieee.org/document/1163711/
+   * R. G. Keys. Cubic convolution interpolation for digital image
+   * processing. IEEE Transactions on Acoustics, Speech, and Signal
+   * Processing, 29(6):1153–1160, 1981.
+   */
+  float operator()(float x) const {
+    x = std::abs(x);
+    if (x >= 2.0f) {
+      return 0.0f;
+    } else if (x >= 1.0f) {
+      return ((-0.5f * x + 2.5f) * x - 4.0f) * x + 2.0f;
+    } else {
+      return ((1.5f * x - 2.5f) * x) * x + 1.0f;
+    }
+  }
+  float Radius() const { return 2.f; }
+};
+
+struct MitchellCubicKernelFunc {
+  /**
+   * https://doi.org/10.1145/378456.378514
+   * D. P. Mitchell and A. N. Netravali. Reconstruction filters in computer
+   * graphics.  Computer Graphics (Proceedings of ACM SIGGRAPH 1988),
+   * 22(4):221–228, 1988.
+   */
+  float operator()(float x) const {
+    x = std::abs(x);
+    if (x >= 2.0f) {
+      return 0.0f;
+    } else if (x >= 1.0f) {
+      return (((-7.0f / 18.0f) * x + 2.0f) * x - 10.0f / 3.0f) * x + 16.0f / 9.0f;
+    } else {
+      return (((7.0f / 6.0f) * x - 2.0f) * x) * x + 8.0f / 9.0f;
+    }
+  }
+  float Radius() const { return 2.f; }
+};
+
+inline LanczosKernelFunc CreateLanczos1Kernel() { return LanczosKernelFunc(1.0); }
+
+inline LanczosKernelFunc CreateLanczos3Kernel() { return LanczosKernelFunc(3.0); }
+
+inline LanczosKernelFunc CreateLanczos5Kernel() { return LanczosKernelFunc(5.0); }
+
+inline GaussianKernelFunc CreateGaussianKernel() { return GaussianKernelFunc(1.5); }
+
+inline BoxKernelFunc CreateBoxKernel() { return BoxKernelFunc(); }
+
+inline TriangleKernelFunc CreateTriangleKernel() { return TriangleKernelFunc(); }
+
+inline KeysCubicKernelFunc CreateKeysCubicKernel() { return KeysCubicKernelFunc(); }
+
+inline MitchellCubicKernelFunc CreateMitchellCubicKernel() { return MitchellCubicKernelFunc(); }
+
+}  // namespace aicpu
+
+#endif  // _AICPU_AICPU_DEVICE_CPU_KERNELS_UTILS_SAMPLING_KERNELS_H_
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
@ -81,8 +81,49 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
                                                               mindspore::kQuantileOpName,
                                                               mindspore::kSparseSegmentSqrtNOpName,
                                                               mindspore::kUnsortedSegmentProdOpName,
-                                                               mindspore::kMulOpName,
-                                                               mindspore::kExpOpName};
+                                                               mindspore::kExpOpName,
+                                                               mindspore::kMatrixTriangularSolveOpName,
+                                                               mindspore::kMaximumGradGradOpName,
+                                                               mindspore::kMaxPoolOpName,
+                                                               mindspore::kMinimumGradGradOpName,
+                                                               mindspore::kMulNoNanOpName,
+                                                               mindspore::kMultilabelMarginLossGradOpName,
+                                                               mindspore::kNthElementOpName,
+                                                               mindspore::kNonMaxSuppressionWithOverlapsOpName,
+                                                               mindspore::kOneHotOpName,
+                                                               mindspore::kOrgqrOpName,
+                                                               mindspore::kPackOpName,
+                                                               mindspore::kParameterizedTruncatedNormalOpName,
+                                                               mindspore::kPolarOpName,
+                                                               mindspore::kPdistGradOpName,
+                                                               mindspore::kRaggedRangeOpName,
+                                                               mindspore::kRaggedTensorToSparseOpName,
+                                                               mindspore::kRaggedTensorToTensorOpName,
+                                                               mindspore::kReciprocalOpName,
+                                                               mindspore::kReciprocalGradOpName,
+                                                               mindspore::kReduceMeanOpName,
+                                                               mindspore::kReduceProdOpName,
+                                                               mindspore::kReluOpName,
+                                                               mindspore::kReverseV2OpName,
+                                                               mindspore::kRGBToHSVOpName,
+                                                               mindspore::kRsqrtGradOpName,
+                                                               mindspore::kSampleDistortedBoundingBoxExt2OpName,
+                                                               mindspore::kScaleAndTranslateGradOpName,
+                                                               mindspore::kScatterNdOpName,
+                                                               mindspore::kScatterNdUpdateOpName,
+                                                               mindspore::kSelectOpName,
+                                                               mindspore::kSelfAdjointEigOpName,
+                                                               mindspore::kSinOpName,
+                                                               mindspore::kSincOpName,
+                                                               mindspore::kSinhOpName,
+                                                               mindspore::kSmoothL1LossGradV2OpName,
+                                                               mindspore::kSmoothL1LossV2OpName,
+                                                               mindspore::kSignOpName,
+                                                               mindspore::kCheckNumericsOpName,
+                                                               mindspore::kFloorDivOpName,
+                                                               mindspore::kLog1pOpName,
+                                                               mindspore::kMulOpName};
+
  static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
  static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";

--- a/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
+++ b/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
@ -185,3 +185,38 @@ from .qr import _qr_aicpu
 from .col2im import _col2im_aicpu
 from .matrix_solve_ls import _matrix_solve_ls_aicpu
 from .exp import _exp_aicpu
+from .matrix_triangular_solve import _matrix_triangular_solve_aicpu
+from .maximum_grad_grad import _maximum_grad_grad_aicpu
+from .maxpool_v1 import _maxpool_v1_aicpu
+from .minimum_grad_grad import _minimum_grad_grad_aicpu
+from .mul_no_nan import _mul_no_nan_aicpu
+from .multilabel_margin_loss_grad import _multilabel_margin_loss_grad_aicpu
+from .nth_element import _nth_element_aicpu
+from .non_max_suppression_with_overlaps import _non_max_suppression_with_overlaps_aicpu
+from .one_hot import _one_hot_aicpu
+from .orgqr import _orgqr_aicpu
+from .parameterized_truncated_normal import _parameterized_truncated_normal_aicpu
+from .polar import _polar_aicpu
+from .pdist_grad import _pdist_grad_aicpu
+from .ragged_range import _raggedrange_aicpu
+from .ragged_tensor_to_sparse import _ragged_tensor_to_sparse_aicpu
+from .ragged_tensor_to_tensor import _ragged_tensor_to_tensor_aicpu
+from .reciprocal import _reciprocal_aicpu
+from .reciprocal_grad import _reciprocal_grad_aicpu
+from .reduce_mean import _reduce_mean_aicpu
+from .reduce_prod import _reduce_prod_aicpu
+from .relu_v3 import _relu_v3_aicpu
+from .reversev2 import _reversev2_aicpu
+from .rgb_to_hsv import _rgb_to_hsv_aicpu
+from .rsqrt_grad import _rsqrt_grad_aicpu
+from .sample_distorted_bounding_box_v2 import _sample_distorted_bounding_box_v2_aicpu
+from .scale_and_translate_grad import _scale_and_translate_grad_aicpu
+from .scatter_nd import _scatter_nd_aicpu
+from .scatter_nd_update import _scatter_nd_update_aicpu
+from .select import _select_aicpu
+from .self_adjoint_eig import _self_adjoint_eig_aicpu
+from .sin import _sin_aicpu
+from .sinc import _sinc_aicpu
+from .sinh import _sinh_aicpu
+from .smooth_l1_loss_grad import _smooth_l1_loss_grad_aicpu
+from .smooth_l1_loss import _smooth_l1_loss_aicpu