!47329 migrates aicpu ops from CANN pr

Merge pull request !47329 from 李林杰/1229_migrates_aicpu_from_CAAN_pr
2023-01-03 08:31:37 +00:00 · 2023-01-03 08:31:37 +00:00 · 76f46b52c4
parent c8637f8d54 08b0e47e50
commit 76f46b52c4
22 changed files with 2615 additions and 10 deletions
--- a/.jenkins/check/config/filter_cppcheck.txt
+++ b/.jenkins/check/config/filter_cppcheck.txt
@ -83,18 +83,10 @@
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/op_proto/matmul_tik.cc"            "syntaxError"

 # AICPU migration
-"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "useStlAlgorithm"
-"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "variableScope"
-"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constParameter"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constVariable"
-"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "unreadVariable"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "redundantAssignment"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constArgument"
-"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "useStlAlgorithm"
-"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "variableScope"
-"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constParameter"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constVariable"
-"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "unreadVariable"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "nullPointerRedundantCheck"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "variableScope"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unreadVariable"
@ -104,3 +96,4 @@
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "knownConditionTrueFalse"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "passedByValue"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar"
+"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"
--- a/.jenkins/check/config/whitelizard.txt
+++ b/.jenkins/check/config/whitelizard.txt
@ -280,3 +280,5 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multinomial.cc:aicpu::Generate     
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_set_diag_v3.cc:aicpu::MatrixSetDiagV3CpuKernel::DoCompute
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc:aicpu::MaxUnpool2DCpuKernel::MaxUnpool2DCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc:aicpu::MatrixSolveLsCpuKernel::Compute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc:aicpu::Col2imCpuKernel::Col2imParamCheck
--- a/mindspore/ccsrc/include/common/utils/utils.h
+++ b/mindspore/ccsrc/include/common/utils/utils.h
@ -163,6 +163,7 @@ constexpr auto kClipBoxesDOpName = "kClipBoxesD";
 constexpr auto kClipByNormNoDivSumOpName = "ClipByNormNoDivSum";
 constexpr auto kClipByValueOpName = "ClipByValue";
 constexpr auto kCoalesceOpName = "Coalesce";
+constexpr auto kCol2imOpName = "Col2im";
 constexpr auto kCombineMomentumOpName = "CombineMomentum";
 constexpr auto kCombineMomentumWeightOpName = "CombineMomentumWeight";
 constexpr auto kComputeAccidentalHitsOpName = "ComputeAccidentalHits";
@ -209,6 +210,7 @@ constexpr auto kCumSumOpName = "CumSum";
 constexpr auto kDataFormatDimMapOpName = "DataFormatDimMap";
 constexpr auto kCumulativeLogsumexpOpName = "CumulativeLogsumexp";
 constexpr auto kCumulativeLogsumexpDOpName = "CumulativeLogsumexpD";
+constexpr auto kDataFormatVecPermuteOpName = "DataFormatVecPermute";
 constexpr auto kDeadNodeName = "DeadNode";
 constexpr auto kDenseToDenseSetOperation = "DenseToDenseSetOperation";
 constexpr auto kDenseToSparseSetOperation = "DenseToSparseSetOperation";
@ -346,6 +348,8 @@ constexpr auto kInplaceUpdateOpName = "InplaceUpdate";
 constexpr auto kInplaceUpdateDOpName = "InplaceUpdateD";
 constexpr auto kInstanceNorm = "InstanceNorm";
 constexpr auto kInstanceNormGradOpName = "InstanceNormGrad";
+constexpr auto kInstanceNormV2OpName = "InstanceNormV2";
+constexpr auto kInstanceNormV2GradOpName = "InstanceNormV2Grad";
 constexpr auto kInTopKOpName = "InTopK";
 constexpr auto kInTopKDOpName = "InTopKD";
 constexpr auto kIsInfOpName = "IsInf";
@ -403,6 +407,7 @@ constexpr auto kMatrixLogarithmOpName = "MatrixLogarithm";
 constexpr auto kMatrixSetDiagOpName = "MatrixSetDiag";
 constexpr auto kMatrixSetDiagDOpName = "MatrixSetDiagD";
 constexpr auto kMatrixSetDiagV3OpName = "MatrixSetDiagV3";
+constexpr auto kMatrixSolveLsOpName = "MatrixSolveLs";
 constexpr auto kMaximumGradOpName = "MaximumGrad";
 constexpr auto kMaximumOpName = "Maximum";
 constexpr auto kMaxPool3DGradGradOpName = "MaxPool3DGradGrad";
@ -422,6 +427,7 @@ constexpr auto kMedianGradOpName = "MedianGrad";
 constexpr auto kMemCpyAsyncOpName = "memcpy_async";
 constexpr auto kMinimumGradOpName = "MinimumGrad";
 constexpr auto kMinimumOpName = "Minimum";
+constexpr auto kMirrorPadOpName = "MirrorPad";
 constexpr auto kMomentumOpName = "Momentum";
 constexpr auto kMulOpName = "Mul";
 constexpr auto kMultinomialOpName = "Multinomial";
@ -439,6 +445,7 @@ constexpr auto kNonZeroOpName = "NonZero";
 constexpr auto kNPUAllocFloatStatusOpName = "NPUAllocFloatStatus";
 constexpr auto kNPUClearFloatStatusOpName = "NPUClearFloatStatus";
 constexpr auto kNPUGetFloatStatusOpName = "NPUGetFloatStatus";
+constexpr auto kNuclearNormOpName = "NuclearNorm";
 constexpr auto kOneHotOpName = "OneHot";
 constexpr auto kOneHotDOpName = "OneHotD";
 constexpr auto kPadAndShiftOpName = "PadAndShift";
@ -472,6 +479,7 @@ constexpr auto kPullWeightOpName = "PullWeight";
 constexpr auto kPushOpName = "Push";
 constexpr auto kQrOpName = "Qr";
 constexpr auto kPushWeightOpName = "PushWeight";
+constexpr auto kQuantileOpName = "Quantile";
 constexpr auto kRandomChoiceWithMaskOpName = "RandomChoiceWithMask";
 constexpr auto kRandomShuffleOpName = "RandomShuffle";
 constexpr auto kRangeOpName = "Range";
@ -591,6 +599,7 @@ constexpr auto kSparseSliceOpName = "SparseSlice";
 constexpr auto kSparseSoftmaxCrossEntropyWithLogitsOpName = "SparseSoftmaxCrossEntropyWithLogits";
 constexpr auto kSparseSparseMinimumOpName = "SparseSparseMinimum";
 constexpr auto kSparseSparseMaximumOpName = "SparseSparseMaximum";
+constexpr auto kSparseTensorDenseMatMulOpName = "SparseTensorDenseMatMul";
 constexpr auto kSplitOpName = "Split";
 constexpr auto kSplitDOpName = "SplitD";
 constexpr auto kSplitVOpName = "SplitV";
@ -604,6 +613,7 @@ constexpr auto kStackDestroyOpName = "StackDestroy";
 constexpr auto kStackInitOpName = "StackInit";
 constexpr auto kStackOpName = "Stack";
 constexpr auto kPackOpName = "Pack";
+constexpr auto kSparseSegmentSqrtNOpName = "SparseSegmentSqrtN";
 constexpr auto kStackPopOpName = "StackPop";
 constexpr auto kStackPushOpName = "StackPush";
 constexpr auto kStandardLaplaceOpName = "StandardLaplace";
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc
@ -0,0 +1,236 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "col2im.h"
+
+#include <vector>
+
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel_utils.h"
+#include "status.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kCol2imInputNum = 2;
+const uint32_t kCol2imOutputNum = 1;
+constexpr uint32_t kValue0 = 0;
+constexpr uint32_t kValue1 = 1;
+constexpr uint32_t kValue2 = 2;
+constexpr uint32_t kValue4 = 4;
+constexpr uint32_t kIndex0 = 0;
+constexpr uint32_t kIndex1 = 1;
+constexpr uint32_t kIndex2 = 2;
+constexpr uint32_t kIndex3 = 3;
+const char *kCol2im = "Col2im";
+}  // namespace
+
+namespace aicpu {
+uint32_t Col2imCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(Col2imParamCheck(ctx), "[%s] check params failed.", kCol2im);
+  auto data_type = ctx.Input(0)->GetDataType();
+  uint32_t ret = KERNEL_STATUS_OK;
+  switch (data_type) {
+    case DT_FLOAT:
+      ret = Col2imCompute<float>(ctx);
+      break;
+    case DT_FLOAT16:
+      ret = Col2imCompute<Eigen::half>(ctx);
+      break;
+    default:
+      KERNEL_LOG_ERROR("Range kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      ret = KERNEL_STATUS_PARAM_INVALID;
+      break;
+  }
+
+  return ret;
+}
+
+template <typename T>
+static inline T div_rtn(T x, T y) {
+  int q = x / y;
+  int r = x % y;
+  if ((r != 0) && ((r < 0) != (y < 0))) {
+    --q;
+  }
+  return q;
+}
+
+uint32_t Col2imCpuKernel::Col2imParamCheck(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kCol2imInputNum, kCol2imOutputNum), "[%s] check params failed.", kCol2im);
+  Tensor *input_ = ctx.Input(0);
+  Tensor *output_size_ = ctx.Input(1);
+  KERNEL_CHECK_NULLPTR(ctx.GetAttr("kernel_size"), KERNEL_STATUS_PARAM_INVALID,
+                       "Get ctx.GetAttr(\"kernel_size\") failed.");
+  KERNEL_CHECK_NULLPTR(ctx.GetAttr("dilation"), KERNEL_STATUS_PARAM_INVALID, "Get ctx.GetAttr(\"dilation\") failed.");
+  KERNEL_CHECK_NULLPTR(ctx.GetAttr("padding"), KERNEL_STATUS_PARAM_INVALID, "Get ctx.GetAttr(\"padding\") failed.");
+  KERNEL_CHECK_NULLPTR(ctx.GetAttr("stride"), KERNEL_STATUS_PARAM_INVALID, "Get ctx.GetAttr(\"stride\") failed.");
+  std::vector<int64_t> kernel_size = ctx.GetAttr("kernel_size")->GetListInt();
+  std::vector<int64_t> dilation = ctx.GetAttr("dilation")->GetListInt();
+  std::vector<int64_t> padding = ctx.GetAttr("padding")->GetListInt();
+  std::vector<int64_t> stride = ctx.GetAttr("stride")->GetListInt();
+  auto output_size_shape = output_size_->GetTensorShape()->GetDimSizes();
+  KERNEL_CHECK_FALSE((output_size_shape.size() == kValue1 && output_size_->NumElements() == kValue2),
+                     KERNEL_STATUS_PARAM_INVALID,
+                     "Expected 1D tensor for output_size with non-zero dimensions for and "
+                     "output_size's size equals to 2, but "
+                     "got %dD tensor for output_size and output_size's size equals to %d.",
+                     output_size_shape.size(), output_size_->NumElements());
+  KERNEL_CHECK_FALSE(kernel_size.size() == kValue2, KERNEL_STATUS_PARAM_INVALID,
+                     "It is expected kernel_size's size equals to 2, but got size %d.", kernel_size.size());
+  KERNEL_CHECK_FALSE(dilation.size() == kValue2, KERNEL_STATUS_PARAM_INVALID,
+                     "It is expected dilation_size equals to 2, but got size %d.", dilation.size());
+  KERNEL_CHECK_FALSE(padding.size() == kValue2, KERNEL_STATUS_PARAM_INVALID,
+                     "It is expected padding_size equals to 2, but got size %d.", padding.size());
+  KERNEL_CHECK_FALSE(stride.size() == kValue2, KERNEL_STATUS_PARAM_INVALID,
+                     "It is expected stride_size equals to 2, but got size %d.", stride.size());
+  int32_t *output_size_data = reinterpret_cast<int32_t *>(output_size_->GetData());
+  std::vector<int64_t> output_size(kValue2, kValue0);
+  output_size[kIndex0] = output_size_data[kIndex0];
+  output_size[kIndex1] = output_size_data[kIndex1];
+  const int64_t output_height = output_size.front();
+  const int64_t output_width = output_size.back();
+  const int64_t kernel_height = kernel_size.front();
+  const int64_t kernel_width = kernel_size.back();
+  const int64_t dilation_height = dilation.front();
+  const int64_t dilation_width = dilation.back();
+  const int64_t pad_height = padding.front();
+  const int64_t pad_width = padding.back();
+  const int64_t stride_height = stride.front();
+  const int64_t stride_width = stride.back();
+  KERNEL_CHECK_FALSE(output_width > kValue0 && output_height > kValue0, KERNEL_STATUS_PARAM_INVALID,
+                     "output should be greater than zero, but got "
+                     "output_height: %d output_width: %d.",
+                     output_height, output_width);
+  KERNEL_CHECK_FALSE(kernel_width > kValue0 && kernel_height > kValue0, KERNEL_STATUS_PARAM_INVALID,
+                     "kernel should be greater than zero, but got "
+                     "kernel_height: %d kernel_width: %d.",
+                     kernel_height, kernel_width);
+  KERNEL_CHECK_FALSE(dilation_width > kValue0 && dilation_height > kValue0, KERNEL_STATUS_PARAM_INVALID,
+                     "dilation should be greater than zero, but got "
+                     "dilation_height: %d dilation_width: %d.",
+                     dilation_height, dilation_width);
+  KERNEL_CHECK_FALSE(pad_width >= kValue0 && pad_height >= kValue0, KERNEL_STATUS_PARAM_INVALID,
+                     "padding should be greater than zero, but got pad_height: "
+                     "%d pad_width: %d.",
+                     pad_height, pad_width);
+  KERNEL_CHECK_FALSE(stride_width > kValue0 && stride_height > kValue0, KERNEL_STATUS_PARAM_INVALID,
+                     "stride should be greater than zero, but got "
+                     "stride_height: %d stride_width: %d.",
+                     stride_height, stride_width);
+  auto input_shape = input_->GetTensorShape()->GetDimSizes();
+  KERNEL_CHECK_FALSE(
+    (input_shape.size() == kValue4 && input_shape[kIndex0] != kValue0 && input_shape[kIndex1] != kValue0 &&
+     input_shape[kIndex2] != kValue0 && input_shape[kIndex3] != kValue0),
+    KERNEL_STATUS_PARAM_INVALID,
+    "Expected 4D (batch mode) tensor for input with non-zero "
+    "batch size and non-zero dimensions for input, but got %dD input: (%d %d "
+    "%d %d).",
+    input_shape.size(), input_shape[kIndex0], input_shape[kIndex1], input_shape[kIndex2], input_shape[kIndex3]);
+  KERNEL_CHECK_FALSE(input_shape[kIndex2] == (kernel_width * kernel_height), KERNEL_STATUS_PARAM_INVALID,
+                     "Expected size of input's dimension 2 to match the calculated "
+                     "number of kernel_size, but got input_shape[2]=%d and kernel_size=(%d, "
+                     "%d).",
+                     input_shape[kIndex2], kernel_height, kernel_width);
+  auto input_length = input_shape[kIndex3];
+  int64_t n_blocks_height =
+    div_rtn<int64_t>(output_height + 2 * pad_height - dilation_height * (kernel_height - 1) - 1, stride_height) + 1;
+  int64_t n_blocks_width =
+    div_rtn<int64_t>(output_width + 2 * pad_width - dilation_width * (kernel_width - 1) - 1, stride_width) + 1;
+  KERNEL_CHECK_FALSE(input_length == (n_blocks_height * n_blocks_width), KERNEL_STATUS_PARAM_INVALID,
+                     "Given output_size=(%d, %d), kernel_size=(%d, %d), dilation=(%d, %d",
+                     "), padding=(%d, %d), stride=(%d, %d), expected size of input's "
+                     "dimension 2 to match the calculated "
+                     "number of sliding blocks %d * %d = %d, but got input.size(2)=%d.",
+                     output_height, output_width, kernel_height, kernel_width, dilation_height, dilation_width,
+                     pad_height, pad_width, stride_height, stride_width, n_blocks_height, n_blocks_width,
+                     (n_blocks_height * n_blocks_width), input_length);
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+void Col2imCpuKernel::InnerCompute(int64_t c_col, int64_t input_offset, int64_t output_offset, T *input_data,
+                                   T *output_data) {
+  int64_t w_offset = c_col % kernel_width;
+  int64_t h_offset = (c_col / kernel_width) % kernel_height;
+  int64_t c_im = c_col / kernel_height / kernel_width;
+  for (int64_t h_col = 0; h_col < height_col; ++h_col) {
+    int64_t h_im = h_col * stride_height - pad_height + h_offset * dilation_height;
+    for (int64_t w_col = 0; w_col < width_col; ++w_col) {
+      int64_t w_im = w_col * stride_width - pad_width + w_offset * dilation_width;
+      if (h_im >= 0 && h_im < output_height && w_im >= 0 && w_im < output_width) {
+        output_data[output_offset + (c_im * output_height + h_im) * output_width + w_im] +=
+          input_data[input_offset + (c_col * height_col + h_col) * width_col + w_col];
+      }
+    }
+  }
+}
+
+template <typename T>
+uint32_t Col2imCpuKernel::Col2imCompute(CpuKernelContext &ctx) {
+  Tensor *input_ = ctx.Input(0);
+  Tensor *output_size_ = ctx.Input(1);
+  Tensor *output_ = ctx.Output(0);
+  int32_t *output_size_data = reinterpret_cast<int32_t *>(output_size_->GetData());
+  std::vector<int64_t> output_size(kValue2, kValue0);
+  output_size[kIndex0] = output_size_data[kIndex0];
+  output_size[kIndex1] = output_size_data[kIndex1];
+
+  std::vector<int64_t> kernel_size = ctx.GetAttr("kernel_size")->GetListInt();
+  std::vector<int64_t> dilation = ctx.GetAttr("dilation")->GetListInt();
+  std::vector<int64_t> padding = ctx.GetAttr("padding")->GetListInt();
+  std::vector<int64_t> stride = ctx.GetAttr("stride")->GetListInt();
+
+  output_height = output_size.front();
+  output_width = output_size.back();
+  kernel_height = kernel_size.front();
+  kernel_width = kernel_size.back();
+  dilation_height = dilation.front();
+  dilation_width = dilation.back();
+  pad_height = padding.front();
+  pad_width = padding.back();
+  stride_height = stride.front();
+  stride_width = stride.back();
+
+  auto input_shape = input_->GetTensorShape()->GetDimSizes();
+  const int64_t batch_size = input_shape[kIndex0];
+  const int64_t n_input_plane = input_shape[kIndex1];
+
+  height_col =
+    (output_height + kValue2 * pad_height - (dilation_height * (kernel_height - kValue1) + kValue1)) / stride_height +
+    1;
+  width_col =
+    (output_width + kValue2 * pad_width - (dilation_width * (kernel_width - kValue1) + kValue1)) / stride_width + 1;
+
+  T *input_data = reinterpret_cast<T *>(input_->GetData());
+  T *output_data = reinterpret_cast<T *>(output_->GetData());
+  std::fill_n(output_data, output_->NumElements(), T(0));
+  channels_col = n_input_plane * kernel_height * kernel_width;
+  batch_input_size = n_input_plane * kernel_height * kernel_width * height_col * width_col;
+  batch_output_size = n_input_plane * output_height * output_width;
+  for (int64_t elt = 0; elt < batch_size; ++elt) {
+    int64_t input_offset = batch_input_size * elt;
+    int64_t output_offset = batch_output_size * elt;
+    for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
+      InnerCompute<T>(c_col, input_offset, output_offset, input_data, output_data);
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kCol2im, Col2imCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_COL2IM_H_
+#define AICPU_KERNELS_NORMALIZED_COL2IM_H_
+
+#include "cpu_ops_kernel.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class Col2imCpuKernel : public CpuKernel {
+ public:
+  Col2imCpuKernel() = default;
+  ~Col2imCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t Col2imParamCheck(CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t Col2imCompute(CpuKernelContext &ctx);
+  template <typename T>
+  void InnerCompute(int64_t c_col, int64_t input_offset, int64_t output_offset, T *input_data, T *output_data);
+
+  int64_t output_height, output_width;
+  int64_t kernel_height, kernel_width;
+  int64_t dilation_height, dilation_width;
+  int64_t pad_height, pad_width;
+  int64_t stride_height, stride_width;
+
+  int64_t height_col, width_col;
+
+  int64_t channels_col, batch_input_size, batch_output_size;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cumulativelogsumexp.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cumulativelogsumexp.cc
@ -0,0 +1,211 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ **/
+#include "cumulativelogsumexp.h"
+
+#include "cmath"
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t KCumulativeLogsumexpInputNum = 2;
+const uint32_t KCumulativeLogsumexpOutputNum = 1;
+const float float16_exclusive_data = -65504e+0;
+const float float_exclusive_data = -3.4028235e+38;
+const double double_exclusive_data = -1.7976931348623157e+308;
+const int64_t ParallelFor_size_float16 = 16 * 1024;
+const int64_t ParallelFor_size_float32 = 32 * 1024;
+const int64_t ParallelFor_size_double = 64 * 1024;
+const char *KCumulativeLogsumexp = "CumulativeLogsumexp";
+#define CUMULATIVELOGSUMEXP_COMPUTE_CASE(DTYPE, IN_TYPE, CTX)         \
+  case (DTYPE): {                                                     \
+    uint32_t result = CumulativeLogsumexpCompute<IN_TYPE>(CTX);       \
+    if (result != KERNEL_STATUS_OK) {                                 \
+      KERNEL_LOG_ERROR("CumulativeLogsumexp kernel compute failed."); \
+      return result;                                                  \
+    }                                                                 \
+    break;                                                            \
+  }
+}  // namespace
+namespace aicpu {
+uint32_t CumulativeLogsumexpCpuKernel::Compute(CpuKernelContext &ctx) {
+  // check params
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, KCumulativeLogsumexpInputNum, KCumulativeLogsumexpOutputNum),
+                      "[%s] check input and output failed,", KCumulativeLogsumexp);
+  KERNEL_HANDLE_ERROR(CumulativeLogsumexpCheck(ctx), "[%s] check params failed.", KCumulativeLogsumexp);
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    CUMULATIVELOGSUMEXP_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
+    CUMULATIVELOGSUMEXP_COMPUTE_CASE(DT_FLOAT, float, ctx)
+    CUMULATIVELOGSUMEXP_COMPUTE_CASE(DT_DOUBLE, double, ctx)
+    default:
+      KERNEL_LOG_ERROR("CumulativeLogsumexp kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+uint32_t CumulativeLogsumexpCpuKernel::CumulativeLogsumexpCheck(CpuKernelContext &ctx) {
+  KERNEL_CHECK_FALSE((ctx.Input(1)->GetDataType() == DT_INT16 || ctx.Input(1)->GetDataType() == DT_INT32),
+                     KERNEL_STATUS_PARAM_INVALID, "Data type of axis is not support, axis data type is [%u].",
+                     ctx.Input(1)->GetDataType())
+  KERNEL_CHECK_FALSE(ctx.Input(1)->NumElements() == 1, KERNEL_STATUS_PARAM_INVALID, "axis is out of shape");
+  auto axis_data = static_cast<int32_t *>(ctx.Input(1)->GetData());
+  int64_t axis = *axis_data;
+  KERNEL_CHECK_FALSE((axis < ctx.Input(0)->GetTensorShape()->GetDims()), KERNEL_STATUS_PARAM_INVALID,
+                     "axis is larger than input dims - 1")
+  KERNEL_CHECK_FALSE((axis >= -ctx.Input(0)->GetTensorShape()->GetDims()), KERNEL_STATUS_PARAM_INVALID,
+                     "axis is lower than -input dims")
+  std::vector<int64_t> shape_input = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  std::vector<int64_t> shape_output = ctx.Output(0)->GetTensorShape()->GetDimSizes();
+  KERNEL_CHECK_FALSE((shape_input.size() != 0), KERNEL_STATUS_PARAM_INVALID,
+                     "Input must be at least rank 1, got [%zu].", shape_input.size())
+  KERNEL_CHECK_FALSE((shape_input.size() == shape_output.size()), KERNEL_STATUS_PARAM_INVALID,
+                     "The output shape size should be same as the output shape size")
+  DataType input0_type = ctx.Input(0)->GetDataType();
+  DataType output0_type = ctx.Output(0)->GetDataType();
+  KERNEL_CHECK_FALSE((input0_type == output0_type), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of input0 [%s] need be same with output0 [%s] ", DTypeStr(input0_type).c_str(),
+                     DTypeStr(output0_type).c_str())
+  return KERNEL_STATUS_OK;
+}
+template <typename t>
+void CumulativeProcess(uint32_t outer, uint32_t inner, uint32_t depth, bool reverse, bool exclusive, t *input_data,
+                       t *output_data, DataType data_type) {
+  for (size_t outer_index = 0; outer_index < outer; ++outer_index) {
+    size_t outer_index_adj;
+    if (reverse) {
+      outer_index_adj = (outer - 1) - outer_index;
+    } else {
+      outer_index_adj = outer_index;
+    }
+    for (size_t inner_index = 0; inner_index < inner; ++inner_index) {
+      double one = 1;
+      double temp = 0;
+      size_t inner_index_adj;
+      if (reverse) {
+        inner_index_adj = (inner - 1) - inner_index;
+      } else {
+        inner_index_adj = inner_index;
+      }
+      for (size_t depth_index = 0; depth_index < depth; ++depth_index) {
+        size_t depth_index_adj;
+        if (reverse) {
+          depth_index_adj = (depth - 1) - depth_index;
+        } else {
+          depth_index_adj = depth_index;
+        }
+        size_t index = outer_index_adj;
+        index += inner_index_adj * depth * outer;
+        index += depth_index_adj * outer;
+        if (exclusive) {
+          if (depth_index == 0) {
+            if (data_type == DT_FLOAT16) {
+              output_data[index] = static_cast<t>(float16_exclusive_data);
+            } else if (data_type == DT_FLOAT) {
+              output_data[index] = static_cast<t>(float_exclusive_data);
+            } else {
+              output_data[index] = static_cast<t>(double_exclusive_data);
+            }
+            temp = static_cast<double>(input_data[index]);
+          } else {
+            output_data[index] = static_cast<t>(temp);
+            double a = temp;
+            double b, min0, max0;
+            b = static_cast<double>(input_data[index]);
+            min0 = (a < b) ? a : b;
+            max0 = (a > b) ? a : b;
+            temp = log(one + exp(min0 - max0)) + max0;
+          }
+        } else {
+          if (depth_index == 0) {
+            output_data[index] = input_data[index];
+            temp = static_cast<double>(input_data[index]);
+          } else {
+            double a = temp;
+            double b, min0, max0;
+            b = static_cast<double>(input_data[index]);
+            min0 = (a < b) ? a : b;
+            max0 = (a > b) ? a : b;
+            output_data[index] = static_cast<t>(log(one + exp(min0 - max0)) + max0);
+            temp = log(one + exp(min0 - max0)) + max0;
+          }
+        }
+      }
+    }
+  }
+}
+template <typename T>
+uint32_t CumulativeLogsumexpCpuKernel::CumulativeLogsumexpCompute(CpuKernelContext &ctx) {
+  auto input_data = static_cast<T *>(ctx.Input(0)->GetData());
+  auto axis_data = static_cast<int32_t *>(ctx.Input(1)->GetData());
+  bool exclusive = false;
+  bool reverse = false;
+  AttrValue *exclusive_attr = ctx.GetAttr("exclusive");
+  if (exclusive_attr != nullptr) {
+    exclusive = exclusive_attr->GetBool();
+  }
+  AttrValue *reverse_attr = ctx.GetAttr("reverse");
+  if (reverse_attr != nullptr) {
+    reverse = reverse_attr->GetBool();
+  }
+  int32_t axis = 0;
+  if (axis_data != nullptr) {
+    axis = *axis_data;
+  }
+  auto output_data = static_cast<T *>(ctx.Output(0)->GetData());
+  auto shape = ctx.Input(0)->GetTensorShape();
+  const int64_t rank = shape->GetDims();
+  if (axis < 0) {
+    axis += shape->GetDims();
+  }
+  uint32_t inner = 1;
+  uint32_t outer = 1;
+  uint32_t depth = 1;
+  for (int32_t i = 0; i < rank; ++i) {
+    if (i < axis) {
+      inner *= shape->GetDimSize(i);
+    } else if (i > axis) {
+      outer *= shape->GetDimSize(i);
+    } else {
+      depth = shape->GetDimSize(i);
+    }
+  }  // end for
+  auto data_type = ctx.Input(0)->GetDataType();
+  int64_t data_num = ctx.Input(0)->NumElements();
+  int64_t data_size = data_num * sizeof(T);
+  if ((data_type == DT_FLOAT16 && data_size <= ParallelFor_size_float16) ||
+      (data_type == DT_FLOAT && data_size <= ParallelFor_size_float32) ||
+      (data_type == DT_DOUBLE && data_size <= ParallelFor_size_double)) {
+    CumulativeProcess<T>(outer, inner, depth, reverse, exclusive, input_data, output_data, data_type);
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > outer) {
+      max_core_num = outer;
+    }
+    auto shard_cumulativelogsumexp = [&](size_t start, size_t end) {
+      CumulativeProcess<T>(outer, inner, depth, reverse, exclusive, input_data, output_data, data_type);
+    };
+    if (max_core_num == 0) {
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, outer, outer / max_core_num, shard_cumulativelogsumexp),
+                        "CumulativeLogsumexp Compute failed.");
+  }  // end else
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(KCumulativeLogsumexp, CumulativeLogsumexpCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cumulativelogsumexp.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cumulativelogsumexp.h
@ -0,0 +1,38 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_CUMULATIVELOGSUMEXP_H_
+#define AICPU_KERNELS_NORMALIZED_CUMULATIVELOGSUMEXP_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class CumulativeLogsumexpCpuKernel : public CpuKernel {
+ public:
+  CumulativeLogsumexpCpuKernel() = default;
+  ~CumulativeLogsumexpCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t CumulativeLogsumexpCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t CumulativeLogsumexpCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/data_format_vec_permute.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/data_format_vec_permute.cc
@ -0,0 +1,126 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "data_format_vec_permute.h"
+
+#include <string>
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "kernel_log.h"
+#include "status.h"
+#include "utils/kernel_util.h"
+using namespace std;
+
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 1;
+const char *kDataFormatVecPermute = "DataFormatVecPermute";
+
+#define DATAFORMATVECPERMUTE_COMPUTE_CASE(DTYPE, TYPE, DIM, SRC_FORMAT_STR, DST_FORMAT_STR, X, Y, CTX)   \
+  case (DTYPE): {                                                                                        \
+    uint32_t result = DataFormatVecPermuteCompute<TYPE>(DIM, SRC_FORMAT_STR, DST_FORMAT_STR, X, Y, CTX); \
+    if (result != KERNEL_STATUS_OK) {                                                                    \
+      KERNEL_LOG_ERROR("DataFormatVecPermute kernel compute failed.");                                   \
+      return result;                                                                                     \
+    }                                                                                                    \
+    break;                                                                                               \
+  }
+
+}  // namespace
+
+namespace aicpu {
+uint32_t DataFormatVecPermute::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check DataFormatVecPermute params failed.");
+  AttrValue *src_format = ctx.GetAttr("src_format");
+  std::string src_format_str = src_format->GetString();
+  KERNEL_CHECK_FALSE((src_format_str.size() == 4), KERNEL_STATUS_PARAM_INVALID,
+                     "src_format must be of length 4, but the length of src_format = [%d].", src_format_str.size());
+  AttrValue *dst_format = ctx.GetAttr("dst_format");
+  std::string dst_format_str = dst_format->GetString();
+  KERNEL_CHECK_FALSE((dst_format_str.size() == 4), KERNEL_STATUS_PARAM_INVALID,
+                     "dst_format must be of length 4, but the length of dst_format = [%d].", dst_format_str.size());
+  Tensor *x = ctx.Input(0);
+  auto x_shape = x->GetTensorShape();
+  int32_t dim = x_shape->GetDims();
+  KERNEL_CHECK_FALSE((dim == 1 || dim == 2), KERNEL_STATUS_PARAM_INVALID,
+                     "Input dimension must be 1 or 2, but got dimension = [%d].", dim);
+  Tensor *y = ctx.Output(0);
+  auto y_shape = y->GetTensorShape();
+  if (dim == 1) {
+    KERNEL_CHECK_FALSE((x_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID,
+                       "1D Input must be of size 4, but got size %lld.", x_shape->GetDimSize(0));
+    KERNEL_CHECK_FALSE((y_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID,
+                       "1D Output must be of size 4, but got size %lld.", y_shape->GetDimSize(0));
+  } else if (dim == 2) {
+    KERNEL_CHECK_FALSE((x_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID,
+                       "First dimension of 2D Input must be of size 4, but got size %lld.", x_shape->GetDimSize(0));
+    KERNEL_CHECK_FALSE((x_shape->GetDimSize(1) == 2), KERNEL_STATUS_PARAM_INVALID,
+                       "Second dimension of 2D Input must be of size 2, but got size %lld.", x_shape->GetDimSize(1));
+    KERNEL_CHECK_FALSE((y_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID,
+                       "First dimension of 2D Output must be of size 4, but got size %lld.", y_shape->GetDimSize(0));
+    KERNEL_CHECK_FALSE((y_shape->GetDimSize(1) == 2), KERNEL_STATUS_PARAM_INVALID,
+                       "Second dimension of 2D Output must be of size 2, but got size %lld.", y_shape->GetDimSize(1));
+  }
+
+  auto x_type = x->GetDataType();
+  auto y_type = y->GetDataType();
+  KERNEL_CHECK_FALSE((x_type == y_type), KERNEL_STATUS_PARAM_INVALID,
+                     "Input[%s] and output[%s] must have the same DataType.", DTypeStr(x_type).c_str(),
+                     DTypeStr(y_type).c_str());
+  switch (x_type) {
+    DATAFORMATVECPERMUTE_COMPUTE_CASE(DT_INT32, int32_t, dim, src_format_str, dst_format_str, x, y, ctx)
+    DATAFORMATVECPERMUTE_COMPUTE_CASE(DT_INT64, int64_t, dim, src_format_str, dst_format_str, x, y, ctx)
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                       DTypeStr(x_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t DataFormatVecPermute::DataFormatVecPermuteCompute(const int32_t dim, const string &src_format_str,
+                                                           const string &dst_format_str, Tensor *x, Tensor *y,
+                                                           CpuKernelContext &ctx) {
+  T *x_addrs = reinterpret_cast<T *>(x->GetData());
+  T *y_addrs = reinterpret_cast<T *>(y->GetData());
+
+  if (dim == 1) {
+    for (uint64_t i = 0; i < dst_format_str.size(); i++) {
+      for (uint64_t j = 0; j < src_format_str.size(); j++) {
+        if (dst_format_str[i] == src_format_str[j]) {
+          y_addrs[i] = x_addrs[j];
+          break;
+        }
+      }
+    }
+  } else if (dim == 2) {
+    for (uint64_t i = 0; i < dst_format_str.size(); i++) {
+      for (uint64_t j = 0; j < src_format_str.size(); j++) {
+        if (dst_format_str[i] == src_format_str[j]) {
+          y_addrs[i * 2] = x_addrs[j * 2];
+          y_addrs[i * 2 + 1] = x_addrs[j * 2 + 1];
+          break;
+        }
+      }
+    }
+  }
+
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kDataFormatVecPermute, DataFormatVecPermute);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/data_format_vec_permute.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/data_format_vec_permute.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_DATA_FORMAT_VEC_PERMUTE_H_
+#define AICPU_KERNELS_NORMALIZED_DATA_FORMAT_VEC_PERMUTE_H_
+
+#include <string>
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class DataFormatVecPermute : public CpuKernel {
+ public:
+  DataFormatVecPermute() = default;
+  ~DataFormatVecPermute() override = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t DataFormatVecPermuteCompute(const int32_t dim, const std::string &src_format_str,
+                                       const std::string &dst_format_str, Tensor *x, Tensor *y, CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc
@ -0,0 +1,455 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "matrix_solve_ls.h"
+
+#include <Eigen/Cholesky>
+#include <Eigen/Dense>
+#include <algorithm>
+#include <iostream>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const uint32_t kInputNum = 3;
+const uint32_t kOutputNum = 1;
+const char *MatrixSolveLs = "MatrixSolveLs";
+const int64_t kNum2 = 2;
+}  // namespace
+
+namespace aicpu {
+uint32_t MatrixSolveLsCpuKernel::Compute(CpuKernelContext &ctx) {
+  bool qr_chole = (ctx.GetAttr("fast") == nullptr) ? true : ctx.GetAttr("fast")->GetBool();
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixSolveLs check input and output number failed.");
+
+  Tensor *matrix = ctx.Input(kFirstInputIndex);
+  Tensor *b = ctx.Input(kSecondInputIndex);
+  Tensor *l2 = ctx.Input(2);
+  Tensor *x = ctx.Output(0);
+  if ((matrix->GetDataSize() == 0) || (b->GetDataSize() == 0)) {
+    KERNEL_LOG_ERROR("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  auto shapea = matrix->GetTensorShape();
+  auto shapeb = b->GetTensorShape();
+  auto shapel2 = l2->GetTensorShape();
+  auto shapex = x->GetTensorShape();
+  auto dims = shapea->GetDims();
+
+  if (ctx.Input(1)->GetTensorShape()->GetDims() == 1) {
+    if (shapea->GetDimSize(dims - kNum2) != shapeb->GetDimSize(0)) {
+      KERNEL_LOG_ERROR(
+        "[%s] #Rows mismatch between A and rhs."
+        "#Rows of A = [%llu], #Rows of rhs = [%llu]",
+        ctx.GetOpType().c_str(), shapea->GetDimSize(dims - kNum2), shapeb->GetDimSize(0));
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  } else {
+    if (shapea->GetDimSize(dims - kNum2) != shapeb->GetDimSize(dims - kNum2)) {
+      KERNEL_LOG_ERROR(
+        "[%s] #Rows mismatch between A and rhs."
+        "#Rows of A = [%llu], #Rows of rhs = [%llu]",
+        ctx.GetOpType().c_str(), shapea->GetDimSize(dims - kNum2), shapeb->GetDimSize(dims - kNum2));
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  if (shapel2->GetDims() != 0) {
+    KERNEL_LOG_ERROR("[%s] Tensor l2 should be a scalar.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Input(1)->GetTensorShape()->GetDims() == 1) {
+    if (shapex->GetDims() != shapeb->GetDims() || shapea->GetDimSize(dims - 1) != shapex->GetDimSize(0) ||
+        shapex->GetDimSize(shapex->GetDims() - 1) != shapeb->GetDimSize(0)) {
+      KERNEL_LOG_ERROR("[%s] Tensor y shape mismatch.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  } else {
+    if (shapex->GetDims() != shapeb->GetDims() ||
+        shapea->GetDimSize(dims - 1) != shapex->GetDimSize(shapex->GetDims() - kNum2) ||
+        shapex->GetDimSize(shapex->GetDims() - 1) != shapeb->GetDimSize(shapeb->GetDims() - 1)) {
+      KERNEL_LOG_ERROR("[%s] Tensor y shape mismatch.", ctx.GetOpType().c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+
+  auto a_data_type = matrix->GetDataType();
+  auto b_data_type = b->GetDataType();
+  if (a_data_type != b_data_type) {
+    KERNEL_LOG_ERROR("[%s] Tensor data type mismatch.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (a_data_type != DT_FLOAT && a_data_type != DT_DOUBLE && a_data_type != DT_COMPLEX64 &&
+      a_data_type != DT_COMPLEX128) {
+    KERNEL_LOG_ERROR("MatrixSolveLs kernel data type [%s] not support.", DTypeStr(a_data_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  if (qr_chole) {
+    if (a_data_type == DT_COMPLEX64) {
+      return ComplexCholesky<float>(ctx);
+    }
+    if (a_data_type == DT_COMPLEX128) {
+      return ComplexCholesky<double>(ctx);
+    }
+    if (a_data_type == DT_DOUBLE) {
+      return RealCholesky<double>(ctx);
+    }
+    if (a_data_type == DT_FLOAT) {
+      return RealCholesky<float>(ctx);
+    }
+  } else {
+    if (a_data_type == DT_COMPLEX64) {
+      return ComplexQr<float>(ctx);
+    }
+    if (a_data_type == DT_COMPLEX128) {
+      return ComplexQr<double>(ctx);
+    }
+    if (a_data_type == DT_DOUBLE) {
+      return RealQr<double>(ctx);
+    }
+    if (a_data_type == DT_FLOAT) {
+      return RealQr<float>(ctx);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(MatrixSolveLs, MatrixSolveLsCpuKernel);
+
+template <typename T>
+void MatrixSolveLsCpuKernel::RealCholeskySingleCompute(T *aptr, T *bptr, T *xptr, double *l2, int64_t m, int64_t k,
+                                                       int64_t n) {
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a(m, k);
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(k, n);
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(m, n);
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_copy;
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_b;
+
+  for (int i = 0; i < m * k; i++) {
+    *(a.data() + i) = *(aptr + i);
+  }
+  for (int i = 0; i < m * n; i++) {
+    *(b.data() + i) = *(bptr + i);
+  }
+
+  if (m >= k) {
+    a_copy =
+      a.transpose() * a + ((T)*l2) * Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Identity(k, k);
+    a_b = a.transpose() * b;
+  } else {
+    a_copy = a * a.transpose();
+    a_b = b;
+  }
+  for (int64_t i = 0; i < n; i++) {
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi = a_copy.ldlt().solve(a_b.col(i));
+    if (m < k) {
+      xi = a.transpose() * xi;
+    }
+    x.col(i) = xi;
+  }
+  for (int64_t i = 0; i < k * n; i++) {
+    *(xptr + i) = *(x.data() + i);
+  }
+}
+
+template <typename T>
+uint32_t MatrixSolveLsCpuKernel::RealCholesky(CpuKernelContext &ctx) {
+  auto dims = ctx.Input(0)->GetTensorShape()->GetDims();
+  auto aptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto bptr = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto xptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  auto l2 = reinterpret_cast<double *>(ctx.Input(2)->GetData());
+  int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2);
+  int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1);
+  int64_t n = 1;
+  if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) {
+    n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1);
+  }
+  int64_t data_num = ctx.Input(0)->NumElements();
+  const int64_t mat_size = m * k;
+  const int64_t rhs_size = m * n;
+  const int64_t res_size = n * k;
+  const int64_t batch = data_num / mat_size;
+  const int64_t kParallelDataNum = 16 * mat_size;
+  const int64_t kParallelDataNumMid = 72 * mat_size;
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+    auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; i++) {
+        RealCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls),
+                        "MatrixSolveLs Compute failed.");
+  } else {
+    for (int64_t i = 0; i < batch; i++) {
+      RealCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+void MatrixSolveLsCpuKernel::ComplexCholeskySingleCompute(std::complex<T> *aptr, std::complex<T> *bptr,
+                                                          std::complex<T> *xptr, double *l2, int64_t m, int64_t k,
+                                                          int64_t n) {
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> A(kNum2 * m, kNum2 * k);
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(kNum2 * k, n);
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(kNum2 * m, n);
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_copy;
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_b;
+  auto l2value = abs(*l2);
+
+  for (int64_t i = 0; i < k; i++) {
+    for (int64_t j = 0; j < m; j++) {
+      *(A.data() + i + j * kNum2 * k) = std::real(*(aptr + i + j * k));
+    }
+    for (int64_t j = 0; j < m; j++) {
+      *(A.data() + (i + k) + (j + m) * kNum2 * k) = std::real(*(aptr + i + j * k));
+    }
+    for (int64_t j = 0; j < m; j++) {
+      *(A.data() + (i + k) + j * kNum2 * k) = -std::imag(*(aptr + i + j * k));
+    }
+    for (int64_t j = 0; j < m; j++) {
+      *(A.data() + i + (j + m) * kNum2 * k) = std::imag(*(aptr + i + j * k));
+    }
+  }
+  for (int64_t i = 0; i < n; i++) {
+    for (int64_t j = 0; j < m; j++) {
+      *(b.data() + i + j * n) = std::real(*(bptr + i + j * n));
+      *(b.data() + i + (j + m) * n) = std::imag(*(bptr + i + j * n));
+    }
+  }
+
+  if (m >= k) {
+    a_copy =
+      A.transpose() * A +
+      ((T)l2value) * Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Identity(kNum2 * k, kNum2 * k);
+    a_b = A.transpose() * b;
+  } else {
+    a_copy =
+      A * A.transpose() +
+      ((T)l2value) * Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Identity(kNum2 * m, kNum2 * m);
+    a_b = b;
+  }
+
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi;
+  for (int64_t i = 0; i < n; i++) {
+    xi = a_copy.ldlt().solve(a_b.col(i));
+    if (m < k) {
+      xi = A.transpose() * xi;
+    }
+    x.col(i) = xi;
+    for (int64_t j = 0; j < k; j++) {
+      (xptr + i + j * n)->real(*(x.data() + i + j * n));
+      (xptr + i + j * n)->imag(*(x.data() + i + (j + k) * n));
+    }
+  }
+}
+
+template <typename T>
+uint32_t MatrixSolveLsCpuKernel::ComplexCholesky(CpuKernelContext &ctx) {
+  auto dims = ctx.Input(0)->GetTensorShape()->GetDims();
+  auto l2 = reinterpret_cast<double *>(ctx.Input(2)->GetData());
+  auto aptr = reinterpret_cast<std::complex<T> *>(ctx.Input(0)->GetData());
+  auto bptr = reinterpret_cast<std::complex<T> *>(ctx.Input(1)->GetData());
+  auto xptr = reinterpret_cast<std::complex<T> *>(ctx.Output(0)->GetData());
+  int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2);
+  int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1);
+  int64_t n = 1;
+  if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) {
+    n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1);
+  }
+  int64_t data_num = ctx.Input(0)->NumElements();
+  const int64_t mat_size = m * k;
+  const int64_t rhs_size = m * n;
+  const int64_t res_size = n * k;
+  const int64_t batch = data_num / mat_size;
+  const int64_t kParallelDataNum = 16 * mat_size;
+  const int64_t kParallelDataNumMid = 72 * mat_size;
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+    auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; i++) {
+        ComplexCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls),
+                        "MatrixSolveLs Compute failed.");
+  } else {
+    for (int64_t i = 0; i < batch; i++) {
+      ComplexCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+void MatrixSolveLsCpuKernel::RealQrSingleCompute(T *aptr, T *bptr, T *xptr, int64_t m, int64_t k, int64_t n) {
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a(m, k);
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(k, n);
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(m, n);
+
+  for (int i = 0; i < m * k; i++) {
+    *(a.data() + i) = *(aptr + i);
+  }
+  for (int i = 0; i < m * n; i++) {
+    *(b.data() + i) = *(bptr + i);
+  }
+
+  Eigen::ColPivHouseholderQR<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> qr_solve(a);
+
+  for (int64_t i = 0; i < n; i++) {
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi = qr_solve.solve(b.col(i));
+    x.col(i) = xi;
+  }
+  for (int64_t i = 0; i < k * n; i++) {
+    *(xptr + i) = *(x.data() + i);
+  }
+}
+
+template <typename T>
+uint32_t MatrixSolveLsCpuKernel::RealQr(CpuKernelContext &ctx) {
+  auto dims = ctx.Input(0)->GetTensorShape()->GetDims();
+  auto aptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  auto bptr = reinterpret_cast<T *>(ctx.Input(1)->GetData());
+  auto xptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2);
+  int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1);
+  int64_t n = 1;
+  if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) {
+    n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1);
+  }
+  int64_t data_num = ctx.Input(0)->NumElements();
+  const int64_t mat_size = m * k;
+  const int64_t rhs_size = m * n;
+  const int64_t res_size = n * k;
+  const int64_t batch = data_num / mat_size;
+  const int64_t kParallelDataNum = 16 * mat_size;
+  const int64_t kParallelDataNumMid = 72 * mat_size;
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+    auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; i++) {
+        RealQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls),
+                        "MatrixSolveLs Compute failed.");
+  } else {
+    for (int64_t i = 0; i < batch; i++) {
+      RealQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+void MatrixSolveLsCpuKernel::ComplexQrSingleCompute(std::complex<T> *aptr, std::complex<T> *bptr, std::complex<T> *xptr,
+                                                    int64_t m, int64_t k, int64_t n) {
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> A(kNum2 * m, kNum2 * k);
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(kNum2 * k, n);
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(kNum2 * m, n);
+  for (int64_t i = 0; i < k; i++) {
+    for (int64_t j = 0; j < m; j++) {
+      *(A.data() + i + j * kNum2 * k) = std::real(*(aptr + i + j * k));
+    }
+    for (int64_t j = 0; j < m; j++) {
+      *(A.data() + (i + k) + (j + m) * kNum2 * k) = std::real(*(aptr + i + j * k));
+    }
+    for (int64_t j = 0; j < m; j++) {
+      *(A.data() + (i + k) + j * kNum2 * k) = -std::imag(*(aptr + i + j * k));
+    }
+    for (int64_t j = 0; j < m; j++) {
+      *(A.data() + i + (j + m) * kNum2 * k) = std::imag(*(aptr + i + j * k));
+    }
+  }
+  for (int64_t i = 0; i < n; i++) {
+    for (int64_t j = 0; j < m; j++) {
+      *(b.data() + i + j * n) = std::real(*(bptr + i + j * n));
+      *(b.data() + i + (j + m) * n) = std::imag(*(bptr + i + j * n));
+    }
+  }
+
+  Eigen::ColPivHouseholderQR<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> qr_solve(A);
+
+  for (int64_t i = 0; i < n; i++) {
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi = qr_solve.solve(b.col(i));
+    x.col(i) = xi;
+
+    for (int64_t j = 0; j < k; j++) {
+      (xptr + i + j * n)->real(*(x.data() + i + j * n));
+      (xptr + i + j * n)->imag(*(x.data() + i + (j + k) * n));
+    }
+  }
+}
+
+template <typename T>
+uint32_t MatrixSolveLsCpuKernel::ComplexQr(CpuKernelContext &ctx) {
+  auto dims = ctx.Input(0)->GetTensorShape()->GetDims();
+  int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2);
+  int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1);
+  int64_t n = 1;
+  if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) {
+    n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1);
+  }
+  int64_t data_num = ctx.Input(0)->NumElements();
+  const int64_t mat_size = m * k;
+  const int64_t rhs_size = m * n;
+  const int64_t res_size = n * k;
+  const int64_t batch = data_num / mat_size;
+  const int64_t kParallelDataNum = 16 * mat_size;
+  const int64_t kParallelDataNumMid = 72 * mat_size;
+  auto aptr = reinterpret_cast<std::complex<T> *>(ctx.Input(0)->GetData());
+  auto bptr = reinterpret_cast<std::complex<T> *>(ctx.Input(1)->GetData());
+  auto xptr = reinterpret_cast<std::complex<T> *>(ctx.Output(0)->GetData());
+  if (data_num >= kParallelDataNum) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);  // up to 4 cpu cores
+    }
+    auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; i++) {
+        ComplexQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls),
+                        "MatrixSolveLs Compute failed.");
+  } else {
+    for (int64_t i = 0; i < batch; i++) {
+      ComplexQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.h
@ -0,0 +1,62 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_MATRIX_SOLVE_LS_H_
+#define AICPU_KERNELS_NORMALIZED_MATRIX_SOLVE_LS_H_
+
+#include <complex>
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class MatrixSolveLsCpuKernel : public CpuKernel {
+ public:
+  MatrixSolveLsCpuKernel() = default;
+  ~MatrixSolveLsCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  void RealCholeskySingleCompute(T *aptr, T *bptr, T *xptr, double *l2, int64_t m, int64_t k, int64_t n);
+
+  template <typename T>
+  uint32_t RealCholesky(CpuKernelContext &ctx);
+
+  template <typename T>
+  void RealQrSingleCompute(T *aptr, T *bptr, T *xptr, int64_t m, int64_t k, int64_t n);
+
+  template <typename T>
+  uint32_t RealQr(CpuKernelContext &ctx);
+
+  template <typename T>
+  void ComplexCholeskySingleCompute(std::complex<T> *aptr, std::complex<T> *bptr, std::complex<T> *xptr, double *l2,
+                                    int64_t m, int64_t k, int64_t n);
+
+  template <typename T>
+  uint32_t ComplexCholesky(CpuKernelContext &ctx);
+
+  template <typename T>
+  void ComplexQrSingleCompute(std::complex<T> *aptr, std::complex<T> *bptr, std::complex<T> *xptr, int64_t m, int64_t k,
+                              int64_t n);
+
+  template <typename T>
+  uint32_t ComplexQr(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.cc
@ -0,0 +1,440 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nuclear_norm.h"
+#include <string.h>
+#include <Eigen/Dense>
+#include <cmath>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "kernel_util.h"
+#include "utils/kernel_util.h"
+#define NoneN 1000
+using namespace Eigen;
+using namespace std;
+
+namespace {
+const char *kNuclearNorm = "NuclearNorm";
+const size_t kNuclearNormInputNum = 1;
+const size_t kNuclearNormOutputNum = 1;
+constexpr int64_t kParallelDataNums = 1 * 1024;
+const size_t DIM_SIZE1 = 1;
+const size_t DIM_SIZE2 = 2;
+const size_t DIM_SIZE3 = 3;
+const size_t DIM_SIZE4 = 4;
+const size_t DIM_SIZE5 = 5;
+const size_t DIM_SIZE6 = 6;
+const size_t DIM_SIZE7 = 7;
+const size_t DIM_SIZE8 = 8;
+const size_t DIM_INDEX0 = 0;
+const size_t DIM_INDEX1 = 1;
+const size_t DIM_INDEX2 = 2;
+}  // namespace
+
+namespace aicpu {
+uint32_t NuclearNormCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kNuclearNormInputNum, kNuclearNormOutputNum),
+                      "NuclearNorm Check input and output number failed.");
+  KERNEL_HANDLE_ERROR(NuclearNormParamCheck(ctx), "NuclearNorm Check params failed.");
+
+  auto data_type = ctx.Input(0)->GetDataType();
+  uint32_t res = KERNEL_STATUS_OK;
+
+  switch (data_type) {
+    case (DT_FLOAT): {
+      res = NuclearNormCompute<float>(ctx);
+      break;
+    }
+    case (DT_DOUBLE): {
+      res = NuclearNormCompute<double>(ctx);
+      break;
+    }
+    default:
+      KERNEL_LOG_ERROR("NuclearNorm kernel data type [%s] not support.", DTypeStr(data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (res != KERNEL_STATUS_OK) {
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t NuclearNormCpuKernel::NuclearNormParamCheck(CpuKernelContext &ctx) {
+  Tensor *input = ctx.Input(0);
+  Tensor *output = ctx.Output(0);
+  KERNEL_CHECK_FALSE((input->GetDataType() == output->GetDataType()), KERNEL_STATUS_PARAM_INVALID,
+                     "The data type of the input [%s] need be the same as the output [%s]",
+                     DTypeStr(input->GetDataType()).c_str(), DTypeStr(output->GetDataType()).c_str());
+  const size_t input_dimnum = input->GetTensorShape()->GetDims();
+  KERNEL_CHECK_FALSE((input_dimnum >= DIM_SIZE2 && input_dimnum <= DIM_SIZE8), KERNEL_STATUS_PARAM_INVALID,
+                     "The range of the dimension of the input tensor should be "
+                     "[%lld,%lld], but got input's dimension=%lld",
+                     DIM_SIZE2, DIM_SIZE8, input_dimnum);
+  AttrValue *dim_ptr = ctx.GetAttr("dim");
+  std::vector<int64_t> dim_temp = {0, 1};
+  std::vector<int64_t> dim = (dim_ptr == nullptr) ? dim_temp : dim_ptr->GetListInt();
+  if (dim_ptr == nullptr) {
+    KERNEL_CHECK_FALSE((input_dimnum == DIM_SIZE2), KERNEL_STATUS_PARAM_INVALID,
+                       "When Attr dim is none, NuclearNorm expected a tensor with 2 "
+                       "dimensions, but got a tensor with [%lld] dimensions instead.",
+                       input_dimnum);
+  }
+  if (dim.size() == 1 && dim[0] == NoneN) {
+    dim.clear();
+    dim.push_back(0);
+    dim.push_back(1);
+  }
+  KERNEL_CHECK_FALSE((dim.size() == DIM_SIZE2), KERNEL_STATUS_PARAM_INVALID,
+                     "Attr dim'size must equal to 2, but got dim's size : [%lld]", dim.size());
+  int64_t lower_bound = 0 - input_dimnum;
+  int64_t upper_bound = input_dimnum - 1;
+  KERNEL_CHECK_FALSE((dim[0] >= lower_bound && dim[0] <= upper_bound), KERNEL_STATUS_PARAM_INVALID,
+                     "The range of dim[0] should be [%lld,%lld], but got input dim[0]=%lld", lower_bound, upper_bound,
+                     dim[0]);
+  KERNEL_CHECK_FALSE((dim[1] >= lower_bound && dim[1] <= upper_bound), KERNEL_STATUS_PARAM_INVALID,
+                     "The range of dim[1] should be [%lld,%lld], but got input dim[1]=%lld", lower_bound, upper_bound,
+                     dim[1]);
+  dim[0] = (dim[0] < 0) ? dim[0] + input_dimnum : dim[0];
+  dim[1] = (dim[1] < 0) ? dim[1] + input_dimnum : dim[1];
+  KERNEL_CHECK_FALSE((dim[0] != dim[1]), KERNEL_STATUS_PARAM_INVALID,
+                     "The values in attr dim point to the same dimension.");
+  KERNEL_LOG_DEBUG("NuclearNormCpuKernel[%s], input: size[%llu], output: size[%llu].", ctx.GetOpType().c_str(),
+                   input->GetDataSize(), output->GetDataSize());
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t NuclearNormCpuKernel::NuclearNormCompute(CpuKernelContext &ctx) {
+  Tensor *input_ptr = ctx.Input(0);
+  auto input_shape = input_ptr->GetTensorShape();
+  std::vector<int64_t> input_dims = input_shape->GetDimSizes();
+  uint32_t res = KERNEL_STATUS_OK;
+  switch (input_dims.size()) {
+    case DIM_SIZE2:
+      res = ComputeTensorNuclearNorm<T, DIM_SIZE2>(ctx);
+      break;
+    case DIM_SIZE3:
+      res = ComputeTensorNuclearNorm<T, DIM_SIZE3>(ctx);
+      break;
+    case DIM_SIZE4:
+      res = ComputeTensorNuclearNorm<T, DIM_SIZE4>(ctx);
+      break;
+    case DIM_SIZE5:
+      res = ComputeTensorNuclearNorm<T, DIM_SIZE5>(ctx);
+      break;
+    case DIM_SIZE6:
+      res = ComputeTensorNuclearNorm<T, DIM_SIZE6>(ctx);
+      break;
+    case DIM_SIZE7:
+      res = ComputeTensorNuclearNorm<T, DIM_SIZE7>(ctx);
+      break;
+    case DIM_SIZE8:
+      res = ComputeTensorNuclearNorm<T, DIM_SIZE8>(ctx);
+      break;
+    default:
+      KERNEL_LOG_ERROR(
+        "Only tensors with ranks between 2 and 8 are currently supported."
+        "Tensor rank: [%d]",
+        input_dims.size());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (res != KERNEL_STATUS_OK) {
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T, int32_t RANK>
+uint32_t NuclearNormCpuKernel::ComputeTensorNuclearNorm(const CpuKernelContext &ctx) {
+  Tensor *input_ptr = ctx.Input(0);
+  auto input_shape = input_ptr->GetTensorShape();
+  void *data_ptr = input_ptr->GetData();
+  int64_t value_num_ = input_ptr->NumElements();
+
+  T *input_data_ptr = reinterpret_cast<T *>(data_ptr);
+  int64_t total_copy_size = value_num_ * static_cast<int64_t>(sizeof(T));
+  Eigen::Tensor<T, 1, Eigen::RowMajor> eigen_tensor(value_num_);
+  int memcpy_ret = memcpy_s(&eigen_tensor(0), total_copy_size, input_data_ptr, total_copy_size);
+
+  if (memcpy_ret != 0) {
+    KERNEL_LOG_ERROR("memcpy_s error!");
+  }
+  std::vector<int64_t> input_dims = input_shape->GetDimSizes();
+  std::array<Eigen::DenseIndex, RANK> dim_array;
+  const int64_t input_dimnum = static_cast<int64_t>(input_shape->GetDims());
+  for (int64_t i = 0; i < input_dimnum; i++) {
+    dim_array.at(i) = input_dims[i];
+  }
+  Eigen::Tensor<T, RANK, Eigen::RowMajor> reshaped_tensor = eigen_tensor.reshape(dim_array);
+
+  AttrValue *dim_ptr = ctx.GetAttr("dim");
+  std::vector<int64_t> dim_temp = {0, 1};
+  std::vector<int64_t> dim = (dim_ptr == nullptr) ? dim_temp : dim_ptr->GetListInt();
+  if (dim.size() == 1 && dim[0] == NoneN) {
+    dim.clear();
+    dim.push_back(0);
+    dim.push_back(1);
+  }
+  dim[0] = (dim[0] < 0) ? dim[0] + input_dimnum : dim[0];
+  dim[1] = (dim[1] < 0) ? dim[1] + input_dimnum : dim[1];
+
+  int64_t j = 0;
+  for (int64_t i = 0; i < input_dimnum; i++) {
+    if (i != dim[0] && i != dim[1]) {
+      dim_array.at(j) = i;
+      j++;
+    }
+  }
+  dim_array.at(j) = dim[0];
+  dim_array.at(j + 1) = dim[1];
+  Eigen::Tensor<T, RANK, Eigen::RowMajor> shuffled_tensor = reshaped_tensor.shuffle(dim_array);
+
+  int64_t dimsize0 = input_shape->GetDimSize(dim[0]);
+  int64_t dimsize1 = input_shape->GetDimSize(dim[1]);
+  int64_t iter_number = value_num_ / (dimsize0 * dimsize1);
+
+  std::array<Eigen::DenseIndex, DIM_SIZE3> dim_array_last;
+  dim_array_last.at(DIM_INDEX0) = iter_number;
+  dim_array_last.at(DIM_INDEX1) = dimsize0;
+  dim_array_last.at(DIM_INDEX2) = dimsize1;
+  Eigen::Tensor<T, DIM_SIZE3, Eigen::RowMajor> permuted_tensor = shuffled_tensor.reshape(dim_array_last);
+
+  auto output_data_ptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
+  int64_t copy_size = (dimsize0 * dimsize1) * static_cast<int64_t>(sizeof(T));
+  if (iter_number <= kParallelDataNums) {
+    for (int64_t i = 0; i < iter_number; i++) {
+      T *mat = new T[dimsize0 * dimsize1];
+      memcpy(mat, &permuted_tensor(i, 0, 0), copy_size);
+      T nuclear_norm = matrix_nuclear_norm<T>(mat, dimsize0, dimsize1);
+      *(output_data_ptr + i) = nuclear_norm;
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > static_cast<uint64_t>(iter_number)) {
+      max_core_num = static_cast<uint64_t>(iter_number);
+    }
+
+    auto shared_nuclear_norm = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        T *mat = new T[dimsize0 * dimsize1];
+        memcpy(mat, &permuted_tensor(i, 0, 0), copy_size);
+        T nuclear_norm = matrix_nuclear_norm<T>(mat, dimsize0, dimsize1);
+        *(output_data_ptr + i) = nuclear_norm;
+      }
+    };
+    if (max_core_num != 0) {
+      KERNEL_HANDLE_ERROR(
+        CpuKernelUtils::ParallelFor(ctx, static_cast<uint64_t>(iter_number),
+                                    static_cast<uint64_t>(iter_number) / max_core_num, shared_nuclear_norm),
+        "NuclearNorm Compute failed.");
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+std::vector<std::vector<T>> NuclearNormCpuKernel::matrix_multiply(std::vector<std::vector<T>> const arrL,
+                                                                  std::vector<std::vector<T>> const arrR) {
+  size_t rowL = arrL.size();
+  size_t colL = arrL[0].size();
+  size_t colR = arrR[0].size();
+
+  std::vector<std::vector<T>> res(rowL);
+  for (size_t i = 0; i < res.size(); i++) {
+    res[i].resize(colR);
+  }
+
+  for (size_t i = 0; i < rowL; i++) {
+    for (size_t j = 0; j < colR; j++) {
+      for (size_t k = 0; k < colL; k++) {
+        res[i][j] += arrL[i][k] * arrR[k][j];
+      }
+    }
+  }
+
+  return res;
+}
+
+template <typename T>
+std::vector<std::vector<T>> NuclearNormCpuKernel::transpose(std::vector<std::vector<T>> const arr) {
+  size_t row = arr.size();
+  size_t col = arr[0].size();
+
+  std::vector<std::vector<T>> trans(col);
+  for (size_t i = 0; i < col; i++) {
+    trans[i].resize(row);
+  }
+
+  for (size_t i = 0; i < col; i++) {
+    for (size_t j = 0; j < row; j++) {
+      trans[i][j] = arr[j][i];
+    }
+  }
+  return trans;
+}
+
+template <typename T>
+std::vector<size_t> NuclearNormCpuKernel::argsort(const std::vector<T> &array) {
+  const size_t array_len(array.size());
+  std::vector<size_t> array_index(array_len, 0);
+  for (size_t i = 0; i < array_len; ++i) array_index[i] = i;
+
+  sort(array_index.begin(), array_index.end(),
+       [&array](size_t pos1, size_t pos2) { return (array[pos1] > array[pos2]); });
+
+  return array_index;
+}
+
+template <typename T>
+void NuclearNormCpuKernel::get_row_col(std::vector<std::vector<T>> arr, T *max, size_t *row, size_t *col) {
+  size_t n = arr.size();
+  for (size_t i = 0; i < n; i++) {
+    for (size_t j = 0; j < n; j++) {
+      if (i != j && fabs(arr[i][j]) > *max) {
+        *max = fabs(arr[i][j]);
+        *row = i;
+        *col = j;
+      }
+    }
+  }
+}
+
+template <typename T>
+void NuclearNormCpuKernel::svd(std::vector<std::vector<T>> arr, std::vector<std::vector<T>> &E, std::vector<T> &e) {
+  size_t n = arr.size();
+  size_t row = 0;
+  size_t col = 0;
+  size_t iter_max_num = 10000;
+  size_t iter_num = 0;
+  T eps = 1e-40;
+  T max = eps;
+  T dot5 = 0.5;
+
+  E.resize(n);
+  e.resize(n);
+  for (size_t i = 0; i < n; i++) {
+    E[i].resize(n, 0);
+    E[i][i] = 1;
+  }
+
+  while (iter_num < iter_max_num && max >= eps) {
+    max = fabs(arr[0][1]);
+    row = 0;
+    col = 1;
+
+    get_row_col<T>(arr, &max, &row, &col);
+    T theta = dot5 * atan2(-2 * arr[row][col], -(arr[row][row] - arr[col][col]));
+
+    T aii = arr[row][row];
+    T ajj = arr[col][col];
+    T aij = arr[row][col];
+    T sin_theta = sin(theta);
+    T cos_theta = cos(theta);
+    T sin_2theta = sin(2 * theta);
+    T cos_2theta = cos(2 * theta);
+    arr[row][row] = aii * cos_theta * cos_theta + ajj * sin_theta * sin_theta + aij * sin_2theta;
+    arr[col][col] = aii * sin_theta * sin_theta + ajj * cos_theta * cos_theta - aij * sin_2theta;
+    arr[row][col] = dot5 * (ajj - aii) * sin_2theta + aij * cos_2theta;
+    arr[col][row] = arr[row][col];
+    for (size_t k = 0; k < n; k++) {
+      if (k != row && k != col) {
+        T arowk = arr[row][k];
+        T acolk = arr[col][k];
+        arr[row][k] = arowk * cos_theta + acolk * sin_theta;
+        arr[k][row] = arr[row][k];
+        arr[col][k] = acolk * cos_theta - arowk * sin_theta;
+        arr[k][col] = arr[col][k];
+      }
+    }
+
+    T Eki;
+    T Ekj;
+    for (size_t k = 0; k < n; k++) {
+      Eki = E[k][row];
+      Ekj = E[k][col];
+      E[k][row] = Eki * cos_theta + Ekj * sin_theta;
+      E[k][col] = Ekj * cos_theta - Eki * sin_theta;
+    }
+    iter_num++;
+  }
+
+  for (size_t i = 0; i < n; i++) {
+    e[i] = arr[i][i];
+  }
+
+  std::vector<size_t> sort_index;
+  sort_index = argsort<T>(e);
+
+  std::vector<std::vector<T>> E_sorted(n);
+  for (size_t i = 0; i < n; i++) {
+    E_sorted[i].resize(n);
+  }
+  std::vector<T> e_sorted(n);
+  for (size_t i = 0; i < n; i++) {
+    e_sorted[i] = e[sort_index[i]];
+    for (size_t j = 0; j < n; j++) {
+      E_sorted[i][j] = E[i][sort_index[j]];
+    }
+  }
+  E = E_sorted;
+  e = e_sorted;
+}
+
+template <typename T>
+T NuclearNormCpuKernel::matrix_nuclear_norm(T *mat, size_t dim0, size_t dim1) {
+  if (dim1 == DIM_SIZE1) {
+    T nuclear_norm = 0.0;
+    T temp = 0.0;
+    for (size_t j = 0; j < dim0; j++) {
+      temp = mat[j];
+      temp = temp * temp;
+      nuclear_norm += temp;
+    }
+    nuclear_norm = sqrt(nuclear_norm);
+    return nuclear_norm;
+  }
+  std::vector<std::vector<double>> arr(dim0);
+  size_t S_dim_size = dim0 < dim1 ? dim0 : dim1;
+  for (size_t i = 0; i < arr.size(); i++) {
+    arr[i].resize(dim1);
+  }
+  for (size_t i = 0; i < dim0; i++) {
+    for (size_t j = 0; j < dim1; j++) {
+      arr[i][j] = mat[i * dim1 + j];
+    }
+  }
+
+  std::vector<std::vector<double>> ATA;
+  std::vector<std::vector<double>> E;
+  std::vector<double> e;
+
+  ATA = matrix_multiply<double>(transpose(arr), arr);
+  svd<double>(ATA, E, e);
+
+  double nuclear_norm = 0.0;
+  for (size_t i = DIM_INDEX0; i < S_dim_size; i++) {
+    if (e[i] > 0) {
+      nuclear_norm += sqrt(e[i]);
+    }
+  }
+
+  return nuclear_norm;
+}
+REGISTER_CPU_KERNEL(kNuclearNorm, NuclearNormCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.h
@ -0,0 +1,66 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_NUCLEARNORM_H_
+#define AICPU_KERNELS_NORMALIZED_NUCLEARNORM_H_
+#include <memory>
+#include <vector>
+
+#include "cpu_ops_kernel.h"
+#include "cpu_kernel_utils.h"
+#include "kernel_log.h"
+#include "securec.h"
+#include "status.h"
+#include "utils/bcast.h"
+
+namespace aicpu {
+class NuclearNormCpuKernel : public CpuKernel {
+ public:
+  NuclearNormCpuKernel() = default;
+  ~NuclearNormCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  uint32_t NuclearNormParamCheck(CpuKernelContext &ctx);
+
+  template <typename T>
+  uint32_t NuclearNormCompute(CpuKernelContext &ctx);
+
+  template <typename T, int32_t RANK>
+  uint32_t ComputeTensorNuclearNorm(const CpuKernelContext &ctx);
+
+  template <typename T>
+  std::vector<std::vector<T>> matrix_multiply(std::vector<std::vector<T>> const arrL,
+                                              std::vector<std::vector<T>> const arrR);
+
+  template <typename T>
+  std::vector<std::vector<T>> transpose(std::vector<std::vector<T>> const arr);
+
+  template <typename T>
+  std::vector<size_t> argsort(const std::vector<T> &array);
+
+  template <typename T>
+  void get_row_col(std::vector<std::vector<T>> arr, T *max, size_t *row, size_t *col);
+
+  template <typename T>
+  void svd(std::vector<std::vector<T>> arr, std::vector<std::vector<T>> &E, std::vector<T> &e);
+
+  template <typename T>
+  T matrix_nuclear_norm(T *mat, size_t dim0, size_t dim1);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/quantile.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/quantile.cc
@ -0,0 +1,410 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "quantile.h"
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+constexpr uint32_t kQuantileInputNum = 2;
+constexpr uint32_t kQuantileOutputNum = 1;
+const int64_t paralled_data_size = 64 * 1024;
+const int64_t kQuantileAttrDefaultDim = 10000;
+const char *kQuantile = "Quantile";
+}  // namespace
+
+namespace aicpu {
+template <typename T>
+uint32_t QuantileCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
+  input_ = ctx.Input(0);
+  DataType input_type = input_->GetDataType();
+  int64_t input_dim = input_->GetTensorShape()->GetDims();
+  int64_t input_size = input_->GetTensorShape()->NumElements();
+  q_ = ctx.Input(1);
+  int64_t q_size = q_->GetTensorShape()->NumElements();
+  T *q_addrs = reinterpret_cast<T *>(q_->GetData());
+  DataType q_type = q_->GetDataType();
+  int64_t q_dim = q_->GetTensorShape()->GetDims();
+  int64_t min = -input_dim;
+  int64_t max = input_dim - 1;
+  auto dim_attr = ctx.GetAttr("dim");
+  dim_ = (dim_attr == nullptr) ? kQuantileAttrDefaultDim : dim_attr->GetInt();
+  auto keep_dims_attr = ctx.GetAttr("keep_dims");
+  keep_dims_ = (keep_dims_attr == nullptr) ? false : keep_dims_attr->GetBool();
+  auto ignore_attr = ctx.GetAttr("ignore_nan");
+  ignore_nan_ = (ignore_attr == nullptr) ? false : ignore_attr->GetBool();
+
+  KERNEL_CHECK_FALSE(input_size > 0, KERNEL_STATUS_PARAM_INVALID, "quantile() input tensor must be non-empty");
+  KERNEL_CHECK_FALSE(q_dim <= 1, KERNEL_STATUS_PARAM_INVALID,
+                     "quantile() q must be a scalar or 1D tensor,but got dimension = [%d].", q_dim);
+  KERNEL_CHECK_FALSE(input_type == q_type, KERNEL_STATUS_PARAM_INVALID,
+                     "quantile() q tensor must be same dtype as the input tensor");
+
+  for (int64_t j = 0; j < q_size; ++j) {
+    KERNEL_CHECK_FALSE(q_addrs[j] <= 1 && q_addrs[j] >= 0, KERNEL_STATUS_PARAM_INVALID,
+                       "quantile() q values must be in the range [0, 1]");
+  }
+  DataType out_type = ctx.Output(0)->GetDataType();
+  output_ = ctx.Output(0);
+  KERNEL_CHECK_FALSE(out_type == input_type, KERNEL_STATUS_PARAM_INVALID,
+                     "quantile() out tensor must be same dtype as the input tensor");
+  if (dim_ != kQuantileAttrDefaultDim) {
+    KERNEL_CHECK_FALSE(dim_ >= min && dim_ <= max, KERNEL_STATUS_PARAM_INVALID,
+                       "Dimension out of range (expected to be in range of [%d] and [%d]).", min, max);
+  }
+  dim_ = MaybeWrapDim(dim_, input_dim);
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t QuantileCpuKernel::MaybeWrapDim(int64_t dim, int64_t dim_post_expr) {
+  if (dim == kQuantileAttrDefaultDim) {
+    return dim;
+  }
+  if (dim_post_expr <= 0) {
+    dim_post_expr = 1;
+  }
+  int64_t min = -dim_post_expr;
+  int64_t max = dim_post_expr - 1;
+  KERNEL_CHECK_FALSE(dim >= min && dim <= max, KERNEL_STATUS_PARAM_INVALID,
+                     "Dimension out of range (expected to be in range of [%d] and [%d]).", min, max)
+  if (dim < 0) {
+    dim += dim_post_expr;
+  }
+  return dim;
+}
+
+template <typename T>
+std::vector<T> transpose(std::vector<T> &f, std::vector<int64_t> &shape, int index) {
+  int element_count = f.size();
+  int m = shape.size();
+  int i;
+  int *indexA = (int *)malloc(sizeof(int) * m);
+  if (indexA == nullptr) {
+    return {};
+  }
+
+  std::vector<int> pos(m);
+  for (int i = 0; i < m; i++) pos[i] = i;
+  if (m != 0) {
+    std::swap(pos[m - 1], pos[((index + m) % m)]);
+  }
+
+  int *indexB = (int *)malloc(sizeof(int) * m);
+  if (indexB == nullptr) {
+    free(indexA);
+    return {};
+  }
+
+  std::vector<T> b(element_count);
+  std::vector<int64_t> shapeb(shape);
+  for (int i = 0; i < m; i++) {
+    shapeb[i] = shape[pos[i]];
+  }
+
+  for (int src = 0; src < element_count; src++) {
+    int temp = src;
+    for (i = m - 1; i >= 0; i--) {
+      indexA[i] = temp % shape[i];
+      temp = temp / shape[i];
+    }
+
+    for (i = 0; i < m; i++) {
+      indexB[i] = indexA[pos[i]];
+    }
+
+    int dst = 0;
+    temp = 1;
+    for (i = m - 1; i >= 0; i--) {
+      dst = dst + indexB[i] * temp;
+      temp = temp * shapeb[i];
+    }
+    b[dst] = f[src];
+  }
+  free(indexA);
+  free(indexB);
+
+  return b;
+}
+
+template <typename T>
+void QuantileCpuKernel::QuantileComputeParallelFunc(size_t start, size_t end, int64_t last_shape_size,
+                                                    std::vector<T> &sorted) {
+  uint64_t q_size = q_->GetTensorShape()->NumElements();
+  T *output_addr = reinterpret_cast<T *>(output_->GetData());
+  T *q_addrs = reinterpret_cast<T *>(q_->GetData());
+  for (u_int64_t i = start; i < end; i++) {
+    std::vector<T> tmp;
+    std::sort(sorted.begin() + i * last_shape_size, sorted.begin() + (i + 1) * last_shape_size);
+    bool has_nan = false;
+    bool all_nan = true;
+
+    for (u_int64_t j = i * last_shape_size; j < (i + 1) * last_shape_size; j++) {
+      if (std::isnan(sorted[j])) {
+        has_nan = true;
+      } else {
+        all_nan = false;
+      }
+    }
+
+    if ((has_nan && !ignore_nan_) || all_nan) {
+      for (uint64_t j = 0; j < q_size; ++j) {
+        output_addr[i * q_size + j] = NAN;
+      }
+      continue;
+    }
+    for (auto k = i * last_shape_size; k < (i + 1) * last_shape_size; k++) {
+      auto x = sorted[k];
+      if (!isnan(x)) {
+        tmp.push_back(x);
+      }
+    }
+    std::sort(tmp.begin(), tmp.end());
+    for (uint64_t j = 0; j < q_size; ++j) {
+      T index = (tmp.size() - 1) * q_addrs[j];
+      int32_t idx = index;
+      if (idx == (int32_t)tmp.size() - 1) {
+        output_addr[i * q_size + j] = tmp[idx];
+        continue;
+      }
+      output_addr[i * q_size + j] = tmp[idx] + (tmp[idx + 1] - tmp[idx]) * (index - idx);
+    }
+  }
+}
+
+template <typename T>
+void QuantileCpuKernel::QuantileComputeSerialFunc(int64_t last_shape_size, std::vector<T> &sorted) {
+  uint64_t n = input_->GetTensorShape()->NumElements();
+  uint64_t q_size = q_->GetTensorShape()->NumElements();
+  T *output_addr = reinterpret_cast<T *>(output_->GetData());
+  T *q_addrs = reinterpret_cast<T *>(q_->GetData());
+  for (u_int64_t i = 0; i < n; i += last_shape_size) {
+    std::vector<T> tmp;
+    sort(sorted.begin() + i, sorted.begin() + i + last_shape_size);
+    bool has_nan = false;
+    bool all_nan = true;
+    for (auto j = i; j < i + last_shape_size; j++) {
+      if (!isnan(sorted[j])) {
+        tmp.push_back(sorted[j]);
+        all_nan = false;
+      } else {
+        has_nan = true;
+      }
+    }
+    sort(tmp.begin(), tmp.end());
+    for (uint64_t j = 0; j < q_size; ++j) {
+      if ((has_nan && !ignore_nan_) || all_nan) {
+        output_addr[i * q_size / last_shape_size + j] = NAN;
+        continue;
+      }
+
+      T index = (tmp.size() - 1) * q_addrs[j];
+      int32_t idx = index;
+      if (idx == (int32_t)tmp.size() - 1) {
+        output_addr[i * q_size / last_shape_size + j] = tmp[idx];
+        continue;
+      }
+      output_addr[i * q_size / last_shape_size + j] = tmp[idx] + (tmp[idx + 1] - tmp[idx]) * (index - idx);
+    }
+  }
+}
+template <typename T>
+void QuantileCpuKernel::QuantileComputeDefaultFunc(std::vector<T> &sorted) {
+  uint64_t q_size = q_->GetTensorShape()->NumElements();
+  T *output_addr = reinterpret_cast<T *>(output_->GetData());
+  T *q_addrs = reinterpret_cast<T *>(q_->GetData());
+  std::sort(sorted.begin(), sorted.end());
+  bool all_nan = true;
+  std::vector<T> tmp;
+  for (auto &x : sorted) {
+    if (!isnan(x)) {
+      tmp.push_back(x);
+      all_nan = false;
+    }
+  }
+  std::sort(tmp.begin(), tmp.end());
+  for (uint64_t i = 0; i < q_size; ++i) {
+    if ((has_nan_ && !ignore_nan_) || all_nan) {
+      output_addr[i] = NAN;
+      continue;
+    }
+    T index = (tmp.size() - 1) * q_addrs[i];
+    int32_t idx = index;
+    if (idx == (int32_t)tmp.size() - 1) {
+      output_addr[i] = tmp[idx];
+      continue;
+    }
+    output_addr[i] = tmp[idx] + (tmp[idx + 1] - tmp[idx]) * (index - idx);
+  }
+}
+
+std::vector<int64_t> QuantileCpuKernel::SetQuantileOutputShape() {
+  std::vector<int64_t> out_shape;
+  int64_t q_dim = q_->GetTensorShape()->NumElements();
+  int64_t input_dim = input_->GetTensorShape()->GetDims();
+  uint64_t q_size = q_->GetTensorShape()->NumElements();
+  std::vector<int64_t> input_shapesize = input_->GetTensorShape()->GetDimSizes();
+  if (dim_ != kQuantileAttrDefaultDim && input_dim > 0) {
+    out_shape = input_shapesize;
+    if (keep_dims_) {
+      out_shape[dim_] = 1;
+    } else {
+      out_shape.erase(out_shape.begin() + dim_);
+    }
+  } else if (keep_dims_) {
+    out_shape = std::vector<int64_t>(input_dim, 1);
+  }
+  if (q_dim > 0) {
+    out_shape.insert(out_shape.begin(), q_size);
+  }
+  return out_shape;
+}
+
+template <typename T>
+uint32_t QuantileCpuKernel::QuantileCompute(CpuKernelContext &ctx) {
+  T *input_addrs = reinterpret_cast<T *>(ctx.Input(0)->GetData());
+  size_t data_size = input_->GetTensorShape()->NumElements() * sizeof(T);
+
+  std::vector<int64_t> out_shape = SetQuantileOutputShape();
+  std::vector<int64_t> input_dims = input_->GetTensorShape()->GetDimSizes();
+  int64_t input_shape_size = input_->GetTensorShape()->GetDims();
+  std::vector<T> sorted;
+  int64_t n = input_->GetTensorShape()->NumElements();
+  for (int64_t i = 0; i < n; i++) {
+    sorted.push_back(input_addrs[i]);
+    if (isnan(input_addrs[i])) {
+      has_nan_ = true;
+    }
+  }
+
+  if (data_size <= paralled_data_size) {
+    if (dim_ == kQuantileAttrDefaultDim) {
+      QuantileComputeDefaultFunc<T>(sorted);
+    } else if (dim_ == input_shape_size - 1) {
+      QuantileComputeSerialFunc<T>(input_dims[input_dims.size() - 1], sorted);
+    } else {
+      input_dims.push_back(1);
+      sorted = transpose<T>(sorted, input_dims, dim_);
+      int32_t m = input_dims.size();
+      if (m != 0) {
+        std::swap(input_dims[m - 1], input_dims[((dim_ + m) % m)]);
+      }
+      QuantileComputeSerialFunc<T>(input_dims[input_dims.size() - 1], sorted);
+    }
+  } else {
+    DoParallelQuantile(ctx, sorted, input_dims);
+  }
+  SetOutput<T>(out_shape);
+  return KERNEL_STATUS_OK;
+}
+template <typename T>
+uint32_t QuantileCpuKernel::DoParallelQuantile(CpuKernelContext &ctx, std::vector<T> sorted,
+                                               std::vector<int64_t> input_dims) {
+  int64_t input_shape_size = input_->GetTensorShape()->GetDims();
+  std::vector<int64_t> input_shape_dims = input_->GetTensorShape()->GetDimSizes();
+  int64_t n = input_->GetTensorShape()->NumElements();
+  if (dim_ == kQuantileAttrDefaultDim) {
+    QuantileComputeDefaultFunc<T>(sorted);
+  } else if (dim_ == input_shape_size - 1) {
+    int64_t last_shape_size = input_dims[input_dims.size() - 1];
+    auto shard_quantile = [&](size_t start, size_t end) {
+      QuantileComputeParallelFunc<T>(start, end, last_shape_size, sorted);
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, n / last_shape_size, last_shape_size, shard_quantile),
+                        "Quantile Compute failed.");
+  } else {
+    input_shape_dims.push_back(1);
+    sorted = transpose<T>(sorted, input_shape_dims, dim_);
+    int32_t m = input_shape_dims.size();
+    if (m != 0) {
+      std::swap(input_shape_dims[m - 1], input_shape_dims[((dim_ + m) % m)]);
+    }
+    int64_t last_shape_size = input_shape_dims[input_shape_dims.size() - 1];
+    auto shard_quantile = [&](size_t start, size_t end) {
+      QuantileComputeParallelFunc<T>(start, end, last_shape_size, sorted);
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, n / last_shape_size, last_shape_size, shard_quantile),
+                        "Quantile Compute failed.");
+  }
+  return 0;
+}
+template <typename T>
+void QuantileCpuKernel::SetOutput(std::vector<int64_t> &out_shape) {
+  auto output_addr = reinterpret_cast<T *>(output_->GetData());
+
+  int64_t l = output_->GetTensorShape()->NumElements();
+  std::vector<T> out;
+  int64_t q_dim = q_->GetTensorShape()->GetDims();
+  std::vector<int64_t> tmp(out_shape);
+  if (q_dim > 0) {
+    for (int i = 0; i < l; i++) {
+      out.push_back(*(output_addr + i));
+    }
+
+    int64_t out_end_shape = out_shape[out_shape.size() - 1];
+    out_shape.push_back(out_end_shape);
+    std::swap(out_shape[0], out_shape[out_shape.size() - 1]);
+    out_shape.erase(out_shape.begin());
+    out_shape.insert(out_shape.begin(), 1);
+    out = transpose<T>(out, out_shape, 0);
+    for (int i = 0; i < l; i++) {
+      output_addr[i] = out[i];
+    }
+  }
+  output_->GetTensorShape()->SetDimSizes(tmp);
+}
+
+uint32_t QuantileCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kQuantileInputNum, kQuantileOutputNum), "[%s] check params failed.", kQuantile);
+  uint32_t res = KERNEL_STATUS_OK;
+
+  auto data_type = ctx.Input(0)->GetDataType();
+  switch (data_type) {
+    case DT_FLOAT:
+      res = GetInputAndCheck<float>(ctx);
+      break;
+    case DT_DOUBLE:
+      res = GetInputAndCheck<double>(ctx);
+      break;
+    default:
+      KERNEL_LOG_ERROR("Quantile invalid input type [%s]", DTypeStr(data_type).c_str());
+      break;
+  }
+  KERNEL_CHECK_FALSE((res == KERNEL_STATUS_OK), res, "GetInputAndCheck failed.");
+  switch (data_type) {
+    case DT_FLOAT:
+      res = QuantileCompute<float>(ctx);
+      break;
+    case DT_DOUBLE:
+      res = QuantileCompute<double>(ctx);
+      break;
+    default:
+      KERNEL_LOG_ERROR("Quantile invalid input type [%s]", DTypeStr(data_type).c_str());
+      break;
+  }
+  if (res != KERNEL_STATUS_OK) {
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kQuantile, QuantileCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/quantile.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/quantile.h
@ -0,0 +1,61 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_QUANTILE_H_
+#define AICPU_KERNELS_NORMALIZED_QUANTILE_H_
+
+#include <vector>
+
+#include "cpu_ops_kernel.h"
+namespace aicpu {
+class QuantileCpuKernel : public CpuKernel {
+ public:
+  QuantileCpuKernel() = default;
+
+  ~QuantileCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T>
+  uint32_t GetInputAndCheck(CpuKernelContext &ctx);
+  template <typename T>
+  uint32_t QuantileCompute(CpuKernelContext &ctx);
+  uint32_t MaybeWrapDim(int64_t dim, int64_t dim_post_expr);
+  template <typename T>
+  void QuantileComputeSerialFunc(int64_t last_shape_size, std::vector<T> &sorted);
+  template <typename T>
+  void QuantileComputeParallelFunc(size_t start, size_t end, int64_t last_shape_size, std::vector<T> &sorted);
+
+  template <typename T>
+  void QuantileComputeDefaultFunc(std::vector<T> &sorted);
+  std::vector<int64_t> SetQuantileOutputShape();
+  template <typename T>
+  void SetOutput(std::vector<int64_t> &out_shape);
+  template <typename T>
+  uint32_t DoParallelQuantile(CpuKernelContext &ctx, std::vector<T> sorted, std::vector<int64_t> input_dims);
+  int64_t last_shape_size_ = 0;
+  bool ignore_nan_ = false;
+  bool keep_dims_ = false;
+  int dim_ = 0;
+  int64_t input_dim_ = 0;
+  Tensor *input_ = nullptr;
+  Tensor *output_ = nullptr;
+  Tensor *q_ = nullptr;
+  bool has_nan_ = false;
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n.cc
@ -0,0 +1,154 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sparse_segment_sqrt_n.h"
+
+#include <math.h>
+
+#include "Eigen/Core"
+#include "utils/kernel_util.h"
+
+namespace aicpu {
+const uint32_t kInputNum = 3;
+const uint32_t kOutputNum = 1;
+const char *SparseSegmentSqrtN = "SparseSegmentSqrtN";
+
+#define COMPUTE_CASE(DTYPE, TYPE, DTYPE_1, DTYPE_2, CTX)   \
+  case (DTYPE):                                            \
+    if ((DTYPE_1) == DT_INT32) {                           \
+      if ((DTYPE_2) == DT_INT32) {                         \
+        return ComputeKernal<TYPE, int32_t, int32_t>(CTX); \
+      } else {                                             \
+        return ComputeKernal<TYPE, int32_t, int64_t>(CTX); \
+      }                                                    \
+    } else {                                               \
+      if ((DTYPE_2) == DT_INT32) {                         \
+        return ComputeKernal<TYPE, int64_t, int32_t>(CTX); \
+      } else {                                             \
+        return ComputeKernal<TYPE, int64_t, int64_t>(CTX); \
+      }                                                    \
+    }                                                      \
+    break;
+}  // namespace aicpu
+
+namespace aicpu {
+uint32_t SparseSegmentSqrtNCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSegmentSqrtN normalcheck failed.");
+  Tensor *x = ctx.Input(0);
+  Tensor *indices = ctx.Input(1);
+  Tensor *segment_ids = ctx.Input(2);
+
+  auto x_shape = x->GetTensorShape();
+  auto indices_shape = indices->GetTensorShape();
+  auto segment_ids_shape = segment_ids->GetTensorShape();
+
+  if (x_shape->GetDims() < 1) {
+    KERNEL_LOG_ERROR("[%s] Tensor input0's rank less than 1.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  if (indices_shape->NumElements() != segment_ids_shape->NumElements()) {
+    KERNEL_LOG_ERROR("[%s] Tensor input1&input2's ranks mismatch.", ctx.GetOpType().c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  auto x_data_type = x->GetDataType();
+  auto indices_data_type = indices->GetDataType();
+  auto segment_ids_data_type = segment_ids->GetDataType();
+
+  if (x_data_type != DT_FLOAT && x_data_type != DT_DOUBLE && x_data_type != DT_FLOAT16) {
+    KERNEL_LOG_ERROR("SparseSegmentSqrtN kernel data type [%s] not support.", DTypeStr(x_data_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if ((indices_data_type != DT_INT32 && indices_data_type != DT_INT64) ||
+      (segment_ids_data_type != DT_INT32 && segment_ids_data_type != DT_INT64)) {
+    KERNEL_LOG_ERROR("SparseSegmentSqrtN kernel data type [%s] not support.", DTypeStr(indices_data_type).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  switch (x_data_type) {
+    COMPUTE_CASE(DT_FLOAT16, Eigen::half, indices_data_type, segment_ids_data_type, ctx)
+    COMPUTE_CASE(DT_FLOAT, float, indices_data_type, segment_ids_data_type, ctx)
+    COMPUTE_CASE(DT_DOUBLE, double, indices_data_type, segment_ids_data_type, ctx)
+    default:
+      KERNEL_LOG_ERROR("SparseSegmentSqrtN kernel data type [%s] not support.", DTypeStr(x_data_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T1, typename T2, typename T3>
+uint32_t SparseSegmentSqrtNCpuKernel::ComputeKernal(CpuKernelContext &ctx) {
+  size_t n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
+  size_t m = ctx.Input(2)->GetTensorShape()->NumElements();
+  size_t k = ctx.Output(0)->GetTensorShape()->NumElements();
+  auto x_addr = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
+  auto indices_addr = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
+  auto segment_ids_addr = reinterpret_cast<T3 *>(ctx.Input(2)->GetData());
+  auto y_addr = reinterpret_cast<T1 *>(ctx.Output(0)->GetData());
+  std::vector<int64_t> x_shape_list = ctx.Input(0)->GetTensorShape()->GetDimSizes();
+  x_shape_list[0] = segment_ids_addr[m - 1] + 1;
+  ctx.Output(0)->GetTensorShape()->SetDimSizes(x_shape_list);
+  for (size_t i = 0; i < k; i++) {
+    y_addr[i] = (T1)0;
+  }
+  if (segment_ids_addr[0] != 0) {
+    KERNEL_LOG_ERROR("segment_ids can't miss ids.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  for (size_t i = 1; i < m; i++) {
+    if (segment_ids_addr[i] < segment_ids_addr[i - 1]) {
+      KERNEL_LOG_ERROR("segment_ids should be sorted.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (segment_ids_addr[i] - segment_ids_addr[i - 1] > 1) {
+      KERNEL_LOG_ERROR("segment_ids can't miss ids.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  for (size_t i = 0; i < m; i++) {
+    if (indices_addr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
+      KERNEL_LOG_ERROR("indices out of range.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  int oldindex = -1;
+  int countnum = 0;
+  for (size_t i = 0; i < m; i++) {
+    if (oldindex == segment_ids_addr[i]) {
+      countnum++;
+    } else if (countnum != 0) {
+      for (size_t j = 0; j < n; j++) {
+        y_addr[j + oldindex * n] /= (T1)(sqrt(countnum));
+      }
+      countnum = 1;
+      oldindex = segment_ids_addr[i];
+    } else {
+      countnum = 1;
+      oldindex = segment_ids_addr[i];
+    }
+    for (size_t j = 0; j < n; j++) {
+      y_addr[j + oldindex * n] += x_addr[j + indices_addr[i] * n];
+    }
+  }
+  if (countnum != 0) {
+    for (size_t j = 0; j < n; j++) {
+      y_addr[j + oldindex * n] /= (T1)(sqrt(countnum));
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(SparseSegmentSqrtN, SparseSegmentSqrtNCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n.h
@ -0,0 +1,38 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_H_
+#define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_H_
+
+#include "cpu_ops_kernel.h"
+#include "cpu_types.h"
+#include "utils/bcast.h"
+#include "utils/sparse_tensor.h"
+
+namespace aicpu {
+class SparseSegmentSqrtNCpuKernel : public CpuKernel {
+ public:
+  SparseSegmentSqrtNCpuKernel() = default;
+  ~SparseSegmentSqrtNCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T1, typename T2, typename T3>
+  uint32_t ComputeKernal(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_prod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_prod.cc
@ -0,0 +1,171 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "unsorted_segment_prod.h"
+
+#include <string>
+#include "cpu_kernel_utils.h"
+#include "cpu_types.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+
+namespace {
+const char *kUnsortedSegmentProd = "UnsortedSegmentProd";
+const uint32_t input_num = 3;
+const uint32_t output_num = 1;
+constexpr int64_t kParallelDataNums = 64 * 1024;
+}  // namespace
+
+namespace aicpu {
+template <typename input_t, typename segment_ids_t, typename num_segments_t>
+uint32_t UnsortedSegmentProdCpuKernel::UnsortedSegmentProdComputeTemplate(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, input_num, output_num), " node input size should be [%llu],  get [%llu]",
+                      input_num, ctx.GetInputsSize(), " node output size should be [%llu],  get [%llu]", output_num,
+                      ctx.GetOutputsSize());
+  if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
+    KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s]",
+                     DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
+    KERNEL_LOG_ERROR(
+      "The data size of the input [%llu] need be the same as the output "
+      "[%llu]",
+      ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  int64_t data_size = ctx.Input(0)->NumElements();
+  int64_t id_size = ctx.Input(1)->NumElements();
+
+  auto input_x = reinterpret_cast<input_t *>(ctx.Input(0)->GetData());
+  KERNEL_CHECK_NULLPTR(input_x, KERNEL_STATUS_PARAM_INVALID, "Get input data failed")
+  auto output_y = reinterpret_cast<input_t *>(ctx.Output(0)->GetData());
+  KERNEL_CHECK_NULLPTR(output_y, KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
+  auto segmentids = reinterpret_cast<segment_ids_t *>(ctx.Input(1)->GetData());
+  KERNEL_CHECK_NULLPTR(segmentids, KERNEL_STATUS_PARAM_INVALID, "Get segment_ids failed")
+  auto numsegments = reinterpret_cast<num_segments_t *>(ctx.Input(2)->GetData());
+  KERNEL_CHECK_NULLPTR(numsegments, KERNEL_STATUS_PARAM_INVALID, "Get num_segments failed")
+
+  if (id_size <= 0) {
+    KERNEL_LOG_ERROR("segment_ids num elements should great than 0");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+
+  int64_t reshapesize = data_size / id_size;
+  // Initialized to 1
+  for (int64_t k = 0; k < data_size; k++) {
+    *(output_y + k) = static_cast<input_t>(1);
+  }
+  if (data_size <= kParallelDataNums) {
+    // calculation process
+    for (int64_t i = 0; i < id_size; i++) {
+      if (*(segmentids + i) < *numsegments) {
+        for (int64_t j = 0; j < reshapesize; j++) {
+          *(output_y + *(segmentids + i) * reshapesize + j) *= *(input_x + i * reshapesize + j);
+        }
+      }
+    }
+  } else {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num > reshapesize) {
+      max_core_num = reshapesize;
+    }
+    // calculation process
+    auto shard_unsorted_segment_prod = [&](int64_t start, int64_t end) {
+      for (int64_t i = 0; i < id_size; i++) {
+        if (*(segmentids + i) < *numsegments) {
+          for (int64_t j = start; j < end; j++) {
+            *(output_y + *(segmentids + i) * reshapesize + j) *= *(input_x + i * reshapesize + j);
+          }
+        }
+      }
+    };
+    KERNEL_HANDLE_ERROR(
+      CpuKernelUtils::ParallelFor(ctx, reshapesize, reshapesize / max_core_num, shard_unsorted_segment_prod),
+      "CpuKernelUtils::ParallelFor failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename input_t, typename segment_ids_t>
+uint32_t UnsortedSegmentProdCpuKernel::DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type) {
+  switch (num_segments_type) {
+    case DT_INT32:
+      return UnsortedSegmentProdComputeTemplate<input_t, segment_ids_t, int32_t>(ctx);
+    case DT_INT64:
+      return UnsortedSegmentProdComputeTemplate<input_t, segment_ids_t, int64_t>(ctx);
+
+    default:
+      KERNEL_LOG_ERROR("UnsortedSegmentProd invalid num_segments_type type [%s]", DTypeStr(num_segments_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+template <typename input_t>
+uint32_t UnsortedSegmentProdCpuKernel::DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type) {
+  auto num_segments_type = ctx.Input(2)->GetDataType();
+  switch (segment_ids_type) {
+    case DT_INT32:
+      return DoComputeWithNumSegmentsType<input_t, int32_t>(ctx, num_segments_type);
+    case DT_INT64:
+      return DoComputeWithNumSegmentsType<input_t, int64_t>(ctx, num_segments_type);
+
+    default:
+      KERNEL_LOG_ERROR("UnsortedSegmentProd invalid segment_ids_type type [%s]", DTypeStr(segment_ids_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+}
+
+uint32_t UnsortedSegmentProdCpuKernel::Compute(CpuKernelContext &ctx) {
+  auto input_type = ctx.Input(0)->GetDataType();
+  auto segment_ids_type = ctx.Input(1)->GetDataType();
+  switch (input_type) {
+    case DT_INT32:
+      return DoComputeWithSegmentIdsType<int32_t>(ctx, segment_ids_type);
+    case DT_INT16:
+      return DoComputeWithSegmentIdsType<int16_t>(ctx, segment_ids_type);
+    case DT_FLOAT:
+      return DoComputeWithSegmentIdsType<float>(ctx, segment_ids_type);
+    case DT_DOUBLE:
+      return DoComputeWithSegmentIdsType<double>(ctx, segment_ids_type);
+    case DT_FLOAT16:
+      return DoComputeWithSegmentIdsType<Eigen::half>(ctx, segment_ids_type);
+    case DT_INT8:
+      return DoComputeWithSegmentIdsType<int8_t>(ctx, segment_ids_type);
+    case DT_INT64:
+      return DoComputeWithSegmentIdsType<int64_t>(ctx, segment_ids_type);
+    case DT_UINT8:
+      return DoComputeWithSegmentIdsType<uint8_t>(ctx, segment_ids_type);
+    case DT_UINT16:
+      return DoComputeWithSegmentIdsType<uint16_t>(ctx, segment_ids_type);
+    case DT_UINT32:
+      return DoComputeWithSegmentIdsType<uint32_t>(ctx, segment_ids_type);
+    case DT_UINT64:
+      return DoComputeWithSegmentIdsType<uint64_t>(ctx, segment_ids_type);
+    case DT_COMPLEX64:
+      return DoComputeWithSegmentIdsType<std::complex<float>>(ctx, segment_ids_type);
+    case DT_COMPLEX128:
+      return DoComputeWithSegmentIdsType<std::complex<double>>(ctx, segment_ids_type);
+    default:
+      KERNEL_LOG_ERROR("UnsortedSegmentProd invalid input type [%s]", DTypeStr(input_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+  }
+  return KERNEL_STATUS_OK;
+}
+
+REGISTER_CPU_KERNEL(kUnsortedSegmentProd, UnsortedSegmentProdCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_prod.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_prod.h
@ -0,0 +1,37 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_PROD_H
+#define AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_PROD_H
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class UnsortedSegmentProdCpuKernel : public CpuKernel {
+ public:
+  ~UnsortedSegmentProdCpuKernel() = default;
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename input_t, typename segment_ids_t, typename num_segments_t>
+  uint32_t UnsortedSegmentProdComputeTemplate(CpuKernelContext &ctx);
+  template <typename input_t, typename segment_ids_t>
+  uint32_t DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type);
+  template <typename input_t>
+  uint32_t DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/equal_util.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/equal_util.h
@ -41,7 +41,7 @@ uint32_t EqualCalculate(const CpuKernelContext &ctx, BCalcInfo &calcInfo, bool f
      output_y[i] = (flag == true) ? (*x_index == *y_index) : (*x_index != *y_index);
    }
  };
-  KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, 1, shard_equal), "Equal calculate failed.")
+  KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, 1, shard_equal), "Equal calculate failed.");
  return KERNEL_STATUS_OK;
 }
 /**
@ -69,7 +69,7 @@ uint32_t EqualCompute(const CpuKernelContext &ctx, bool flag) {
    calcInfo.input_1->GetDataSize(), calcInfo.output->GetData(), calcInfo.output->GetDataSize());

  Bcast bcast;
-  KERNEL_HANDLE_ERROR(bcast.GenerateBcastInfo(calcInfo), "Generate broadcast info failed.")
+  KERNEL_HANDLE_ERROR(bcast.GenerateBcastInfo(calcInfo), "Generate broadcast info failed.");
  bcast.BCastIndexes(calcInfo.x_indexes, calcInfo.y_indexes);
  bcast.GetBcastVec(calcInfo);

--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
@ -51,8 +51,12 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
  static const std::set<std::string> kMigrateAicpuKernelOps = {mindspore::kAdaptiveAvgPool2dOpName,
                                                               mindspore::kAdaptiveAvgPool2dGradOpName,
                                                               mindspore::kCacheSwapTableOpName,
+                                                               mindspore::kCol2imOpName,
+                                                               mindspore::kCumulativeLogsumexpOpName,
+                                                               mindspore::kDataFormatVecPermuteOpName,
                                                               mindspore::kFillOpName,
                                                               mindspore::kLogMatrixDeterminantOpName,
+                                                               mindspore::kMatrixSolveLsOpName,
                                                               mindspore::kMaskedSelectOpName,
                                                               mindspore::kMaskedSelectGradOpName,
                                                               mindspore::kMedianOpName,
@ -71,6 +75,10 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
                                                               mindspore::kNanToNumOpName,
                                                               mindspore::kQrOpName,
                                                               mindspore::kResizeBicubicOpName};
+                                                               mindspore::kNuclearNormOpName,
+                                                               mindspore::kQuantileOpName,
+                                                               mindspore::kSparseSegmentSqrtNOpName,
+                                                               mindspore::kUnsortedSegmentProdOpName};
  static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
  static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";

--- a/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
+++ b/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
@ -171,3 +171,5 @@ from .median_grad import _median_grad_aicpu
 from .reduce_sum import _reduce_sum_aicpu
 from .adaptive_avg_pool_2d_v1 import _adaptive_avg_pool_2d_v1_aicpu
 from .fill_v2 import _fill_v2_aicpu
+from .data_format_vec_permute import _data_format_vec_permute_aicpu
+from .quantile import _quantile_aicpu