!43926 [assistant][ops] Add MatrixSolveLs

Merge pull request !43926 from 张渝/MatrixSolveLs
2022-11-18 01:57:53 +00:00 · 2022-11-18 01:57:53 +00:00 · cbc9496860
parent f7228fdfac bffb600fab
commit cbc9496860
9 changed files with 980 additions and 3 deletions
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/eigen/matrix_solve_ls_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/eigen/matrix_solve_ls_cpu_kernel.cc
@ -0,0 +1,498 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "plugin/device/cpu/kernel/eigen/matrix_solve_ls_cpu_kernel.h"
 #include <Eigen/Cholesky>
 #include <Eigen/Dense>
 #include <algorithm>
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr auto kInputNum = 3;
 constexpr auto kOutputNum = 1;
 constexpr int64_t kNum2 = 2;
 constexpr char kFast[] = "fast";
 constexpr bool kMatrixSolveLsComputeOk = true;
 constexpr bool kMatrixSolveLsComputeFailed = false;
 template <typename InputIt, typename T>
 T GetNumElements(InputIt first, InputIt last, T init) {
  for (; first != last; ++first) {
    init = std::move(init) * (*first);
  }
  return init;
 }
 }  // namespace
 template <typename T, int Major>
 using EigenMatrix = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Major>;
 template <typename T>
 void MatrixSolveLsCpuKernelMod::RealCholeskySingleCompute(T *aptr, T *bptr, T *xptr, double *l2, int64_t m, int64_t k,
                                                          int64_t n) {
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a(m, k);
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(k, n);
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(m, n);
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_copy;
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_b;
  for (int i = 0; i < m * k; i++) {
    *(a.data() + i) = *(aptr + i);
  }
  for (int i = 0; i < m * n; i++) {
    *(b.data() + i) = *(bptr + i);
  }
  if (m >= k) {
    a_copy = a.transpose() * a + ((T)*l2) * EigenMatrix<T, Eigen::RowMajor>::Identity(k, k);
    a_b = a.transpose() * b;
  } else {
    a_copy = a * a.transpose() + ((T)*l2) * EigenMatrix<T, Eigen::RowMajor>::Identity(m, m);
    a_b = b;
  }
  for (int64_t i = 0; i < n; i++) {
    EigenMatrix<T, Eigen::RowMajor> xi = a_copy.ldlt().solve(a_b.col(i));
    if (m < k) {
      xi = a.transpose() * xi;
    }
    x.col(i) = xi;
  }
  for (int64_t i = 0; i < k * n; i++) {
    *(xptr + i) = *(x.data() + i);
  }
 }
 template <typename T>
 void MatrixSolveLsCpuKernelMod::ComplexCholeskySingleCompute(std::complex<T> *aptr, std::complex<T> *bptr,
                                                             std::complex<T> *xptr, double *l2, int64_t m, int64_t k,
                                                             int64_t n) {
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> A(kNum2 * m, kNum2 * k);
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(kNum2 * k, n);
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(kNum2 * m, n);
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_copy;
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_b;
  auto l2value = abs(*l2);
  for (int64_t i = 0; i < k; i++) {
    for (int64_t j = 0; j < m; j++) {
      *(A.data() + i + j * kNum2 * k) = std::real(*(aptr + i + j * k));
    }
    for (int64_t j = 0; j < m; j++) {
      *(A.data() + (i + k) + (j + m) * kNum2 * k) = std::real(*(aptr + i + j * k));
    }
    for (int64_t j = 0; j < m; j++) {
      *(A.data() + (i + k) + j * kNum2 * k) = -std::imag(*(aptr + i + j * k));
    }
    for (int64_t j = 0; j < m; j++) {
      *(A.data() + i + (j + m) * kNum2 * k) = std::imag(*(aptr + i + j * k));
    }
  }
  for (int64_t i = 0; i < n; i++) {
    for (int64_t j = 0; j < m; j++) {
      *(b.data() + i + j * n) = std::real(*(bptr + i + j * n));
      *(b.data() + i + (j + m) * n) = std::imag(*(bptr + i + j * n));
    }
  }
  if (m >= k) {
    a_copy =
      A.transpose() * A +
      ((T)l2value) * Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Identity(kNum2 * k, kNum2 * k);
    a_b = A.transpose() * b;
  } else {
    a_copy =
      A * A.transpose() +
      ((T)l2value) * Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Identity(kNum2 * m, kNum2 * m);
    a_b = b;
  }
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi;
  for (int64_t i = 0; i < n; i++) {
    xi = a_copy.ldlt().solve(a_b.col(i));
    if (m < k) {
      xi = A.transpose() * xi;
    }
    x.col(i) = xi;
    for (int64_t j = 0; j < k; j++) {
      (xptr + i + j * n)->real(*(x.data() + i + j * n));
      (xptr + i + j * n)->imag(*(x.data() + i + (j + k) * n));
    }
  }
 }
 template <typename T>
 void MatrixSolveLsCpuKernelMod::RealQrSingleCompute(T *aptr, T *bptr, T *xptr, int64_t m, int64_t k, int64_t n) {
  EigenMatrix<T, Eigen::RowMajor> a(m, k);
  EigenMatrix<T, Eigen::RowMajor> x(k, n);
  EigenMatrix<T, Eigen::RowMajor> b(m, n);
  for (int i = 0; i < m * k; i++) {
    *(a.data() + i) = *(aptr + i);
  }
  for (int i = 0; i < m * n; i++) {
    *(b.data() + i) = *(bptr + i);
  }
  Eigen::CompleteOrthogonalDecomposition<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> qr_solve(a);
  for (int64_t i = 0; i < n; i++) {
    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi = qr_solve.solve(b.col(i));
    x.col(i) = xi;
  }
  for (int64_t i = 0; i < k * n; i++) {
    *(xptr + i) = *(x.data() + i);
  }
 }
 template <typename T>
 void MatrixSolveLsCpuKernelMod::ComplexQrSingleCompute(std::complex<T> *aptr, std::complex<T> *bptr,
                                                       std::complex<T> *xptr, int64_t m, int64_t k, int64_t n) {
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> A(kNum2 * m, kNum2 * k);
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(kNum2 * k, n);
  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(kNum2 * m, n);
  for (int64_t i = 0; i < k; i++) {
    for (int64_t j = 0; j < m; j++) {
      *(A.data() + i + j * kNum2 * k) = std::real(*(aptr + i + j * k));
    }
    for (int64_t j = 0; j < m; j++) {
      *(A.data() + (i + k) + (j + m) * kNum2 * k) = std::real(*(aptr + i + j * k));
    }
    for (int64_t j = 0; j < m; j++) {
      *(A.data() + (i + k) + j * kNum2 * k) = -std::imag(*(aptr + i + j * k));
    }
    for (int64_t j = 0; j < m; j++) {
      *(A.data() + i + (j + m) * kNum2 * k) = std::imag(*(aptr + i + j * k));
    }
  }
  for (int64_t i = 0; i < n; i++) {
    for (int64_t j = 0; j < m; j++) {
      *(b.data() + i + j * n) = std::real(*(bptr + i + j * n));
      *(b.data() + i + (j + m) * n) = std::imag(*(bptr + i + j * n));
    }
  }
  Eigen::CompleteOrthogonalDecomposition<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> qr_solve(A);
  for (int64_t i = 0; i < n; i++) {
    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi = qr_solve.solve(b.col(i));
    x.col(i) = xi;
    for (int64_t j = 0; j < k; j++) {
      (xptr + i + j * n)->real(*(x.data() + i + j * n));
      (xptr + i + j * n)->imag(*(x.data() + i + (j + k) * n));
    }
  }
 }
 template <typename T>
 bool MatrixSolveLsCpuKernelMod::ComplexCholesky(const std::vector<kernel::AddressPtr> &inputs,
                                                const std::vector<kernel::AddressPtr> &outputs) {
  auto x_shape_vector = AnfAlgo::GetInputDeviceShape(node_wpt_, 0);
  auto dims = x_shape_vector.size();
  auto l2 = reinterpret_cast<double *>(inputs[2]->addr);
  auto aptr = reinterpret_cast<std::complex<T> *>(inputs[0]->addr);
  auto bptr = reinterpret_cast<std::complex<T> *>(inputs[1]->addr);
  auto xptr = reinterpret_cast<std::complex<T> *>(outputs[0]->addr);
  int64_t m = x_shape_vector[dims - kNum2];
  int64_t k = x_shape_vector[dims - 1];
  int64_t n = 1;
  auto b_shape_vector = AnfAlgo::GetInputDeviceShape(node_wpt_, 1);
  if (b_shape_vector.size() > 1) {
    n = b_shape_vector[dims - 1];
  }
  int64_t data_num = 1;
  data_num = GetNumElements(x_shape_vector.begin(), x_shape_vector.end(), data_num);
  const int64_t mat_size = m * k;
  const int64_t rhs_size = m * n;
  const int64_t res_size = n * k;
  const int64_t batch = data_num / mat_size;
  const int64_t kParallelDataNum = 16 * mat_size;
  if (data_num >= kParallelDataNum) {
    auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
      for (int64_t i = start; i < end; i++) {
        ComplexCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
      }
    };
    ParallelLaunchAutoSearch(sharder_matrix_solve_ls, batch, this, &parallel_search_info_);
  } else {
    for (int64_t i = 0; i < batch; i++) {
      ComplexCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
    }
  }
  return kMatrixSolveLsComputeOk;
 }
 template <typename T>
 bool MatrixSolveLsCpuKernelMod::RealQr(const std::vector<kernel::AddressPtr> &inputs,
                                       const std::vector<kernel::AddressPtr> &outputs) {
  auto x_shape_vector = AnfAlgo::GetInputDeviceShape(node_wpt_, 0);
  auto dims = x_shape_vector.size();
  auto aptr = reinterpret_cast<T *>(inputs[0]->addr);
  auto bptr = reinterpret_cast<T *>(inputs[1]->addr);
  auto xptr = reinterpret_cast<T *>(outputs[0]->addr);
  int64_t m = x_shape_vector[dims - kNum2];
  int64_t k = x_shape_vector[dims - 1];
  int64_t n = 1;
  auto b_shape_vector = AnfAlgo::GetInputDeviceShape(node_wpt_, 1);
  if (b_shape_vector.size() > 1) {
    n = b_shape_vector[dims - 1];
  }
  int64_t data_num = 1;
  data_num = GetNumElements(x_shape_vector.begin(), x_shape_vector.end(), data_num);
  const int64_t mat_size = m * k;
  const int64_t rhs_size = m * n;
  const int64_t res_size = n * k;
  const int64_t batch = data_num / mat_size;
  const int64_t kParallelDataNum = 16 * mat_size;
  if (data_num >= kParallelDataNum) {
    auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
      for (int64_t i = start; i < end; i++) {
        RealQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
      }
    };
    ParallelLaunchAutoSearch(sharder_matrix_solve_ls, batch, this, &parallel_search_info_);
  } else {
    for (int64_t i = 0; i < batch; i++) {
      RealQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
    }
  }
  return kMatrixSolveLsComputeOk;
 }
 template <typename T>
 bool MatrixSolveLsCpuKernelMod::ComplexQr(const std::vector<kernel::AddressPtr> &inputs,
                                          const std::vector<kernel::AddressPtr> &outputs) {
  auto x_shape_vector = AnfAlgo::GetInputDeviceShape(node_wpt_, 0);
  auto dims = x_shape_vector.size();
  int64_t m = x_shape_vector[dims - kNum2];
  int64_t k = x_shape_vector[dims - 1];
  int64_t n = 1;
  auto b_shape_vector = AnfAlgo::GetInputDeviceShape(node_wpt_, 1);
  if (b_shape_vector.size() > 1) {
    n = b_shape_vector[dims - 1];
  }
  int64_t data_num = 1;
  data_num = GetNumElements(x_shape_vector.begin(), x_shape_vector.end(), data_num);
  const int64_t mat_size = m * k;
  const int64_t rhs_size = m * n;
  const int64_t res_size = n * k;
  const int64_t batch = data_num / mat_size;
  const int64_t kParallelDataNum = 16 * mat_size;
  auto aptr = reinterpret_cast<std::complex<T> *>(inputs[0]->addr);
  auto bptr = reinterpret_cast<std::complex<T> *>(inputs[1]->addr);
  auto xptr = reinterpret_cast<std::complex<T> *>(outputs[0]->addr);
  if (data_num >= kParallelDataNum) {
    auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
      for (int64_t i = start; i < end; i++) {
        ComplexQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
      }
    };
    ParallelLaunchAutoSearch(sharder_matrix_solve_ls, batch, this, &parallel_search_info_);
  } else {
    for (int64_t i = 0; i < batch; i++) {
      ComplexQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
    }
  }
  return kMatrixSolveLsComputeOk;
 }
 template <typename T>
 bool MatrixSolveLsCpuKernelMod::RealCholesky(const std::vector<kernel::AddressPtr> &inputs,
                                             const std::vector<kernel::AddressPtr> &outputs) {
  auto x_shape_vector = AnfAlgo::GetInputDeviceShape(node_wpt_, 0);
  auto dims = x_shape_vector.size();
  auto aptr = reinterpret_cast<T *>(inputs[0]->addr);
  auto bptr = reinterpret_cast<T *>(inputs[1]->addr);
  auto xptr = reinterpret_cast<T *>(outputs[0]->addr);
  auto l2 = reinterpret_cast<double *>(inputs[2]->addr);
  int64_t m = x_shape_vector[dims - kNum2];
  int64_t k = x_shape_vector[dims - 1];
  int64_t n = 1;
  auto b_shape_vector = AnfAlgo::GetInputDeviceShape(node_wpt_, 1);
  if (b_shape_vector.size() > 1) {
    n = b_shape_vector[dims - 1];
  }
  int64_t data_num = 1;
  data_num = GetNumElements(x_shape_vector.begin(), x_shape_vector.end(), data_num);
  const int64_t mat_size = m * k;
  const int64_t rhs_size = m * n;
  const int64_t res_size = n * k;
  const int64_t batch = data_num / mat_size;
  const int64_t kParallelDataNum = 16 * mat_size;
  if (data_num >= kParallelDataNum) {
    auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
      for (int64_t i = start; i < end; i++) {
        RealCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
      }
    };
    ParallelLaunchAutoSearch(sharder_matrix_solve_ls, batch, this, &parallel_search_info_);
  } else {
    for (int64_t i = 0; i < batch; i++) {
      RealCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
    }
  }
  return kMatrixSolveLsComputeOk;
 }
 void MatrixSolveLsCpuKernelMod::InitKernel(const CNodePtr &kernel_node) {
  node_wpt_ = kernel_node;
  kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
  if (common::AnfAlgo::HasNodeAttr(kFast, kernel_node)) {
    qr_chole = common::AnfAlgo::GetNodeAttr<bool>(kernel_node, kFast);
  } else {
    MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the attribute 'fast' does not exist.";
  }
  auto kernel_attr = GetKernelAttrFromNode(kernel_node);
  auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
  if (!is_match) {
    MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", does not support this kernel data type: " << kernel_attr;
  }
  kernel_func_ = func_list_[index].second;
 }
 bool MatrixSolveLsCpuKernelMod::Resize(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
  auto shapea = AnfAlgo::GetInputDeviceShape(node_wpt_, 0);
  auto shapeb = AnfAlgo::GetInputDeviceShape(node_wpt_, 1);
  auto shapel2 = AnfAlgo::GetInputDeviceShape(node_wpt_, 2);
  auto shapex = AnfAlgo::GetOutputDeviceShape(node_wpt_, 0);
  auto dims = shapea.size();
  if (shapeb.size() == 1) {
    if (shapea[dims - kNum2] != shapeb[0]) {
      MS_EXCEPTION(ValueError) << "For " << kernel_name_ << ", #Rows mismatch between A and rhs."
                               << "#Rows of A = [" << shapea[dims - kNum2] << "]"
                               << "#Rows of rhs = [" << shapeb[0] << "]";
      return kMatrixSolveLsComputeFailed;
    }
  } else {
    if (shapea[dims - kNum2] != shapeb[dims - kNum2]) {
      MS_EXCEPTION(ValueError) << "For " << kernel_name_ << "#Rows mismatch between A and rhs."
                               << "#Rows of A = [" << shapea[dims - kNum2] << "]"
                               << "#Rows of rhs = [" << shapeb[dims - kNum2] << "]";
      return kMatrixSolveLsComputeFailed;
    }
  }
  if (shapel2.size() != 0) {
    MS_EXCEPTION(ValueError) << "For " << kernel_name_ << "Tensor l2 should be a scalar.";
    return kMatrixSolveLsComputeFailed;
  }
  if (shapeb.size() == 1) {
    if ((shapex.size() != shapeb.size()) || (shapea[dims - 1] != shapex[0]) || (shapex.back() != shapeb[0])) {
      MS_EXCEPTION(ValueError) << "For " << kernel_name_ << "Tensor y shape mismatch.";
      return kMatrixSolveLsComputeFailed;
    }
  } else {
    if ((shapex.size() != shapeb.size()) || (shapea[dims - 1] != shapex[shapex.size() - kNum2]) ||
        (shapex.back() != shapeb.back())) {
      MS_EXCEPTION(ValueError) << "For " << kernel_name_ << "Tensor y shape mismatch.";
      return kMatrixSolveLsComputeFailed;
    }
  }
  return kMatrixSolveLsComputeOk;
 }
 template <typename T>
 bool MatrixSolveLsCpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                             const std::vector<AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputNum, kernel_name_);
  auto a_data_type = AnfAlgo::GetInputDeviceDataType(node_wpt_, 0);
  auto b_data_type = AnfAlgo::GetInputDeviceDataType(node_wpt_, 1);
  if (Resize(inputs, outputs) != true) {
    return kMatrixSolveLsComputeFailed;
  }
  if (a_data_type != b_data_type) {
    MS_EXCEPTION(TypeError) << "For " << kernel_name_ << "Tensor data type mismatch.";
    return kMatrixSolveLsComputeFailed;
  }
  if (a_data_type != kNumberTypeFloat32 && a_data_type != kNumberTypeFloat64 && a_data_type != kNumberTypeComplex64 &&
      a_data_type != kNumberTypeComplex128) {
    MS_EXCEPTION(TypeError) << "For " << kernel_name_ << ", unsupported data type: " << TypeIdLabel(a_data_type) << ".";
    return kMatrixSolveLsComputeFailed;
  }
  if (qr_chole) {
    if (a_data_type == kNumberTypeComplex64) {
      return ComplexCholesky<float>(inputs, outputs);
    }
    if (a_data_type == kNumberTypeComplex128) {
      return ComplexCholesky<double>(inputs, outputs);
    }
    if (a_data_type == kNumberTypeFloat64) {
      return RealCholesky<double>(inputs, outputs);
    }
    if (a_data_type == kNumberTypeFloat32) {
      return RealCholesky<float>(inputs, outputs);
    }
  } else {
    if (a_data_type == kNumberTypeComplex64) {
      return ComplexQr<float>(inputs, outputs);
    }
    if (a_data_type == kNumberTypeComplex128) {
      return ComplexQr<double>(inputs, outputs);
    }
    if (a_data_type == kNumberTypeFloat64) {
      return RealQr<double>(inputs, outputs);
    }
    if (a_data_type == kNumberTypeFloat32) {
      return RealQr<float>(inputs, outputs);
    }
  }
  return kMatrixSolveLsComputeOk;
 }  // un pass
 std::vector<std::pair<KernelAttr, MatrixSolveLsCpuKernelMod::MatrixSolveLsFunc>> MatrixSolveLsCpuKernelMod::func_list_ =
  {{KernelAttr()
      .AddInputAttr(kNumberTypeFloat32)
      .AddInputAttr(kNumberTypeFloat32)
      .AddInputAttr(kNumberTypeFloat64)
      .AddOutputAttr(kNumberTypeFloat32),
    &MatrixSolveLsCpuKernelMod::LaunchKernel<float>},
   {KernelAttr()
      .AddInputAttr(kNumberTypeFloat64)
      .AddInputAttr(kNumberTypeFloat64)
      .AddInputAttr(kNumberTypeFloat64)
      .AddOutputAttr(kNumberTypeFloat64),
    &MatrixSolveLsCpuKernelMod::LaunchKernel<double>},
   {KernelAttr()
      .AddInputAttr(kNumberTypeComplex64)
      .AddInputAttr(kNumberTypeComplex64)
      .AddInputAttr(kNumberTypeFloat64)
      .AddOutputAttr(kNumberTypeComplex64),
    &MatrixSolveLsCpuKernelMod::LaunchKernel<std::complex<float>>},
   {KernelAttr()
      .AddInputAttr(kNumberTypeComplex128)
      .AddInputAttr(kNumberTypeComplex128)
      .AddInputAttr(kNumberTypeFloat64)
      .AddOutputAttr(kNumberTypeComplex128),
    &MatrixSolveLsCpuKernelMod::LaunchKernel<std::complex<double>>}};
 std::vector<KernelAttr> MatrixSolveLsCpuKernelMod::GetOpSupport() {
  std::vector<KernelAttr> support_list;
  (void)std::transform(func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
                       [](const std::pair<KernelAttr, MatrixSolveLsFunc> &pair) { return pair.first; });
  return support_list;
 }
 MS_KERNEL_FACTORY_REG(NativeCpuKernelMod, MatrixSolveLs, MatrixSolveLsCpuKernelMod);
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/eigen/matrix_solve_ls_cpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/eigen/matrix_solve_ls_cpu_kernel.h
@ -0,0 +1,85 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EIGEN_MATRIX_SOLVE_LS_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EIGEN_MATRIX_SOLVE_LS_CPU_KERNEL_H_
 #include <vector>
 #include <utility>
 #include <complex>
 #include "plugin/device/cpu/kernel/cpu_kernel.h"
 #include "plugin/factory/ms_factory.h"
 namespace mindspore {
 namespace kernel {
 class MatrixSolveLsCpuKernelMod : public DeprecatedNativeCpuKernelMod {
 public:
  MatrixSolveLsCpuKernelMod() = default;
  ~MatrixSolveLsCpuKernelMod() override = default;
  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override {
    return kernel_func_(this, inputs, workspace, outputs);
  }
 protected:
  std::vector<KernelAttr> GetOpSupport() override;
 private:
  bool Resize(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);
  template <typename T>
  bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &workspace,
                    const std::vector<kernel::AddressPtr> &outputs);
  using MatrixSolveLsFunc =
    std::function<bool(MatrixSolveLsCpuKernelMod *, const std::vector<kernel::AddressPtr> &,
                       const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
  static std::vector<std::pair<KernelAttr, MatrixSolveLsFunc>> func_list_;
  MatrixSolveLsFunc kernel_func_;
  template <typename T>
  void RealCholeskySingleCompute(T *aptr, T *bptr, T *xptr, double *l2, int64_t m, int64_t k, int64_t n);
  template <typename T>
  bool RealCholesky(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);
  template <typename T>
  void RealQrSingleCompute(T *aptr, T *bptr, T *xptr, int64_t m, int64_t k, int64_t n);
  template <typename T>
  bool RealQr(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);
  template <typename T>
  void ComplexCholeskySingleCompute(std::complex<T> *aptr, std::complex<T> *bptr, std::complex<T> *xptr, double *l2,
                                    int64_t m, int64_t k, int64_t n);
  template <typename T>
  bool ComplexCholesky(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);
  template <typename T>
  void ComplexQrSingleCompute(std::complex<T> *aptr, std::complex<T> *bptr, std::complex<T> *xptr, int64_t m, int64_t k,
                              int64_t n);
  template <typename T>
  bool ComplexQr(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);
  bool qr_chole{true};
  CNodePtr node_wpt_;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EIGEN_MATRIX_SOLVE_LS_CPU_KERNEL_H_
--- a/mindspore/core/ops/core_ops.h
+++ b/mindspore/core/ops/core_ops.h
@ -141,6 +141,7 @@ constexpr auto kDiagonal = "Diagonal";
 constexpr auto kEditDistance = "EditDistance";
 constexpr auto kNextAfter = "NextAfter";
 constexpr auto kMaximumGradGrad = "MaximumGradGrad";
 constexpr auto kMatrixSolveLs = "MatrixSolveLs";
 constexpr auto kSparseSegmentMean = "SparseSegmentMean";
 constexpr auto kTridiagonalMatMul = "TridiagonalMatMul";
 constexpr auto kTridiagonalSolve = "TridiagonalSolve";
@ -1272,6 +1273,7 @@ GVAR_DEF(PrimitivePtr, kPrimLinSpace, std::make_shared<Primitive>("LinSpace"));
 GVAR_DEF(PrimitivePtr, kPrimNonMaxSuppression, std::make_shared<Primitive>("NonMaxSuppression"));
 GVAR_DEF(PrimitivePtr, kPrimSign, std::make_shared<Primitive>("Sign"));
 GVAR_DEF(PrimitivePtr, kPrimACos, std::make_shared<Primitive>(kACos));
 GVAR_DEF(PrimitivePtr, kPrimMatrixSolveLs, std::make_shared<Primitive>(kMatrixSolveLs));
 GVAR_DEF(PrimitivePtr, kPrimAsinGrad, std::make_shared<Primitive>(kAsinGrad));
 GVAR_DEF(PrimitivePtr, kPrimACosGrad, std::make_shared<Primitive>(kACosGrad));
 GVAR_DEF(PrimitivePtr, kPrimAtanGrad, std::make_shared<Primitive>("AtanGrad"));
--- a/mindspore/core/ops/matrix_solve_ls.cc
+++ b/mindspore/core/ops/matrix_solve_ls.cc
@ -0,0 +1,104 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <set>
 #include <map>
 #include <string>
 #include <vector>
 #include <memory>
 #include "ops/matrix_solve_ls.h"
 #include "ops/op_utils.h"
 #include "utils/check_convert_utils.h"
 #include "mindapi/src/helper.h"
 namespace mindspore {
 namespace ops {
 namespace {
 abstract::ShapePtr MatrixSolveLsInferShape(const PrimitivePtr &primitive,
                                           const std::vector<AbstractBasePtr> &input_args) {
  MS_EXCEPTION_IF_NULL(primitive);
  auto prim_name = primitive->name();
  auto matrix_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[0]->BuildShape())[kShape];
  auto rhs_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[1]->BuildShape())[kShape];
  auto l2_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[2]->BuildShape())[kShape];
  (void)CheckAndConvertUtils::CheckInteger("input matrix rank", SizeToLong(matrix_shape.size()), kGreaterEqual, 2L,
                                           prim_name);
  (void)CheckAndConvertUtils::CheckInteger("input rhs rank", SizeToLong(rhs_shape.size()), kGreaterEqual, 2L,
                                           prim_name);
  (void)CheckAndConvertUtils::CheckInteger("input l2 rank", SizeToLong(l2_shape.size()), kEqual, 0L, prim_name);
  constexpr size_t offset = 2;
  std::vector<int64_t> matrix_last(matrix_shape.end() - offset, matrix_shape.end());
  std::vector<int64_t> rhs_last(rhs_shape.end() - offset, rhs_shape.end());
  std::vector<int64_t> y_shape(rhs_shape.begin(), rhs_shape.end() - offset);
  int64_t matrix_row = matrix_last[0];
  int64_t matrix_col = matrix_last[1];
  int64_t rhs_row = rhs_last[0];
  int64_t rhs_col = rhs_last[1];
  for (size_t i = 0; i < matrix_shape.size() - offset; ++i) {
    if (matrix_shape[i] != rhs_shape[i]) {
      MS_EXCEPTION(ValueError) << "For " << prim_name << ", shapes in batch dimension must be same, but dim[" << i
                               << "] are not the same, "
                               << "got matrix_dim[" << i << "]: " << matrix_shape[i] << ", rhs_dim[" << i
                               << "]: " << rhs_shape[i] << ".";
    }
  }
  if (matrix_row != rhs_row) {
    MS_EXCEPTION(ValueError) << "MatrixSolveLs shape error, got matrix_row: " << matrix_row << ", rhs_row: " << rhs_row
                             << ". In MatrixSolveLs matrix_row and rhs_row should be equal.";
  }
  y_shape.push_back(matrix_col);
  y_shape.push_back(rhs_col);
  return std::make_shared<abstract::Shape>(y_shape);
 }
 TypePtr MatrixSolveLsInferType(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
  MS_EXCEPTION_IF_NULL(primitive);
  const std::set<TypePtr> valid_types = {kFloat32, kFloat64, kComplex64, kComplex128};
  const std::set<TypePtr> l2_valid_types = {kFloat64};
  auto matrix_type = input_args[0]->BuildType();
  auto rhs_type = input_args[1]->BuildType();
  auto l2_type = input_args[2]->BuildType();
  std::map<std::string, TypePtr> types;
  (void)types.emplace("matrix", matrix_type);
  (void)types.emplace("rhs", rhs_type);
  (void)CheckAndConvertUtils::CheckTypeValid("matrix", matrix_type, valid_types, primitive->name());
  (void)CheckAndConvertUtils::CheckTypeValid("rhs", rhs_type, valid_types, primitive->name());
  (void)CheckAndConvertUtils::CheckTypeValid("l2_regularizer", l2_type, l2_valid_types, primitive->name());
  CheckAndConvertUtils::CheckTensorTypeSame(types, valid_types, primitive->name());
  return matrix_type;
 }
 }  // namespace
 AbstractBasePtr MatrixSolveLsInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const std::vector<AbstractBasePtr> &input_args) {
  MS_EXCEPTION_IF_NULL(primitive);
  const int64_t input_num = 3;
  CheckAndConvertUtils::CheckInputArgs(input_args, kEqual, input_num, primitive->name());
  auto infer_type = MatrixSolveLsInferType(primitive, input_args);
  auto infer_shape = MatrixSolveLsInferShape(primitive, input_args);
  return abstract::MakeAbstract(infer_shape, infer_type);
 }
 REGISTER_PRIMITIVE_EVAL_IMPL(MatrixSolveLs, prim::kPrimMatrixSolveLs, MatrixSolveLsInfer, nullptr, true);
 }  // namespace ops
 }  // namespace mindspore
--- a/mindspore/core/ops/matrix_solve_ls.h
+++ b/mindspore/core/ops/matrix_solve_ls.h
@ -0,0 +1,44 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CORE_OPS_MATRIX_SOLVE_LS_H
 #define MINDSPORE_CORE_OPS_MATRIX_SOLVE_LS_H
 #include <vector>
 #include <memory>
 #include "ops/base_operator.h"
 #include "mindapi/base/types.h"
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMatrixSolveLs = "MatrixSolveLs";
 class MIND_API MatrixSolveLs : public BaseOperator {
 public:
  MIND_API_BASE_MEMBER(MatrixSolveLs);
  /// \brief Constructor.
  MatrixSolveLs() : BaseOperator(kNameMatrixSolveLs) { InitIOName({"matrix", "rhs", "l2"}, {"y"}); }
  /// \brief Init. Refer to the parameters of Python API @ref mindspore.ops.MatrixSolveLs for the inputs.
  void Init() const {}
 };
 abstract::AbstractBasePtr MatrixSolveLsInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                             const std::vector<abstract::AbstractBasePtr> &input_args);
 using PrimMatrixSolveLsPtr = std::shared_ptr<MatrixSolveLs>;
 }  // namespace ops
 }  // namespace mindspore
 #endif  // MINDSPORE_CORE_OPS_MATRIX_SOLVE_LS_H
--- a/mindspore/python/mindspore/ops/_grad_experimental/grad_math_ops.py
+++ b/mindspore/python/mindspore/ops/_grad_experimental/grad_math_ops.py
@ -51,6 +51,7 @@ from mindspore.ops.operations.math_ops import LuUnpack
 from mindspore.ops.operations.math_ops import MatrixExp
 from mindspore.ops.operations.math_ops import CumulativeLogsumexp
 from mindspore.ops.operations.math_ops import MatrixSolve
 from mindspore.ops.operations.math_ops import MatrixSolveLs
 from mindspore.ops.operations.math_ops import MatrixPower
 from mindspore.ops.operations.math_ops import Median
 from mindspore.ops.operations.math_ops import MatrixTriangularSolve
@ -74,6 +75,7 @@ from mindspore.ops._grad.grad_base import bprop_getters, create_tensor_by_elemen
 from mindspore.ops._grad.grad_base import dyn_ones, dyn_fill, sum_grad_reduce_axis
 from mindspore.ops._grad.grad_math_ops import binop_grad_common
 from mindspore.ops.operations.array_ops import MatrixBandPart
 from mindspore.ops.operations.array_ops import ConjugateTranspose
 transpose = P.Transpose()
 dyn_shape_op = P.TensorShape()
@ -673,6 +675,155 @@ def get_bprop_matrix_solve(self):
    return bprop
@constexpr
 def _generate_perm_matrix_solve_ls(x_dim):
    perm = tuple(range(x_dim - 2))
    perm = perm + (x_dim-1, x_dim-2)
    return perm
@bprop_getters.register(MatrixSolveLs)
 def get_bprop_matrix_solve_ls(self):
    """Grad definition for 'MatrixSolveLs' operation"""
    fast = self.fast
    cast = P.Cast()
    neg = P.Neg()
    rank = P.Rank()
    cholesky = Cholesky()
    eye = P.Eye()
    add = P.Add()
    mul = P.Mul()
    matmul = P.MatMul()
    batch_matmul = P.BatchMatMul()
    cholesky_solve = CholeskySolve()
    _transpose = Transpose()
    conjugate_transpose = ConjugateTranspose()
    shape = P.Shape()
    _complex = P.Complex()
    def regularized_gramian_cholesky(matrix, l2, first_kind):
        matrix_dim = rank(matrix)
        perm = _generate_perm_matrix_solve_ls(matrix_dim)
        if matrix.dtype in (mstype.complex64, mstype.complex128):
            matrix_temp = conjugate_transpose(matrix, perm)
        else:
            matrix_temp = _transpose(matrix, perm)
        if first_kind:
            if matrix_dim > 2:
                gramian = batch_matmul(matrix_temp, matrix)
            else:
                gramian = matmul(matrix_temp, matrix)
        else:
            if matrix_dim > 2:
                gramian = batch_matmul(matrix, matrix_temp)
            else:
                gramian = matmul(matrix, matrix_temp)
        if isinstance(l2, Tensor) or l2 != 0:
            matrix_shape = shape(matrix)
            if first_kind:
                small_dim = matrix_shape[-1]
            else:
                small_dim = matrix_shape[-2]
            identity = eye(small_dim, small_dim, matrix.dtype)
            gramian = add(gramian, mul(l2, identity))
        #Cholesky not support complex dtype for now
        return cholesky(gramian)
    def bprop(matrix, rhs, l2, out, dout):
        #support dtype:float32
        #support dimension: 2D,3D
        def over_determined(matrix, rhs, out, l2, dout):
            if matrix.dtype == mstype.complex64:
                l2_regularizer = _complex(cast(l2, mstype.float32), Tensor(0, mstype.float32))
            elif matrix.dtype == mstype.complex128:
                l2_regularizer = _complex(cast(l2, mstype.float64), Tensor(0, mstype.float64))
            else:
                l2_regularizer = cast(l2, matrix.dtype)
            chol = cast(regularized_gramian_cholesky(matrix, l2_regularizer, first_kind=True), matrix.dtype)
            #CholeskySolve not support complex dtype and just support 2D or 3D matrices for now
            z = cholesky_solve(dout, chol)
            matrix_dim = rank(matrix)
            perm = _generate_perm_matrix_solve_ls(matrix_dim)
            if matrix.dtype in (mstype.complex64, mstype.complex128):
                z_temp = conjugate_transpose(z, perm)
            else:
                z_temp = _transpose(z, perm)
            if matrix_dim > 2:
                xzt = batch_matmul(out, z_temp)
            else:
                xzt = matmul(out, z_temp)
            zx_sym = add(xzt, _transpose(xzt, perm))
            if matrix_dim > 2:
                grad_a = add(neg(batch_matmul(matrix, zx_sym)), batch_matmul(rhs, z_temp))
                grad_b = batch_matmul(matrix, z)
            else:
                grad_a = add(neg(matmul(matrix, zx_sym)), matmul(rhs, z_temp))
                grad_b = matmul(matrix, z)
            return (grad_a, grad_b, None)
        def under_determined(matrix, rhs, l2, dout):
            if matrix.dtype == mstype.complex64:
                l2_regularizer = _complex(cast(l2, mstype.float32), Tensor(0, mstype.float32))
            elif matrix.dtype == mstype.complex128:
                l2_regularizer = _complex(cast(l2, mstype.float64), Tensor(0, mstype.float64))
            else:
                l2_regularizer = cast(l2, matrix.dtype)
            chol = cast(regularized_gramian_cholesky(matrix, l2_regularizer, first_kind=False), matrix.dtype)
            matrix_dim = rank(matrix)
            perm = _generate_perm_matrix_solve_ls(matrix_dim)
            if matrix_dim > 2:
                gramian = batch_matmul(matrix, dout)
            else:
                gramian = matmul(matrix, dout)
            #CholeskySolve not support complex dtype and just support 2D or 3D matrices for now
            grad_b = cholesky_solve(gramian, chol)
            tmp = cholesky_solve(rhs, chol)
            if matrix.dtype in (mstype.complex64, mstype.complex128):
                tmp_temp = conjugate_transpose(tmp, perm)
                matrix_temp = conjugate_transpose(matrix, perm)
            else:
                tmp_temp = _transpose(tmp, perm)
                matrix_temp = _transpose(matrix, perm)
            if matrix_dim > 2:
                a1 = batch_matmul(tmp_temp, matrix)
                a1 = neg(batch_matmul(grad_b, a1))
                a2 = dout - batch_matmul(matrix_temp, grad_b)
                if matrix.dtype in (mstype.complex64, mstype.complex128):
                    a2_temp = conjugate_transpose(a2, perm)
                else:
                    a2_temp = _transpose(a2, perm)
                a2 = batch_matmul(tmp, a2_temp)
            else:
                a1 = matmul(tmp_temp, matrix)
                a1 = neg(matmul(grad_b, a1))
                a2 = dout - matmul(matrix_temp, grad_b)
                if matrix.dtype in (mstype.complex64, mstype.complex128):
                    a2_temp = conjugate_transpose(a2, perm)
                else:
                    a2_temp = _transpose(a2, perm)
                a2 = matmul(tmp, a2_temp)
            grad_a = add(a1, a2)
            return (grad_a, grad_b, None)
        if fast is False:
            raise ValueError("For MatrixSolveLs, gradient not defined for fast=False")
        matrix_shape = shape(matrix)[-2:]
        if matrix_shape[-2] >= matrix_shape[-1]:
            return over_determined(matrix, rhs, out, l2, dout)
        return under_determined(matrix, rhs, l2, dout)
    return bprop
@bprop_getters.register(P.MatrixDeterminant)
 def get_bprop_matrix_determinant(self):
    """Generate bprop for MatrixDeterminant"""
--- a/mindspore/python/mindspore/ops/_op_impl/aicpu/matrix_solve_ls.py
+++ b/mindspore/python/mindspore/ops/_op_impl/aicpu/matrix_solve_ls.py
@ -0,0 +1,36 @@
 # Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """MatrixSolveLs op"""
 from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
 matrix_solve_ls_op_info = AiCPURegOp("MatrixSolveLs") \
    .fusion_type("OPAQUE") \
    .attr("fast", "bool") \
    .input(0, "matrix", "required") \
    .input(1, "rhs", "required") \
    .input(2, "l2", "required") \
    .output(0, "y", "required") \
    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F64_Default, DataType.F32_Default) \
    .dtype_format(DataType.F64_Default, DataType.F64_Default, DataType.F64_Default, DataType.F64_Default) \
    .dtype_format(DataType.C64_Default, DataType.C64_Default, DataType.F64_Default, DataType.C64_Default) \
    .dtype_format(DataType.C128_Default, DataType.C128_Default, DataType.F64_Default, DataType.C128_Default) \
    .get_op_info()
@op_info_register(matrix_solve_ls_op_info)
 def _matrix_solve_ls_aicpu():
    """MatrixSolveLs aicpu register"""
    return
--- a/mindspore/python/mindspore/ops/operations/math_ops.py
+++ b/mindspore/python/mindspore/ops/operations/math_ops.py
@ -1518,9 +1518,8 @@ class MatMul(PrimitiveWithCheck):
    def check_dtype(self, x1, x2):
        args = {"x1": x1, "x2": x2}
-        validator.check_tensors_dtypes_same_and_valid(args,
+        validator.check_tensors_dtypes_same_and_valid(args, mstype.float_type + mstype.int_type
-                                                      mstype.float_type + mstype.int_type + (mstype.complex64,),
+                                                      + (mstype.complex64, mstype.complex128), self.name)
                                                      self.name)
 class BatchMatMul(Primitive):
@ -6264,6 +6263,57 @@ class MatrixSolve(Primitive):
        self.adjoint = validator.check_value_type("adjoint", adjoint, [bool], self.name)
 class MatrixSolveLs(Primitive):
    r"""
    Solves one or more linear least-squares problems.
    If `fast` is `True`,then the solution is computed by solving the normal equations using Cholesky decomposition.
    If `fast` is `False` an algorithm based on the numerically robust complete orthogonal decomposition is used. This
    path is typically 6-7 times slower than the fast path. If `fast` is `False` then `l2_regularizer` is ignored.
    Args:
        fast (bool): An optional bool. Defaults to True.
    Inputs:
        - **matrix** (Tensor) -  A Tensor. Must be one of the following data types: float64, float32, complex64,
          complex128. Shape is :math:`(*, M, N)`.
        - **rhs** (Tensor) -  A Tensor. Must have the same data type as matrix. Shape is :math:`(*, M, K)`.
          `matrix` and `rhs` should have the same dimensions except the last one.
        - **l2_regularizer** (Tensor) - A Tensor of type float64. Scalar tensor.
    Outputs:
        Tensor of shape :math:`(*, N, K)` with the same data type as `matrix`.
    Raises:
        TypeError: If `matrix`, `rhs` or `l2_regularizer` is not tensor.
        TypeError: If either of `matrix` and `rhs` is not float32, float64, complex64 or complex128.
        TypeError: If `l2_regularizer` is not float64.
        TypeError: If `fast` is not bool.
        ValueError: If dimensions of `matrix` or `rhs` is less than 2.
        ValueError: If shape of `matrix` dose not match the shape of `rhs`.
    Supported Platforms:
        ``CPU``
    Examples:
        >>> matrix_solve_ls = ops.MatrixSolveLs(fast=True)
        >>> matrix = Tensor([[3, 0, 0, 0], [2, 1, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]], mstype.float32)
        >>> rhs = Tensor(np.array([[4], [2], [4], [2]]), mstype.float32)
        >>> l2 = Tensor(0.0, mstype.float64)
        >>> output = matrix_solve_ls(matrix, rhs, l2)
        >>> print(output)
        [[ 1.3333334]
        [-0.6666667]
        [ 2.6666665]
        [-1.3333333]]
    """
    @prim_attr_register
    def __init__(self, fast=True):
        """Initialize MatrixSolveLs"""
        validator.check_value_type('fast', fast, [bool], self.name)
 class LuSolve(Primitive):
    """
    Return the solution of the linear equation Ax = b.
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@ -51,6 +51,7 @@ from mindspore.ops.operations.math_ops import MatrixExp
 from mindspore.ops.operations.math_ops import FFTWithSize
 from mindspore.ops.operations.math_ops import MatrixPower
 from mindspore.ops.operations.math_ops import MatrixSolve
 from mindspore.ops.operations.math_ops import MatrixSolveLs
 from mindspore.ops.operations.math_ops import MatrixLogarithm
 from mindspore.ops.operations.math_ops import CholeskySolve
 from mindspore.ops.operations.math_ops import InplaceIndexAdd
@ -2484,6 +2485,12 @@ test_case_math_ops = [
        'desc_inputs': [Tensor(np.array([[[1., 4.], [2., 7.]], [[1., 4.], [2., 7.]]]).astype(np.float32)),
                        Tensor(np.array([[[1.], [3.]], [[1.], [3.]]]).astype(np.float32))],
        'desc_bprop': [Tensor(np.array([[[1.], [1.]], [[1.], [1.]]]).astype(np.float32))]}),
    ('MatrixSolveLs', {
        'block': MatrixSolveLs(fast=True),
        'desc_inputs': [Tensor(np.array([[[1., 4.], [2., 7.]], [[1., 4.], [2., 7.]]]).astype(np.float32)),
                        Tensor(np.array([[[1.], [3.]], [[1.], [3.]]]).astype(np.float32)),
                        Tensor(np.random.uniform(0.0, 5.0), mstype.float64)],
        'desc_bprop': [Tensor(np.array([[[1.], [1.]], [[1.], [1.]]]).astype(np.float32))]}),
    ('MatrixDeterminant', {
        'block': P.MatrixDeterminant(),
        'desc_inputs': [Tensor(np.array([[[-1, -2], [-3, -4]], [[5, 6], [7, 8]]]).astype(np.float32))],