!40215 [assistant][ops] Add new gpu operator Lu

Merge pull request !40215 from LiuMingwu/Lu
2022-11-29 01:37:41 +00:00 · 2022-11-29 01:37:41 +00:00 · ad88c0e1f6
parent 4e448004af 3d55e3f947
commit ad88c0e1f6
11 changed files with 913 additions and 345 deletions
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_gpu_kernel.cc
@ -15,22 +15,315 @@
 */

 #include "plugin/device/gpu/kernel/math/lu_gpu_kernel.h"
+#include <iostream>
+#include <functional>
+#include <utility>
+#include <string>
+#include <algorithm>
+#include "abstract/utils.h"
+#include "kernel/common_utils.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_transpose_impl.cuh"
+
 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_ONE(LU,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddOutputAttr(kNumberTypeFloat32)
-                        .AddOutputAttr(kNumberTypeInt32)
-                        .AddOutputAttr(kNumberTypeInt32),
-                      LUGpuKernelMod, float)
+bool LuGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+                          const std::vector<KernelTensorPtr> &outputs) {
+  kernel_name_ = base_operator->name();
+  if (inputs.empty() || outputs.empty()) {
+    MS_LOG(ERROR) << "For '" << kernel_name_ << "', it got empty inputs or outputs, which is invalid.";
+    return false;
+  }
+  if (!MatchKernelFunc(base_operator, inputs, outputs)) {
+    return false;
+  }
+  handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
+  cublas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
+  return true;
+}

-MS_REG_GPU_KERNEL_ONE(LU,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat64)
-                        .AddOutputAttr(kNumberTypeFloat64)
-                        .AddOutputAttr(kNumberTypeInt32)
-                        .AddOutputAttr(kNumberTypeInt32),
-                      LUGpuKernelMod, double)
+int LuGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+                           const std::vector<KernelTensorPtr> &outputs, const std::map<uint32_t, tensor::TensorPtr> &) {
+  if (int ret = KernelMod::Resize(base_operator, inputs, outputs); ret != KRET_OK) {
+    return ret;
+  }
+  unit_size_ = abstract::TypeIdSize(inputs.at(kIndex0)->GetDtype());
+  auto in_shape = inputs.at(kIndex0)->GetShapeVector();
+  (void)std::transform(in_shape.begin(), in_shape.end(), std::back_inserter(in_shape_), LongToSize);
+  if (!CheckLuShape()) {
+    MS_LOG(ERROR) << "For '" << kernel_name_ << "', input shape init failed.";
+    return KRET_RESIZE_FAILED;
+  }
+  // a device addr to place lu factor return code
+  workspace_size_list_.push_back(sizeof(int));
+
+  // transpose workspace
+  workspace_size_list_.push_back(batch_size_ * m_ * n_ * unit_size_);
+  workspace_size_list_.push_back(batch_size_ * n_ * sizeof(int));
+
+  // The workspace for device return info.
+  workspace_size_list_.push_back(batch_size_ * sizeof(void *));
+  workspace_size_list_.push_back(batch_size_ * sizeof(int));
+  return KRET_OK;
+}
+
+void LuGpuKernelMod::ResetResource() noexcept {
+  is_null_input_ = false;
+  input_size_list_.clear();
+  output_size_list_.clear();
+  workspace_size_list_.clear();
+}
+
+template <typename T>
+void LuGpuKernelMod::BufferSize(T *batch_output_addr, int *lwork) {
+  if constexpr (std::is_same_v<T, float>) {
+    CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(cusolverDnSgetrf_bufferSize(handle_, m_, n_, batch_output_addr, lda_, lwork),
+                                           "cusolver query lu work size fail");
+  } else if constexpr (std::is_same_v<T, double>) {
+    CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(cusolverDnDgetrf_bufferSize(handle_, m_, n_, batch_output_addr, lda_, lwork),
+                                           "cusolver query lu work size fail");
+  } else if constexpr (std::is_same_v<T, utils::Complex<float>>) {
+    CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
+      cusolverDnCgetrf_bufferSize(handle_, m_, n_, reinterpret_cast<cuComplex *>(batch_output_addr), lda_, lwork),
+      "cusolver query lu work size fail");
+  } else {
+    CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
+      cusolverDnZgetrf_bufferSize(handle_, m_, n_, reinterpret_cast<cuDoubleComplex *>(batch_output_addr), lda_, lwork),
+      "cusolver query lu work size fail");
+  }
+}
+
+template <typename T, typename S>
+void LuGpuKernelMod::LaunchKernel_CuSolve(const std::vector<AddressPtr> &inputs,
+                                          const std::vector<AddressPtr> &workspace,
+                                          const std::vector<AddressPtr> &outputs) {
+  CHECK_CUSOLVER_RET_WITH_ERROR(cusolverDnSetStream(handle_, reinterpret_cast<cudaStream_t>(cuda_stream_)),
+                                "cusolverDnSetStream failed");
+  T *batch_input_addr = GetDeviceAddress<T>(inputs, kDim0);
+  T *batch_output_addr = GetDeviceAddress<T>(outputs, kDim0);
+  T *d_work_ = nullptr;
+  S *batch_piv_output_addr = nullptr;
+  if (pivot_on_) {
+    batch_piv_output_addr = GetDeviceAddress<S>(outputs, kDim1);
+  }
+  int *info_output_addr = GetDeviceAddress<int>(workspace, kDim0);
+  T *dev_work = GetDeviceAddress<T>(workspace, kDim1);
+  int *dev_batch_piv = GetDeviceAddress<int>(workspace, kDim2);
+  // query working space of getrf
+  BufferSize(batch_output_addr, &lwork_);
+  // Transpose input data from rowMajor to colMajor.
+  MatrixTranspose(batch_input_addr, SizeToInt(input_elements_), m_, m_, dev_work, device_id_,
+                  reinterpret_cast<cudaStream_t>(cuda_stream_));
+  // malloc device working space of getrf
+  d_work_ = reinterpret_cast<T *>(device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(unit_size_ * lwork_));
+  for (size_t batch = 0; batch < batch_size_; ++batch) {
+    S *piv_output_addr = batch_piv_output_addr + batch * k_;
+    int *dev_piv = dev_batch_piv + batch * k_;
+    if constexpr (std::is_same_v<T, float>) {
+      CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
+        cusolverDnSgetrf(handle_, m_, n_, dev_work + batch * m_ * n_, lda_, d_work_, dev_piv, info_output_addr),
+        "cusolver lu fail");
+    } else if constexpr (std::is_same_v<T, double>) {
+      CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
+        cusolverDnDgetrf(handle_, m_, n_, dev_work + batch * m_ * n_, lda_, d_work_, dev_piv, info_output_addr),
+        "cusolver lu fail");
+    } else if constexpr (std::is_same_v<T, utils::Complex<float>>) {
+      CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
+        cusolverDnCgetrf(handle_, m_, n_, reinterpret_cast<cuComplex *>(dev_work + batch * m_ * n_), lda_,
+                         reinterpret_cast<cuComplex *>(d_work_), dev_piv, info_output_addr),
+        "cusolver lu fail");
+    } else {
+      CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
+        cusolverDnZgetrf(handle_, m_, n_, reinterpret_cast<cuDoubleComplex *>(dev_work + batch * m_ * n_), lda_,
+                         reinterpret_cast<cuDoubleComplex *>(d_work_), dev_piv, info_output_addr),
+        "cusolver lu fail");
+    }
+
+    std::vector<int> host_permuted(k_, 0);
+    std::vector<int> host_pivots(k_, 0);
+    std::vector<S> host_p(k_, 0);
+    CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
+      cudaMemcpyAsync(host_pivots.data(), dev_piv, sizeof(int) * k_, cudaMemcpyDeviceToHost,
+                      reinterpret_cast<cudaStream_t>(cuda_stream_)),
+      "cudaMemcpyAsync failed in LuGpuKernelMod::Launch copy pivots to host.");
+    // cal pivots && permutation major by row.
+    for (size_t i = 0; i < k_; ++i) {
+      host_pivots[i] -= 1;
+      host_permuted[i] = i;
+    }
+    for (size_t i = 0; i < k_; ++i) {
+      int tmp_value = host_permuted[i];
+      host_permuted[i] = host_permuted[host_pivots[i]];
+      host_permuted[host_pivots[i]] = tmp_value;
+    }
+    for (size_t i = 0; i < k_; ++i) {
+      host_p[i] = host_permuted[i];
+    }
+    CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
+      cudaMemcpyAsync(piv_output_addr, host_p.data(), sizeof(S) * k_, cudaMemcpyHostToDevice,
+                      reinterpret_cast<cudaStream_t>(cuda_stream_)),
+      "cudaMemcpyAsync failed in LuGpuKernelMod::Launch copy pivots array.");
+  }
+  MatrixTranspose(dev_work, SizeToInt(input_elements_), m_, m_, batch_output_addr, device_id_,
+                  reinterpret_cast<cudaStream_t>(cuda_stream_));
+  device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(d_work_);
+}
+
+template <typename T, typename S>
+void LuGpuKernelMod::LaunchKernel_Cublas(const std::vector<AddressPtr> &inputs,
+                                         const std::vector<AddressPtr> &workspace,
+                                         const std::vector<AddressPtr> &outputs) {
+  T *batch_input_addr = GetDeviceAddress<T>(inputs, kDim0);
+  T *batch_output_addr = GetDeviceAddress<T>(outputs, kDim0);
+  S *batch_piv_output_addr = nullptr;
+  if (pivot_on_) {
+    batch_piv_output_addr = GetDeviceAddress<S>(outputs, kDim1);
+  }
+  T *dev_transpose_work = GetDeviceAddress<T>(workspace, kDim1);
+  auto dev_batch_piv = GetDeviceAddress<int>(workspace, kDim2);
+  auto batch_lu_device_address = GetDeviceAddress<T *>(workspace, kDim3);
+  auto info = GetDeviceAddress<int>(workspace, kDim4);
+  std::vector<T *> batch_lu_address_data;
+  for (size_t i = 0; i < batch_size_; i++) {
+    batch_lu_address_data.emplace_back(dev_transpose_work + i * m_ * m_);
+  }
+  CHECK_CUDA_RET_WITH_ERROR_NOTRACE(
+    cudaMemcpyAsync(batch_lu_device_address, batch_lu_address_data.data(), sizeof(T *) * batch_size_,
+                    cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(cuda_stream_)),
+    "LuGpuKernelMod cudaMemcpyAsync Fail");
+  CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(cublasSetStream(cublas_handle_, reinterpret_cast<cudaStream_t>(cuda_stream_)),
+                                       "For LuGpuKernelMod cublasSetStream Fail");
+  // Transpose input data from rowMajor to colMajor.
+  MatrixTranspose(batch_input_addr, SizeToInt(input_elements_), m_, m_, dev_transpose_work, device_id_,
+                  reinterpret_cast<cudaStream_t>(cuda_stream_));
+  if constexpr (std::is_same_v<T, float>) {
+    CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(
+      cublasSgetrfBatched(cublas_handle_, m_, reinterpret_cast<float **>(batch_lu_device_address), m_, dev_batch_piv,
+                          info, SizeToInt(batch_size_)),
+      "LuGpuKernelMod cublasSgetrfBatched Fail");
+  } else if constexpr (std::is_same_v<T, double>) {
+    CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(
+      cublasDgetrfBatched(cublas_handle_, m_, reinterpret_cast<double **>(batch_lu_device_address), m_, dev_batch_piv,
+                          info, SizeToInt(batch_size_)),
+      "LuGpuKernelMod cublasDgetrfBatched Fail");
+  } else if constexpr (std::is_same_v<T, utils::Complex<float>>) {
+    CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(
+      cublasCgetrfBatched(cublas_handle_, m_, reinterpret_cast<cuComplex **>(batch_lu_device_address), m_,
+                          dev_batch_piv, info, SizeToInt(batch_size_)),
+      "LuGpuKernelMod cublasCgetrfBatched Fail");
+  } else if constexpr (std::is_same_v<T, utils::Complex<double>>) {
+    CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(
+      cublasZgetrfBatched(cublas_handle_, m_, reinterpret_cast<cuDoubleComplex **>(batch_lu_device_address), m_,
+                          dev_batch_piv, info, SizeToInt(batch_size_)),
+      "LuGpuKernelMod cublasZgetrfBatched Fail");
+  } else {
+    MS_LOG(ERROR) << "For '" << kernel_name_
+                  << "', it's the input data type must be float32, float64, complex64 or complex128.";
+  }
+  MatrixTranspose(dev_transpose_work, SizeToInt(input_elements_), m_, m_, batch_output_addr, device_id_,
+                  reinterpret_cast<cudaStream_t>(cuda_stream_));
+  std::vector<int> host_permuted(batch_size_ * k_, 0);
+  std::vector<int> host_pivots(batch_size_ * k_, 0);
+  std::vector<S> host_p(batch_size_ * k_, 0);
+  CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
+    cudaMemcpyAsync(host_pivots.data(), dev_batch_piv, sizeof(int) * batch_size_ * k_, cudaMemcpyDeviceToHost,
+                    reinterpret_cast<cudaStream_t>(cuda_stream_)),
+    "cudaMemcpyAsync failed in LuGpuKernelMod::Launch copy pivots to host.");
+  for (size_t i = 0; i < batch_size_; ++i) {
+    for (size_t j = 0; j < k_; ++j) {
+      host_permuted[i * k_ + j] = j;
+      host_pivots[i * k_ + j] -= 1;
+    }
+    for (size_t j = 0; j < k_; ++j) {
+      int tmp_value = host_permuted[i * k_ + j];
+      host_permuted[i * k_ + j] = host_permuted[i * k_ + host_pivots[i * k_ + j]];
+      host_permuted[i * k_ + host_pivots[i * k_ + j]] = tmp_value;
+    }
+  }
+  for (size_t i = 0; i < batch_size_; ++i) {
+    for (size_t j = 0; j < k_; ++j) {
+      host_p[i * k_ + j] = host_permuted[i * k_ + j];
+    }
+  }
+  CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
+    cudaMemcpyAsync(batch_piv_output_addr, host_p.data(), sizeof(S) * batch_size_ * k_, cudaMemcpyHostToDevice,
+                    reinterpret_cast<cudaStream_t>(cuda_stream_)),
+    "cudaMemcpyAsync failed in LuGpuKernelMod::Launch copy pivots array.");
+}
+
+template <typename T, typename S>
+bool LuGpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                                  const std::vector<AddressPtr> &outputs) {
+  // If m_ / batch_size_ <= 128 :
+  // We use batched cublas api is faster by empiricism, for small matrices or large batch.
+  // Otherwise:
+  // We use no-batched cusolver api is faster by empiricism, For small batch sizes.
+  const size_t kNumber128 = 128;
+  if (m_ / batch_size_ <= kNumber128) {
+    LaunchKernel_Cublas<T, S>(inputs, workspace, outputs);
+  } else {
+    LaunchKernel_CuSolve<T, S>(inputs, workspace, outputs);
+  }
+  return true;
+}
+
+bool LuGpuKernelMod::CheckLuShape() {
+  constexpr size_t lu_min_dim = 1;
+  if (in_shape_.size() <= lu_min_dim) {
+    MS_LOG(ERROR) << kernel_name_ << " input shape is " << in_shape_.size() << " which is invalid.";
+    return false;
+  }
+  constexpr size_t lu_reverse_row_dim = 2;
+  lu_row_ = in_shape_.at(in_shape_.size() - lu_reverse_row_dim);
+  lu_col_ = in_shape_.at(in_shape_.size() - 1);
+  input_elements_ = std::accumulate(in_shape_.begin(), in_shape_.end(), size_t(1), std::multiplies<size_t>());
+  batch_size_ = lu_min_dim;
+  for (int batch = 0; batch < static_cast<int>(in_shape_.size() - lu_reverse_row_dim); ++batch) {
+    batch_size_ *= in_shape_.at(batch);
+  }
+  // set matrix row or col to be lead dimension
+  m_ = SizeToInt(lu_row_);
+  n_ = SizeToInt(lu_col_);
+  k_ = std::min(lu_row_, lu_col_);
+  lda_ = m_;
+  ldb_ = n_;
+  return true;
+}
+
+const std::vector<std::pair<KernelAttr, LuGpuKernelMod::KernelRunFunc>> &LuGpuKernelMod::GetFuncList() const {
+  static const std::vector<std::pair<KernelAttr, LuGpuKernelMod::KernelRunFunc>> func_list = {
+    {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32),
+     &LuGpuKernelMod::LaunchKernel<float, int>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeInt32),
+     &LuGpuKernelMod::LaunchKernel<double, int>},
+    {KernelAttr()
+       .AddInputAttr(kNumberTypeComplex64)
+       .AddOutputAttr(kNumberTypeComplex64)
+       .AddOutputAttr(kNumberTypeInt32),
+     &LuGpuKernelMod::LaunchKernel<utils::Complex<float>, int>},
+    {KernelAttr()
+       .AddInputAttr(kNumberTypeComplex128)
+       .AddOutputAttr(kNumberTypeComplex128)
+       .AddOutputAttr(kNumberTypeInt32),
+     &LuGpuKernelMod::LaunchKernel<utils::Complex<double>, int>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt64),
+     &LuGpuKernelMod::LaunchKernel<float, int64_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeInt64),
+     &LuGpuKernelMod::LaunchKernel<double, int64_t>},
+    {KernelAttr()
+       .AddInputAttr(kNumberTypeComplex64)
+       .AddOutputAttr(kNumberTypeComplex64)
+       .AddOutputAttr(kNumberTypeInt64),
+     &LuGpuKernelMod::LaunchKernel<utils::Complex<float>, int64_t>},
+    {KernelAttr()
+       .AddInputAttr(kNumberTypeComplex128)
+       .AddOutputAttr(kNumberTypeComplex128)
+       .AddOutputAttr(kNumberTypeInt64),
+     &LuGpuKernelMod::LaunchKernel<utils::Complex<double>, int64_t>},
+  };
+  return func_list;
+}
+
+MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, Lu, LuGpuKernelMod);
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_gpu_kernel.h
@ -14,244 +14,68 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_MATH_LU_GPU_KERNEL_H_
-#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_MATH_LU_GPU_KERNEL_H_
-#include <cublas_v2.h>
-#include <cuda_runtime_api.h>
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_GPU_KERNEL_H_
 #include <vector>
+#include <memory>
+#include <utility>
 #include <map>
-#include <string>
-#include <algorithm>
-#include <type_traits>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
-#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "include/common/utils/convert_utils.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
+#include "plugin/factory/ms_factory.h"

 namespace mindspore {
 namespace kernel {
-
-template <typename T>
-class LUGpuKernelMod : public NativeGpuKernelMod {
+class LuGpuKernelMod : public NativeGpuKernelMod, public MatchKernelHelper<LuGpuKernelMod> {
 public:
-  LUGpuKernelMod() : is_null_input_(false) {}
-  ~LUGpuKernelMod() = default;
+  LuGpuKernelMod() { ResetResource(); }
+  ~LuGpuKernelMod() override = default;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
-              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+              const std::vector<AddressPtr> &outputs, void *cuda_stream) override {
    if (is_null_input_) {
      return true;
    }
-    CHECK_CUSOLVER_RET_WITH_ERROR(cusolverDnSetStream(handle_, reinterpret_cast<cudaStream_t>(stream_ptr)),
-                                  "cusolverDnSetStream failed");
-    T *batch_input_addr = GetDeviceAddress<T>(inputs, kDim0);
-    T *batch_output_addr = GetDeviceAddress<T>(outputs, kDim0);
-    int *batch_piv_output_addr = nullptr;
-    if (pivot_on_) {
-      batch_piv_output_addr = GetDeviceAddress<int>(outputs, kDim1);
-    }
-    int *batch_permutation_addr = GetDeviceAddress<int>(outputs, kDim2);
-    int *info_output_addr = GetDeviceAddress<int>(workspace, kDim0);
-
-    size_t *dev_transpose_shape = GetDeviceAddress<size_t>(workspace, kDim1);
-    size_t *dev_transpose_axis = GetDeviceAddress<size_t>(workspace, kDim2);
-    constexpr size_t shape_2d = 2;
-    size_t host_transpose_shape[shape_2d] = {m_, n_};
-    size_t host_transpose_axis[shape_2d] = {1, 0};
-    T *dev_transpose_work = GetDeviceAddress<T>(workspace, kDim3);
-    CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
-      cudaMemcpyAsync(dev_transpose_axis, host_transpose_axis, shape_2d * sizeof(size_t), cudaMemcpyHostToDevice,
-                      reinterpret_cast<cudaStream_t>(stream_ptr)),
-      "malloc input shape workspace failed");
-
-    CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
-      cudaMemcpyAsync(batch_output_addr, batch_input_addr, batch_size_ * m_ * n_ * unit_size_, cudaMemcpyDeviceToDevice,
-                      reinterpret_cast<cudaStream_t>(stream_ptr)),
-      "cudaMemcpyAsync failed in LUGpuKernelMod::Launch.");
-
-    // 4. query working space of getrf
-    if constexpr (std::is_same_v<T, float>) {
-      CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
-        cusolverDnSgetrf_bufferSize(handle_, m_, n_, batch_output_addr, lda_, &lwork_),
-        "cusolver query lu work size fail");
-
-    } else if constexpr (std::is_same_v<T, double>) {
-      CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
-        cusolverDnDgetrf_bufferSize(handle_, m_, n_, batch_output_addr, lda_, &lwork_),
-        "cusolver query lu work size fail");
-    } else {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type only should be float or double, right now.";
-    }
-    // 5. malloc device working space of getrf
-    d_work_ = reinterpret_cast<T *>(device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(unit_size_ * lwork_));
-    for (size_t batch = 0; batch < batch_size_; ++batch) {
-      T *output_addr = batch_output_addr + batch * m_ * n_;
-      int *permutation_addr = batch_permutation_addr + batch * k_ * k_;
-      int *piv_output_addr = batch_piv_output_addr + batch * k_;
-      CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
-        cudaMemcpyAsync(dev_transpose_shape, host_transpose_shape, shape_2d * sizeof(size_t), cudaMemcpyHostToDevice,
-                        reinterpret_cast<cudaStream_t>(stream_ptr)),
-        "malloc input shape workspace failed");
-
-      CalTranspose(m_ * n_, output_addr, dev_transpose_shape, dev_transpose_axis, shape_2d, dev_transpose_work,
-                   reinterpret_cast<cudaStream_t>(stream_ptr));
-
-      // 6.lu factorization according to cuSolver api, outputs have been written to input's matrix.
-      if constexpr (std::is_same_v<T, float>) {
-        CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
-
-          cusolverDnSgetrf(handle_, m_, n_, dev_transpose_work, lda_, d_work_, piv_output_addr, info_output_addr),
-          "cusolver lu fail");
-      } else if constexpr (std::is_same_v<T, double>) {
-        // 6.lu factorization according to cuSolver api, outputs have been written to input's matrix.
-        CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
-          cusolverDnDgetrf(handle_, m_, n_, dev_transpose_work, lda_, d_work_, piv_output_addr, info_output_addr),
-          "cusolver lu fail");
-      } else {
-        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type only should be float or double, right now.";
-      }
-
-      size_t host_wk_transpose_shape[shape_2d] = {n_, m_};
-      cudaMemcpyAsync(dev_transpose_shape, host_wk_transpose_shape, shape_2d * sizeof(size_t), cudaMemcpyHostToDevice,
-                      reinterpret_cast<cudaStream_t>(stream_ptr));
-      CalTranspose(m_ * n_, dev_transpose_work, dev_transpose_shape, dev_transpose_axis, shape_2d, output_addr,
-                   reinterpret_cast<cudaStream_t>(stream_ptr));
-      std::vector<int> host_permuted(k_, 0);
-      std::vector<int> host_pivots(k_, 0);
-      std::vector<int> host_permutation(k_ * k_, 0);
-      CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
-        cudaMemcpyAsync(host_pivots.data(), piv_output_addr, sizeof(int) * k_, cudaMemcpyDeviceToHost,
-                        reinterpret_cast<cudaStream_t>(stream_ptr)),
-        "cudaMemcpyAsync failed in LUGpuKernelMod::Launch copy pivots to host.");
-
-      // cal pivots && permutation major by row.
-      for (size_t i = 0; i < k_; ++i) {
-        host_pivots[i] -= 1;
-        host_permuted[i] = i;
-      }
-      for (size_t i = 0; i < k_; ++i) {
-        int tmp_value = host_permuted[i];
-        host_permuted[i] = host_permuted[host_pivots[i]];
-        host_permuted[host_pivots[i]] = tmp_value;
-      }
-      // gpu default is P.A = LU, so here is col swap.
-      for (size_t i = 0; i < k_; ++i) {
-        host_permutation[host_permuted[i] * k_ + i] = 1;
-      }
-      CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
-        cudaMemcpyAsync(permutation_addr, host_permutation.data(), sizeof(int) * k_ * k_, cudaMemcpyHostToDevice,
-                        reinterpret_cast<cudaStream_t>(stream_ptr)),
-        "cudaMemcpyAsync failed in LUGpuKernelMod::Launch copy permutation matrix.");
-      CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
-        cudaMemcpyAsync(piv_output_addr, host_pivots.data(), sizeof(int) * k_, cudaMemcpyHostToDevice,
-                        reinterpret_cast<cudaStream_t>(stream_ptr)),
-        "cudaMemcpyAsync failed in LUGpuKernelMod::Launch copy pivots array.");
-    }
-    device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(d_work_);
-    return true;
+    cuda_stream_ = cuda_stream;
+    return kernel_func_(this, inputs, workspace, outputs);
  }

  bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
-            const std::vector<KernelTensorPtr> &outputs) override {
-    MS_EXCEPTION_IF_NULL(base_operator);
-    kernel_name_ = base_operator->name();
-    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
-    return true;
-  }
+            const std::vector<KernelTensorPtr> &outputs) override;

  int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
-             const std::vector<KernelTensorPtr> &outputs,
-             const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost) override {
-    if (auto ret = KernelMod::Resize(base_operator, inputs, outputs, inputsOnHost); ret != KRET_OK) {
-      return ret;
-    }
-    batch_size_ = 1;
-    auto shape_signed = inputs[kIndex0]->GetShapeVector();
-    auto in_shape = Convert2SizeT(shape_signed);
-    // 2. check input shape not null
-    is_null_input_ = CHECK_SHAPE_NULL(in_shape, kernel_name_, "input");
-    if (is_null_input_) {
-      InitSizeLists();
-      return KRET_OK;
-    }
-    // 3. calculate input size
-    if (!InitInputSize(in_shape)) {
-      MS_LOG(ERROR) << "For 'PureCholeskyGpuKernel', input shape init failed.";
-      return KRET_RESIZE_FAILED;
-    }
-    return KRET_OK;
-  }
+             const std::vector<KernelTensorPtr> &outputs, const std::map<uint32_t, tensor::TensorPtr> &) override;

-  std::vector<KernelAttr> GetOpSupport() override {
-    static std::vector<KernelAttr> support_list = {
-      KernelAttr()
-        .AddInputAttr(kNumberTypeFloat32)
-        .AddOutputAttr(kNumberTypeFloat32)
-        .AddOutputAttr(kNumberTypeInt32)
-        .AddOutputAttr(kNumberTypeInt32),
-      KernelAttr()
-        .AddInputAttr(kNumberTypeFloat64)
-        .AddOutputAttr(kNumberTypeFloat64)
-        .AddOutputAttr(kNumberTypeInt32)
-        .AddOutputAttr(kNumberTypeInt32),
-    };
-    return support_list;
-  }
+  const std::vector<std::pair<KernelAttr, KernelRunFunc>> &GetFuncList() const override;
+
+ protected:
+  std::vector<KernelAttr> GetOpSupport() override { return OpSupport(); }

 private:
-  bool InitInputSize(const std::vector<size_t> &in_shape) {
-    constexpr size_t lu_min_dim = 1;
-    if (in_shape.size() <= lu_min_dim) {
-      MS_LOG_EXCEPTION << kernel_name_ << " input shape is " << in_shape.size() << " which is invalid.";
-    }
-    constexpr size_t lu_reverse_row_dim = 2;
-    lu_row_ = in_shape.at(in_shape.size() - lu_reverse_row_dim);
-    lu_col_ = in_shape.at(in_shape.size() - 1);
-    batch_size_ = lu_min_dim;
-    for (int batch = 0; batch < static_cast<int>(in_shape.size() - lu_reverse_row_dim); ++batch) {
-      batch_size_ *= in_shape.at(batch);
-    }
-    // set matrix row or col to be lead dimension
-    m_ = SizeToInt(lu_row_);
-    n_ = SizeToInt(lu_col_);
-    k_ = std::min(lu_row_, lu_col_);
-    lda_ = m_;
-    ldb_ = n_;
-    InitSizeLists();
-    return true;
-  }
+  void ResetResource() noexcept;

-  void InitSizeLists() {
-    size_t input_size = batch_size_ * lu_row_ * lu_col_ * unit_size_;
-    input_size_list_.push_back(input_size);
+  bool CheckLuShape();

-    size_t output_size = batch_size_ * lu_row_ * lu_col_ * unit_size_;
+  template <typename T, typename S>
+  bool LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                    const std::vector<AddressPtr> &outputs);

-    size_t output_piv_size = 0;
-    if (pivot_on_) {
-      output_piv_size = batch_size_ * k_ * sizeof(int);
-    }
-    size_t output_permutation_size = batch_size_ * k_ * k_ * sizeof(int);
-    output_size_list_.resize(kDim3);
-    output_size_list_[kDim0] = output_size;
-    output_size_list_[kDim1] = output_piv_size;
-    output_size_list_[kDim2] = output_permutation_size;
+  template <typename T, typename S>
+  void LaunchKernel_CuSolve(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                            const std::vector<AddressPtr> &outputs);

-    // a device addr to place lu factor return code
-    workspace_size_list_.push_back(sizeof(int));
+  template <typename T, typename S>
+  void LaunchKernel_Cublas(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                           const std::vector<AddressPtr> &outputs);

-    // transpose 2d matrix scalar args workspace
-    constexpr size_t shape_2d = 2;
-    workspace_size_list_.push_back(shape_2d * sizeof(size_t));
-    workspace_size_list_.push_back(shape_2d * sizeof(size_t));
+  template <typename T>
+  void BufferSize(T *batch_output_addr, int *lwork);

-    // transpose workspace
-    workspace_size_list_.push_back(m_ * n_ * unit_size_);
-  }
-
-  size_t unit_size_{sizeof(T)};
+  bool is_null_input_{false};
+  bool pivot_on_{true};
+  std::vector<size_t> in_shape_;
+  size_t unit_size_{1};
  size_t batch_size_{1};
+  size_t input_elements_{};
  size_t lu_row_{0};
  size_t lu_col_{0};
  size_t k_{0};
@ -260,12 +84,11 @@ class LUGpuKernelMod : public NativeGpuKernelMod {
  size_t lda_{0};
  size_t ldb_{0};
  int lwork_{0};
-  bool pivot_on_{true};
-  T *d_work_{nullptr};
+  void *cuda_stream_{nullptr};
  cusolverDnHandle_t handle_{nullptr};
-  bool is_null_input_;
+  cublasHandle_t cublas_handle_{nullptr};
 };
 }  // namespace kernel
 }  // namespace mindspore

-#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_MATH_LU_GPU_KERNEL_H_
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_GPU_KERNEL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_scipy_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_scipy_gpu_kernel.cc
@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/math/lu_scipy_gpu_kernel.h"
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(LU,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      LUGpuKernelMod, float)
+
+MS_REG_GPU_KERNEL_ONE(LU,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddOutputAttr(kNumberTypeFloat64)
+                        .AddOutputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      LUGpuKernelMod, double)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_scipy_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_scipy_gpu_kernel.h
@ -0,0 +1,269 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_SCIPY_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_SCIPY_GPU_KERNEL_H_
+#include <cublas_v2.h>
+#include <cuda_runtime_api.h>
+#include <vector>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <type_traits>
+#include "plugin/device/gpu/kernel/gpu_kernel.h"
+#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
+#include "plugin/device/gpu/kernel/kernel_constants.h"
+#include "include/common/utils/convert_utils.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class LUGpuKernelMod : public NativeGpuKernelMod {
+ public:
+  LUGpuKernelMod() : is_null_input_(false) {}
+  ~LUGpuKernelMod() = default;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    if (is_null_input_) {
+      return true;
+    }
+    CHECK_CUSOLVER_RET_WITH_ERROR(cusolverDnSetStream(handle_, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                                  "cusolverDnSetStream failed");
+    T *batch_input_addr = GetDeviceAddress<T>(inputs, kDim0);
+    T *batch_output_addr = GetDeviceAddress<T>(outputs, kDim0);
+    int *batch_piv_output_addr = nullptr;
+    if (pivot_on_) {
+      batch_piv_output_addr = GetDeviceAddress<int>(outputs, kDim1);
+    }
+    int *batch_permutation_addr = GetDeviceAddress<int>(outputs, kDim2);
+    int *info_output_addr = GetDeviceAddress<int>(workspace, kDim0);
+
+    size_t *dev_transpose_shape = GetDeviceAddress<size_t>(workspace, kDim1);
+    size_t *dev_transpose_axis = GetDeviceAddress<size_t>(workspace, kDim2);
+    constexpr size_t shape_2d = 2;
+    size_t host_transpose_shape[shape_2d] = {m_, n_};
+    size_t host_transpose_axis[shape_2d] = {1, 0};
+    T *dev_transpose_work = GetDeviceAddress<T>(workspace, kDim3);
+    CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
+      cudaMemcpyAsync(dev_transpose_axis, host_transpose_axis, shape_2d * sizeof(size_t), cudaMemcpyHostToDevice,
+                      reinterpret_cast<cudaStream_t>(stream_ptr)),
+      "malloc input shape workspace failed");
+
+    CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
+      cudaMemcpyAsync(batch_output_addr, batch_input_addr, batch_size_ * m_ * n_ * unit_size_, cudaMemcpyDeviceToDevice,
+                      reinterpret_cast<cudaStream_t>(stream_ptr)),
+      "cudaMemcpyAsync failed in LUGpuKernelMod::Launch.");
+
+    // 4. query working space of getrf
+    if constexpr (std::is_same_v<T, float>) {
+      CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
+        cusolverDnSgetrf_bufferSize(handle_, m_, n_, batch_output_addr, lda_, &lwork_),
+        "cusolver query lu work size fail");
+    } else if constexpr (std::is_same_v<T, double>) {
+      CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
+        cusolverDnDgetrf_bufferSize(handle_, m_, n_, batch_output_addr, lda_, &lwork_),
+        "cusolver query lu work size fail");
+    } else {
+      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type only should be float or double, right now.";
+    }
+    // 5. malloc device working space of getrf
+    d_work_ = reinterpret_cast<T *>(device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(unit_size_ * lwork_));
+    for (size_t batch = 0; batch < batch_size_; ++batch) {
+      T *output_addr = batch_output_addr + batch * m_ * n_;
+      int *permutation_addr = batch_permutation_addr + batch * k_ * k_;
+      int *piv_output_addr = batch_piv_output_addr + batch * k_;
+      CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
+        cudaMemcpyAsync(dev_transpose_shape, host_transpose_shape, shape_2d * sizeof(size_t), cudaMemcpyHostToDevice,
+                        reinterpret_cast<cudaStream_t>(stream_ptr)),
+        "malloc input shape workspace failed");
+
+      CalTranspose(m_ * n_, output_addr, dev_transpose_shape, dev_transpose_axis, shape_2d, dev_transpose_work,
+                   reinterpret_cast<cudaStream_t>(stream_ptr));
+
+      // 6.lu factorization according to cuSolver api, outputs have been written to input's matrix.
+      if constexpr (std::is_same_v<T, float>) {
+        CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
+
+          cusolverDnSgetrf(handle_, m_, n_, dev_transpose_work, lda_, d_work_, piv_output_addr, info_output_addr),
+          "cusolver lu fail");
+      } else if constexpr (std::is_same_v<T, double>) {
+        // 6.lu factorization according to cuSolver api, outputs have been written to input's matrix.
+        CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
+          cusolverDnDgetrf(handle_, m_, n_, dev_transpose_work, lda_, d_work_, piv_output_addr, info_output_addr),
+          "cusolver lu fail");
+      } else {
+        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type only should be float or double, right now.";
+      }
+
+      size_t host_wk_transpose_shape[shape_2d] = {n_, m_};
+      cudaMemcpyAsync(dev_transpose_shape, host_wk_transpose_shape, shape_2d * sizeof(size_t), cudaMemcpyHostToDevice,
+                      reinterpret_cast<cudaStream_t>(stream_ptr));
+      CalTranspose(m_ * n_, dev_transpose_work, dev_transpose_shape, dev_transpose_axis, shape_2d, output_addr,
+                   reinterpret_cast<cudaStream_t>(stream_ptr));
+      std::vector<int> host_permuted(k_, 0);
+      std::vector<int> host_pivots(k_, 0);
+      std::vector<int> host_permutation(k_ * k_, 0);
+      CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
+        cudaMemcpyAsync(host_pivots.data(), piv_output_addr, sizeof(int) * k_, cudaMemcpyDeviceToHost,
+                        reinterpret_cast<cudaStream_t>(stream_ptr)),
+        "cudaMemcpyAsync failed in LUGpuKernelMod::Launch copy pivots to host.");
+
+      // cal pivots && permutation major by row.
+      for (size_t i = 0; i < k_; ++i) {
+        host_pivots[i] -= 1;
+        host_permuted[i] = i;
+      }
+      for (size_t i = 0; i < k_; ++i) {
+        int tmp_value = host_permuted[i];
+        host_permuted[i] = host_permuted[host_pivots[i]];
+        host_permuted[host_pivots[i]] = tmp_value;
+      }
+      // gpu default is P.A = LU, so here is col swap.
+      for (size_t i = 0; i < k_; ++i) {
+        host_permutation[host_permuted[i] * k_ + i] = 1;
+      }
+      CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
+        cudaMemcpyAsync(permutation_addr, host_permutation.data(), sizeof(int) * k_ * k_, cudaMemcpyHostToDevice,
+                        reinterpret_cast<cudaStream_t>(stream_ptr)),
+        "cudaMemcpyAsync failed in LUGpuKernelMod::Launch copy permutation matrix.");
+      CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
+        cudaMemcpyAsync(piv_output_addr, host_pivots.data(), sizeof(int) * k_, cudaMemcpyHostToDevice,
+                        reinterpret_cast<cudaStream_t>(stream_ptr)),
+        "cudaMemcpyAsync failed in LUGpuKernelMod::Launch copy pivots array.");
+    }
+    device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(d_work_);
+    return true;
+  }
+
+  bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+            const std::vector<KernelTensorPtr> &outputs) override {
+    MS_EXCEPTION_IF_NULL(base_operator);
+    kernel_name_ = base_operator->name();
+    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
+    return true;
+  }
+
+  int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+             const std::vector<KernelTensorPtr> &outputs,
+             const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost) override {
+    if (auto ret = KernelMod::Resize(base_operator, inputs, outputs, inputsOnHost); ret != KRET_OK) {
+      return ret;
+    }
+    batch_size_ = 1;
+    auto shape_signed = inputs[kIndex0]->GetShapeVector();
+    auto in_shape = Convert2SizeT(shape_signed);
+    // 2. check input shape not null
+    is_null_input_ = CHECK_SHAPE_NULL(in_shape, kernel_name_, "input");
+    if (is_null_input_) {
+      InitSizeLists();
+      return KRET_OK;
+    }
+    // 3. calculate input size
+    if (!InitInputSize(in_shape)) {
+      MS_LOG(ERROR) << "For 'PureCholeskyGpuKernel', input shape init failed.";
+      return KRET_RESIZE_FAILED;
+    }
+    return KRET_OK;
+  }
+
+  std::vector<KernelAttr> GetOpSupport() override {
+    static std::vector<KernelAttr> support_list = {
+      KernelAttr()
+        .AddInputAttr(kNumberTypeFloat32)
+        .AddOutputAttr(kNumberTypeFloat32)
+        .AddOutputAttr(kNumberTypeInt32)
+        .AddOutputAttr(kNumberTypeInt32),
+      KernelAttr()
+        .AddInputAttr(kNumberTypeFloat64)
+        .AddOutputAttr(kNumberTypeFloat64)
+        .AddOutputAttr(kNumberTypeInt32)
+        .AddOutputAttr(kNumberTypeInt32),
+    };
+    return support_list;
+  }
+
+ private:
+  bool InitInputSize(const std::vector<size_t> &in_shape) {
+    constexpr size_t lu_min_dim = 1;
+    if (in_shape.size() <= lu_min_dim) {
+      MS_LOG_EXCEPTION << kernel_name_ << " input shape is " << in_shape.size() << " which is invalid.";
+    }
+    constexpr size_t lu_reverse_row_dim = 2;
+    lu_row_ = in_shape.at(in_shape.size() - lu_reverse_row_dim);
+    lu_col_ = in_shape.at(in_shape.size() - 1);
+    batch_size_ = lu_min_dim;
+    for (int batch = 0; batch < static_cast<int>(in_shape.size() - lu_reverse_row_dim); ++batch) {
+      batch_size_ *= in_shape.at(batch);
+    }
+    // set matrix row or col to be lead dimension
+    m_ = SizeToInt(lu_row_);
+    n_ = SizeToInt(lu_col_);
+    k_ = std::min(lu_row_, lu_col_);
+    lda_ = m_;
+    ldb_ = n_;
+    InitSizeLists();
+    return true;
+  }
+
+  void InitSizeLists() {
+    size_t input_size = batch_size_ * lu_row_ * lu_col_ * unit_size_;
+    input_size_list_.push_back(input_size);
+
+    size_t output_size = batch_size_ * lu_row_ * lu_col_ * unit_size_;
+
+    size_t output_piv_size = 0;
+    if (pivot_on_) {
+      output_piv_size = batch_size_ * k_ * sizeof(int);
+    }
+    size_t output_permutation_size = batch_size_ * k_ * k_ * sizeof(int);
+    output_size_list_.resize(kDim3);
+    output_size_list_[kDim0] = output_size;
+    output_size_list_[kDim1] = output_piv_size;
+    output_size_list_[kDim2] = output_permutation_size;
+
+    // a device addr to place lu factor return code
+    workspace_size_list_.push_back(sizeof(int));
+
+    // transpose 2d matrix scalar args workspace
+    constexpr size_t shape_2d = 2;
+    workspace_size_list_.push_back(shape_2d * sizeof(size_t));
+    workspace_size_list_.push_back(shape_2d * sizeof(size_t));
+
+    // transpose workspace
+    workspace_size_list_.push_back(m_ * n_ * unit_size_);
+  }
+
+  size_t unit_size_{sizeof(T)};
+  size_t batch_size_{1};
+  size_t lu_row_{0};
+  size_t lu_col_{0};
+  size_t k_{0};
+  size_t m_{0};
+  size_t n_{0};
+  size_t lda_{0};
+  size_t ldb_{0};
+  int lwork_{0};
+  bool pivot_on_{true};
+  T *d_work_{nullptr};
+  cusolverDnHandle_t handle_{nullptr};
+  bool is_null_input_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_SCIPY_GPU_KERNEL_H_
--- a/mindspore/core/ops/core_ops.h
+++ b/mindspore/core/ops/core_ops.h
@ -1245,6 +1245,7 @@ GVAR_DEF(PrimitivePtr, kPrimGer, std::make_shared<Primitive>("Ger"));
 GVAR_DEF(PrimitivePtr, kPrimCeil, std::make_shared<Primitive>("Ceil"));
 GVAR_DEF(PrimitivePtr, kPrimDiagonal, std::make_shared<Primitive>(kDiagonal));
 GVAR_DEF(PrimitivePtr, kPrimTrunc, std::make_shared<Primitive>("Trunc"));
+GVAR_DEF(PrimitivePtr, kPrimLu, std::make_shared<Primitive>("Lu"));
 GVAR_DEF(PrimitivePtr, kPrimLuSolve, std::make_shared<Primitive>("LuSolve"));
 GVAR_DEF(PrimitivePtr, kPrimMatrixSolve, std::make_shared<Primitive>("MatrixSolve"));
 GVAR_DEF(PrimitivePtr, kPrimTridiagonalSolve, std::make_shared<Primitive>(kTridiagonalSolve));
--- a/mindspore/core/ops/lu.cc
+++ b/mindspore/core/ops/lu.cc
@ -15,68 +15,67 @@
 */

 #include "ops/lu.h"
-#include <algorithm>
 #include "ops/op_utils.h"
+#include "mindapi/ir/type.h"
 #include "utils/check_convert_utils.h"
 #include "abstract/ops/primitive_infer_map.h"
 #include "mindapi/src/helper.h"
-#include "common/graph_kernel/core/graph_kernel_utils.h"

 namespace mindspore {
 namespace ops {
 namespace {
-constexpr size_t kLUInputsNum = 1;
-constexpr size_t kXDim = 2;
-constexpr size_t kLastDim = 1;
-constexpr size_t kPenultimateDim = 2;
-abstract::TupleShapePtr LUInferShape(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
+abstract::TupleShapePtr LuInferShape(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
  MS_EXCEPTION_IF_NULL(primitive);
  auto prim_name = primitive->name();
-  auto x_shape_map = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[0]->BuildShape());
-  auto x_shape = x_shape_map[kShape];
-
-  auto x_output = std::make_shared<abstract::Shape>(x_shape);
-  if (IsDynamicRank(x_shape)) {
-    return std::make_shared<abstract::TupleShape>(std::vector<abstract::BaseShapePtr>{x_output, x_output, x_output});
+  constexpr int64_t number1 = 1;
+  constexpr int64_t number2 = 2;
+  const int64_t input_num = 1;
+  const int64_t rank = 2;
+  (void)CheckAndConvertUtils::CheckInteger("input numbers", SizeToLong(input_args.size()), kGreaterEqual, input_num,
+                                           prim_name);
+  auto input_shape_ptr = CheckAndConvertUtils::GetTensorInputShape(prim_name, input_args, kInputIndex0);
+  auto input_shape = input_shape_ptr->shape();
+  if (IsDynamicRank(input_shape)) {
+    abstract::ShapePtr rank_shape = std::make_shared<abstract::Shape>(ShapeVector({-2}));
+    return std::make_shared<abstract::TupleShape>(std::vector<abstract::BaseShapePtr>{rank_shape, rank_shape});
  }
-
-  size_t x_shape_size = x_shape.size();
-  if (x_shape_size < kXDim) {
-    MS_EXCEPTION(ValueError) << "For '" << prim_name << "',"
-                             << " the dimension of hashmap must be greater than or equal to 2, but got: "
-                             << x_shape_size << ".";
+  std::vector<int64_t> p_shape(input_shape.begin(), (input_shape.end() - number1));
+  abstract::ShapePtr p_shape_ptr = std::make_shared<abstract::Shape>(p_shape);
+  auto input_rank = SizeToLong(input_shape.size());
+  CheckAndConvertUtils::CheckInteger("input rank", input_rank, kGreaterEqual, rank, prim_name);
+  int64_t size1 = input_shape[input_shape.size() - number1];
+  int64_t size2 = input_shape[input_shape.size() - number2];
+  if (size1 != size2) {
+    MS_EXCEPTION(ValueError) << "For '" << primitive->name()
+                             << "', input_shape[-1] and input_shape[-2] must be same, but got " << size1 << " vs "
+                             << size2;
  }
-
-  auto k_shape = std::min(x_shape[x_shape_size - kLastDim], x_shape[x_shape_size - kPenultimateDim]);
-  ShapeVector top_k_shape(x_shape.begin(), x_shape.end() - kPenultimateDim);
-  ShapeVector pivots_shape = top_k_shape;
-  pivots_shape.push_back(k_shape);
-  ShapeVector permutation_shape = pivots_shape;
-  permutation_shape.push_back(k_shape);
-
-  auto pivots_output = std::make_shared<abstract::Shape>(pivots_shape);
-  auto permutation_output = std::make_shared<abstract::Shape>(permutation_shape);
-  return std::make_shared<abstract::TupleShape>(
-    std::vector<abstract::BaseShapePtr>{x_output, pivots_output, permutation_output});
+  return std::make_shared<abstract::TupleShape>(std::vector<abstract::BaseShapePtr>{input_shape_ptr, p_shape_ptr});
 }

-TuplePtr LUInferType(const PrimitivePtr &prim, const std::vector<AbstractBasePtr> &input_args) {
-  MS_EXCEPTION_IF_NULL(prim);
-  auto x_type = input_args[0]->BuildType();
-  return std::make_shared<Tuple>(std::vector<TypePtr>{x_type, kInt32, kInt32});
+TypePtr LuInferType(const PrimitivePtr &prim, const std::vector<AbstractBasePtr> &input_args) {
+  for (const auto &item : input_args) {
+    MS_EXCEPTION_IF_NULL(item);
+  }
+  const std::set<TypePtr> lu_types = {kFloat32, kFloat64, kComplex64, kComplex128};
+  auto input_type = input_args[kInputIndex0]->BuildType();
+  (void)CheckAndConvertUtils::CheckTensorTypeValid("input type", input_type, lu_types, prim->name());
+  const std::set<TypePtr> out_valid_types = {kInt32, kInt64};
+  ValuePtr out_type_value = prim->GetAttr("output_idx_type");
+  TypePtr type = dyn_cast<Type>(out_type_value);
+  (void)CheckAndConvertUtils::CheckTypeValid("p type", type, out_valid_types, prim->name());
+  return std::make_shared<Tuple>(std::vector<TypePtr>{input_type, type});
 }
 }  // namespace

-AbstractBasePtr LUInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
+MIND_API_OPERATOR_IMPL(Lu, BaseOperator);
+AbstractBasePtr LuInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
                        const std::vector<AbstractBasePtr> &input_args) {
-  MS_EXCEPTION_IF_NULL(primitive);
-  CheckAndConvertUtils::CheckInputArgs(input_args, kEqual, kLUInputsNum, primitive->name());
-  auto infer_type = LUInferType(primitive, input_args);
-  auto infer_shape = LUInferShape(primitive, input_args);
+  auto infer_type = LuInferType(primitive, input_args);
+  auto infer_shape = LuInferShape(primitive, input_args);
  return abstract::MakeAbstract(infer_shape, infer_type);
 }

-MIND_API_OPERATOR_IMPL(LU, BaseOperator);
-REGISTER_PRIMITIVE_EVAL_IMPL(LU, prim::kPrimLU, LUInfer, nullptr, true);
+REGISTER_PRIMITIVE_EVAL_IMPL(Lu, prim::kPrimLu, LuInfer, nullptr, true);
 }  // namespace ops
 }  // namespace mindspore
--- a/mindspore/core/ops/lu.h
+++ b/mindspore/core/ops/lu.h
@ -18,25 +18,24 @@
 #define MINDSPORE_CORE_OPS_LU_H_

 #include <map>
-#include <vector>
 #include <set>
 #include <string>
 #include <memory>
-
+#include <vector>
 #include "ops/base_operator.h"
 #include "mindapi/base/types.h"

 namespace mindspore {
 namespace ops {
-constexpr auto kNameLU = "LU";
-class MIND_API LU : public BaseOperator {
+constexpr auto kNameLu = "Lu";
+class MIND_API Lu : public BaseOperator {
 public:
-  MIND_API_BASE_MEMBER(LU);
-  LU() : BaseOperator(kNameLU) { InitIOName({"x"}, {"lu", "pivots", "permutation"}); }
+  MIND_API_BASE_MEMBER(Lu);
+  Lu() : BaseOperator(kNameLu) { InitIOName({"input"}, {"lu", "p"}); }
 };
-abstract::AbstractBasePtr LUInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
+abstract::AbstractBasePtr LuInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                  const std::vector<abstract::AbstractBasePtr> &input_args);
-using PrimLUPtr = std::shared_ptr<LU>;
+using PrimLuPtr = std::shared_ptr<Lu>;
 }  // namespace ops
 }  // namespace mindspore
 #endif  // MINDSPORE_CORE_OPS_LU_H_
--- a/mindspore/core/ops/lu_scipy.cc
+++ b/mindspore/core/ops/lu_scipy.cc
@ -0,0 +1,82 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ops/lu_scipy.h"
+#include <algorithm>
+#include "ops/op_utils.h"
+#include "utils/check_convert_utils.h"
+#include "abstract/ops/primitive_infer_map.h"
+#include "mindapi/src/helper.h"
+#include "common/graph_kernel/core/graph_kernel_utils.h"
+
+namespace mindspore {
+namespace ops {
+namespace {
+constexpr size_t kLUInputsNum = 1;
+constexpr size_t kXDim = 2;
+constexpr size_t kLastDim = 1;
+constexpr size_t kPenultimateDim = 2;
+abstract::TupleShapePtr LUInferShape(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
+  MS_EXCEPTION_IF_NULL(primitive);
+  auto prim_name = primitive->name();
+  auto x_shape_map = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[0]->BuildShape());
+  auto x_shape = x_shape_map[kShape];
+
+  auto x_output = std::make_shared<abstract::Shape>(x_shape);
+  if (IsDynamicRank(x_shape)) {
+    return std::make_shared<abstract::TupleShape>(std::vector<abstract::BaseShapePtr>{x_output, x_output, x_output});
+  }
+
+  size_t x_shape_size = x_shape.size();
+  if (x_shape_size < kXDim) {
+    MS_EXCEPTION(ValueError) << "For '" << prim_name << "',"
+                             << " the dimension of hashmap must be greater than or equal to 2, but got: "
+                             << x_shape_size << ".";
+  }
+
+  auto k_shape = std::min(x_shape[x_shape_size - kLastDim], x_shape[x_shape_size - kPenultimateDim]);
+  ShapeVector top_k_shape(x_shape.begin(), x_shape.end() - kPenultimateDim);
+  ShapeVector pivots_shape = top_k_shape;
+  pivots_shape.push_back(k_shape);
+  ShapeVector permutation_shape = pivots_shape;
+  permutation_shape.push_back(k_shape);
+
+  auto pivots_output = std::make_shared<abstract::Shape>(pivots_shape);
+  auto permutation_output = std::make_shared<abstract::Shape>(permutation_shape);
+  return std::make_shared<abstract::TupleShape>(
+    std::vector<abstract::BaseShapePtr>{x_output, pivots_output, permutation_output});
+}
+
+TuplePtr LUInferType(const PrimitivePtr &prim, const std::vector<AbstractBasePtr> &input_args) {
+  MS_EXCEPTION_IF_NULL(prim);
+  auto x_type = input_args[0]->BuildType();
+  return std::make_shared<Tuple>(std::vector<TypePtr>{x_type, kInt32, kInt32});
+}
+}  // namespace
+
+AbstractBasePtr LUInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                        const std::vector<AbstractBasePtr> &input_args) {
+  MS_EXCEPTION_IF_NULL(primitive);
+  CheckAndConvertUtils::CheckInputArgs(input_args, kEqual, kLUInputsNum, primitive->name());
+  auto infer_type = LUInferType(primitive, input_args);
+  auto infer_shape = LUInferShape(primitive, input_args);
+  return abstract::MakeAbstract(infer_shape, infer_type);
+}
+
+MIND_API_OPERATOR_IMPL(LU, BaseOperator);
+REGISTER_PRIMITIVE_EVAL_IMPL(LU, prim::kPrimLU, LUInfer, nullptr, true);
+}  // namespace ops
+}  // namespace mindspore
--- a/mindspore/core/ops/lu_scipy.h
+++ b/mindspore/core/ops/lu_scipy.h
@ -0,0 +1,42 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#ifndef MINDSPORE_CORE_OPS_LU_SCIPY_H_
+#define MINDSPORE_CORE_OPS_LU_SCIPY_H_
+
+#include <map>
+#include <vector>
+#include <set>
+#include <string>
+#include <memory>
+
+#include "ops/base_operator.h"
+#include "mindapi/base/types.h"
+
+namespace mindspore {
+namespace ops {
+constexpr auto kNameLU = "LU";
+class MIND_API LU : public BaseOperator {
+ public:
+  MIND_API_BASE_MEMBER(LU);
+  LU() : BaseOperator(kNameLU) { InitIOName({"x"}, {"lu", "pivots", "permutation"}); }
+};
+abstract::AbstractBasePtr LUInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                  const std::vector<abstract::AbstractBasePtr> &input_args);
+using PrimLUPtr = std::shared_ptr<LU>;
+}  // namespace ops
+}  // namespace mindspore
+#endif  // MINDSPORE_CORE_OPS_LU_SCIPY_H_
--- a/mindspore/python/mindspore/ops/operations/math_ops.py
+++ b/mindspore/python/mindspore/ops/operations/math_ops.py
@ -6286,6 +6286,52 @@ class MatrixSolveLs(Primitive):
        validator.check_value_type('fast', fast, [bool], self.name)


+class Lu(Primitive):
+    """
+    Computes the LU decomposition of one or more square matrices.
+
+    Args:
+        output_idx_type (:class:`mindspore.dtype`): An optional data type of `mindspore.dtype.int32`.
+            Default: `mindspore.dtype.int32`.
+
+    Inputs:
+        - **input** (Tensor) - A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+          matrices of size `[M, M]`, with data type float32, float64, complex64, complex128.
+
+    Outputs:
+        - **lu** (Tensor) - A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the lower
+          triangular factor `L` with unit diagonal. Upper triangular part denotes the upper triangular factor `U`.
+        - **p** (Tensor) - Permutation of the rows encoded as a list of indices in `0..M-1`, shape is `[..., M]`.
+
+    Raises:
+        TypeError: If the dtype of `input` is not one of the following dtype:
+            float32, float64, complex64, complex128.
+        TypeError: If `output_idx_type` is neither int32 nor int64.
+        ValueError: If `input` rank is less than 2.
+        ValueError: If input[-1] is not equal to input[-2].
+
+    Supported Platforms:
+        ``GPU``
+
+    Examples:
+        >>> input = Tensor(np.array([[2.5,3.1,3.5], [4.7,1.9,0.2], [1.1,3.6,2.0]]), mindspore.float32)
+        >>> lu, p = ops.Lu(output_idx_type=mindspore.int32)(input)
+        >>> print(lu)
+        [[4.7        1.9        0.2       ]
+         [0.23404257 3.155319   1.9531915 ]
+         [0.5319149  0.6621713  2.1002696 ]]
+        >>> print(p)
+        [1 2 0]
+    """
+
+    @prim_attr_register
+    def __init__(self, output_idx_type):
+        super().__init__(name="Lu")
+        self.init_prim_io_names(inputs=['input'], outputs=['lu', 'p'])
+        validator.check_type_name("output_idx_type", output_idx_type, [mstype.int32, mstype.int64], self.name)
+        self.add_prim_attr('output_idx_type', output_idx_type)
+
+
 class LuSolve(Primitive):
    r"""
    Return the solution of the linear equation :math:`Ax = b` .
--- a/tests/st/ops/gpu/test_lu_op.py
+++ b/tests/st/ops/gpu/test_lu_op.py
@ -1,4 +1,4 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2021-2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -12,83 +12,61 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-from typing import Generic
-import mindspore.context as context
-import mindspore.nn as nn
-from mindspore import Tensor
-import mindspore.numpy as mnp
-import mindspore.common.dtype as mstype
-from mindspore.ops import PrimitiveWithInfer
-from mindspore.ops import prim_attr_register
-import scipy as scp
+
 import numpy as np
 import pytest
-
-context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
-
-
-class LU(PrimitiveWithInfer):
-    """
-    LU decomposition with partial pivoting
-    P.A = L.U
-    """
-
-    @prim_attr_register
-    def __init__(self):
-        super().__init__(name="LU")
-        self.init_prim_io_names(inputs=['x'], outputs=['lu', 'pivots', 'permutation'])
-
-    def __infer__(self, x):
-        x_shape = list(x['shape'])
-        x_dtype = x['dtype']
-        pivots_shape = []
-        permutation_shape = []
-        ndim = len(x_shape)
-        if ndim == 0:
-            pivots_shape = x_shape
-            permutation_shape = x_shape
-        elif ndim == 1:
-            pivots_shape = x_shape[:-1]
-            # permutation_shape = x_shape[:-1]
-        else:
-            pivots_shape = x_shape[-2:-1]
-            # permutation_shape = x_shape[-2:-1]
-
-        output = {
-            'shape': (x_shape, pivots_shape, permutation_shape),
-            'dtype': (x_dtype, mstype.int32, mstype.int32),
-            'value': None
-        }
-        return output
+import scipy as scp
+import mindspore.nn as nn
+import mindspore.context as context
+import mindspore.common.dtype as mstype
+from mindspore import Tensor
+from mindspore.ops.operations import math_ops as P


 class LuNet(nn.Cell):
-    def __init__(self):
+    def __init__(self, output_idx_type=mstype.int32):
        super(LuNet, self).__init__()
-        self.lu = LU()
+        self.lu = P.Lu(output_idx_type=output_idx_type)

    def construct(self, a):
        return self.lu(a)


-@pytest.mark.platform_x86_gpu
-@pytest.mark.parametrize('n', [10, 20])
-@pytest.mark.parametrize('dtype', [np.float32, np.float64])
-def test_lu_net(n: int, dtype: Generic):
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_lu_dtype_float32():
    """
-    Feature: ALL To ALL
-    Description: test cases for lu decomposition test cases for A[N,N]x = b[N,1]
-    Expectation: the result match to scipy
+    Feature: Lu gpu TEST.
+    Description: float32 test case for Lu
+    Expectation: the result match to scp
    """
-    a = (np.random.random((n, n)) + np.eye(n)).astype(dtype)
-    expect, _ = scp.linalg.lu_factor(a)
-    mscp_lu_net = LuNet()
-    # mindspore tensor is row major but gpu cusolver is col major, so we should transpose it.
-    tensor_a = Tensor(a)
-    tensor_a = mnp.transpose(tensor_a)
-    output, _, _ = mscp_lu_net(tensor_a)
-    # mindspore tensor is row major but gpu cusolver is col major, so we should transpose it.
-    output = mnp.transpose(output)
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    x_np = np.array([[2.5, 3.1, 3.5], [4.7, 1.9, 0.2], [1.1, 3.6, 2.0]])
+    expect, _ = scp.linalg.lu_factor(x_np)
+    input_x = Tensor(x_np.astype(np.float32))
+    net = LuNet(mstype.int32)
+    lu, _ = net(input_x)
    rtol = 1.e-4
+    atol = 1.e-4
+    assert np.allclose(expect, lu.asnumpy(), rtol=rtol, atol=atol)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_lu_dtype_float64():
+    """
+    Feature: Lu gpu TEST.
+    Description: float64 test case for Lu
+    Expectation: the result match to scp
+    """
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    x_np = np.array([[3.5, 6.5, 3.1], [4.7, 1.9, 6.2], [1.5, 4.8, 2.3]])
+    expect, _ = scp.linalg.lu_factor(x_np)
+    input_x = Tensor(x_np.astype(np.float64))
+    net = LuNet(mstype.int64)
+    lu, _ = net(input_x)
+    rtol = 1.e-5
    atol = 1.e-5
-    assert np.allclose(expect, output.asnumpy(), rtol=rtol, atol=atol)
+    assert np.allclose(expect, lu.asnumpy(), rtol=rtol, atol=atol)