[feat][assistant][I5EWLH] add new gpu operator LuUnpackGrad

2022-12-01 21:46:11 +08:00 · 2022-12-01 21:46:11 +08:00 · 1caf3bbbfc
parent 5d292df0bb
commit 1caf3bbbfc
13 changed files with 1822 additions and 7 deletions
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_grad_impl.cu
@ -0,0 +1,284 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "lu_unpack_grad_impl.cuh"
+
+template <typename T>
+__global__ void TrilExpendWidth(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
+                                const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
+                                const int64_t lu_data_width) {
+  for (int64_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    int64_t matrix_size = lu_data_height * lu_data_width;
+    int64_t idx_height = pos % matrix_size / lu_data_width;
+    int64_t idx_width = pos % matrix_size % lu_data_width;
+    if (idx_width >= idx_height) {
+      l_grad_output[pos] = 0;
+    } else {
+      int64_t in_pos = ((pos / matrix_size) * matrix_L_height + idx_height) * matrix_L_width + idx_width;
+      l_grad_output[pos] = l_grad_input[in_pos];
+    }
+  }
+  return;
+}
+
+template <typename T>
+__global__ void TrilLower(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
+                          const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
+                          const int64_t lu_data_width) {
+  for (int64_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    int64_t matrix_size = lu_data_height * lu_data_width;
+    int64_t idx_height = pos % matrix_size / lu_data_width;
+    int64_t idx_width = pos % matrix_size % lu_data_width;
+    if (idx_width >= idx_height) {
+      l_grad_output[pos] = 0;
+    } else {
+      l_grad_output[pos] = l_grad_input[pos];
+    }
+  }
+  return;
+}
+
+template <typename T>
+__global__ void TriuExpendHeight(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
+                                 const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
+                                 const int64_t lu_data_width) {
+  for (int64_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    int64_t matrix_size = lu_data_height * lu_data_width;
+    int64_t idx_height = pos % matrix_size / lu_data_width;
+    int64_t idx_width = pos % matrix_size % lu_data_width;
+    if (idx_width >= idx_height) {
+      int64_t in_pos = ((pos / matrix_size) * matrix_U_height + idx_height) * matrix_U_width + idx_width;
+      u_grad_output[pos] = u_grad_input[in_pos];
+    } else {
+      u_grad_output[pos] = 0;
+    }
+  }
+  return;
+}
+
+template <typename T>
+__global__ void TriuUpper(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
+                          const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
+                          const int64_t lu_data_width) {
+  for (int64_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    int64_t matrix_size = lu_data_height * lu_data_width;
+    int64_t idx_height = pos % matrix_size / lu_data_width;
+    int64_t idx_width = pos % matrix_size % lu_data_width;
+    if (idx_height <= idx_width) {
+      u_grad_output[pos] = u_grad_input[pos];
+    } else {
+      u_grad_output[pos] = 0;
+    }
+  }
+  return;
+}
+
+template <typename T>
+void CalTrilExpendWidth(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
+                        const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
+                        const int64_t lu_data_width, const uint32_t &device_id, cudaStream_t cuda_stream) {
+  TrilExpendWidth<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
+    size, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output, lu_data_height, lu_data_width);
+  return;
+}
+
+template <typename T>
+void CalTrilLower(const int64_t size, T *l_grad_input, const int64_t matrix_L_height, const int64_t matrix_L_width,
+                  T *l_grad_output, const int64_t lu_data_height, const int64_t lu_data_width,
+                  const uint32_t &device_id, cudaStream_t cuda_stream) {
+  TrilLower<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
+    size, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output, lu_data_height, lu_data_width);
+  return;
+}
+
+template <typename T>
+void CalTriuExpendHeight(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
+                         const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
+                         const int64_t lu_data_width, const uint32_t &device_id, cudaStream_t cuda_stream) {
+  TriuExpendHeight<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
+    size, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output, lu_data_height, lu_data_width);
+  return;
+}
+
+template <typename T>
+void CalTriuUpper(const int64_t size, T *u_grad_input, const int64_t matrix_U_height, const int64_t matrix_U_width,
+                  T *u_grad_output, const int64_t lu_data_height, const int64_t lu_data_width,
+                  const uint32_t &device_id, cudaStream_t cuda_stream) {
+  TriuUpper<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
+    size, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output, lu_data_height, lu_data_width);
+  return;
+}
+
+template CUDA_LIB_EXPORT void CalTrilExpendWidth<double>(const int64_t size, double *l_grad_input,
+                                                         const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                         double *l_grad_output, const int64_t lu_data_height,
+                                                         const int64_t lu_data_width, const uint32_t &device_id,
+                                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilExpendWidth<float>(const int64_t size, float *l_grad_input,
+                                                        const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                        float *l_grad_output, const int64_t lu_data_height,
+                                                        const int64_t lu_data_width, const uint32_t &device_id,
+                                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilExpendWidth<half>(const int64_t size, half *l_grad_input,
+                                                       const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                       half *l_grad_output, const int64_t lu_data_height,
+                                                       const int64_t lu_data_width, const uint32_t &device_id,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilExpendWidth<int64_t>(const int64_t size, int64_t *l_grad_input,
+                                                          const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                          int64_t *l_grad_output, const int64_t lu_data_height,
+                                                          const int64_t lu_data_width, const uint32_t &device_id,
+                                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilExpendWidth<int32_t>(const int64_t size, int32_t *l_grad_input,
+                                                          const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                          int32_t *l_grad_output, const int64_t lu_data_height,
+                                                          const int64_t lu_data_width, const uint32_t &device_id,
+                                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilExpendWidth<int16_t>(const int64_t size, int16_t *l_grad_input,
+                                                          const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                          int16_t *l_grad_output, const int64_t lu_data_height,
+                                                          const int64_t lu_data_width, const uint32_t &device_id,
+                                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilExpendWidth<int8_t>(const int64_t size, int8_t *l_grad_input,
+                                                         const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                         int8_t *l_grad_output, const int64_t lu_data_height,
+                                                         const int64_t lu_data_width, const uint32_t &device_id,
+                                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilExpendWidth<uint8_t>(const int64_t size, uint8_t *l_grad_input,
+                                                          const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                          uint8_t *l_grad_output, const int64_t lu_data_height,
+                                                          const int64_t lu_data_width, const uint32_t &device_id,
+                                                          cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void CalTrilLower<double>(const int64_t size, double *l_grad_input,
+                                                   const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                   double *l_grad_output, const int64_t lu_data_height,
+                                                   const int64_t lu_data_width, const uint32_t &device_id,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilLower<float>(const int64_t size, float *l_grad_input,
+                                                  const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                  float *l_grad_output, const int64_t lu_data_height,
+                                                  const int64_t lu_data_width, const uint32_t &device_id,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilLower<half>(const int64_t size, half *l_grad_input, const int64_t matrix_L_height,
+                                                 const int64_t matrix_L_width, half *l_grad_output,
+                                                 const int64_t lu_data_height, const int64_t lu_data_width,
+                                                 const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilLower<int64_t>(const int64_t size, int64_t *l_grad_input,
+                                                    const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                    int64_t *l_grad_output, const int64_t lu_data_height,
+                                                    const int64_t lu_data_width, const uint32_t &device_id,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilLower<int32_t>(const int64_t size, int32_t *l_grad_input,
+                                                    const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                    int32_t *l_grad_output, const int64_t lu_data_height,
+                                                    const int64_t lu_data_width, const uint32_t &device_id,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilLower<int16_t>(const int64_t size, int16_t *l_grad_input,
+                                                    const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                    int16_t *l_grad_output, const int64_t lu_data_height,
+                                                    const int64_t lu_data_width, const uint32_t &device_id,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilLower<int8_t>(const int64_t size, int8_t *l_grad_input,
+                                                   const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                   int8_t *l_grad_output, const int64_t lu_data_height,
+                                                   const int64_t lu_data_width, const uint32_t &device_id,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilLower<uint8_t>(const int64_t size, uint8_t *l_grad_input,
+                                                    const int64_t matrix_L_height, const int64_t matrix_L_width,
+                                                    uint8_t *l_grad_output, const int64_t lu_data_height,
+                                                    const int64_t lu_data_width, const uint32_t &device_id,
+                                                    cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void CalTriuExpendHeight<double>(const int64_t size, double *u_grad_input,
+                                                          const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                          double *u_grad_output, const int64_t lu_data_height,
+                                                          const int64_t lu_data_width, const uint32_t &device_id,
+                                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuExpendHeight<float>(const int64_t size, float *u_grad_input,
+                                                         const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                         float *u_grad_output, const int64_t lu_data_height,
+                                                         const int64_t lu_data_width, const uint32_t &device_id,
+                                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuExpendHeight<half>(const int64_t size, half *u_grad_input,
+                                                        const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                        half *u_grad_output, const int64_t lu_data_height,
+                                                        const int64_t lu_data_width, const uint32_t &device_id,
+                                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuExpendHeight<int64_t>(const int64_t size, int64_t *u_grad_input,
+                                                           const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                           int64_t *u_grad_output, const int64_t lu_data_height,
+                                                           const int64_t lu_data_width, const uint32_t &device_id,
+                                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuExpendHeight<int32_t>(const int64_t size, int32_t *u_grad_input,
+                                                           const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                           int32_t *u_grad_output, const int64_t lu_data_height,
+                                                           const int64_t lu_data_width, const uint32_t &device_id,
+                                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuExpendHeight<int16_t>(const int64_t size, int16_t *u_grad_input,
+                                                           const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                           int16_t *u_grad_output, const int64_t lu_data_height,
+                                                           const int64_t lu_data_width, const uint32_t &device_id,
+                                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuExpendHeight<int8_t>(const int64_t size, int8_t *u_grad_input,
+                                                          const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                          int8_t *u_grad_output, const int64_t lu_data_height,
+                                                          const int64_t lu_data_width, const uint32_t &device_id,
+                                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuExpendHeight<uint8_t>(const int64_t size, uint8_t *u_grad_input,
+                                                           const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                           uint8_t *u_grad_output, const int64_t lu_data_height,
+                                                           const int64_t lu_data_width, const uint32_t &device_id,
+                                                           cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void CalTriuUpper<double>(const int64_t size, double *u_grad_input,
+                                                   const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                   double *u_grad_output, const int64_t lu_data_height,
+                                                   const int64_t lu_data_width, const uint32_t &device_id,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuUpper<float>(const int64_t size, float *u_grad_input,
+                                                  const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                  float *u_grad_output, const int64_t lu_data_height,
+                                                  const int64_t lu_data_width, const uint32_t &device_id,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuUpper<half>(const int64_t size, half *u_grad_input, const int64_t matrix_U_height,
+                                                 const int64_t matrix_U_width, half *u_grad_output,
+                                                 const int64_t lu_data_height, const int64_t lu_data_width,
+                                                 const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuUpper<int64_t>(const int64_t size, int64_t *u_grad_input,
+                                                    const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                    int64_t *u_grad_output, const int64_t lu_data_height,
+                                                    const int64_t lu_data_width, const uint32_t &device_id,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuUpper<int32_t>(const int64_t size, int32_t *u_grad_input,
+                                                    const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                    int32_t *u_grad_output, const int64_t lu_data_height,
+                                                    const int64_t lu_data_width, const uint32_t &device_id,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuUpper<int16_t>(const int64_t size, int16_t *u_grad_input,
+                                                    const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                    int16_t *u_grad_output, const int64_t lu_data_height,
+                                                    const int64_t lu_data_width, const uint32_t &device_id,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuUpper<int8_t>(const int64_t size, int8_t *u_grad_input,
+                                                   const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                   int8_t *u_grad_output, const int64_t lu_data_height,
+                                                   const int64_t lu_data_width, const uint32_t &device_id,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuUpper<uint8_t>(const int64_t size, uint8_t *u_grad_input,
+                                                    const int64_t matrix_U_height, const int64_t matrix_U_width,
+                                                    uint8_t *u_grad_output, const int64_t lu_data_height,
+                                                    const int64_t lu_data_width, const uint32_t &device_id,
+                                                    cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_grad_impl.cuh
@ -0,0 +1,42 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LU_UNPACK_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LU_UNPACK_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void CalTrilExpendWidth(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
+                                        const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
+                                        const int64_t lu_data_width, const uint32_t &device_id,
+                                        cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalTrilLower(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
+                                  const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
+                                  const int64_t lu_data_width, const uint32_t &device_id, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalTriuExpendHeight(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
+                                         const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
+                                         const int64_t lu_data_width, const uint32_t &device_id,
+                                         cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalTriuUpper(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
+                                  const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
+                                  const int64_t lu_data_width, const uint32_t &device_id, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LU_UNPACK_GRAD_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_impl.cu
@ -0,0 +1,369 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_impl.cuh"
+
+template <typename S>
+__global__ void InitOrder(const size_t p_size, const int64_t lu_data_dim1, S *final_order) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < p_size * lu_data_dim1; pos += gridDim.x * blockDim.x) {
+    final_order[pos] = pos % lu_data_dim1;
+  }
+}
+
+template <typename S>
+__global__ void SwapOrder(const size_t p_size, const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
+                          S *lu_pivots_ptr, S *final_order) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < p_size; pos += gridDim.x * blockDim.x) {
+    S *lu_pivots_working_ptr = lu_pivots_ptr + pos * lu_pivots_dim;
+    for (S idx = 0; idx < lu_pivots_dim; ++idx) {
+      S perm_pivots_id = lu_pivots_working_ptr[idx] - 1;
+      S tmp = final_order[pos * lu_data_dim1 + idx];
+      final_order[pos * lu_data_dim1 + idx] = final_order[pos * lu_data_dim1 + perm_pivots_id];
+      final_order[pos * lu_data_dim1 + perm_pivots_id] = tmp;
+    }
+  }
+}
+
+template <typename T, typename S>
+__global__ void AssignEyeValue(const size_t p_size, const size_t lu_data_dim1, S *final_order, T *pivots_ptr) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < p_size * lu_data_dim1; pos += gridDim.x * blockDim.x) {
+    size_t row_start = (pos / lu_data_dim1 * lu_data_dim1 + final_order[pos]) * lu_data_dim1;
+    size_t eye_idx = pos % lu_data_dim1;
+    *(pivots_ptr + row_start + eye_idx) = static_cast<T>(1);
+  }
+  return;
+}
+
+template <typename T, typename S>
+void TransposeEyeMatrix(S *lu_pivots_ptr, T *pivots_ptr, S *final_order, const int64_t batch_num,
+                        const int64_t lu_data_dim1, const int64_t lu_pivots_dim, const uint32_t &device_id,
+                        cudaStream_t cuda_stream) {
+  cudaMemset(pivots_ptr, 0, batch_num * lu_data_dim1 * lu_data_dim1 * sizeof(T));
+  InitOrder<<<CUDA_BLOCKS(device_id, batch_num * lu_data_dim1), CUDA_THREADS(device_id), 0, cuda_stream>>>(
+    batch_num, lu_data_dim1, final_order);
+  SwapOrder<<<CUDA_BLOCKS(device_id, batch_num), CUDA_THREADS(device_id), 0, cuda_stream>>>(
+    batch_num, lu_data_dim1, lu_pivots_dim, lu_pivots_ptr, final_order);
+  AssignEyeValue<<<CUDA_BLOCKS(device_id, batch_num * lu_data_dim1), CUDA_THREADS(device_id), 0, cuda_stream>>>(
+    batch_num, lu_data_dim1, final_order, pivots_ptr);
+  return;
+}
+
+template <typename T>
+__global__ void TriuAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col,
+                        const int64_t lu_row, const int64_t lu_col, T *output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    int64_t out_matrix_size = out_row * out_col;
+    int64_t in_matrix_size = lu_row * lu_col;
+    int64_t idx_row = pos % out_matrix_size / out_col;
+    int64_t idx_col = pos % out_matrix_size % out_col;
+
+    if (idx_row <= idx_col) {
+      int64_t batch_size = pos / out_matrix_size;
+      int64_t in_pos = batch_size * in_matrix_size + idx_row * lu_col + idx_col;
+      output[pos] = input[in_pos];
+    } else {
+      output[pos] = static_cast<T>(0.0);
+    }
+  }
+  return;
+}
+
+template <typename T>
+void CalTriuAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col, const int64_t lu_row,
+                const int64_t lu_col, T *output, const uint32_t &device_id, cudaStream_t cuda_stream) {
+  TriuAux<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(size, input, out_row, out_col,
+                                                                                     lu_row, lu_col, output);
+  return;
+}
+
+template <typename T>
+__global__ void TrilAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col,
+                        const int64_t lu_row, const int64_t lu_col, T *output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    int64_t out_matrix_size = out_row * out_col;
+    int64_t in_matrix_size = lu_row * lu_col;
+    int64_t idx_row = pos % out_matrix_size / out_col;
+    int64_t idx_col = pos % out_matrix_size % out_col;
+
+    if (idx_row == idx_col) {
+      output[pos] = static_cast<T>(1.0);
+    } else if (idx_row > idx_col) {
+      int64_t batch_size = pos / out_matrix_size;
+      int64_t in_pos = batch_size * in_matrix_size + idx_row * lu_col + idx_col;
+      output[pos] = input[in_pos];
+    } else {
+      output[pos] = static_cast<T>(0.0);
+    }
+  }
+  return;
+}
+
+template <typename T>
+void CalTrilAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col, const int64_t lu_row,
+                const int64_t lu_col, T *output, const uint32_t &device_id, cudaStream_t cuda_stream) {
+  TrilAux<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(size, input, out_row, out_col,
+                                                                                     lu_row, lu_col, output);
+  return;
+}
+
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, int64_t>(int64_t *lu_pivots_ptr, double *pivots_ptr,
+                                                                  int64_t *final_order, const int64_t batch_num,
+                                                                  const int64_t lu_data_dim1,
+                                                                  const int64_t lu_pivots_dim,
+                                                                  const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, int32_t>(int32_t *lu_pivots_ptr, double *pivots_ptr,
+                                                                  int32_t *final_order, const int64_t batch_num,
+                                                                  const int64_t lu_data_dim1,
+                                                                  const int64_t lu_pivots_dim,
+                                                                  const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, int16_t>(int16_t *lu_pivots_ptr, double *pivots_ptr,
+                                                                  int16_t *final_order, const int64_t batch_num,
+                                                                  const int64_t lu_data_dim1,
+                                                                  const int64_t lu_pivots_dim,
+                                                                  const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, int8_t>(int8_t *lu_pivots_ptr, double *pivots_ptr,
+                                                                 int8_t *final_order, const int64_t batch_num,
+                                                                 const int64_t lu_data_dim1,
+                                                                 const int64_t lu_pivots_dim, const uint32_t &device_id,
+                                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, uint8_t>(uint8_t *lu_pivots_ptr, double *pivots_ptr,
+                                                                  uint8_t *final_order, const int64_t batch_num,
+                                                                  const int64_t lu_data_dim1,
+                                                                  const int64_t lu_pivots_dim,
+                                                                  const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, int64_t>(int64_t *lu_pivots_ptr, float *pivots_ptr,
+                                                                 int64_t *final_order, const int64_t batch_num,
+                                                                 const int64_t lu_data_dim1,
+                                                                 const int64_t lu_pivots_dim, const uint32_t &device_id,
+                                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, int32_t>(int32_t *lu_pivots_ptr, float *pivots_ptr,
+                                                                 int32_t *final_order, const int64_t batch_num,
+                                                                 const int64_t lu_data_dim1,
+                                                                 const int64_t lu_pivots_dim, const uint32_t &device_id,
+                                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, int16_t>(int16_t *lu_pivots_ptr, float *pivots_ptr,
+                                                                 int16_t *final_order, const int64_t batch_num,
+                                                                 const int64_t lu_data_dim1,
+                                                                 const int64_t lu_pivots_dim, const uint32_t &device_id,
+                                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, int8_t>(int8_t *lu_pivots_ptr, float *pivots_ptr,
+                                                                int8_t *final_order, const int64_t batch_num,
+                                                                const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
+                                                                const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, uint8_t>(uint8_t *lu_pivots_ptr, float *pivots_ptr,
+                                                                 uint8_t *final_order, const int64_t batch_num,
+                                                                 const int64_t lu_data_dim1,
+                                                                 const int64_t lu_pivots_dim, const uint32_t &device_id,
+                                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, int64_t>(int64_t *lu_pivots_ptr, half *pivots_ptr,
+                                                                int64_t *final_order, const int64_t batch_num,
+                                                                const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
+                                                                const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, int32_t>(int32_t *lu_pivots_ptr, half *pivots_ptr,
+                                                                int32_t *final_order, const int64_t batch_num,
+                                                                const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
+                                                                const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, int16_t>(int16_t *lu_pivots_ptr, half *pivots_ptr,
+                                                                int16_t *final_order, const int64_t batch_num,
+                                                                const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
+                                                                const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, int8_t>(int8_t *lu_pivots_ptr, half *pivots_ptr,
+                                                               int8_t *final_order, const int64_t batch_num,
+                                                               const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
+                                                               const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, uint8_t>(uint8_t *lu_pivots_ptr, half *pivots_ptr,
+                                                                uint8_t *final_order, const int64_t batch_num,
+                                                                const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
+                                                                const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, int64_t>(int64_t *lu_pivots_ptr, int64_t *pivots_ptr,
+                                                                   int64_t *final_order, const int64_t batch_num,
+                                                                   const int64_t lu_data_dim1,
+                                                                   const int64_t lu_pivots_dim,
+                                                                   const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, int32_t>(int32_t *lu_pivots_ptr, int64_t *pivots_ptr,
+                                                                   int32_t *final_order, const int64_t batch_num,
+                                                                   const int64_t lu_data_dim1,
+                                                                   const int64_t lu_pivots_dim,
+                                                                   const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, int16_t>(int16_t *lu_pivots_ptr, int64_t *pivots_ptr,
+                                                                   int16_t *final_order, const int64_t batch_num,
+                                                                   const int64_t lu_data_dim1,
+                                                                   const int64_t lu_pivots_dim,
+                                                                   const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, int8_t>(int8_t *lu_pivots_ptr, int64_t *pivots_ptr,
+                                                                  int8_t *final_order, const int64_t batch_num,
+                                                                  const int64_t lu_data_dim1,
+                                                                  const int64_t lu_pivots_dim,
+                                                                  const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, uint8_t>(uint8_t *lu_pivots_ptr, int64_t *pivots_ptr,
+                                                                   uint8_t *final_order, const int64_t batch_num,
+                                                                   const int64_t lu_data_dim1,
+                                                                   const int64_t lu_pivots_dim,
+                                                                   const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, int64_t>(int64_t *lu_pivots_ptr, int32_t *pivots_ptr,
+                                                                   int64_t *final_order, const int64_t batch_num,
+                                                                   const int64_t lu_data_dim1,
+                                                                   const int64_t lu_pivots_dim,
+                                                                   const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, int32_t>(int32_t *lu_pivots_ptr, int32_t *pivots_ptr,
+                                                                   int32_t *final_order, const int64_t batch_num,
+                                                                   const int64_t lu_data_dim1,
+                                                                   const int64_t lu_pivots_dim,
+                                                                   const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, int16_t>(int16_t *lu_pivots_ptr, int32_t *pivots_ptr,
+                                                                   int16_t *final_order, const int64_t batch_num,
+                                                                   const int64_t lu_data_dim1,
+                                                                   const int64_t lu_pivots_dim,
+                                                                   const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, int8_t>(int8_t *lu_pivots_ptr, int32_t *pivots_ptr,
+                                                                  int8_t *final_order, const int64_t batch_num,
+                                                                  const int64_t lu_data_dim1,
+                                                                  const int64_t lu_pivots_dim,
+                                                                  const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, uint8_t>(uint8_t *lu_pivots_ptr, int32_t *pivots_ptr,
+                                                                   uint8_t *final_order, const int64_t batch_num,
+                                                                   const int64_t lu_data_dim1,
+                                                                   const int64_t lu_pivots_dim,
+                                                                   const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, int64_t>(int64_t *lu_pivots_ptr, int16_t *pivots_ptr,
+                                                                   int64_t *final_order, const int64_t batch_num,
+                                                                   const int64_t lu_data_dim1,
+                                                                   const int64_t lu_pivots_dim,
+                                                                   const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, int32_t>(int32_t *lu_pivots_ptr, int16_t *pivots_ptr,
+                                                                   int32_t *final_order, const int64_t batch_num,
+                                                                   const int64_t lu_data_dim1,
+                                                                   const int64_t lu_pivots_dim,
+                                                                   const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, int16_t>(int16_t *lu_pivots_ptr, int16_t *pivots_ptr,
+                                                                   int16_t *final_order, const int64_t batch_num,
+                                                                   const int64_t lu_data_dim1,
+                                                                   const int64_t lu_pivots_dim,
+                                                                   const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, int8_t>(int8_t *lu_pivots_ptr, int16_t *pivots_ptr,
+                                                                  int8_t *final_order, const int64_t batch_num,
+                                                                  const int64_t lu_data_dim1,
+                                                                  const int64_t lu_pivots_dim,
+                                                                  const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, uint8_t>(uint8_t *lu_pivots_ptr, int16_t *pivots_ptr,
+                                                                   uint8_t *final_order, const int64_t batch_num,
+                                                                   const int64_t lu_data_dim1,
+                                                                   const int64_t lu_pivots_dim,
+                                                                   const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, int64_t>(int64_t *lu_pivots_ptr, int8_t *pivots_ptr,
+                                                                  int64_t *final_order, const int64_t batch_num,
+                                                                  const int64_t lu_data_dim1,
+                                                                  const int64_t lu_pivots_dim,
+                                                                  const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, int32_t>(int32_t *lu_pivots_ptr, int8_t *pivots_ptr,
+                                                                  int32_t *final_order, const int64_t batch_num,
+                                                                  const int64_t lu_data_dim1,
+                                                                  const int64_t lu_pivots_dim,
+                                                                  const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, int16_t>(int16_t *lu_pivots_ptr, int8_t *pivots_ptr,
+                                                                  int16_t *final_order, const int64_t batch_num,
+                                                                  const int64_t lu_data_dim1,
+                                                                  const int64_t lu_pivots_dim,
+                                                                  const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, int8_t>(int8_t *lu_pivots_ptr, int8_t *pivots_ptr,
+                                                                 int8_t *final_order, const int64_t batch_num,
+                                                                 const int64_t lu_data_dim1,
+                                                                 const int64_t lu_pivots_dim, const uint32_t &device_id,
+                                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, uint8_t>(uint8_t *lu_pivots_ptr, int8_t *pivots_ptr,
+                                                                  uint8_t *final_order, const int64_t batch_num,
+                                                                  const int64_t lu_data_dim1,
+                                                                  const int64_t lu_pivots_dim,
+                                                                  const uint32_t &device_id, cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void CalTriuAux<uint8_t>(const size_t size, const uint8_t *input, const int64_t out_row,
+                                                  const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                  uint8_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuAux<uint16_t>(const size_t size, const uint16_t *input, const int64_t out_row,
+                                                   const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                   uint16_t *output, const uint32_t &device_id,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuAux<uint32_t>(const size_t size, const uint32_t *input, const int64_t out_row,
+                                                   const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                   uint32_t *output, const uint32_t &device_id,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuAux<uint64_t>(const size_t size, const uint64_t *input, const int64_t out_row,
+                                                   const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                   uint64_t *output, const uint32_t &device_id,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuAux<int8_t>(const size_t size, const int8_t *input, const int64_t out_row,
+                                                 const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                 int8_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuAux<int16_t>(const size_t size, const int16_t *input, const int64_t out_row,
+                                                  const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                  int16_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuAux<int>(const size_t size, const int *input, const int64_t out_row,
+                                              const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                              int *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuAux<int64_t>(const size_t size, const int64_t *input, const int64_t out_row,
+                                                  const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                  int64_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuAux<half>(const size_t size, const half *input, const int64_t out_row,
+                                               const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                               half *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuAux<float>(const size_t size, const float *input, const int64_t out_row,
+                                                const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                float *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuAux<double>(const size_t size, const double *input, const int64_t out_row,
+                                                 const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                 double *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTriuAux<bool>(const size_t size, const bool *input, const int64_t out_row,
+                                               const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                               bool *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void CalTrilAux<uint8_t>(const size_t size, const uint8_t *input, const int64_t out_row,
+                                                  const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                  uint8_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilAux<uint16_t>(const size_t size, const uint16_t *input, const int64_t out_row,
+                                                   const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                   uint16_t *output, const uint32_t &device_id,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilAux<uint32_t>(const size_t size, const uint32_t *input, const int64_t out_row,
+                                                   const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                   uint32_t *output, const uint32_t &device_id,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilAux<uint64_t>(const size_t size, const uint64_t *input, const int64_t out_row,
+                                                   const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                   uint64_t *output, const uint32_t &device_id,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilAux<int8_t>(const size_t size, const int8_t *input, const int64_t out_row,
+                                                 const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                 int8_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilAux<int16_t>(const size_t size, const int16_t *input, const int64_t out_row,
+                                                  const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                  int16_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilAux<int>(const size_t size, const int *input, const int64_t out_row,
+                                              const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                              int *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilAux<int64_t>(const size_t size, const int64_t *input, const int64_t out_row,
+                                                  const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                  int64_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilAux<half>(const size_t size, const half *input, const int64_t out_row,
+                                               const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                               half *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilAux<float>(const size_t size, const float *input, const int64_t out_row,
+                                                const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                float *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilAux<double>(const size_t size, const double *input, const int64_t out_row,
+                                                 const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                                 double *output, const uint32_t &device_id, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTrilAux<bool>(const size_t size, const bool *input, const int64_t out_row,
+                                               const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
+                                               bool *output, const uint32_t &device_id, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_impl.cuh
@ -0,0 +1,35 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LUUNPACK_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LUUNPACK_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void TransposeEyeMatrix(S *lu_pivots_ptr, T *pivots_ptr, S *final_order, const int64_t batch_num,
+                                        const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
+                                        const uint32_t &device_id, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalTrilAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col,
+                                const int64_t lu_row, const int64_t lu_col, T *output, const uint32_t &device_id,
+                                cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalTriuAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col,
+                                const int64_t lu_row, const int64_t lu_col, T *output, const uint32_t &device_id,
+                                cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LUUNPACK_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_unpack_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_unpack_gpu_kernel.cc
@ -0,0 +1,455 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/math/lu_unpack_gpu_kernel.h"
+namespace mindspore {
+namespace kernel {
+namespace {
+constexpr size_t kFirstDim = 1;
+constexpr size_t kSecondDim = 2;
+}  // namespace
+
+void LuUnpackGpuKernelMod::ResetResource() noexcept {
+  input_size_list_.clear();
+  output_size_list_.clear();
+  workspace_size_list_.clear();
+}
+
+template <typename T, typename S>
+bool LuUnpackGpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                                        const std::vector<AddressPtr> &outputs) {
+  T *lu_data_ptr = GetDeviceAddress<T>(inputs, kIndex0);
+  MS_EXCEPTION_IF_NULL(lu_data_ptr);
+  S *lu_pivots_ptr = GetDeviceAddress<S>(inputs, kIndex1);
+  MS_EXCEPTION_IF_NULL(lu_pivots_ptr);
+
+  T *pivots_ptr = GetDeviceAddress<T>(outputs, kIndex0);
+  T *l_ptr = GetDeviceAddress<T>(outputs, kIndex1);
+  T *u_ptr = GetDeviceAddress<T>(outputs, kIndex2);
+  S *final_order = GetDeviceAddress<S>(workspace, kIndex0);
+
+  // triu and tril
+  if (lu_data_dim1_ > lu_data_dim2_) {
+    CalTriuAux(batch_num_ * u_stride_, lu_data_ptr, lu_data_dim2_, lu_data_dim2_, lu_data_dim1_, lu_data_dim2_, u_ptr,
+               device_id_, reinterpret_cast<cudaStream_t>(cuda_stream_));
+    CalTrilAux(batch_num_ * l_stride_, lu_data_ptr, lu_data_dim1_, lu_data_dim2_, lu_data_dim1_, lu_data_dim2_, l_ptr,
+               device_id_, reinterpret_cast<cudaStream_t>(cuda_stream_));
+  } else {
+    CalTriuAux(batch_num_ * u_stride_, lu_data_ptr, lu_data_dim1_, lu_data_dim2_, lu_data_dim1_, lu_data_dim2_, u_ptr,
+               device_id_, reinterpret_cast<cudaStream_t>(cuda_stream_));
+    CalTrilAux(batch_num_ * l_stride_, lu_data_ptr, lu_data_dim1_, lu_data_dim1_, lu_data_dim1_, lu_data_dim2_, l_ptr,
+               device_id_, reinterpret_cast<cudaStream_t>(cuda_stream_));
+  }
+
+  // swap and index_select
+  TransposeEyeMatrix(lu_pivots_ptr, pivots_ptr, final_order, batch_num_, lu_data_dim1_, lu_pivots_dim_, device_id_,
+                     reinterpret_cast<cudaStream_t>(cuda_stream_));
+  return true;
+}
+
+bool LuUnpackGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+                                const std::vector<KernelTensorPtr> &outputs) {
+  kernel_name_ = base_operator->name();
+
+  if (inputs.empty() || outputs.empty()) {
+    MS_LOG(ERROR) << "For '" << kernel_name_ << "' got empty inputs or outputs, which is invalid.";
+    return false;
+  }
+  auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
+  auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
+  if (!is_match) {
+    MS_LOG(EXCEPTION) << "LuUnpack does not support this kernel data type: " << kernel_attr << ".";
+    return false;
+  }
+
+  kernel_func_ = func_list_[index].second;
+  unit_size1_ = abstract::TypeIdSize(inputs.at(kIndex0)->GetDtype());
+  unit_size2_ = abstract::TypeIdSize(inputs.at(kIndex1)->GetDtype());
+
+  return true;
+}
+
+int LuUnpackGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+                                 const std::vector<KernelTensorPtr> &outputs,
+                                 const std::map<uint32_t, tensor::TensorPtr> &) {
+  int ret = KernelMod::Resize(base_operator, inputs, outputs);
+  if (ret != KRET_OK) {
+    return ret;
+  }
+
+  std::vector<int64_t> lu_data_shape = std::vector<int64_t>(inputs.at(kIndex0)->GetDeviceShapeAdaptively().begin(),
+                                                            inputs.at(kIndex0)->GetDeviceShapeAdaptively().end());
+  std::vector<int64_t> lu_pivots_shape = std::vector<int64_t>(inputs.at(kIndex1)->GetDeviceShapeAdaptively().begin(),
+                                                              inputs.at(kIndex1)->GetDeviceShapeAdaptively().end());
+  lu_data_size_ = size_t(std::accumulate(lu_data_shape.begin(), lu_data_shape.end(), 1, std::multiplies<int64_t>()));
+  lu_pivots_size_ =
+    size_t(std::accumulate(lu_pivots_shape.begin(), lu_pivots_shape.end(), 1, std::multiplies<int64_t>()));
+
+  auto lu_data_shape_vec = inputs[kIndex0]->GetShapeVector();
+  auto lu_pivots_shape_vec = inputs[kIndex1]->GetShapeVector();
+  size_t lu_data_dims = lu_data_shape_vec.size();
+  size_t lu_pivots_dims = lu_pivots_shape_vec.size();
+  std::vector<int64_t> lu_data_dims_vector = lu_data_shape_vec;
+  std::vector<int64_t> lu_pivots_dims_vector = lu_pivots_shape_vec;
+
+  lu_data_dim1_ = lu_data_dims_vector[lu_data_dims - kSecondDim];
+  lu_data_dim2_ = lu_data_dims_vector[lu_data_dims - kFirstDim];
+  if (lu_data_dim1_ > lu_data_dim2_) {
+    l_stride_ = lu_data_dim1_ * lu_data_dim2_;
+    u_stride_ = lu_data_dim2_ * lu_data_dim2_;
+  } else {
+    l_stride_ = lu_data_dim1_ * lu_data_dim1_;
+    u_stride_ = lu_data_dim1_ * lu_data_dim2_;
+  }
+  lu_pivots_dim_ = lu_pivots_dims_vector[lu_pivots_dims - kFirstDim];
+  batch_num_ = lu_data_size_ / (lu_data_dim1_ * lu_data_dim2_);
+  workspace_size_list_.push_back(batch_num_ * lu_data_dim1_ * unit_size2_);
+  return KRET_OK;
+}
+
+bool LuUnpackGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                                  const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  cuda_stream_ = stream_ptr;
+  return kernel_func_(this, inputs, workspace, outputs);
+}
+
+std::vector<KernelAttr> LuUnpackGpuKernelMod::GetOpSupport() {
+  std::vector<KernelAttr> support_list;
+  (void)std::transform(func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
+                       [](const std::pair<KernelAttr, LuUnpackFunc> &pair) { return pair.first; });
+  return support_list;
+}
+
+std::vector<std::pair<KernelAttr, LuUnpackGpuKernelMod::LuUnpackFunc>> LuUnpackGpuKernelMod::func_list_ = {
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat64)
+     .AddInputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeFloat64)
+     .AddOutputAttr(kNumberTypeFloat64)
+     .AddOutputAttr(kNumberTypeFloat64),
+   &LuUnpackGpuKernelMod::LaunchKernel<double, int64_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat64)
+     .AddInputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeFloat64)
+     .AddOutputAttr(kNumberTypeFloat64)
+     .AddOutputAttr(kNumberTypeFloat64),
+   &LuUnpackGpuKernelMod::LaunchKernel<double, int32_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat64)
+     .AddInputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeFloat64)
+     .AddOutputAttr(kNumberTypeFloat64)
+     .AddOutputAttr(kNumberTypeFloat64),
+   &LuUnpackGpuKernelMod::LaunchKernel<double, int16_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat64)
+     .AddInputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeFloat64)
+     .AddOutputAttr(kNumberTypeFloat64)
+     .AddOutputAttr(kNumberTypeFloat64),
+   &LuUnpackGpuKernelMod::LaunchKernel<double, int8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat64)
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeFloat64)
+     .AddOutputAttr(kNumberTypeFloat64)
+     .AddOutputAttr(kNumberTypeFloat64),
+   &LuUnpackGpuKernelMod::LaunchKernel<double, uint8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat32)
+     .AddInputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeFloat32)
+     .AddOutputAttr(kNumberTypeFloat32)
+     .AddOutputAttr(kNumberTypeFloat32),
+   &LuUnpackGpuKernelMod::LaunchKernel<float, int64_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat32)
+     .AddInputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeFloat32)
+     .AddOutputAttr(kNumberTypeFloat32)
+     .AddOutputAttr(kNumberTypeFloat32),
+   &LuUnpackGpuKernelMod::LaunchKernel<float, int32_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat32)
+     .AddInputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeFloat32)
+     .AddOutputAttr(kNumberTypeFloat32)
+     .AddOutputAttr(kNumberTypeFloat32),
+   &LuUnpackGpuKernelMod::LaunchKernel<float, int16_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat32)
+     .AddInputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeFloat32)
+     .AddOutputAttr(kNumberTypeFloat32)
+     .AddOutputAttr(kNumberTypeFloat32),
+   &LuUnpackGpuKernelMod::LaunchKernel<float, int8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat32)
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeFloat32)
+     .AddOutputAttr(kNumberTypeFloat32)
+     .AddOutputAttr(kNumberTypeFloat32),
+   &LuUnpackGpuKernelMod::LaunchKernel<float, uint8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat16)
+     .AddInputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeFloat16)
+     .AddOutputAttr(kNumberTypeFloat16)
+     .AddOutputAttr(kNumberTypeFloat16),
+   &LuUnpackGpuKernelMod::LaunchKernel<half, int64_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat16)
+     .AddInputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeFloat16)
+     .AddOutputAttr(kNumberTypeFloat16)
+     .AddOutputAttr(kNumberTypeFloat16),
+   &LuUnpackGpuKernelMod::LaunchKernel<half, int32_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat16)
+     .AddInputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeFloat16)
+     .AddOutputAttr(kNumberTypeFloat16)
+     .AddOutputAttr(kNumberTypeFloat16),
+   &LuUnpackGpuKernelMod::LaunchKernel<half, int16_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat16)
+     .AddInputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeFloat16)
+     .AddOutputAttr(kNumberTypeFloat16)
+     .AddOutputAttr(kNumberTypeFloat16),
+   &LuUnpackGpuKernelMod::LaunchKernel<half, int8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat16)
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeFloat16)
+     .AddOutputAttr(kNumberTypeFloat16)
+     .AddOutputAttr(kNumberTypeFloat16),
+   &LuUnpackGpuKernelMod::LaunchKernel<half, uint8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt64)
+     .AddInputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64),
+   &LuUnpackGpuKernelMod::LaunchKernel<int64_t, int64_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt64)
+     .AddInputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64),
+   &LuUnpackGpuKernelMod::LaunchKernel<int64_t, int32_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt64)
+     .AddInputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64),
+   &LuUnpackGpuKernelMod::LaunchKernel<int64_t, int16_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt64)
+     .AddInputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64),
+   &LuUnpackGpuKernelMod::LaunchKernel<int64_t, int8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt64)
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64),
+   &LuUnpackGpuKernelMod::LaunchKernel<int64_t, uint8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt32)
+     .AddInputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32),
+   &LuUnpackGpuKernelMod::LaunchKernel<int32_t, int64_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt32)
+     .AddInputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32),
+   &LuUnpackGpuKernelMod::LaunchKernel<int32_t, int32_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt32)
+     .AddInputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32),
+   &LuUnpackGpuKernelMod::LaunchKernel<int32_t, int16_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt32)
+     .AddInputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32),
+   &LuUnpackGpuKernelMod::LaunchKernel<int32_t, int8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt32)
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32),
+   &LuUnpackGpuKernelMod::LaunchKernel<int32_t, uint8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt32)
+     .AddInputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32),
+   &LuUnpackGpuKernelMod::LaunchKernel<int32_t, int64_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt32)
+     .AddInputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32),
+   &LuUnpackGpuKernelMod::LaunchKernel<int32_t, int32_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt32)
+     .AddInputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32),
+   &LuUnpackGpuKernelMod::LaunchKernel<int32_t, int16_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt32)
+     .AddInputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32),
+   &LuUnpackGpuKernelMod::LaunchKernel<int32_t, int8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt32)
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32),
+   &LuUnpackGpuKernelMod::LaunchKernel<int32_t, uint8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt16)
+     .AddInputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16),
+   &LuUnpackGpuKernelMod::LaunchKernel<int16_t, int64_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt16)
+     .AddInputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16),
+   &LuUnpackGpuKernelMod::LaunchKernel<int16_t, int32_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt16)
+     .AddInputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16),
+   &LuUnpackGpuKernelMod::LaunchKernel<int16_t, int16_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt16)
+     .AddInputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16),
+   &LuUnpackGpuKernelMod::LaunchKernel<int16_t, int8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt16)
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16),
+   &LuUnpackGpuKernelMod::LaunchKernel<int16_t, uint8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt8)
+     .AddInputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8),
+   &LuUnpackGpuKernelMod::LaunchKernel<int8_t, int64_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt8)
+     .AddInputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8),
+   &LuUnpackGpuKernelMod::LaunchKernel<int8_t, int32_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt8)
+     .AddInputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8),
+   &LuUnpackGpuKernelMod::LaunchKernel<int8_t, int16_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt8)
+     .AddInputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8),
+   &LuUnpackGpuKernelMod::LaunchKernel<int8_t, int8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt8)
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8),
+   &LuUnpackGpuKernelMod::LaunchKernel<int8_t, uint8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddInputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8),
+   &LuUnpackGpuKernelMod::LaunchKernel<int8_t, int64_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddInputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8),
+   &LuUnpackGpuKernelMod::LaunchKernel<int8_t, int32_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddInputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8),
+   &LuUnpackGpuKernelMod::LaunchKernel<int8_t, int16_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddInputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8),
+   &LuUnpackGpuKernelMod::LaunchKernel<int8_t, int8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8),
+   &LuUnpackGpuKernelMod::LaunchKernel<int8_t, uint8_t>}};
+
+MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, LuUnpack, LuUnpackGpuKernelMod);
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_unpack_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_unpack_gpu_kernel.h
@ -0,0 +1,78 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LUUNPACK_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LUUNPACK_GPU_KERNEL_H_
+#include <map>
+#include <vector>
+#include <string>
+#include <memory>
+#include <utility>
+#include <algorithm>
+#include <functional>
+#include "mindspore/core/ops/lu_unpack.h"
+#include "plugin/device/gpu/kernel/gpu_kernel.h"
+#include "plugin/factory/ms_factory.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_impl.cuh"
+#include "plugin/device/gpu/hal/device/gpu_device_address.h"
+#include "utils/check_convert_utils.h"
+#include "abstract/ops/primitive_infer_map.h"
+
+namespace mindspore {
+constexpr size_t kInputNum = 2;
+constexpr size_t kOutputNum = 3;
+namespace kernel {
+class LuUnpackGpuKernelMod : public NativeGpuKernelMod {
+ public:
+  LuUnpackGpuKernelMod() { ResetResource(); }
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+  bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+            const std::vector<KernelTensorPtr> &outputs) override;
+  int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+             const std::vector<KernelTensorPtr> &outputs, const std::map<uint32_t, tensor::TensorPtr> &) override;
+
+ protected:
+  std::vector<KernelAttr> GetOpSupport() override;
+
+ private:
+  using LuUnpackFunc =
+    std::function<bool(LuUnpackGpuKernelMod *, const std::vector<kernel::AddressPtr> &,
+                       const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
+  LuUnpackFunc kernel_func_;
+  void *cuda_stream_{nullptr};
+  static std::vector<std::pair<KernelAttr, LuUnpackFunc>> func_list_;
+
+  size_t lu_data_size_;
+  size_t lu_pivots_size_;
+  int64_t lu_data_dim1_;
+  int64_t lu_data_dim2_;
+  int64_t l_stride_;
+  int64_t u_stride_;
+  int64_t lu_pivots_dim_;
+  int64_t batch_num_;
+  size_t unit_size1_;
+  size_t unit_size2_;
+
+  void ResetResource() noexcept;
+  template <typename T_data, typename T_pivots>
+  bool LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                    const std::vector<AddressPtr> &outputs);
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LUUNPACK_GPU_KERNEL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_unpack_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_unpack_grad_gpu_kernel.cc
@ -0,0 +1,213 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "plugin/device/gpu/kernel/math/lu_unpack_grad_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+constexpr const size_t kLuUnpackGradInputNum = 3;
+constexpr const size_t kLuUnpackGradOutputNum = 2;
+constexpr const int64_t Min_Dim = 2;
+}  // namespace
+
+bool LuUnpackGradGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+                                    const std::vector<KernelTensorPtr> &outputs) {
+  auto kernel_ptr = std::dynamic_pointer_cast<ops::LuUnpackGrad>(base_operator);
+  MS_ERROR_IF_NULL_W_RET_VAL(kernel_ptr, false);
+
+  kernel_name_ = kernel_ptr->name();
+  if (inputs.size() != kLuUnpackGradInputNum || outputs.size() != kLuUnpackGradOutputNum) {
+    MS_LOG(ERROR) << "For '" << kernel_name_ << "', input and output size must be " << kLuUnpackGradInputNum << " and "
+                  << kLuUnpackGradOutputNum << ", but got " << inputs.size() << " and " << outputs.size();
+    return false;
+  }
+
+  auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
+  auto pair = MatchKernelAttr(kernel_attr, GetOpSupport());
+  if (!pair.first) {
+    MS_LOG(ERROR) << "'" << kernel_name_ << "' does not support this kernel data type: " << kernel_attr;
+    return false;
+  }
+
+  kernel_func_ = func_list_[pair.second].second;
+  unit_size_ = abstract::TypeIdSize(inputs.at(kIndex0)->GetDtype());
+  input_L_shape = std::vector<int64_t>(inputs.at(kIndex0)->GetDeviceShapeAdaptively().begin(),
+                                       inputs.at(kIndex0)->GetDeviceShapeAdaptively().end());
+  input_U_shape = std::vector<int64_t>(inputs.at(kIndex1)->GetDeviceShapeAdaptively().begin(),
+                                       inputs.at(kIndex1)->GetDeviceShapeAdaptively().end());
+  input_LU_shape = std::vector<int64_t>(inputs.at(kIndex2)->GetDeviceShapeAdaptively().begin(),
+                                        inputs.at(kIndex2)->GetDeviceShapeAdaptively().end());
+  return true;
+}
+
+int LuUnpackGradGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+                                     const std::vector<KernelTensorPtr> &outputs,
+                                     const std::map<uint32_t, tensor::TensorPtr> &others) {
+  int ret = KernelMod::Resize(base_operator, inputs, outputs);
+  if (ret != KRET_OK) {
+    return ret;
+  }
+
+  ResetResource();
+  input_L_shape = std::vector<int64_t>(inputs.at(kIndex0)->GetDeviceShapeAdaptively().begin(),
+                                       inputs.at(kIndex0)->GetDeviceShapeAdaptively().end());
+  input_U_shape = std::vector<int64_t>(inputs.at(kIndex1)->GetDeviceShapeAdaptively().begin(),
+                                       inputs.at(kIndex1)->GetDeviceShapeAdaptively().end());
+  input_LU_shape = std::vector<int64_t>(inputs.at(kIndex2)->GetDeviceShapeAdaptively().begin(),
+                                        inputs.at(kIndex2)->GetDeviceShapeAdaptively().end());
+  int64_t input_L_elements_ =
+    std::accumulate(input_L_shape.begin(), input_L_shape.end(), int64_t(1), std::multiplies<int64_t>());
+  int64_t input_U_elements_ =
+    std::accumulate(input_U_shape.begin(), input_U_shape.end(), int64_t(1), std::multiplies<int64_t>());
+  int64_t input_LU_elements_ =
+    std::accumulate(input_LU_shape.begin(), input_LU_shape.end(), int64_t(1), std::multiplies<int64_t>());
+  input_elements_ = input_L_elements_ + input_U_elements_ + input_LU_elements_;
+  int64_t input_L_size = input_L_elements_ * unit_size_;
+  int64_t input_U_size = input_U_elements_ * unit_size_;
+  int64_t input_LU_size = input_LU_elements_ * unit_size_;
+  input_size_list_.emplace_back(input_L_size);
+  input_size_list_.emplace_back(input_U_size);
+  input_size_list_.emplace_back(input_LU_size);
+  output_size_list_.emplace_back(input_LU_size);
+  output_size_list_.emplace_back(input_LU_size);
+  workspace_size_list_.emplace_back(input_L_size);
+  workspace_size_list_.emplace_back(input_U_size);
+
+  return KRET_OK;
+}
+
+void LuUnpackGradGpuKernelMod::ResetResource() noexcept {
+  input_elements_ = 0;
+  is_null_input_ = false;
+  input_size_list_.clear();
+  output_size_list_.clear();
+  workspace_size_list_.clear();
+}
+
+template <typename T>
+bool LuUnpackGradGpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
+                                            const std::vector<AddressPtr> &workspace,
+                                            const std::vector<kernel::AddressPtr> &outputs) {
+  T *l_grad_input = GetDeviceAddress<T>(inputs, kIndex0);
+  T *u_grad_input = GetDeviceAddress<T>(inputs, kIndex1);
+  T *l_grad_output = GetDeviceAddress<T>(outputs, kIndex0);
+  T *u_grad_output = GetDeviceAddress<T>(outputs, kIndex1);
+
+  int64_t lu_data_dims = input_LU_shape.size();
+  int64_t lu_data_elem_num =
+    std::accumulate(input_LU_shape.begin(), input_LU_shape.end(), int64_t(1), std::multiplies<int64_t>());
+  int64_t lu_data_height = input_LU_shape[lu_data_dims - 2];
+  int64_t lu_data_width = input_LU_shape[lu_data_dims - 1];
+  int64_t lu_data_stride = lu_data_height * lu_data_width;
+  int64_t matrix_num = lu_data_elem_num / lu_data_stride;
+  int64_t output_stride = lu_data_stride;
+
+  int64_t input_L_dims = input_L_shape.size();
+  int64_t input_U_dims = input_U_shape.size();
+  int64_t matrix_L_width = input_L_shape[input_L_dims - 2];
+  int64_t matrix_L_height = input_L_shape[input_L_dims - 1];
+  int64_t matrix_U_width = input_U_shape[input_U_dims - 2];
+  int64_t matrix_U_height = input_U_shape[input_U_dims - 1];
+
+  if (lu_data_width > lu_data_height) {
+    CalTrilExpendWidth(matrix_num * output_stride, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output,
+                       lu_data_height, lu_data_width, device_id_, stream_ptr_);
+    CalTriuUpper(matrix_num * output_stride, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output,
+                 lu_data_height, lu_data_width, device_id_, stream_ptr_);
+  } else {
+    if (lu_data_height > lu_data_width) {
+      CalTrilLower(matrix_num * output_stride, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output,
+                   lu_data_height, lu_data_width, device_id_, stream_ptr_);
+      CalTriuExpendHeight(matrix_num * output_stride, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output,
+                          lu_data_height, lu_data_width, device_id_, stream_ptr_);
+    } else {
+      CalTrilLower(matrix_num * output_stride, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output,
+                   lu_data_height, lu_data_width, device_id_, stream_ptr_);
+      CalTriuUpper(matrix_num * output_stride, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output,
+                   lu_data_height, lu_data_width, device_id_, stream_ptr_);
+    }
+  }
+  return true;
+}
+
+std::vector<std::pair<KernelAttr, LuUnpackGradGpuKernelMod::LuUnpackGradFunc>> LuUnpackGradGpuKernelMod::func_list_ = {
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat64)
+     .AddInputAttr(kNumberTypeFloat64)
+     .AddInputAttr(kNumberTypeFloat64)
+     .AddOutputAttr(kNumberTypeFloat64)
+     .AddOutputAttr(kNumberTypeFloat64),
+   &LuUnpackGradGpuKernelMod::LaunchKernel<double>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat32)
+     .AddInputAttr(kNumberTypeFloat32)
+     .AddInputAttr(kNumberTypeFloat32)
+     .AddOutputAttr(kNumberTypeFloat32)
+     .AddOutputAttr(kNumberTypeFloat32),
+   &LuUnpackGradGpuKernelMod::LaunchKernel<float>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeFloat16)
+     .AddInputAttr(kNumberTypeFloat16)
+     .AddInputAttr(kNumberTypeFloat16)
+     .AddOutputAttr(kNumberTypeFloat16)
+     .AddOutputAttr(kNumberTypeFloat16),
+   &LuUnpackGradGpuKernelMod::LaunchKernel<half>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt64)
+     .AddInputAttr(kNumberTypeInt64)
+     .AddInputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64)
+     .AddOutputAttr(kNumberTypeInt64),
+   &LuUnpackGradGpuKernelMod::LaunchKernel<int64_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt32)
+     .AddInputAttr(kNumberTypeInt32)
+     .AddInputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32)
+     .AddOutputAttr(kNumberTypeInt32),
+   &LuUnpackGradGpuKernelMod::LaunchKernel<int32_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt16)
+     .AddInputAttr(kNumberTypeInt16)
+     .AddInputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16)
+     .AddOutputAttr(kNumberTypeInt16),
+   &LuUnpackGradGpuKernelMod::LaunchKernel<int16_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeInt8)
+     .AddInputAttr(kNumberTypeInt8)
+     .AddInputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8)
+     .AddOutputAttr(kNumberTypeInt8),
+   &LuUnpackGradGpuKernelMod::LaunchKernel<int8_t>},
+  {KernelAttr()
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddInputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8)
+     .AddOutputAttr(kNumberTypeUInt8),
+   &LuUnpackGradGpuKernelMod::LaunchKernel<uint8_t>}};
+
+std::vector<KernelAttr> LuUnpackGradGpuKernelMod::GetOpSupport() {
+  std::vector<KernelAttr> support_list;
+  (void)std::transform(func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
+                       [](const std::pair<KernelAttr, LuUnpackGradFunc> &pair) { return pair.first; });
+  return support_list;
+}
+
+MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, LuUnpackGrad, LuUnpackGradGpuKernelMod);
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_unpack_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_unpack_grad_gpu_kernel.h
@ -0,0 +1,81 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_UNPACK_GRAD_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_UNPACK_GRAD_GPU_KERNEL_H_
+
+#include <vector>
+#include <map>
+#include <functional>
+#include <algorithm>
+#include <utility>
+#include "mindspore/core/ops/grad/lu_unpack_grad.h"
+#include "plugin/device/gpu/kernel/gpu_kernel.h"
+#include "plugin/factory/ms_factory.h"
+#include "plugin/device/gpu/hal/device/gpu_device_address.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_grad_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+class LuUnpackGradGpuKernelMod : public NativeGpuKernelMod {
+ public:
+  LuUnpackGradGpuKernelMod() { ResetResource(); }
+  ~LuUnpackGradGpuKernelMod() override = default;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    if (is_null_input_) {
+      return true;
+    }
+    VARIABLE_NOT_USED(workspace);
+    stream_ptr_ = reinterpret_cast<cudaStream_t>(stream_ptr);
+    return kernel_func_(this, inputs, workspace, outputs);
+  }
+
+  bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+            const std::vector<KernelTensorPtr> &outputs) override;
+
+  int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+             const std::vector<KernelTensorPtr> &outputs,
+             const std::map<uint32_t, tensor::TensorPtr> &others = std::map<uint32_t, tensor::TensorPtr>()) override;
+
+  std::vector<KernelAttr> GetOpSupport() override;
+
+  void ResetResource() noexcept;
+
+ private:
+  template <typename T>
+  bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                    const std::vector<kernel::AddressPtr> &outputs);
+
+  using LuUnpackGradFunc =
+    std::function<bool(LuUnpackGradGpuKernelMod *, const std::vector<kernel::AddressPtr> &,
+                       const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
+  static std::vector<std::pair<KernelAttr, LuUnpackGradFunc>> func_list_;
+  LuUnpackGradFunc kernel_func_;
+
+  bool is_null_input_{false};
+  int64_t unit_size_{1};
+  int64_t input_elements_;
+  std::vector<int64_t> input_L_shape;
+  std::vector<int64_t> input_U_shape;
+  std::vector<int64_t> input_LU_shape;
+  cudaStream_t stream_ptr_{nullptr};
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_UNPACK_GRAD_GPU_KERNEL_H_
--- a/mindspore/core/ops/grad/lu_unpack_grad.cc
+++ b/mindspore/core/ops/grad/lu_unpack_grad.cc
@ -45,6 +45,7 @@ abstract::TupleShapePtr LuUnpackGradInferShape(const PrimitivePtr &primitive,
  auto LU_data_dim1 = LU_data_shape[LU_data_shape.size() - 2];
  auto LU_data_dim2 = LU_data_shape[LU_data_shape.size() - 1];
  int64_t LU_data_min = std::min(LU_data_dim1, LU_data_dim2);
+
  if (LU_data_dim1 != L_data_dim1) {
    MS_EXCEPTION(ValueError) << "For '" << primitive->name()
                             << "', L_grad's data dim[-2] and LU_data's dim[-2] should be same.";
--- a/mindspore/core/ops/lu_unpack.cc
+++ b/mindspore/core/ops/lu_unpack.cc
@ -42,6 +42,7 @@ abstract::TupleShapePtr LuUnpackInferShape(const PrimitivePtr &primitive,
  auto LU_data_shape = LU_data_shape_map[kShape];
  auto LU_pivots_shape_map = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[1]->BuildShape());
  auto LU_pivots_shape = LU_pivots_shape_map[kShape];
+
  if (IsDynamicRank(LU_data_shape)) {
    auto output_shpe = std::make_shared<abstract::Shape>(LU_data_shape);
    return std::make_shared<abstract::TupleShape>(
--- a/mindspore/python/mindspore/ops/operations/math_ops.py
+++ b/mindspore/python/mindspore/ops/operations/math_ops.py
@ -6388,7 +6388,7 @@ class LuUnpack(Primitive):
        RuntimeError: On the Ascend platform, if the value of `LU_pivots` are out of range[1, LU_data.shape[-2]).

    Supported Platforms:
-        ``Ascend`` ``CPU``
+        ``Ascend`` ``GPU`` ``CPU``

    Examples:
        >>> LU_data = Tensor(np.array([[[-0.3806, -0.4872,  0.5536],
@ -6399,11 +6399,32 @@ class LuUnpack(Primitive):
        ...                             [ 0.1015, -0.5363,  0.6165]]]), mstype.float32)
        >>> LU_pivots = Tensor(np.array([[1, 3, 3],
        ...                              [2, 3, 3]]), mstype.int32)
-        >>> lu_unpack = ops.LuUnpack()
-        >>> pivots, L, U = lu_unpack(LU_data, LU_pivots, unpack_data, unpack_pivots)
+        >>> lu_unpack = LuUnpack()
+        >>> pivots, L, U = lu_unpack(LU_data, LU_pivots)
        >>> print(pivots)
+        [[[1. 0. 0.]
+          [0. 0. 1.]
+          [0. 1. 0.]]
+        <BLANKLINE>
+         [[0. 0. 1.]
+          [1. 0. 0.]
+          [0. 1. 0.]]]
        >>> print(L)
+        [[[ 1.      0.      0.    ]
+          [-0.1287  1.      0.    ]
+          [ 0.2583  0.5239  1.    ]]
+        <BLANKLINE>
+         [[ 1.      0.      0.    ]
+          [-0.6401  1.      0.    ]
+          [ 0.1015 -0.5363  1.    ]]]
        >>> print(U)
+        [[[-0.3806 -0.4872  0.5536]
+          [ 0.      0.6508 -0.2396]
+          [ 0.      0.      0.6902]]
+        <BLANKLINE>
+         [[ 0.6706 -1.1782  0.4574]
+          [ 0.     -0.4779  0.6701]
+          [ 0.      0.      0.6165]]]
    """

    @prim_attr_register
@ -6491,10 +6512,6 @@ class Polygamma(Primitive):
    .. math::
    \psi^{(a)}(x) = \frac{d^{(a)}}{dx^{(a)}} \psi(x)

-    Args:
-        :math:`a \geq 0`: the order of the polygamma function.
-        input (Tensor): the tensor to compute the polygamma function.
-
    Inputs:
        - **a** (Tensor) - The order of the polygamma function, types: int32, int64.
        - **x** (Tensor) - The input tensor, types: float16, float32, float64.
--- a/tests/st/ops/gpu/test_lu_unpack_grad_op.py
+++ b/tests/st/ops/gpu/test_lu_unpack_grad_op.py
@ -0,0 +1,126 @@
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+import torch
+
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops.operations import _grad_ops as G
+import mindspore.common.dtype as mstype
+
+
+class NetLuUnpackGrad(nn.Cell):
+    def __init__(self, l_grad_flag=True, u_grad_flag=True):
+        super().__init__()
+        self.lu_unpack_grad = G.LuUnpackGrad(L_grad_flag=l_grad_flag, U_grad_flag=u_grad_flag)
+
+    def construct(self, l_grad, u_grad, lu_data):
+        return self.lu_unpack_grad(l_grad, u_grad, lu_data)
+
+
+def getl(a_lu, pivots):
+    a_lu_float = a_lu.float()
+    a_lu_torch = torch.tensor(a_lu_float, requires_grad=True)
+    out_torch = torch.lu_unpack(a_lu_torch, pivots)
+    out_torch[1].backward(gradient=out_torch[1])
+    return a_lu_torch.grad
+
+
+def getu(a_lu, pivots):
+    a_lu_float = a_lu.float()
+    a_lu_torch = torch.tensor(a_lu_float, requires_grad=True)
+    out_torch = torch.lu_unpack(a_lu_torch, pivots)
+    out_torch[2].backward(gradient=out_torch[2])
+    return a_lu_torch.grad
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_lu_unpack_grad_graph_float():
+    """
+    Feature: LuUnpackGrad gpu TEST.
+    Description: 4d - float32 test case for LuUnpackGrad
+    Expectation: the result match to numpy
+    """
+    a = torch.randn(3, 3, 3, 3)
+    loss = 1e-5
+    a_lu, pivots = a.lu()
+    out = torch.lu_unpack(a_lu, pivots)
+
+    a_l_torch = getl(a_lu, pivots)
+    a_u_torch = getu(a_lu, pivots)
+
+    a_l_mindspore = Tensor(out[1].numpy(), mstype.float32)
+    a_u_mindspore = Tensor(out[2].numpy(), mstype.float32)
+    a_lu_mindspore = Tensor(a_lu.numpy(), mstype.float32)
+
+    net = NetLuUnpackGrad(l_grad_flag=True, u_grad_flag=True)
+    l_grad_mindspore, u_grad_mindspore = net(a_l_mindspore, a_u_mindspore, a_lu_mindspore)
+
+    assert np.allclose(a_l_torch, torch.tensor(l_grad_mindspore.asnumpy()), atol=loss)
+    assert np.allclose(a_u_torch, torch.tensor(u_grad_mindspore.asnumpy()), atol=loss)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_lu_unpack_grad_pynative_float():
+    """
+    Feature: LuUnpackGrad gpu TEST.
+    Description: 4d - float32 test case for LuUnpackGrad
+    Expectation: the result match to numpy
+    """
+    a = torch.randn(5, 4, 3, 2)
+    loss = 1e-5
+    a_lu, pivots = a.lu()
+    out = torch.lu_unpack(a_lu, pivots)
+
+    a_l_torch = getl(a_lu, pivots)
+    a_u_torch = getu(a_lu, pivots)
+
+    a_l_mindspore = Tensor(out[1].numpy(), mstype.float32)
+    a_u_mindspore = Tensor(out[2].numpy(), mstype.float32)
+    a_lu_mindspore = Tensor(a_lu.numpy(), mstype.float32)
+
+    net = NetLuUnpackGrad(l_grad_flag=True, u_grad_flag=True)
+    l_grad_mindspore, u_grad_mindspore = net(a_l_mindspore, a_u_mindspore, a_lu_mindspore)
+
+    assert np.allclose(a_l_torch, torch.tensor(l_grad_mindspore.asnumpy()), atol=loss)
+    assert np.allclose(a_u_torch, torch.tensor(u_grad_mindspore.asnumpy()), atol=loss)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_lu_unpack_grad_pynative_float_error():
+    """
+    Feature: LuUnpackGrad gpu TEST.
+    Description: 4d - uint16 test the unsupported type for LuUnpackGrad
+    Expectation: the result match to numpy
+    """
+    a = torch.randn(8, 7, 4, 2)
+    a_lu, pivots = a.lu()
+    out = torch.lu_unpack(a_lu, pivots)
+
+    a_l_mindspore = Tensor(out[1].numpy(), mstype.uint16)
+    a_u_mindspore = Tensor(out[2].numpy(), mstype.uint16)
+    a_lu_mindspore = Tensor(a_lu.numpy(), mstype.uint16)
+
+    with pytest.raises(TypeError):
+        net = NetLuUnpackGrad(l_grad_flag=True, u_grad_flag=True)
+        net(a_l_mindspore, a_u_mindspore, a_lu_mindspore)
--- a/tests/st/ops/gpu/test_lu_unpack_op
+++ b/tests/st/ops/gpu/test_lu_unpack_op
@ -0,0 +1,113 @@
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops.operations.math_ops import LuUnpack
+import mindspore.common.dtype as mstype
+import torch
+import pytest
+
+
+class LuUnpackNet(nn.Cell):
+    def __init__(self, unpack_data=True, unpack_pivots=True):
+        super(LuUnpackNet, self).__init__()
+        self.lu_unpack = LuUnpack(unpack_data=unpack_data, unpack_pivots=unpack_pivots)
+
+    def construct(self, LU_data, LU_pivots):
+        return self.lu_unpack(LU_data, LU_pivots)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_lu_unpack_float32_int64():
+    """
+    Feature: ALL To ALL
+    Description: test cases for LuUnpack
+    Expectation: the result match to torch
+    """
+    my_atol = 1e-5
+    data = torch.randn(10, 7, 8, 9)
+    a_lu, pivots = data.lu()
+    p, a_l, a_u = torch.lu_unpack(a_lu, pivots)
+    my_a_lu = Tensor(a_lu.numpy(), mstype.float32)
+    my_pivots = Tensor(pivots.numpy(), mstype.int64)
+    net = LuUnpackNet(unpack_data=True, unpack_pivots=True)
+    my_p, my_a_l, my_a_u = net(my_a_lu, my_pivots)
+    assert np.allclose(p.numpy(), my_p.asnumpy(), atol=my_atol)
+    assert np.allclose(a_l.numpy(), my_a_l.asnumpy(), atol=my_atol)
+    assert np.allclose(a_u.numpy(), my_a_u.asnumpy(), atol=my_atol)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_lu_unpack_float64_int32():
+    """
+    Feature: ALL To ALL
+    Description: test cases for LuUnpack
+    Expectation: the result match to torch
+    """
+    my_atol = 1e-5
+    data = torch.randn(7, 8, 9, 10)
+    a_lu, pivots = data.lu()
+    p, a_l, a_u = torch.lu_unpack(a_lu, pivots)
+    my_a_lu = Tensor(a_lu.numpy(), mstype.float64)
+    my_pivots = Tensor(pivots.numpy(), mstype.int32)
+    net = LuUnpackNet(unpack_data=True, unpack_pivots=True)
+    my_p, my_a_l, my_a_u = net(my_a_lu, my_pivots)
+    assert np.allclose(p.numpy(), my_p.asnumpy(), atol=my_atol)
+    assert np.allclose(a_l.numpy(), my_a_l.asnumpy(), atol=my_atol)
+    assert np.allclose(a_u.numpy(), my_a_u.asnumpy(), atol=my_atol)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_lu_unpack_float64_int16():
+    """
+    Feature: ALL To ALL
+    Description: test cases for LuUnpack
+    Expectation: the result match to torch
+    """
+    my_atol = 1e-5
+    data = torch.randn(4, 6, 8, 10)
+    a_lu, pivots = data.lu()
+    p, a_l, a_u = torch.lu_unpack(a_lu, pivots)
+    my_a_lu = Tensor(a_lu.numpy(), mstype.float64)
+    my_pivots = Tensor(pivots.numpy(), mstype.int16)
+    net = LuUnpackNet(unpack_data=True, unpack_pivots=True)
+    my_p, my_a_l, my_a_u = net(my_a_lu, my_pivots)
+    assert np.allclose(p.numpy(), my_p.asnumpy(), atol=my_atol)
+    assert np.allclose(a_l.numpy(), my_a_l.asnumpy(), atol=my_atol)
+    assert np.allclose(a_u.numpy(), my_a_u.asnumpy(), atol=my_atol)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_lu_unpack_input_error():
+    """
+    Feature: ALL To ALL
+    Description: test cases for LuUnpack
+    Expectation: raise ValueError
+    """
+    my_a_lu = Tensor(np.random.randn(3, 3, 2, 2).astype(np.float32))
+    my_pivots = Tensor(np.random.randn(3, 3, 5, 3).astype(np.int32))
+    with pytest.raises(ValueError):
+        net = LuUnpackNet(unpack_data=True, unpack_pivots=True)
+        net(my_a_lu, my_pivots)