diff --git a/cmake/package.cmake b/cmake/package.cmake
index da75fd07177..8ed2e7d448c 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -212,6 +212,11 @@ if(ENABLE_GPU)
         DESTINATION ${INSTALL_LIB_DIR}
         COMPONENT mindspore
     )
+    install(
+            TARGETS cuda_ops
+            DESTINATION ${INSTALL_LIB_DIR}
+            COMPONENT mindspore
+    )
 endif()
 
 if(ENABLE_D)
diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt
index 6bca1693595..46114fd7c41 100644
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -114,6 +114,8 @@ if(ENABLE_GPU)
             "plugin/device/gpu/kernel/*.cu"
             )
 
+    list(REMOVE_ITEM GPU_SRC_LIST "plugin/device/gpu/kernel/cuda_impl/cuda_ops/*.cu")
+
     list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
     list(REMOVE_ITEM GPU_SRC_LIST "plugin/device/gpu/hal/device/blocking_queue.cc"
             "plugin/device/gpu/hal/device/gpu_buffer_mgr.cc")
@@ -145,6 +147,8 @@ if(ENABLE_GPU)
     cuda_add_library(gpu_cuda_lib STATIC ${GPU_SRC_LIST})
     set(CMAKE_CXX_FLAGS ${NVCC_TMP_CMAKE_CXX_FLAGS})
     add_compile_definitions(ENABLE_GPU)
+
+    add_subdirectory(plugin/device/gpu/kernel/cuda_impl/cuda_ops)
 endif()
 
 
@@ -430,7 +434,7 @@ endif()
 
 if(ENABLE_GPU)
     message("add gpu lib to c_expression")
-    target_link_libraries(_c_expression PRIVATE gpu_cuda_lib gpu_queue cublas
+    target_link_libraries(_c_expression PRIVATE gpu_cuda_lib gpu_queue cublas cuda_ops
                           ${CUDA_PATH}/lib64/libcurand.so
                           ${CUDNN_LIBRARY_PATH}
                           ${CUDA_PATH}/lib64/libcudart.so
diff --git a/mindspore/ccsrc/cxx_api/CMakeLists.txt b/mindspore/ccsrc/cxx_api/CMakeLists.txt
index 4502114e38b..9db16b44d7e 100644
--- a/mindspore/ccsrc/cxx_api/CMakeLists.txt
+++ b/mindspore/ccsrc/cxx_api/CMakeLists.txt
@@ -140,7 +140,7 @@ if(ENABLE_D)
 endif()
 
 if(ENABLE_GPU)
-    target_link_libraries(mindspore_shared_lib PRIVATE  gpu_cuda_lib gpu_queue cublas
+    target_link_libraries(mindspore_shared_lib PRIVATE  gpu_cuda_lib gpu_queue cublas cuda_ops
                           ${CUDA_PATH}/lib64/libcurand.so
                           ${CUDNN_LIBRARY_PATH}
                           ${CUDA_PATH}/lib64/libcudart.so
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc b/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc
index 1b4f2f5f277..77e6aa1cb12 100644
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc
@@ -26,7 +26,7 @@
 #include "kernel/oplib/oplib.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "plugin/device/gpu/kernel/custom/custom_aot_gpu_kernel.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "utils/ms_context.h"
 #include "utils/ms_utils.h"
 #include "utils/utils.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
index 98d00fdd667..40d77f8d8b4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
@@ -28,7 +28,7 @@
 #include "plugin/device/gpu/hal/device/gpu_buffer_mgr.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/hal/device/gpu_common.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/hal/hardware/optimizer.h"
 #include "utils/ms_device_shape_transfer.h"
 #include "utils/context/graph_kernel_flags.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmax_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmax_gpu_kernel.h
index 1dc20713dcf..cd09eb7013e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmax_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmax_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmaxandminwithvalue_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmaxandminwithvalue_gpu_kernel.h
index 8644187a16e..72cbed431f1 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmaxandminwithvalue_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmaxandminwithvalue_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h
index 56ab101e245..f6a7b8b3a6e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h
index ca70b194841..b687014f980 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/cast_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/cast_gpu_kernel.h
index 73ed5971427..664529e9ae9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/cast_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/cast_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/concatv2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/concatv2_gpu_kernel.h
index b79dd918910..c81ac71a433 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/concatv2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/concatv2_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/crop_and_resize_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/crop_and_resize_gpu_kernel.h
index 5e00ba80955..5a9c18ee4e5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/crop_and_resize_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/crop_and_resize_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/depthtospace_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/depthtospace_gpu_kernel.h
index 02e33795e73..8b3458dd759 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/depthtospace_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/depthtospace_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/dynamic_range_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/dynamic_range_gpu_kernel.h
index 7ef4a9889a8..a4efb38bae0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/dynamic_range_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/dynamic_range_gpu_kernel.h
@@ -21,7 +21,7 @@
 
 #include <vector>
 
-#include "plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/embedding_lookup_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/embedding_lookup_gpu_kernel.h
index f8102103578..52579606904 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/embedding_lookup_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/embedding_lookup_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/extract_image_patches_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/extract_image_patches_gpu_kernel.h
index 7355325d270..52d091066d2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/extract_image_patches_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/extract_image_patches_gpu_kernel.h
@@ -23,8 +23,8 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_gpu_kernel.h
index 4de4011a5d3..5beb63dcb3b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/gather.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_grad_gpu_kernel.h
index 045d0ee49ce..a106a12fb3b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/gather_grad.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gathernd_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gathernd_gpu_kernel.h
index 32022f027a0..70ea77f3805 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gathernd_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gathernd_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/gathernd.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cuh"
 #include "backend/common/session/anf_runtime_algorithm.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gatherv2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gatherv2_gpu_kernel.h
index 04a786f5459..c4a63212d4f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gatherv2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gatherv2_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh"
 #include "backend/common/session/anf_runtime_algorithm.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/in_top_k_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/in_top_k_gpu_kernel.h
index d460e34db07..d75de64118e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/in_top_k_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/in_top_k_gpu_kernel.h
@@ -22,9 +22,9 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_band_part_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_band_part_gpu_kernel.h
index b4ddfcefb48..a8538d0fcda 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_band_part_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_band_part_gpu_kernel.h
@@ -23,9 +23,9 @@
 #include <cuda_runtime.h>
 #include <vector>
 #include <algorithm>
-#include "utils/complex.h"
-#include "plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_diag_part_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_diag_part_gpu_kernel.h
index 48f9a2776fe..8527e876bba 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_diag_part_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_diag_part_gpu_kernel.h
@@ -25,9 +25,9 @@
 #include <string>
 #include <utility>
 #include <algorithm>
-#include "utils/complex.h"
-#include "plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_set_diag_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_set_diag_gpu_kernel.h
index 658eb666738..b0bc14f5744 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_set_diag_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_set_diag_gpu_kernel.h
@@ -26,7 +26,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/meshgrid_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/meshgrid_gpu_kernel.h
index 52ba3e02d32..3271c11e2f9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/meshgrid_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/meshgrid_gpu_kernel.h
@@ -22,8 +22,8 @@
 #include <utility>
 #include <vector>
 
-#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/one_hot_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/one_hot_gpu_kernel.h
index b1f2b2b4fe0..cf2893f063b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/one_hot_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/one_hot_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/oneslike_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/oneslike_gpu_kernel.h
index a12bc6ddc76..2e54f687f5e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/oneslike_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/oneslike_gpu_kernel.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops//oneslike_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/pack_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/pack_gpu_kernel.h
index d945d1f92aa..1e9dc564d18 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/pack_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/pack_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/pack.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/range_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/range_gpu_kernel.h
index 13da8525029..7acade9fcde 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/range_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/range_gpu_kernel.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/range_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr float kStartDefault = 0.;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_gpu_kernel.h
index fe3c9617f54..99cdf519e85 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_grad_gpu_kernel.h
index 06921785fad..b85f75b2347 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_sequence_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_sequence_gpu_kernel.h
index bc46f767c85..d073c531019 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_sequence_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_sequence_gpu_kernel.h
@@ -22,8 +22,8 @@
 #include <iostream>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_v2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_v2_gpu_kernel.h
index dbcbaef9101..251e856fc78 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_v2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_v2_gpu_kernel.h
@@ -22,7 +22,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_functor_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_functor_gpu_kernel.h
index 0acacb8208b..b960bee68fb 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_functor_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_functor_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_functor_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_functor_gpu_kernel.h
index c52a96b63ba..7d512f1e826 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_functor_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_functor_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_gpu_kernel.h
index 9b042838429..79b1a9f7559 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_gpu_kernel.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/scatter_nd.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/select_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/select_gpu_kernel.h
index b3da2563db2..12bb75f4245 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/select_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/select_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/select_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h
index 192cf9af970..36b8f57396d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_grad_gpu_kernel.h
index e2b3f1ba47d..1291ac7204c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_grad_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include <utility>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/sort_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/sort_gpu_kernel.h
index 3b9ea35551a..0cb93e68e98 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/sort_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/sort_gpu_kernel.h
@@ -24,9 +24,9 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetobatch_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetobatch_gpu_kernel.h
index e37838dfe29..81ffb8ee054 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetobatch_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetobatch_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetodepth_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetodepth_gpu_kernel.h
index f090ed396b2..7edb4c39205 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetodepth_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetodepth_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/split_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/split_gpu_kernel.h
index 3e92763552d..c3016b7ad73 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/split_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/split_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/split_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_gpu_kernel.h
index f73e80dd794..86a44e7c3db 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/arrays/strided_slice_gpu_common.h"
-#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_grad_gpu_kernel.h
index 31a1a5b963c..0354b2cac75 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_grad_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/arrays/strided_slice_gpu_common.h"
-#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_copy_slices_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_copy_slices_gpu_kernel.h
index 4b79e3873b0..1afea2fd7c6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_copy_slices_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_copy_slices_gpu_kernel.h
@@ -25,7 +25,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "kernel/common_utils.h"
-#include "plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_add_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_add_gpu_kernel.h
index 34923826749..d4a3d268ea5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_add_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_add_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include <string>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_max_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_max_gpu_kernel.h
index 57f15c53a38..ba7fdb4b693 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_max_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_max_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include <algorithm>
 
-#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_min_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_min_gpu_kernel.h
index d16e3c31e1c..60af63b88b0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_min_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_min_gpu_kernel.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_sub_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_sub_gpu_kernel.h
index 9cc4e01841b..6d1a44c9864 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_sub_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_sub_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include <algorithm>
 
-#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_update_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_update_gpu_kernel.h
index afc25e08221..6f110d216e7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_update_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_update_gpu_kernel.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tile_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tile_gpu_kernel.h
index b27e0747d20..57eec01d03b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tile_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tile_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/topk_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/topk_gpu_kernel.h
index d0a1ed551e8..9bef68dfbaf 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/topk_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/topk_gpu_kernel.h
@@ -21,8 +21,8 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/transpose_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/transpose_gpu_kernel.h
index 7ed3f305b07..9990dc0b04c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/transpose_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/transpose_gpu_kernel.h
@@ -21,8 +21,8 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr size_t kDimSize4 = 4;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h
index 1970d2ff4dd..a27a62d0585 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/unique_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unpack_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unpack_gpu_kernel.h
index 563a27c9f98..e8265dcbe7a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unpack_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unpack_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/unpack.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_max_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_max_gpu_kernel.h
index 70d03c13279..d94c242202c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_max_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_max_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <limits>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_min_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_min_gpu_kernel.h
index 6c7ca40378a..915e098f585 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_min_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_min_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <limits>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_sum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_sum_gpu_kernel.h
index c4b04629717..5e8c9e83a52 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_sum_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_sum_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh
deleted file mode 100644
index 8f76c51e01a..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T, typename S, typename G>
-void ApplyAdagrad(const size_t size,
-                  const bool update_slots,
-                  const S *learning_rate,
-                  const G *gradient,
-                  T *variable,
-                  T *accumulation,
-                  cudaStream_t stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh
deleted file mode 100644
index 65a388b1fb2..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void ApplyAdam(const size_t size, const T *gradient, const T *beta1_power, const T *beta2_power, const T *learning_rate,
-               const T *beta1, const T *beta2, const T *epsilon, T *variable, T *m, T *v, cudaStream_t cuda_stream);
-template <typename T>
-void AdamWeightDecayOp(const size_t size, const T *gradient, const float *learning_rate, const float *beta1,
-                       const float *beta2, const float *epsilon, const float *decay, T *variable, T *m, T *v,
-                       cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cuh
deleted file mode 100644
index 2addffbf002..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cuh
+++ /dev/null
@@ -1,24 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ADAM_WEIGHT_DECAY_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ADAM_WEIGHT_DECAY_H_
-template <typename T>
-void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1, const float *one_sub_beta1,
-                     const float *beta2, const float *one_sub_beta2, const float *epsilon, const float *lr,
-                     const float *weight_decay, T *m, T *v, T *param, T *gradient, cudaStream_t stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ADAM_WEIGHT_DECAY_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh
deleted file mode 100644
index b8c12da9774..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const uint input_width,
-                                const uint output_height, const uint output_width, T *input_data,
-                                T *output_data, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh
deleted file mode 100644
index caa0418ed38..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVEAVGPOOL2D_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVEAVGPOOL2D_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint input_width, const uint output_height,
-                            const uint output_width, T *input_data, T *output_data, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVEAVGPOOL2D_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh
deleted file mode 100644
index 81e10d1d49e..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask, cudaStream_t cuda_stream);
-
-template <typename T>
-void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh
deleted file mode 100755
index 5b80eb85b48..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh
+++ /dev/null
@@ -1,23 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_
-template <typename T, typename S>
-void CalArgmax(const T *input, const S bound, const size_t outer_size, const size_t inner_size, S *output,
-               cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh
deleted file mode 100644
index f60b061b3df..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMFOLD2_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMFOLD2_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void BatchNormFold2Forward(const T *x, const T *beta, const T *gamma, const T *batch_std, const T *batch_mean,
-                           const T *running_std, const T *running_mean, const int *global_step, T *y, int freeze_bn,
-                           size_t N, size_t C, size_t H, size_t W, cudaStream_t cuda_stream);
-template <typename T>
-void CalBatchNormFold2GradNotFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std,
-                                    const T *running_mean, const T *running_std, const T *gamma, T *d_gamma,
-                                    T *d_batch_mean, T *d_batch_std, size_t C, cudaStream_t cuda_stream);
-template <typename T>
-void CalBatchNormFold2GradFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std,
-                                 const T *running_mean, const T *running_std, const T *gamma, T *d_gamma,
-                                 T *d_batch_mean, T *d_batch_std, size_t C, cudaStream_t cuda_stream);
-template <typename T>
-void BatchNormFold2GradReduce(const T *dout, const T *x, T *d_beta, T *tmp, T *reduce_x, T *tmp2, T *tmp_x, size_t N,
-                              size_t C, size_t H, size_t W, cudaStream_t cuda_stream);
-
-template <typename T>
-void CalBatchNormFold2GradNotFreezeDxMul(const T *batch_std, const T *running_std, T *d_x, size_t N, size_t C, size_t H,
-                                         size_t W, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMFOLD2_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh
deleted file mode 100755
index d7ad76c5adc..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORM_FOLD_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORM_FOLD_H_
-
-template <typename T>
-void CalUpdateRunningStd(int channel_size, double epsilon, T* running_std, cudaStream_t cuda_stream);
-
-template <typename T>
-void CalUpdateBatchStd(int channel_size, T* batch_std, cudaStream_t cuda_stream);
-
-template <typename T>
-void CalBatchNormFoldGrad(const T* d_batch_mean, const T* d_batch_std, const T* x, const T* batch_mean,
-                          const T* batch_std, int batch_size, int channel_size, int height, int width, T* dx,
-                          cudaStream_t cuda_stream);
-template <typename T>
-void ThrustFillWith(T* array, int size, T tofill, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BATCHNORM_FOLD_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cu
deleted file mode 100644
index 4ff5c230f42..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cu
+++ /dev/null
@@ -1,133 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cuda_runtime.h>
-#include "batchtospace_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-__global__ void BatchToSpace(const size_t size, const T *input, const size_t in,
-                             const size_t ih, const size_t iw, const size_t ic,
-                             const size_t on, const size_t oh, const size_t ow,
-                             const size_t oc, const size_t crop_up, const size_t crop_dn,
-                             const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-                             T *output) {
-  size_t temp_stride = 0;
-  size_t temp_pos = 0;
-  size_t idx_on = 0;
-  size_t idx_oc = 0;
-  size_t idx_oh = 0;
-  size_t idx_ow = 0;
-  size_t idx_in = 0;
-  size_t input_pos = 0;
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size;
-       pos += blockDim.x * gridDim.x) {
-    temp_stride = oc * oh * ow;
-    idx_on = pos / temp_stride;
-    temp_pos = pos % temp_stride;
-
-    temp_stride /= oc;
-    idx_oc = temp_pos / temp_stride;
-    temp_pos = pos % temp_stride;
-
-    temp_stride /= oh;
-    idx_oh = temp_pos / temp_stride;
-    temp_pos = pos % temp_stride;
-
-    temp_stride /= ow;
-    idx_ow = temp_pos / temp_stride;
-
-    idx_in = (((idx_oh + crop_up) % block_num) * block_num + ((idx_ow + crop_lft) % block_num)) * on + idx_on;
-    input_pos = idx_in * ic;
-    input_pos = (input_pos + idx_oc) * ih;
-    input_pos = (input_pos + ((idx_oh + crop_up) - (idx_in / (on * block_num))) / block_num) * iw;
-    input_pos = (input_pos + ((idx_ow + crop_lft) - ((idx_in / on) % block_num)) / block_num);
-    output[pos] = input[input_pos];
-  }
-  return;
-}
-
-template <typename T>
-void CalBatchToSpace(const size_t size, const T *input, const size_t in,
-  const size_t ih, const size_t iw, const size_t ic,
-  const size_t on, const size_t oh, const size_t ow,
-  const size_t oc, const size_t crop_up, const size_t crop_dn,
-  const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-  T *output, cudaStream_t cuda_stream) {
-  BatchToSpace<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
-    size, input, in, ih, iw, ic, on, oh, ow, oc, crop_up, crop_dn, crop_lft, crop_rht, block_num, output);
-  return;
-}
-
-template void CalBatchToSpace<float>(const size_t size, const float *input, const size_t in,
-                                     const size_t ih, const size_t iw, const size_t ic,
-                                     const size_t on, const size_t oh, const size_t ow,
-                                     const size_t oc, const size_t crop_up, const size_t crop_dn,
-                                     const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-                                     float *output, cudaStream_t cuda_stream);
-template void CalBatchToSpace<half>(const size_t size, const half *input, const size_t in,
-                                    const size_t ih, const size_t iw, const size_t ic,
-                                    const size_t on, const size_t oh, const size_t ow,
-                                    const size_t oc, const size_t crop_up, const size_t crop_dn,
-                                    const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-                                    half *output, cudaStream_t cuda_stream);
-template void CalBatchToSpace<int>(const size_t size, const int *input, const size_t in,
-                                   const size_t ih, const size_t iw, const size_t ic,
-                                   const size_t on, const size_t oh, const size_t ow,
-                                   const size_t oc, const size_t crop_up, const size_t crop_dn,
-                                   const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-                                   int *output, cudaStream_t cuda_stream);
-template void CalBatchToSpace<int64_t>(const size_t size, const int64_t *input, const size_t in,
-                                       const size_t ih, const size_t iw, const size_t ic,
-                                       const size_t on, const size_t oh, const size_t ow,
-                                       const size_t oc, const size_t crop_up, const size_t crop_dn,
-                                       const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-                                       int64_t *output, cudaStream_t cuda_stream);
-template void CalBatchToSpace<int16_t>(const size_t size, const int16_t *input, const size_t in,
-                                       const size_t ih, const size_t iw, const size_t ic,
-                                       const size_t on, const size_t oh, const size_t ow,
-                                       const size_t oc, const size_t crop_up, const size_t crop_dn,
-                                       const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-                                       int16_t *output, cudaStream_t cuda_stream);
-template void CalBatchToSpace<int8_t>(const size_t size, const int8_t *input, const size_t in,
-                                      const size_t ih, const size_t iw, const size_t ic,
-                                      const size_t on, const size_t oh, const size_t ow,
-                                      const size_t oc, const size_t crop_up, const size_t crop_dn,
-                                      const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-                                      int8_t *output, cudaStream_t cuda_stream);
-template void CalBatchToSpace<uint8_t>(const size_t size, const uint8_t *input, const size_t in,
-                                       const size_t ih, const size_t iw, const size_t ic,
-                                       const size_t on, const size_t oh, const size_t ow,
-                                       const size_t oc, const size_t crop_up, const size_t crop_dn,
-                                       const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-                                       uint8_t *output, cudaStream_t cuda_stream);
-template void CalBatchToSpace<uint16_t>(const size_t size, const uint16_t *input, const size_t in,
-                                        const size_t ih, const size_t iw, const size_t ic,
-                                        const size_t on, const size_t oh, const size_t ow,
-                                        const size_t oc, const size_t crop_up, const size_t crop_dn,
-                                        const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-                                        uint16_t *output, cudaStream_t cuda_stream);
-template void CalBatchToSpace<uint32_t>(const size_t size, const uint32_t *input, const size_t in,
-                                        const size_t ih, const size_t iw, const size_t ic,
-                                        const size_t on, const size_t oh, const size_t ow,
-                                        const size_t oc, const size_t crop_up, const size_t crop_dn,
-                                        const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-                                        uint32_t *output, cudaStream_t cuda_stream);
-template void CalBatchToSpace<uint64_t>(const size_t size, const uint64_t *input, const size_t in,
-                                        const size_t ih, const size_t iw, const size_t ic,
-                                        const size_t on, const size_t oh, const size_t ow,
-                                        const size_t oc, const size_t crop_up, const size_t crop_dn,
-                                        const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-                                        uint64_t *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh
deleted file mode 100644
index cbf6a3976a6..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHTOSPACE_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHTOSPACE_H_
-template <typename T>
-void CalBatchToSpace(const size_t size, const T *input, const size_t in,
-                     const size_t ih, const size_t iw, const size_t ic,
-                     const size_t on, const size_t oh, const size_t ow,
-                     const size_t oc, const size_t crop_up, const size_t crop_dn,
-                     const size_t crop_lft, const size_t crop_rht, const size_t block_num,
-                     T *output, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHTOSPACE_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh
deleted file mode 100644
index 7654f111033..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_BCE_WITH_LOGITS_LOSS_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_BCE_WITH_LOGITS_LOSS_IMPL_CUH_
-
-#define MAX_LOGITS_DIMENSION 8
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-void CalBCEWithLogitsLoss(const size_t input_size, const T *predict, const T *target, const size_t *input_shape,
-                          const size_t shape_size, const T *weight, const size_t *weight_shape,
-                          const bool weight_need_broadcast, const T *pos_weight, const size_t *pos_weight_shape,
-                          const bool pos_weight_need_broadcast, T *shape_broadcasted, T *output,
-                          cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_BCE_WITH_LOGITS_LOSS_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh
deleted file mode 100644
index 40dd5099640..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BIASADDGRAD_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BIASADDGRAD_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void CalBiasAddGradNHWC(const size_t size, const size_t bias_size,
-                        const T*  dy, T*  db,  cudaStream_t cuda_stream);
-template <typename T>
-void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, const int height, const int width,
-                        const T*  dy, T*  db,  cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BIASADDGRAD_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh
deleted file mode 100644
index ed459c57f15..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void BoundingBoxDecode(const size_t size, const T *rois, const T *deltas, T *bboxes, const float &m1, const float &m2,
-                       const float &m3, const float &m4, const float &s1, const float &s2, const float &s3,
-                       const float &s4, const int &max_height, const int &max_width, const float &ratio_clip,
-                       cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh
deleted file mode 100644
index f3345090b95..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void BoundingBoxEncode(const size_t size, const T *anchor_box, const T *groundtruth_box, T *deltas, const float &m1,
-                       const float &m2, const float &m3, const float &m4, const float &s1, const float &s2,
-                       const float &s3, const float &s4, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh
deleted file mode 100644
index 7fbc486ace7..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-enum BroadcastGradOpType {
-  BROADCAST_GRAD_TYPE_MAXIMUM = 0,
-  BROADCAST_GRAD_TYPE_MINIMUM = 1,
-  BROADCAST_GRAD_TYPE_INVALID = 0xffffffff,
-};
-
-template <typename T>
-void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
-                   const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
-                   const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const T *x1, const T *x2,
-                   const T *dy, T *dx1, T *dx2, cudaStream_t stream);
-
-template <typename T>
-void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
-                     const T *x1, const T *x2, const T *dy, T *dx1, T *dx2, cudaStream_t stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh
deleted file mode 100644
index 6d17c2a5c84..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
-
-#include <vector>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "utils/complex.h"
-
-const float kFloatEplison = 1e-37;
-
-enum BroadcastOpType {
-  BROADCAST_TYPE_GREATER = 0,
-  BROADCAST_TYPE_LESS = 1,
-  BROADCAST_TYPE_MAXIMUM = 2,
-  BROADCAST_TYPE_MINIMUM = 3,
-  BROADCAST_TYPE_POWER = 4,
-  BROADCAST_TYPE_REALDIV = 5,
-  BROADCAST_TYPE_MUL = 6,
-  BROADCAST_TYPE_SUB = 7,
-  BROADCAST_TYPE_ADD = 8,
-  BROADCAST_TYPE_FLOORDIV = 9,
-  BROADCAST_TYPE_ABSGRAD = 10,
-  BROADCAST_TYPE_DIV = 11,
-  BROADCAST_TYPE_DIVNONAN = 12,
-  BROADCAST_TYPE_EQUAL = 13,
-  BROADCAST_TYPE_SQUARED_DIFFERENCE = 14,
-  BROADCAST_TYPE_MOD = 15,
-  BROADCAST_TYPE_FLOORMOD = 16,
-  BROADCAST_TYPE_ATAN2 = 17,
-  BROADCAST_TYPE_GREATER_EQUAL = 18,
-  BROADCAST_TYPE_LESS_EQUAL = 19,
-  BROADCAST_TYPE_NOT_EQUAL = 20,
-  BROADCAST_TYPE_LOGICAL_AND = 21,
-  BROADCAST_TYPE_LOGICAL_OR = 22,
-  BROADCAST_TYPE_TRUNCATEDIV = 23,
-  BROADCAST_TYPE_TRUNCATEMOD = 24,
-  BROADCAST_TYPE_COMPLEX = 25,
-  BROADCAST_TYPE_INVALID = 0xffffffff,
-};
-
-template <typename T>
-void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, bool *y, cudaStream_t stream);
-
-template <typename T>
-void ElewiseArith(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream);
-
-template <typename T1, typename T2, typename T3>
-void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const T1 *x0, const T2 *x1,
-                         Complex<T3> *y, cudaStream_t stream);
-
-template <typename T>
-void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                  const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T *x0, const T *x1, bool *y,
-                  cudaStream_t stream);
-
-template <typename T>
-void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                    const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T *x0, const T *x1, T *y,
-                    cudaStream_t stream);
-
-template <typename T1, typename T2, typename T3>
-void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T1 *x0, const T2 *x1,
-                           Complex<T3> *y, cudaStream_t stream);
-template <typename T>
-void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T *x0, const T *x1,
-                           Complex<T> *y, cudaStream_t stream);
-
-template <typename T>
-void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
-                 const size_t &o1, const size_t &o2, const size_t &o3, const T *input_addr, T *output_addr,
-                 cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cu
deleted file mode 100644
index 63e48c0fc08..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cu
+++ /dev/null
@@ -1,318 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-#include <iostream>
-
-#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-// Generic cast
-template <typename S, typename T>
-__device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) {
-  *output_addr = static_cast<T>((*input_addr));
-}
-
-// half --> integer
-__device__ __forceinline__ void CastBase(const half *input_addr, uint64_t *output_addr) {
-  *output_addr = __half2ull_rz((*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const half *input_addr, int64_t *output_addr) {
-  *output_addr = __half2ll_rz((*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const half *input_addr, uint32_t *output_addr) {
-  *output_addr = __half2uint_rz((*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const half *input_addr, int32_t *output_addr) {
-  *output_addr = __half2int_rz((*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const half *input_addr, uint16_t *output_addr) {
-  *output_addr = __half2ushort_rz((*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const half *input_addr, int16_t *output_addr) {
-  *output_addr = __half2short_rz((*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const half *input_addr, uint8_t *output_addr) {
-  *output_addr = static_cast<uint8_t>(__half2ushort_rz((*input_addr)));
-}
-
-__device__ __forceinline__ void CastBase(const half *input_addr, int8_t *output_addr) {
-  *output_addr = static_cast<int8_t>(__half2short_rz((*input_addr)));
-}
-
-// integer --> half
-__device__ __forceinline__ void CastBase(const uint64_t *input_addr, half *output_addr) {
-  *output_addr = __ull2half_rn((*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const int64_t *input_addr, half *output_addr) {
-  *output_addr = __ll2half_rn((*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const uint32_t *input_addr, half *output_addr) {
-  *output_addr = __uint2half_rn((*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const int32_t *input_addr, half *output_addr) {
-  *output_addr = __int2half_rn((*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const uint16_t *input_addr, half *output_addr) {
-  *output_addr = __ushort2half_rn((*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const int16_t *input_addr, half *output_addr) {
-  *output_addr = __short2half_rn((*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const uint8_t *input_addr, half *output_addr) {
-  *output_addr = __ushort2half_rn(static_cast<uint16_t>(*input_addr));
-}
-
-__device__ __forceinline__ void CastBase(const int8_t *input_addr, half *output_addr) {
-  *output_addr = __short2half_rn(static_cast<int16_t>(*input_addr));
-}
-
-// Cast
-template <typename S, typename T>
-__global__ void CastKernel(const int input_size, const S *input_addr, T *output_addr) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < input_size; pos += blockDim.x * gridDim.x) {
-    CastBase(input_addr + pos, output_addr + pos);
-  }
-}
-
-template <typename S, typename T>
-void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream) {
-  CastKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, input_addr, output_addr);
-}
-
-template void Cast(const int input_size, const int8_t *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int8_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);
-
-template void Cast(const int input_size, const int16_t *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int16_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);
-
-
-template void Cast(const int input_size, const int32_t *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int32_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);
-
-
-template void Cast(const int input_size, const int64_t *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const int64_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);
-
-template void Cast(const int input_size, const uint8_t *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint8_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);
-
-template void Cast(const int input_size, const uint16_t *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint16_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);
-
-template void Cast(const int input_size, const uint32_t *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint32_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);
-
-template void Cast(const int input_size, const uint64_t *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const uint64_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);
-
-template void Cast(const int input_size, const half *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, Complex<float> *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const half *input_addr, Complex<double> *output_addr, cudaStream_t stream);
-
-template void Cast(const int input_size, const float *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, Complex<float> *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const float *input_addr, Complex<double> *output_addr, cudaStream_t stream);
-
-template void Cast(const int input_size, const double *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, Complex<float> *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const double *input_addr, Complex<double> *output_addr, cudaStream_t stream);
-
-template void Cast(const int input_size, const bool *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, Complex<float> *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const bool *input_addr, Complex<double> *output_addr, cudaStream_t stream);
-
-template void Cast(const int input_size, const Complex<float> *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<float> *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<float> *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<float> *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<float> *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<float> *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<float> *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<float> *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<float> *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<float> *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<float> *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<float> *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<float> *input_addr, Complex<double> *output_addr,
-                   cudaStream_t stream);
-
-template void Cast(const int input_size, const Complex<double> *input_addr, int8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<double> *input_addr, int16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<double> *input_addr, int32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<double> *input_addr, int64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<double> *input_addr, uint8_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<double> *input_addr, uint16_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<double> *input_addr, uint32_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<double> *input_addr, uint64_t *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<double> *input_addr, float *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<double> *input_addr, double *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<double> *input_addr, half *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<double> *input_addr, bool *output_addr, cudaStream_t stream);
-template void Cast(const int input_size, const Complex<double> *input_addr, Complex<float> *output_addr,
-                   cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cuh
deleted file mode 100644
index c7eab4b0a81..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CLIP_GRAD_NORM_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CLIP_GRAD_NORM_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void ScalingGradOp(const size_t size, const T *x, const float *scaling_factor, float *scaling_out_addr,
-                   cudaStream_t cuda_stream);
-
-template <typename T>
-void ClipGradNormOp(const size_t size, const float *x, const T *clip_norm, const float *reduce_sum_value,
-                    float *output_addr, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CLIP_GRAD_NORM_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cu
deleted file mode 100755
index ca409ec126d..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cu
+++ /dev/null
@@ -1,92 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <cuda_runtime.h>
-#include "plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh"
-template <typename T>
-__global__ void Concat(const size_t size, const int input_num, const int all_size_before_axis, const int all_size_axis,
-                       int *len_axis, T **inputs, T *output) {
-  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
-    int num = pos % all_size_before_axis / all_size_axis;
-    int block = -1;
-    int axis_inc = 0;
-    int block_len = 0;
-    for (int i = 0; i < input_num; i++) {
-      if (axis_inc <= num) {
-        block++;
-        axis_inc += len_axis[i];
-      } else {
-        break;
-      }
-    }
-    block_len = len_axis[block];
-    axis_inc -= len_axis[block];
-    int block_pos =
-      pos / all_size_before_axis * block_len * all_size_axis + (num - axis_inc) * all_size_axis + pos % all_size_axis;
-    output[pos] = inputs[block][block_pos];
-  }
-  return;
-}
-
-template <typename T>
-void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, const int all_size_axis,
-                  int *len_axis, T **inputs, T *output, cudaStream_t cuda_stream) {
-  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_num, all_size_before_axis, all_size_axis,
-                                                            len_axis, inputs, output);
-  return;
-}
-
-template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
-                           const int all_size_axis, int *len_axis, double **inputs, double *output,
-                           cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
-                           const int all_size_axis, int *len_axis, float **inputs, float *output,
-                           cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
-                           const int all_size_axis, int *len_axis, half **inputs, half *output,
-                           cudaStream_t cuda_stream);
-
-template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
-                           const int all_size_axis, int *len_axis, int64_t **inputs, int64_t *output,
-                           cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
-                           const int all_size_axis, int *len_axis, int **inputs, int *output,
-                           cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
-                           const int all_size_axis, int *len_axis, short **inputs, short *output,  // NOLINT
-                           cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
-                           const int all_size_axis, int *len_axis, char **inputs, char *output,
-                           cudaStream_t cuda_stream);
-
-template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
-                           const int all_size_axis, int *len_axis, uint64_t **inputs, uint64_t *output,
-                           cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
-                           const int all_size_axis, int *len_axis, uint32_t **inputs, uint32_t *output,
-                           cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
-                           const int all_size_axis, int *len_axis, uint16_t **inputs, uint16_t *output,
-                           cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
-                           const int all_size_axis, int *len_axis, unsigned char **inputs, unsigned char *output,
-                           cudaStream_t cuda_stream);
-
-template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
-                           const int all_size_axis, int *len_axis, bool **inputs, bool *output,
-                           cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cuh
deleted file mode 100644
index 17d8ba82723..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cuh
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void ConvertGradient(const size_t size, const size_t height_h, const size_t height_w, const size_t batchwidth,
-                     const size_t width, T *input_addr, T *outt_addr, cudaStream_t cuda_stream);
-
-template <typename T>
-void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, const size_t batchwidth,
-                         const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream);
-
-template <typename T>
-void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, const size_t ori_h,
-                         const size_t ori_w, const size_t batchwidth, const size_t width, T *input_addr, T *output_addr,
-                         cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh
deleted file mode 100644
index 176c063dc8e..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CORRECTIONMUL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CORRECTIONMUL_H_
-
-template <typename T>
-void CalCorrectionMul(const T* weight, const T* gamma, const T* running_std, int batch_size, int channel_size,
-                      int height, int width, T* output, cudaStream_t cuda_stream);
-
-template <typename T>
-void CalCorrectionMulGrad(const T* d_out, const T* weight, const T* running_std, int batch_size, int channel_size,
-                          int height, int width, T* d_gamma, T* tmp, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CORRECTIONMUL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh
deleted file mode 100644
index c3f31f29940..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CROP_AND_RESIZE_IMPL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CROP_AND_RESIZE_IMPL_H_
-#include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void CalCropAndResize(const size_t size, const T *input_image, float *input_boxes, int *input_box_index, int batch,
-                      int input_height, int input_width, int final_height, int final_width, int channel,
-                      int method, float extrapol_val, float *output, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CROP_AND_RESIZE_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cuh
deleted file mode 100644
index 2440135fe42..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cuh
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CROSSENTROPY_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CROSSENTROPY_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-// The batch size limit to judge whether to use multiple threads.
-constexpr int kLargeBatchLowLimit = 32768;
-
-template <typename T, typename S>
-void CrossEntropyWithSparse(const T *logits, const S *labels, const size_t batch_size, const size_t class_num, T *loss,
-                            cudaStream_t cuda_stream);
-
-template <typename T, typename S>
-void CrossEntropyGradWithSparse(const T *logits, const S *labels, const size_t batch_size, const size_t class_num,
-                                T *grad, cudaStream_t cuda_stream);
-
-template <typename T, typename S>
-void CrossEntropy(const T *logits, const S *labels, const size_t batch_size, const size_t class_num, T *losses,
-                  T *dlogits, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CROSSENTROPY_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cuh
deleted file mode 100644
index 7e155ced56e..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cuh
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_IMPL_CUH
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_IMPL_CUH
-
-template <typename T>
-void CalculateFwdVar(T *log_alpha_b, int *label_value_with_blank, T *softmax_probs, const int *sequence_length,
-                     bool ctc_merge_repeated, int batch, int SOffSet, int maxtime, int blank, int *label_squence_length,
-                     int *cum_labels_length, bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
-
-template <typename T>
-void CalculateBwdVar(T *log_beta_b, int *label_value_with_blank, T *softmax_probs, const int *sequence_length,
-                     bool ctc_merge_repeated, int batch, int SOffSet, int maxtime, int blank, int *label_squence_length,
-                     int *cum_labels_length, bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
-
-template <typename T>
-void InnerSoftMax(const T *probs, T *softmax_cost, const int *sequence_length, int max_time, int batch, int numclass,
-                  cudaStream_t stream);
-
-void GenLabelValuePCR(int *label_value_sp, int *label_value_pcr, int *label_squence_length, int *cum_labels_length,
-                      int *max_labels_length, int batch, cudaStream_t stream);
-
-void GenLabelWithBlank(int *label_value, int *label_value_with_blank, int *label_squence_length,
-                       int *precum_labels_length, int *cum_labels_length, int batch, int blank, cudaStream_t stream);
-
-void GenLabelValue(int *label_value_sp, const int64_t *label_indices, const int *label_values,
-                   int *label_squence_length, int *cum_labels_length, int *max_labels_length, int size, int blank,
-                   int batch, cudaStream_t stream);
-
-void CalculatePreLength(int *label_squence_length, int *precum_labels_length, int *cum_labels_length,
-                        int *max_labels_length, const int64_t *label_indices, int batch, int size, cudaStream_t stream);
-void CalculateMaxSequence(const int *sequence_length, int *max_labels_length, int batch, cudaStream_t stream);
-template <typename T>
-void CTCLoss(T *log_alpha_b, T *log_beta_b, T *softmax_probs, int *label_value_with_blank, int batch, int SOffSet,
-             int maxtime, int numclass, const int *sequence_length, int *label_squence_length, int *cum_labels_length,
-             T *cost, T *grads, T *prob_num, bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_IMPL_CUH
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt
new file mode 100644
index 00000000000..d81c38e431b
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt
@@ -0,0 +1,27 @@
+file(GLOB_RECURSE CUDA_OPS_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cu")
+
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-delete-non-abstract-non-virtual-dtor -Wno-overloaded-virtual")
+endif()
+
+if(${CUDA_VERSION} VERSION_LESS 11.0)
+    string(REPLACE "-std=c++17" "-std=c++11" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+else()
+    string(REPLACE "-std=c++17" "-std=c++14" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+endif()
+
+set_property(SOURCE ${CUDA_OPS_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_KERNEL)
+if(ENABLE_GPU)
+    add_library(cuda_common_obj OBJECT cuda_common.cc)
+    target_compile_options(cuda_common_obj PRIVATE "-std=c++17")
+    cuda_add_library(cuda_ops SHARED ${CUDA_OPS_SRC_LIST} $<TARGET_OBJECTS:cuda_common_obj>)
+    message("add gpu lib to cuda_ops")
+    target_link_libraries(cuda_ops mindspore_core
+            ${CUDA_PATH}/lib64/libcurand.so
+            ${CUDNN_LIBRARY_PATH}
+            ${CUDA_PATH}/lib64/libcudart.so
+            ${CUDA_PATH}/lib64/stubs/libcuda.so
+            ${CUDA_PATH}/lib64/libcusolver.so
+            ${CUDA_PATH}/lib64/libcufft.so
+            ${CUDA_PATH}/lib64/libcublas.so)
+endif()
\ No newline at end of file
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cu
similarity index 55%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cu
index 0680867b722..765b73ac229 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __device__ __forceinline__ T SqrtFunc(T input) {
@@ -113,50 +114,50 @@ void ApplyAdagrad(const size_t size,
           size, update_slots, learning_rate, gradient, variable, accumulation);
 }
 
-template void ApplyAdagrad<float, float, float>(const size_t size,
-                                  const bool update_slots,
-                                  const float *learning_rate,
-                                  const float *gradient,
-                                  float *variable,
-                                  float *accumulation,
-                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdagrad<float, float, float>(const size_t size,
+                                                                const bool update_slots,
+                                                                const float *learning_rate,
+                                                                const float *gradient,
+                                                                float *variable,
+                                                                float *accumulation,
+                                                                cudaStream_t cuda_stream);
 
-template void ApplyAdagrad<half, half, half>(const size_t size,
-                                 const bool update_slots,
-                                 const half *learning_rate,
-                                 const half *gradient,
-                                 half *variable,
-                                 half *accumulation,
-                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdagrad<half, half, half>(const size_t size,
+                                                             const bool update_slots,
+                                                             const half *learning_rate,
+                                                             const half *gradient,
+                                                             half *variable,
+                                                             half *accumulation,
+                                                             cudaStream_t cuda_stream);
 
-template void ApplyAdagrad<half, float, half>(const size_t size,
-                                 const bool update_slots,
-                                 const float *learning_rate,
-                                 const half *gradient,
-                                 half *variable,
-                                 half *accumulation,
-                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdagrad<half, float, half>(const size_t size,
+                                                              const bool update_slots,
+                                                              const float *learning_rate,
+                                                              const half *gradient,
+                                                              half *variable,
+                                                              half *accumulation,
+                                                              cudaStream_t cuda_stream);
 
-template void ApplyAdagrad<float, float, half>(const size_t size,
-                                 const bool update_slots,
-                                 const float *learning_rate,
-                                 const half *gradient,
-                                 float *variable,
-                                 float *accumulation,
-                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdagrad<float, float, half>(const size_t size,
+                                                               const bool update_slots,
+                                                               const float *learning_rate,
+                                                               const half *gradient,
+                                                               float *variable,
+                                                               float *accumulation,
+                                                               cudaStream_t cuda_stream);
 
-template void ApplyAdagrad<float, half, float>(const size_t size,
-                                 const bool update_slots,
-                                 const half *learning_rate,
-                                 const float *gradient,
-                                 float *variable,
-                                 float *accumulation,
-                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdagrad<float, half, float>(const size_t size,
+                                                               const bool update_slots,
+                                                               const half *learning_rate,
+                                                               const float *gradient,
+                                                               float *variable,
+                                                               float *accumulation,
+                                                               cudaStream_t cuda_stream);
 
-template void ApplyAdagrad<half, float, float>(const size_t size,
-                                 const bool update_slots,
-                                 const float *learning_rate,
-                                 const float *gradient,
-                                 half *variable,
-                                 half *accumulation,
-                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdagrad<half, float, float>(const size_t size,
+                                                               const bool update_slots,
+                                                               const float *learning_rate,
+                                                               const float *gradient,
+                                                               half *variable,
+                                                               half *accumulation,
+                                                               cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh
new file mode 100644
index 00000000000..7f73af3ef57
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAGRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAGRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T, typename S, typename G>
+CUDA_LIB_EXPORT void ApplyAdagrad(const size_t size,
+                                  const bool update_slots,
+                                  const S *learning_rate,
+                                  const G *gradient,
+                                  T *variable,
+                                  T *accumulation,
+                                  cudaStream_t stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAGRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cu
similarity index 69%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cu
index 8c3859dbefd..e2bd92c3264 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __device__ __forceinline__ T SqrtFunc(T input) {
@@ -82,16 +83,19 @@ void AdamWeightDecayOp(const size_t size, const T *gradient, const float *learni
                                                                            epsilon, decay, variable, m, v);
 }
 
-template void ApplyAdam<float>(const size_t size, const float *gradient, const float *beta1_power,
-                               const float *beta2_power, const float *learning_rate, const float *beta1,
-                               const float *beta2, const float *epsilon, float *variable, float *m, float *v,
-                               cudaStream_t cuda_stream);
-template void ApplyAdam<half>(const size_t size, const half *gradient, const half *beta1_power, const half *beta2_power,
-                              const half *learning_rate, const half *beta1, const half *beta2, const half *epsilon,
-                              half *variable, half *m, half *v, cudaStream_t cuda_stream);
-template void AdamWeightDecayOp<float>(const size_t size, const float *gradient, const float *learning_rate,
-                                       const float *beta1, const float *beta2, const float *epsilon, const float *decay,
-                                       float *variable, float *m, float *v, cudaStream_t cuda_stream);
-template void AdamWeightDecayOp<half>(const size_t size, const half *gradient, const float *learning_rate,
-                                      const float *beta1, const float *beta2, const float *epsilon, const float *decay,
-                                      half *variable, half *m, half *v, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdam<float>(const size_t size, const float *gradient, const float *beta1_power,
+                                               const float *beta2_power, const float *learning_rate, const float *beta1,
+                                               const float *beta2, const float *epsilon, float *variable, float *m,
+                                               float *v, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdam<half>(const size_t size, const half *gradient, const half *beta1_power,
+                                              const half *beta2_power, const half *learning_rate, const half *beta1,
+                                              const half *beta2, const half *epsilon, half *variable, half *m, half *v,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AdamWeightDecayOp<float>(const size_t size, const float *gradient,
+                                                       const float *learning_rate, const float *beta1,
+                                                       const float *beta2, const float *epsilon, const float *decay,
+                                                       float *variable, float *m, float *v, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AdamWeightDecayOp<half>(const size_t size, const half *gradient,
+                                                      const float *learning_rate, const float *beta1,
+                                                      const float *beta2, const float *epsilon, const float *decay,
+                                                      half *variable, half *m, half *v, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh
new file mode 100644
index 00000000000..ab42e3f250d
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void ApplyAdam(const size_t size, const T *gradient, const T *beta1_power, const T *beta2_power,
+                               const T *learning_rate, const T *beta1, const T *beta2, const T *epsilon, T *variable,
+                               T *m, T *v, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void AdamWeightDecayOp(const size_t size, const T *gradient, const float *learning_rate,
+                                       const float *beta1, const float *beta2, const float *epsilon, const float *decay,
+                                       T *variable, T *m, T *v, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cu
similarity index 80%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cu
index 7eae29a155b..21425267fba 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cu
@@ -15,7 +15,6 @@
  */
 
 #include "adam_weight_decay_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 
 template <typename T>
 __global__ void AdamWeightDecayKernel(const int element_num_, const bool need_decay, const float *beta1,
@@ -44,7 +43,8 @@ void AdamWeightDecay(const int &element_num_, const bool &need_decay, const floa
     gradient);
 }
 
-template void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1,
-                              const float *one_sub_beta1, const float *beta2, const float *one_sub_beta2,
-                              const float *epsilon, const float *lr, const float *weight_decay, float *m, float *v,
-                              float *param, float *gradient, cudaStream_t stream);
+template CUDA_LIB_EXPORT void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1,
+                                              const float *one_sub_beta1, const float *beta2,
+                                              const float *one_sub_beta2, const float *epsilon, const float *lr,
+                                              const float *weight_decay, float *m, float *v, float *param,
+                                              float *gradient, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cuh
new file mode 100644
index 00000000000..7e52876a3e5
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_WEIGHT_DECAY_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_WEIGHT_DECAY_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1,
+                                     const float *one_sub_beta1, const float *beta2, const float *one_sub_beta2,
+                                     const float *epsilon, const float *lr, const float *weight_decay, T *m, T *v,
+                                     T *param, T *gradient, cudaStream_t stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_WEIGHT_DECAY_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cu
similarity index 82%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cu
index 0b3c3f92646..275cda84ecf 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh"
+#include "include/cuda_fp16.h"
 
 __device__ inline uint start_index(uint a, uint b, uint c) {
   return floorf(__uint2float_rn(a * c) / __uint2float_rn(b));
@@ -168,14 +169,17 @@ void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const
     size, input_height, input_width, output_height, output_width, input_data, output_data);
 }
 
-template void ApplyAdaptiveAvgPool2DGrad<float>(const uint size, const uint input_height, const uint input_width,
-                                                const uint output_height, const uint output_width, float *input_data,
-                                                float *output_data, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad<float>(const uint size, const uint input_height,
+                                                                const uint input_width, const uint output_height,
+                                                                const uint output_width, float *input_data,
+                                                                float *output_data, cudaStream_t cuda_stream);
 
-template void ApplyAdaptiveAvgPool2DGrad<half>(const uint size, const uint input_height, const uint input_width,
-                                               const uint output_height, const uint output_width, half *input_data,
-                                               half *output_data, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad<half>(const uint size, const uint input_height,
+                                                               const uint input_width, const uint output_height,
+                                                               const uint output_width, half *input_data,
+                                                               half *output_data, cudaStream_t cuda_stream);
 
-template void ApplyAdaptiveAvgPool2DGrad<double>(const uint size, const uint input_height, const uint input_width,
-                                                 const uint output_height, const uint output_width, double *input_data,
-                                                 double *output_data, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad<double>(const uint size, const uint input_height,
+                                                                 const uint input_width, const uint output_height,
+                                                                 const uint output_width, double *input_data,
+                                                                 double *output_data, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh
new file mode 100644
index 00000000000..d9bb7c22aea
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const uint input_width,
+                                                const uint output_height, const uint output_width, T *input_data,
+                                                T *output_data, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cu
similarity index 81%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cu
index e93af42a28f..11cb4b1cfa4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh"
+#include "include/cuda_fp16.h"
 
 __device__ inline uint start_index(uint a, uint b, uint c) {
   return floorf(__uint2float_rn(a * c) / __uint2float_rn(b));
@@ -155,14 +156,17 @@ void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint
     size, input_height, input_width, output_height, output_width, input_data, output_data);
 }
 
-template void ApplyAdaptiveAvgPool2D<float>(const uint size, const uint input_height, const uint input_width,
-                                            const uint output_height, const uint output_width, float *input_data,
-                                            float *output_data, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D<float>(const uint size, const uint input_height,
+                                                            const uint input_width, const uint output_height,
+                                                            const uint output_width, float *input_data,
+                                                            float *output_data, cudaStream_t cuda_stream);
 
-template void ApplyAdaptiveAvgPool2D<half>(const uint size, const uint input_height, const uint input_width,
-                                           const uint output_height, const uint output_width, half *input_data,
-                                           half *output_data, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D<half>(const uint size, const uint input_height,
+                                                           const uint input_width, const uint output_height,
+                                                           const uint output_width, half *input_data,
+                                                           half *output_data, cudaStream_t cuda_stream);
 
-template void ApplyAdaptiveAvgPool2D<double>(const uint size, const uint input_height, const uint input_width,
-                                             const uint output_height, const uint output_width, double *input_data,
-                                             double *output_data, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D<double>(const uint size, const uint input_height,
+                                                             const uint input_width, const uint output_height,
+                                                             const uint output_width, double *input_data,
+                                                             double *output_data, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh
new file mode 100644
index 00000000000..b6ea4787f93
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint input_width,
+                                            const uint output_height, const uint output_width, T *input_data,
+                                            T *output_data, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_impl.cuh
similarity index 51%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_impl.cuh
index a01a40443be..2eb9ac5ede7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_impl.cuh
@@ -14,12 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask,
+                               cudaStream_t cuda_stream);
 
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T, typename S>
-void SigmoidCrossEntropyWithLogits(const size_t size, const T *logits, const S *labels, T *outputs,
+template <typename T>
+CUDA_LIB_EXPORT void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx,
                                    cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cu
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cu
index e55e15ea6c3..d063133bcd7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 template <typename T>
 __global__ void AddReluV2Kernel(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask) {
@@ -49,20 +49,20 @@ void AddReluGradV2(const size_t num, const T *x1, const T *x2, const uint32_t *m
   AddReluGradV2Kernel<<<kBlocksPerGrid(num), kThreadsPerBlock, 0, cuda_stream>>>(num, x1, x2, mask, dx);
 }
 
-template void AddReluV2(const size_t num, const float *x1, const float *x2, float *y, uint32_t *mask,
-                        cudaStream_t cuda_stream);
-template void AddReluV2(const size_t num, const half *x1, const half *x2, half *y, uint32_t *mask,
-                        cudaStream_t cuda_stream);
-template void AddReluV2(const size_t num, const int32_t *x1, const int32_t *x2, int32_t *y, uint32_t *mask,
-                        cudaStream_t cuda_stream);
-template void AddReluV2(const size_t num, const int64_t *x1, const int64_t *x2, int64_t *y, uint32_t *mask,
-                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const float *x1, const float *x2, float *y, uint32_t *mask,
+                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const half *x1, const half *x2, half *y, uint32_t *mask,
+                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const int32_t *x1, const int32_t *x2, int32_t *y,
+                                        uint32_t *mask, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const int64_t *x1, const int64_t *x2, int64_t *y,
+                                        uint32_t *mask, cudaStream_t cuda_stream);
 
-template void AddReluGradV2(const size_t num, const float *x1, const float *x2, const uint32_t *mask, float *dx,
-                            cudaStream_t cuda_stream);
-template void AddReluGradV2(const size_t num, const half *x1, const half *x2, const uint32_t *mask, half *dx,
-                            cudaStream_t cuda_stream);
-template void AddReluGradV2(const size_t num, const int32_t *x1, const int32_t *x2, const uint32_t *mask, int32_t *dx,
-                            cudaStream_t cuda_stream);
-template void AddReluGradV2(const size_t num, const int64_t *x1, const int64_t *x2, const uint32_t *mask, int64_t *dx,
-                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const float *x1, const float *x2, const uint32_t *mask,
+                                            float *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const half *x1, const half *x2, const uint32_t *mask,
+                                            half *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const int32_t *x1, const int32_t *x2,
+                                            const uint32_t *mask, int32_t *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const int64_t *x1, const int64_t *x2,
+                                            const uint32_t *mask, int64_t *dx, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh
new file mode 100644
index 00000000000..19af5ceb0bf
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_V2_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_V2_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask,
+                               cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx,
+                                   cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_V2_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cu
similarity index 66%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cu
index 4f3489bf2ed..37abea756be 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void ApplyGradientDescent(const size_t size, T *var, const T *alpha, const T *delta, T *output) {
@@ -31,7 +32,8 @@ void CalApplyGradientDescent(const size_t &size, T *var, const T *alpha, const T
   ApplyGradientDescent<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, var, alpha, delta, output);
 }
 
-template void CalApplyGradientDescent<float>(const size_t &size, float *var, const float *alpha, const float *delta,
-                                             float *output, cudaStream_t cuda_stream);
-template void CalApplyGradientDescent<half>(const size_t &size, half *var, const half *alpha, const half *delta,
-                                            half *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalApplyGradientDescent<float>(const size_t &size, float *var, const float *alpha,
+                                                             const float *delta, float *output,
+                                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalApplyGradientDescent<half>(const size_t &size, half *var, const half *alpha,
+                                                            const half *delta, half *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh
similarity index 53%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh
index 48d61d63b81..19bb5afcff8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh
@@ -14,16 +14,13 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSIGMOID_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSIGMOID_IMPL_CUH_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_APPLY_GRADIENT_DESCENT_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_APPLY_GRADIENT_DESCENT_IMPL_CUH_
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void CalHSigmoid(const size_t &size, const T *input, T *output, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CalApplyGradientDescent(const size_t &size, T *var, const T *alpha, const T *delta, T *output,
+                                             cudaStream_t cuda_stream);
 
-template <typename T>
-void CalHSigmoidGrad(const size_t &size, const T *dout, const T *x, T *output, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSIGMOID_IMPL_CUH_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_APPLY_GRADIENT_DESCENT_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cu
similarity index 85%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cu
index 7f7eb415bd8..9fceaa6e3f7 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cu
@@ -15,8 +15,6 @@
  */
 
 #include "argmax_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "include/cuda_fp16.h"
 template <typename T, typename S>
 __global__ void Argmax(const T *input, const S bound, const size_t outer_size,
                        const size_t inner_size, S *output) {
@@ -46,7 +44,9 @@ void CalArgmax(const T *input, const S bound, const size_t outer_size, const siz
   return;
 }
 
-template void CalArgmax<float, int>(const float *input, const int bound, const size_t outer_size,
+template
+CUDA_LIB_EXPORT void CalArgmax<float, int>(const float *input, const int bound, const size_t outer_size,
                                     const size_t inner_size, int *output, cudaStream_t cuda_stream);
-template void CalArgmax<half, int>(const half *input, const int bound, const size_t outer_size,
+template
+CUDA_LIB_EXPORT void CalArgmax<half, int>(const half *input, const int bound, const size_t outer_size,
                                    const size_t inner_size, int *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cuh
new file mode 100755
index 00000000000..8cee0638634
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cuh
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ARGMAX_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ARGMAX_IMPL_CUH_
+#include "include/cuda_fp16.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+CUDA_LIB_EXPORT void CalArgmaxFp32(const float *input, const int bound, const size_t outer_size,
+                         const size_t inner_size, int *output, cudaStream_t cuda_stream);
+
+CUDA_LIB_EXPORT void CalArgmaxFp16(const half *input, const int bound, const size_t outer_size,
+                        const size_t inner_size, int *output, cudaStream_t cuda_stream);
+#ifdef __cplusplus
+}
+#endif
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CalArgmax(const T *input, const S bound, const size_t outer_size, const size_t inner_size,
+                               S *output, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ARGMAX_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cu
similarity index 60%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cu
index 0be7e20137c..5953c5248ae 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cu
@@ -15,8 +15,6 @@
  */
 
 #include "assign_add_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "include/cuda_fp16.h"
 template <typename T>
 __global__ void AssignAdd(const size_t size, T* ref, const T* value, T* output) {
   for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
@@ -33,10 +31,11 @@ void CalAssignAdd(const size_t size, T* ref, const T* value, T* output, cudaStre
   return;
 }
 
-template void CalAssignAdd<float>(const size_t size, float* ref, const float* value, float* output,
-                                  cudaStream_t cuda_stream);
-template void CalAssignAdd<half>(const size_t size, half* ref, const half* value, half* output,
-                                 cudaStream_t cuda_stream);
-template void CalAssignAdd<int>(const size_t size, int* ref, const int* value, int* output, cudaStream_t cuda_stream);
-template void CalAssignAdd<int64_t>(const size_t size, int64_t* ref, const int64_t* value, int64_t* output,
-                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalAssignAdd<float>(const size_t size, float* ref, const float* value, float* output,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalAssignAdd<half>(const size_t size, half* ref, const half* value, half* output,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalAssignAdd<int>(const size_t size, int* ref, const int* value, int* output,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalAssignAdd<int64_t>(const size_t size, int64_t* ref, const int64_t* value,
+                                                    int64_t* output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cuh
similarity index 56%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cuh
index b095384aced..d4dd0e64484 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cuh
@@ -14,9 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ASSIGNADD_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ASSIGNADD_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ASSIGN_ADD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ASSIGN_ADD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "include/cuda_fp16.h"
 template <typename T>
-void CalAssignAdd(const size_t size, T* ref, const T* value, T* output, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CalAssignAdd(const size_t size, T* ref, const T* value, T* output, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ASSIGNADD_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ASSIGN_ADD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cu
similarity index 72%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cu
index 3ef856e00af..39ebbc88411 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cu
@@ -109,10 +109,11 @@ void BatchNormFold2Forward(const T *x, const T *beta, const T *gamma, const T *b
     x, beta, gamma, batch_std, batch_mean, running_std, running_mean, global_step, y, freeze_bn, N, C, H, W);
 }
 
-template void BatchNormFold2Forward<float>(const float *x, const float *beta, const float *gamma,
-                                           const float *batch_std, const float *batch_mean, const float *running_std,
-                                           const float *running_mean, const int *global_step, float *y, int freeze_bn,
-                                           size_t N, size_t C, size_t H, size_t W, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void BatchNormFold2Forward<float>(const float *x, const float *beta, const float *gamma,
+                                                           const float *batch_std, const float *batch_mean,
+                                                           const float *running_std, const float *running_mean,
+                                                           const int *global_step, float *y, int freeze_bn, size_t N,
+                                                           size_t C, size_t H, size_t W, cudaStream_t cuda_stream);
 
 template <typename T>
 void BatchNormFold2GradReduce(const T *dout, const T *x, T *d_beta, T *tmp, T *reduce_x, T *tmp2, T *tmp_x, size_t N,
@@ -124,9 +125,10 @@ void BatchNormFold2GradReduce(const T *dout, const T *x, T *d_beta, T *tmp, T *r
   BatchNormFold2GradReduce2<<<GET_BLOCKS(C), GET_THREADS, 0, cuda_stream>>>(tmp, d_beta, tmp2, reduce_x, N, C);
 }
 
-template void BatchNormFold2GradReduce<float>(const float *dout, const float *x, float *d_beta, float *tmp,
-                                              float *reduce_x, float *tmp2, float *tmp_x, size_t N, size_t C, size_t H,
-                                              size_t W, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void BatchNormFold2GradReduce<float>(const float *dout, const float *x, float *d_beta,
+                                                              float *tmp, float *reduce_x, float *tmp2, float *tmp_x,
+                                                              size_t N, size_t C, size_t H, size_t W,
+                                                              cudaStream_t cuda_stream);
 
 template <typename T>
 void CalBatchNormFold2GradNotFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std,
@@ -136,11 +138,12 @@ void CalBatchNormFold2GradNotFreeze(const T *d_beta, const T *reduce_x, const T
     d_beta, reduce_x, batch_mean, batch_std, running_mean, running_std, gamma, d_gamma, d_batch_mean, d_batch_std, C);
 }
 
-template void CalBatchNormFold2GradNotFreeze<float>(const float *d_beta, const float *reduce_x, const float *batch_mean,
-                                                    const float *batch_std, const float *running_mean,
-                                                    const float *running_std, const float *gamma, float *d_gamma,
-                                                    float *d_batch_mean, float *d_batch_std, size_t C,
-                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchNormFold2GradNotFreeze<float>(const float *d_beta, const float *reduce_x,
+                                                                    const float *batch_mean, const float *batch_std,
+                                                                    const float *running_mean, const float *running_std,
+                                                                    const float *gamma, float *d_gamma,
+                                                                    float *d_batch_mean, float *d_batch_std, size_t C,
+                                                                    cudaStream_t cuda_stream);
 
 template <typename T>
 void CalBatchNormFold2GradFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std,
@@ -152,11 +155,12 @@ void CalBatchNormFold2GradFreeze(const T *d_beta, const T *reduce_x, const T *ba
   ThrustFillWith(d_batch_std, C, (T)0.f, cuda_stream);
 }
 
-template void CalBatchNormFold2GradFreeze<float>(const float *d_beta, const float *reduce_x, const float *batch_mean,
-                                                 const float *batch_std, const float *running_mean,
-                                                 const float *running_std, const float *gamma, float *d_gamma,
-                                                 float *d_batch_mean, float *d_batch_std, size_t C,
-                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchNormFold2GradFreeze<float>(const float *d_beta, const float *reduce_x,
+                                                                 const float *batch_mean, const float *batch_std,
+                                                                 const float *running_mean, const float *running_std,
+                                                                 const float *gamma, float *d_gamma,
+                                                                 float *d_batch_mean, float *d_batch_std, size_t C,
+                                                                 cudaStream_t cuda_stream);
 
 template <typename T>
 void CalBatchNormFold2GradNotFreezeDxMul(const T *batch_std, const T *running_std, T *d_x, size_t N, size_t C, size_t H,
@@ -164,6 +168,7 @@ void CalBatchNormFold2GradNotFreezeDxMul(const T *batch_std, const T *running_st
   DxMul<<<GET_BLOCKS(N * C * H * W), GET_THREADS, 0, cuda_stream>>>(N, C, H * W, batch_std, running_std, d_x);
 }
 
-template void CalBatchNormFold2GradNotFreezeDxMul<float>(const float *batch_std, const float *running_std, float *d_x,
-                                                         size_t N, size_t C, size_t H, size_t W,
-                                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchNormFold2GradNotFreezeDxMul<float>(const float *batch_std,
+                                                                         const float *running_std, float *d_x,
+                                                                         size_t N, size_t C, size_t H, size_t W,
+                                                                         cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh
new file mode 100644
index 00000000000..955d18b7dd5
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_FOLD2_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_FOLD2_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void BatchNormFold2Forward(const T *x, const T *beta, const T *gamma, const T *batch_std,
+                                           const T *batch_mean, const T *running_std, const T *running_mean,
+                                           const int *global_step, T *y, int freeze_bn, size_t N, size_t C, size_t H,
+                                           size_t W, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalBatchNormFold2GradNotFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean,
+                                                    const T *batch_std, const T *running_mean, const T *running_std,
+                                                    const T *gamma, T *d_gamma, T *d_batch_mean, T *d_batch_std,
+                                                    size_t C, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalBatchNormFold2GradFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean,
+                                                 const T *batch_std, const T *running_mean, const T *running_std,
+                                                 const T *gamma, T *d_gamma, T *d_batch_mean, T *d_batch_std, size_t C,
+                                                 cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void BatchNormFold2GradReduce(const T *dout, const T *x, T *d_beta, T *tmp, T *reduce_x, T *tmp2,
+                                              T *tmp_x, size_t N, size_t C, size_t H, size_t W,
+                                              cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalBatchNormFold2GradNotFreezeDxMul(const T *batch_std, const T *running_std, T *d_x, size_t N,
+                                                         size_t C, size_t H, size_t W, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_FOLD2_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cu
similarity index 78%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cu
index 54f1c11ab9f..1dafd8b87c6 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cu
@@ -18,7 +18,6 @@
 #include <thrust/fill.h>
 #include <thrust/system/cuda/execution_policy.h>
 #include "batchnorm_fold_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 
 template <typename T>
 __global__ void UpdateRunningStd(int channel_size, const double epsilon, T* running_std) {
@@ -55,8 +54,8 @@ void CalUpdateRunningStd(int channel_size, double epsilon, T* running_std, cudaS
   return;
 }
 
-template void CalUpdateRunningStd<float>(int channel_size, double epsilon, float* running_std,
-                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalUpdateRunningStd<float>(int channel_size, double epsilon, float* running_std,
+                                                         cudaStream_t cuda_stream);
 
 template <typename T>
 void CalUpdateBatchStd(int channel_size, T* batch_std, cudaStream_t cuda_stream) {
@@ -64,7 +63,7 @@ void CalUpdateBatchStd(int channel_size, T* batch_std, cudaStream_t cuda_stream)
   return;
 }
 
-template void CalUpdateBatchStd<float>(int channel_size, float* batch_std, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalUpdateBatchStd<float>(int channel_size, float* batch_std, cudaStream_t cuda_stream);
 
 template <typename T>
 void CalBatchNormFoldGrad(const T* d_batch_mean, const T* d_batch_std, const T* x, const T* batch_mean,
@@ -74,9 +73,10 @@ void CalBatchNormFoldGrad(const T* d_batch_mean, const T* d_batch_std, const T*
     d_batch_mean, d_batch_std, x, batch_mean, batch_std, batch_size, channel_size, height, width, dx);
 }
 
-template void CalBatchNormFoldGrad<float>(const float* d_batch_mean, const float* d_batch_std, const float* x,
-                                          const float* batch_mean, const float* batch_std, int batch_size,
-                                          int channel_size, int height, int width, float* dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchNormFoldGrad<float>(const float* d_batch_mean, const float* d_batch_std,
+                                                          const float* x, const float* batch_mean,
+                                                          const float* batch_std, int batch_size, int channel_size,
+                                                          int height, int width, float* dx, cudaStream_t cuda_stream);
 
 template <typename T>
 void ThrustFillWith(T* array, int size, T tofill, cudaStream_t cuda_stream) {
@@ -84,5 +84,5 @@ void ThrustFillWith(T* array, int size, T tofill, cudaStream_t cuda_stream) {
   thrust::fill(thrust::cuda::par.on(cuda_stream), dev_ptr, dev_ptr + size, tofill);
 }
 
-template void ThrustFillWith<float>(float* array, int size, float tofill, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ThrustFillWith<float>(float* array, int size, float tofill, cudaStream_t cuda_stream);
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh
new file mode 100755
index 00000000000..5c02d9aedc4
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_FOLD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_FOLD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void CalUpdateRunningStd(int channel_size, double epsilon, T* running_std, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalUpdateBatchStd(int channel_size, T* batch_std, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalBatchNormFoldGrad(const T* d_batch_mean, const T* d_batch_std, const T* x, const T* batch_mean,
+                                          const T* batch_std, int batch_size, int channel_size, int height, int width,
+                                          T* dx, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void ThrustFillWith(T* array, int size, T tofill, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_FOLD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cu
similarity index 82%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cu
index dba71d8f693..f62541422c3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cu
@@ -21,6 +21,7 @@
 #include <thrust/system/cuda/execution_policy.h>
 #include "batchnorm_grad_impl.cuh"
 #include "include/cuda_runtime.h"
+#include "include/cuda_fp16.h"
 
 const int kWarpSize = 32;
 const int kBlockSize = 1024;
@@ -111,10 +112,12 @@ void CalBatchNormGrad(T *x, T *dy, float *scale, float *save_mean, float *save_v
                                                          epsilon, N, C, H, W);
 }
 
-template void CalBatchNormGrad<float>(float *x, float *dy, float *scale, float *save_mean, float *save_variance,
-                                      float *dx, float *bn_scale, float *bn_bias, double epsilon, int N, int C, int H,
-                                      int W, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchNormGrad<float>(float *x, float *dy, float *scale, float *save_mean,
+                                                      float *save_variance, float *dx, float *bn_scale, float *bn_bias,
+                                                      double epsilon, int N, int C, int H, int W,
+                                                      cudaStream_t cuda_stream);
 
-template void CalBatchNormGrad<half>(half *x, half *dy, float *scale, float *save_mean, float *save_variance, half *dx,
-                                     float *bn_scale, float *bn_bias, double epsilon, int N, int C, int H, int W,
-                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchNormGrad<half>(half *x, half *dy, float *scale, float *save_mean,
+                                                     float *save_variance, half *dx, float *bn_scale, float *bn_bias,
+                                                     double epsilon, int N, int C, int H, int W,
+                                                     cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cuh
new file mode 100644
index 00000000000..c7edd82b2e2
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cuh
@@ -0,0 +1,24 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void CalBatchNormGrad(T *x, T *dy, float *scale, float *save_mean, float *save_variance, T *dx,
+                                      float *bn_scale, float *bn_bias, double epsilon, int N, int C, int H, int W,
+                                      cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cu
new file mode 100644
index 00000000000..aaea0c0bc89
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cu
@@ -0,0 +1,139 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cuda_runtime.h>
+#include "batchtospace_impl.cuh"
+#include "include/cuda_fp16.h"
+
+template <typename T>
+__global__ void BatchToSpace(const size_t size, const T *input, const size_t in,
+                             const size_t ih, const size_t iw, const size_t ic,
+                             const size_t on, const size_t oh, const size_t ow,
+                             const size_t oc, const size_t crop_up, const size_t crop_dn,
+                             const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                             T *output) {
+  size_t temp_stride = 0;
+  size_t temp_pos = 0;
+  size_t idx_on = 0;
+  size_t idx_oc = 0;
+  size_t idx_oh = 0;
+  size_t idx_ow = 0;
+  size_t idx_in = 0;
+  size_t input_pos = 0;
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size;
+       pos += blockDim.x * gridDim.x) {
+    temp_stride = oc * oh * ow;
+    idx_on = pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= oc;
+    idx_oc = temp_pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= oh;
+    idx_oh = temp_pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= ow;
+    idx_ow = temp_pos / temp_stride;
+
+    idx_in = (((idx_oh + crop_up) % block_num) * block_num + ((idx_ow + crop_lft) % block_num)) * on + idx_on;
+    input_pos = idx_in * ic;
+    input_pos = (input_pos + idx_oc) * ih;
+    input_pos = (input_pos + ((idx_oh + crop_up) - (idx_in / (on * block_num))) / block_num) * iw;
+    input_pos = (input_pos + ((idx_ow + crop_lft) - ((idx_in / on) % block_num)) / block_num);
+    output[pos] = input[input_pos];
+  }
+  return;
+}
+
+template <typename T>
+void CalBatchToSpace(const size_t size, const T *input, const size_t in,
+  const size_t ih, const size_t iw, const size_t ic,
+  const size_t on, const size_t oh, const size_t ow,
+  const size_t oc, const size_t crop_up, const size_t crop_dn,
+  const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+  T *output, cudaStream_t cuda_stream) {
+  BatchToSpace<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+    size, input, in, ih, iw, ic, on, oh, ow, oc, crop_up, crop_dn, crop_lft, crop_rht, block_num, output);
+  return;
+}
+
+template CUDA_LIB_EXPORT void CalBatchToSpace<float>(const size_t size, const float *input, const size_t in,
+                                                     const size_t ih, const size_t iw, const size_t ic,
+                                                     const size_t on, const size_t oh, const size_t ow,
+                                                     const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                                     const size_t crop_lft, const size_t crop_rht,
+                                                     const size_t block_num, float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchToSpace<half>(const size_t size, const half *input, const size_t in,
+                                                    const size_t ih, const size_t iw, const size_t ic,
+                                                    const size_t on, const size_t oh, const size_t ow,
+                                                    const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                                    const size_t crop_lft, const size_t crop_rht,
+                                                    const size_t block_num, half *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchToSpace<int>(const size_t size, const int *input, const size_t in,
+                                                   const size_t ih, const size_t iw, const size_t ic,
+                                                   const size_t on, const size_t oh, const size_t ow,
+                                                   const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                                   const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                                                   int *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchToSpace<int64_t>(const size_t size, const int64_t *input, const size_t in,
+                                                       const size_t ih, const size_t iw, const size_t ic,
+                                                       const size_t on, const size_t oh, const size_t ow,
+                                                       const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                                       const size_t crop_lft, const size_t crop_rht,
+                                                       const size_t block_num, int64_t *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchToSpace<int16_t>(const size_t size, const int16_t *input, const size_t in,
+                                                       const size_t ih, const size_t iw, const size_t ic,
+                                                       const size_t on, const size_t oh, const size_t ow,
+                                                       const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                                       const size_t crop_lft, const size_t crop_rht,
+                                                       const size_t block_num, int16_t *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchToSpace<int8_t>(const size_t size, const int8_t *input, const size_t in,
+                                                      const size_t ih, const size_t iw, const size_t ic,
+                                                      const size_t on, const size_t oh, const size_t ow,
+                                                      const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                                      const size_t crop_lft, const size_t crop_rht,
+                                                      const size_t block_num, int8_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchToSpace<uint8_t>(const size_t size, const uint8_t *input, const size_t in,
+                                                       const size_t ih, const size_t iw, const size_t ic,
+                                                       const size_t on, const size_t oh, const size_t ow,
+                                                       const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                                       const size_t crop_lft, const size_t crop_rht,
+                                                       const size_t block_num, uint8_t *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchToSpace<uint16_t>(const size_t size, const uint16_t *input, const size_t in,
+                                                        const size_t ih, const size_t iw, const size_t ic,
+                                                        const size_t on, const size_t oh, const size_t ow,
+                                                        const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                                        const size_t crop_lft, const size_t crop_rht,
+                                                        const size_t block_num, uint16_t *output,
+                                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchToSpace<uint32_t>(const size_t size, const uint32_t *input, const size_t in,
+                                                        const size_t ih, const size_t iw, const size_t ic,
+                                                        const size_t on, const size_t oh, const size_t ow,
+                                                        const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                                        const size_t crop_lft, const size_t crop_rht,
+                                                        const size_t block_num, uint32_t *output,
+                                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBatchToSpace<uint64_t>(const size_t size, const uint64_t *input, const size_t in,
+                                                        const size_t ih, const size_t iw, const size_t ic,
+                                                        const size_t on, const size_t oh, const size_t ow,
+                                                        const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                                        const size_t crop_lft, const size_t crop_rht,
+                                                        const size_t block_num, uint64_t *output,
+                                                        cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh
new file mode 100644
index 00000000000..47433e51d73
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHTOSPACE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHTOSPACE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void CalBatchToSpace(const size_t size, const T *input, const size_t in,
+                                     const size_t ih, const size_t iw, const size_t ic,
+                                     const size_t on, const size_t oh, const size_t ow,
+                                     const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                     const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                                     T *output, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHTOSPACE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cu
similarity index 76%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cu
index 8a2d6b0edc8..58731e05175 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cuh"
+#include "include/cuda_fp16.h"
 
 __device__ __forceinline__ size_t Index(const size_t &index, const size_t &dim) { return dim == 1 ? 0 : index; }
 
@@ -114,15 +115,18 @@ void CalBCEWithLogitsLoss(const size_t input_size, const T *predict, const T *ta
   return;
 }
 
-template void CalBCEWithLogitsLoss<half>(const size_t input_size, const half *predict, const half *target,
-                                         const size_t *input_shape, const size_t shape_size, const half *weight,
-                                         const size_t *weight_shape, const bool weight_need_broadcast,
-                                         const half *pos_weight, const size_t *pos_weight_shape,
-                                         const bool pos_weight_need_broadcast, half *shape_broadcasted, half *output,
-                                         cudaStream_t cuda_stream);
-template void CalBCEWithLogitsLoss<float>(const size_t input_size, const float *predict, const float *target,
-                                          const size_t *input_shape, const size_t shape_size, const float *weight,
-                                          const size_t *weight_shape, const bool weight_need_broadcast,
-                                          const float *pos_weight, const size_t *pos_weight_shape,
-                                          const bool pos_weight_need_broadcast, float *shape_broadcasted, float *output,
-                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBCEWithLogitsLoss<half>(const size_t input_size, const half *predict,
+                                                         const half *target, const size_t *input_shape,
+                                                         const size_t shape_size, const half *weight,
+                                                         const size_t *weight_shape, const bool weight_need_broadcast,
+                                                         const half *pos_weight, const size_t *pos_weight_shape,
+                                                         const bool pos_weight_need_broadcast, half *shape_broadcasted,
+                                                         half *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBCEWithLogitsLoss<float>(const size_t input_size, const float *predict,
+                                                          const float *target, const size_t *input_shape,
+                                                          const size_t shape_size, const float *weight,
+                                                          const size_t *weight_shape, const bool weight_need_broadcast,
+                                                          const float *pos_weight, const size_t *pos_weight_shape,
+                                                          const bool pos_weight_need_broadcast,
+                                                          float *shape_broadcasted, float *output,
+                                                          cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cuh
new file mode 100644
index 00000000000..531816f836e
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cuh
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BCE_WITH_LOGITS_LOSS_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BCE_WITH_LOGITS_LOSS_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#define MAX_LOGITS_DIMENSION 8
+
+template <typename T>
+CUDA_LIB_EXPORT void CalBCEWithLogitsLoss(const size_t input_size, const T *predict, const T *target,
+                                          const size_t *input_shape, const size_t shape_size, const T *weight,
+                                          const size_t *weight_shape, const bool weight_need_broadcast,
+                                          const T *pos_weight, const size_t *pos_weight_shape,
+                                          const bool pos_weight_need_broadcast, T *shape_broadcasted, T *output,
+                                          cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BCE_WITH_LOGITS_LOSS_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cu
similarity index 86%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cu
index 5ae02f0198a..53a1c8401d9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cu
@@ -17,9 +17,8 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cuh"
 
 const int kWarpSize = 32;
 // tuning param, for those nhw >= kLargeSize, launch more blocks to solve
@@ -165,11 +164,13 @@ void CalBiasAddGradNHWC(const size_t size, const size_t bias_size,
   return;
 }
 
-template void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, const int height, const int width,
-                                 const float*  dy, float*  db, cudaStream_t cuda_stream);
-template void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, const int height, const int width,
-                                 const half*  dy, half*  db, cudaStream_t cuda_stream);
-template void CalBiasAddGradNHWC(const size_t size, const size_t bias_size,
-                                 const float*  dy, float*  db,  cudaStream_t cuda_stream);
-template void CalBiasAddGradNHWC(const size_t size, const size_t bias_size, const half*  dy,
-                                 half*  db,  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBiasAddGradNCHW(const size_t size, const size_t bias_size,
+                                                 const int height, const int width,
+                                                 const float*  dy, float*  db, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBiasAddGradNCHW(const size_t size, const size_t bias_size,
+                                                 const int height, const int width,
+                                                 const half*  dy, half*  db, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBiasAddGradNHWC(const size_t size, const size_t bias_size,
+                                                 const float*  dy, float*  db,  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalBiasAddGradNHWC(const size_t size, const size_t bias_size, const half*  dy,
+                                                 half*  db,  cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cuh
new file mode 100644
index 00000000000..b95060aa154
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BIAS_ADD_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BIAS_ADD_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void CalBiasAddGradNHWC(const size_t size, const size_t bias_size,
+                                        const T*  dy, T*  db,  cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, const int height, const int width,
+                                        const T*  dy, T*  db,  cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BIAS_ADD_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cu
similarity index 79%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cu
index ef6e4575d0f..75191547168 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cuh"
 
 template <typename T>
 __global__ void BoundingBoxDecodeKernel(const size_t size, const T *rois, const T *deltas, T *bboxes, const float m1,
@@ -74,8 +74,11 @@ void BoundingBoxDecode(const size_t size, const T *rois, const T *deltas, T *bbo
                                                                              ratio_clip);
 }
 
-template void BoundingBoxDecode<float>(const size_t size, const float *rois, const float *deltas, float *bboxes,
-                                       const float &m1, const float &m2, const float &m3, const float &m4,
-                                       const float &s1, const float &s2, const float &s3, const float &s4,
-                                       const int &max_height, const int &max_width, const float &ratio_clip,
-                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void BoundingBoxDecode<float>(const size_t size, const float *rois, const float *deltas,
+                                                       float *bboxes,
+                                                       const float &m1, const float &m2,
+                                                       const float &m3, const float &m4,
+                                                       const float &s1, const float &s2,
+                                                       const float &s3, const float &s4,
+                                                       const int &max_height, const int &max_width,
+                                                       const float &ratio_clip, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cuh
new file mode 100644
index 00000000000..bb887299582
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cuh
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BOUNDINGBOX_DECODE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BOUNDINGBOX_DECODE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void BoundingBoxDecode(const size_t size, const T *rois, const T *deltas, T *bboxes,
+                                       const float &m1, const float &m2, const float &m3, const float &m4,
+                                       const float &s1, const float &s2, const float &s3, const float &s4,
+                                       const int &max_height, const int &max_width, const float &ratio_clip,
+                                       cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BOUNDINGBOX_DECODE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cu
similarity index 78%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cu
index 155c7fe6936..927de52b0f0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cuh"
 
 template <typename T>
 __global__ void BoundingBoxEncodeKernel(const size_t size, const T *anchor_box, const T *groundtruth_box, T *deltas,
@@ -56,7 +56,10 @@ void BoundingBoxEncode(const size_t size, const T *anchor_box, const T *groundtr
                                                                              m1, m2, m3, m4, s1, s2, s3, s4);
 }
 
-template void BoundingBoxEncode<float>(const size_t size, const float *anchor_box, const float *groundtruth_box,
-                                       float *deltas, const float &m1, const float &m2, const float &m3,
-                                       const float &m4, const float &s1, const float &s2, const float &s3,
-                                       const float &s4, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void BoundingBoxEncode<float>(const size_t size, const float *anchor_box,
+                                                       const float *groundtruth_box, float *deltas,
+                                                       const float &m1, const float &m2,
+                                                       const float &m3, const float &m4,
+                                                       const float &s1, const float &s2,
+                                                       const float &s3, const float &s4,
+                                                       cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cuh
new file mode 100644
index 00000000000..c7322f87c30
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BOUNDINGBOX_ENCODE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BOUNDINGBOX_ENCODE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void BoundingBoxEncode(const size_t size, const T *anchor_box, const T *groundtruth_box, T *deltas,
+                                       const float &m1, const float &m2, const float &m3, const float &m4,
+                                       const float &s1, const float &s2, const float &s3, const float &s4,
+                                       cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BOUNDINGBOX_ENCODE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cu
similarity index 54%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cu
index 2c751d7f438..47d2a4c6581 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cu
@@ -14,9 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 template <typename T>
 struct MinimumGradFunc {
@@ -113,37 +112,48 @@ void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
   NoBroadcastGradKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, grad_x1, grad_x2, op, x1, x2, dy, dx1, dx2);
 }
 
-template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
-                              const double *x1, const double *x2, const double *dy, double *dx1, double *dx2,
-                              cudaStream_t stream);
-template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
-                              const float *x1, const float *x2, const float *dy, float *dx1, float *dx2,
-                              cudaStream_t stream);
-template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
-                              const int *x1, const int *x2, const int *dy, int *dx1, int *dx2, cudaStream_t stream);
-template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
-                              const half *x1, const half *x2, const half *dy, half *dx1, half *dx2,
-                              cudaStream_t stream);
-template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
-                              const int64_t *x1, const int64_t *x2, const int64_t *dy, int64_t *dx1, int64_t *dx2,
-                              cudaStream_t stream);
-template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
-                            const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
-                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const double *x1,
-                            const double *x2, const double *dy, double *dx1, double *dx2, cudaStream_t stream);
-template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
-                            const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
-                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const float *x1,
-                            const float *x2, const float *dy, float *dx1, float *dx2, cudaStream_t stream);
-template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
-                            const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
-                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const int *x1,
-                            const int *x2, const int *dy, int *dx1, int *dx2, cudaStream_t stream);
-template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
-                            const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
-                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const half *x1,
-                            const half *x2, const half *dy, half *dx1, half *dx2, cudaStream_t stream);
-template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
-                            const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
-                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const int64_t *x1,
-                            const int64_t *x2, const int64_t *dy, int64_t *dx1, int64_t *dx2, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
+                                              enum BroadcastGradOpType op, const double *x1, const double *x2,
+                                              const double *dy, double *dx1, double *dx2, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
+                                              enum BroadcastGradOpType op, const float *x1, const float *x2,
+                                              const float *dy, float *dx1, float *dx2, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
+                                              enum BroadcastGradOpType op, const int *x1, const int *x2,
+                                              const int *dy, int *dx1, int *dx2, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
+                                              enum BroadcastGradOpType op, const half *x1, const half *x2,
+                                              const half *dy, half *dx1, half *dx2, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
+                                              enum BroadcastGradOpType op, const int64_t *x1, const int64_t *x2,
+                                              const int64_t *dy, int64_t *dx1, int64_t *dx2, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3,
+                                            const int &r0, const int &r1, const int &r2, const int &r3,
+                                            const int &d0, const int &d1, const int &d2, const int &d3,
+                                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
+                                            const double *x1, const double *x2, const double *dy,
+                                            double *dx1, double *dx2, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3,
+                                            const int &r0, const int &r1, const int &r2, const int &r3,
+                                            const int &d0, const int &d1, const int &d2, const int &d3,
+                                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
+                                            const float *x1, const float *x2, const float *dy, float *dx1, float *dx2,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3,
+                                            const int &r0, const int &r1, const int &r2, const int &r3,
+                                            const int &d0, const int &d1, const int &d2, const int &d3,
+                                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
+                                            const int *x1, const int *x2, const int *dy, int *dx1, int *dx2,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3,
+                                            const int &r0, const int &r1, const int &r2, const int &r3,
+                                            const int &d0, const int &d1, const int &d2, const int &d3,
+                                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
+                                            const half *x1, const half *x2, const half *dy, half *dx1, half *dx2,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3,
+                                            const int &r0, const int &r1, const int &r2, const int &r3,
+                                            const int &d0, const int &d1, const int &d2, const int &d3,
+                                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
+                                            const int64_t *x1, const int64_t *x2, const int64_t *dy,
+                                            int64_t *dx1, int64_t *dx2, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cuh
new file mode 100644
index 00000000000..2935a56cf35
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cuh
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BROADCAST_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BROADCAST_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+enum BroadcastGradOpType {
+  BROADCAST_GRAD_TYPE_MAXIMUM = 0,
+  BROADCAST_GRAD_TYPE_MINIMUM = 1,
+  BROADCAST_GRAD_TYPE_INVALID = 0xffffffff,
+};
+
+template <typename T>
+CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3,
+                                   const int &r0, const int &r1, const int &r2, const int &r3,
+                                   const int &d0, const int &d1, const int &d2, const int &d3,
+                                   const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
+                                   const T *x1, const T *x2, const T *dy, T *dx1, T *dx2, cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
+                                     enum BroadcastGradOpType op,
+                                     const T *x1, const T *x2, const T *dy, T *dx1, T *dx2, cudaStream_t stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BROADCAST_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu
similarity index 68%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu
index d7b7314161f..bc1bef0b6e8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu
@@ -16,9 +16,8 @@
 
 #include <vector>
 #include <iostream>
-
-#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
+#include "include/cuda_fp16.h"
 
 // Basic function
 template <typename T>
@@ -566,30 +565,30 @@ void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *
   }
 }
 
-template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const double *x0, const double *x1, bool *y,
-                         cudaStream_t stream);
-template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, bool *y,
-                         cudaStream_t stream);
-template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, bool *y,
-                         cudaStream_t stream);
-template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int *x0, const int *x1, bool *y,
-                         cudaStream_t stream);
-template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int8_t *x0, const int8_t *x1, bool *y,
-                         cudaStream_t stream);
-template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const uint8_t *x0, const uint8_t *x1, bool *y,
-                         cudaStream_t stream);
-template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int64_t *x0, const int64_t *x1, bool *y,
-                         cudaStream_t stream);
-template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int16_t *x0, const int16_t *x1, bool *y,
-                         cudaStream_t stream);
-template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const uint16_t *x0, const uint16_t *x1, bool *y,
-                         cudaStream_t stream);
-template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const uint32_t *x0, const uint32_t *x1, bool *y,
-                         cudaStream_t stream);
-template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const uint64_t *x0, const uint64_t *x1, bool *y,
-                         cudaStream_t stream);
-template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const bool *x0, const bool *x1, bool *y,
-                         cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
+                                         const double *x0, const double *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
+                                         const float *x0, const float *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
+                                         const half *x0, const half *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
+                                         const int *x0, const int *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
+                                         const int8_t *x0, const int8_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
+                                         const uint8_t *x0, const uint8_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
+                                         const int64_t *x0, const int64_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
+                                         const int16_t *x0, const int16_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
+                                         const uint16_t *x0, const uint16_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
+                                         const uint32_t *x0, const uint32_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
+                                         const uint64_t *x0, const uint64_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
+                                         const bool *x0, const bool *x1, bool *y, cudaStream_t stream);
 // Element-wise ArithMetic
 template <typename T, typename Func>
 __global__ void ElewiseArithKernel(const int nums, const T *x0, const T *x1, T *y) {
@@ -703,46 +702,46 @@ void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const T1 *x0,
   return ElewiseArithComplexKernel(nums, op, x0, x1, y, stream);
 }
 
-template void ElewiseArith(const int &nums, enum BroadcastOpType op, const double *x0, const double *x1, double *y,
-                           cudaStream_t stream);
-template void ElewiseArith(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, float *y,
-                           cudaStream_t stream);
-template void ElewiseArith(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, half *y,
-                           cudaStream_t stream);
-template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int *x0, const int *x1, int *y,
-                           cudaStream_t stream);
-template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int8_t *x0, const int8_t *x1, int8_t *y,
-                           cudaStream_t stream);
-template void ElewiseArith(const int &nums, enum BroadcastOpType op, const uint8_t *x0, const uint8_t *x1, uint8_t *y,
-                           cudaStream_t stream);
-template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int64_t *x0, const int64_t *x1, int64_t *y,
-                           cudaStream_t stream);
-template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int16_t *x0, const int16_t *x1, int16_t *y,
-                           cudaStream_t stream);
-template void ElewiseArith(const int &nums, enum BroadcastOpType op, const uint16_t *x0, const uint16_t *x1,
-                           uint16_t *y, cudaStream_t stream);
-template void ElewiseArith(const int &nums, enum BroadcastOpType op, const uint32_t *x0, const uint32_t *x1,
-                           uint32_t *y, cudaStream_t stream);
-template void ElewiseArith(const int &nums, enum BroadcastOpType op, const uint64_t *x0, const uint64_t *x1,
-                           uint64_t *y, cudaStream_t stream);
-template void ElewiseArith(const int &nums, enum BroadcastOpType op, const bool *x0, const bool *x1, bool *y,
-                           cudaStream_t stream);
-template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<float> *x0,
-                                  const Complex<float> *x1, Complex<float> *y, cudaStream_t stream);
-template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<float> *x0, const float *x1,
-                                  Complex<float> *y, cudaStream_t stream);
-template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const float *x0, const Complex<float> *x1,
-                                  Complex<float> *y, cudaStream_t stream);
-template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<double> *x0,
-                                  const Complex<double> *x1, Complex<double> *y, cudaStream_t stream);
-template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<double> *x0, const double *x1,
-                                  Complex<double> *y, cudaStream_t stream);
-template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const double *x0, const Complex<double> *x1,
-                                  Complex<double> *y, cudaStream_t stream);
-template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1,
-                                  Complex<float> *y, cudaStream_t stream);
-template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const double *x0, const double *x1,
-                                  Complex<double> *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
+                                           const double *x0, const double *x1, double *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
+                                           const float *x0, const float *x1, float *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
+                                           const half *x0, const half *x1, half *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
+                                           const int *x0, const int *x1, int *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
+                                           const int8_t *x0, const int8_t *x1, int8_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
+                                           const uint8_t *x0, const uint8_t *x1, uint8_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
+                                           const int64_t *x0, const int64_t *x1, int64_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
+                                           const int16_t *x0, const int16_t *x1, int16_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
+                                           const uint16_t *x0, const uint16_t *x1, uint16_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
+                                           const uint32_t *x0, const uint32_t *x1, uint32_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
+                                           const uint64_t *x0, const uint64_t *x1, uint64_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
+                                           const bool *x0, const bool *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<float> *x0,
+                                                  const Complex<float> *x1, Complex<float> *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<float> *x0,
+                                                  const float *x1, Complex<float> *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const float *x0,
+                                                  const Complex<float> *x1, Complex<float> *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<double> *x0,
+                                                  const Complex<double> *x1, Complex<double> *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<double> *x0,
+                                                  const double *x1, Complex<double> *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const double *x0,
+                                                  const Complex<double> *x1, Complex<double> *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const float *x0,
+                                                  const float *x1, Complex<float> *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const double *x0,
+                                                  const double *x1, Complex<double> *y, cudaStream_t stream);
 
 // Broadcast comparison
 __device__ __forceinline__ size_t Index(const size_t &index, const size_t &dim) { return dim == 1 ? 0 : index; }
@@ -836,42 +835,42 @@ void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t>
   }
 }
 
-template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const double *x0,
-                           const double *x1, bool *y, cudaStream_t stream);
-template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const float *x0, const float *x1,
-                           bool *y, cudaStream_t stream);
-template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const half *x0, const half *x1,
-                           bool *y, cudaStream_t stream);
-template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int *x0, const int *x1,
-                           bool *y, cudaStream_t stream);
-template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int8_t *x0,
-                           const int8_t *x1, bool *y, cudaStream_t stream);
-template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint8_t *x0,
-                           const uint8_t *x1, bool *y, cudaStream_t stream);
-template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int64_t *x0,
-                           const int64_t *x1, bool *y, cudaStream_t stream);
-template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int16_t *x0,
-                           const int16_t *x1, bool *y, cudaStream_t stream);
-template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint16_t *x0,
-                           const uint16_t *x1, bool *y, cudaStream_t stream);
-template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint32_t *x0,
-                           const uint32_t *x1, bool *y, cudaStream_t stream);
-template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint64_t *x0,
-                           const uint64_t *x1, bool *y, cudaStream_t stream);
-template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const bool *x0, const bool *x1,
-                           bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const double *x0,
+                                           const double *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const float *x0,
+                                           const float *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const half *x0,
+                                           const half *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int *x0,
+                                           const int *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int8_t *x0,
+                                           const int8_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                           const uint8_t *x0, const uint8_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                           const int64_t *x0, const int64_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                           const int16_t *x0, const int16_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                           const uint16_t *x0, const uint16_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                           const uint32_t *x0, const uint32_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                           const uint64_t *x0, const uint64_t *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const bool *x0,
+                                           const bool *x1, bool *y, cudaStream_t stream);
 // Broadcast Arithmetic
 template <typename T, typename Func>
 __global__ void BroadcastArithKernel(const size_t l0, const size_t l1, const size_t l2, const size_t l3,
@@ -1097,69 +1096,82 @@ void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector
   }
 }
 
-template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const double *x0,
-                             const double *x1, double *y, cudaStream_t stream);
-template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const float *x0,
-                             const float *x1, float *y, cudaStream_t stream);
-template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const half *x0, const half *x1,
-                             half *y, cudaStream_t stream);
-template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int *x0, const int *x1,
-                             int *y, cudaStream_t stream);
-template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int8_t *x0,
-                             const int8_t *x1, int8_t *y, cudaStream_t stream);
-template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint8_t *x0,
-                             const uint8_t *x1, uint8_t *y, cudaStream_t stream);
-template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int64_t *x0,
-                             const int64_t *x1, int64_t *y, cudaStream_t stream);
-template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int16_t *x0,
-                             const int16_t *x1, int16_t *y, cudaStream_t stream);
-template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint16_t *x0,
-                             const uint16_t *x1, uint16_t *y, cudaStream_t stream);
-template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint32_t *x0,
-                             const uint32_t *x1, uint32_t *y, cudaStream_t stream);
-template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint64_t *x0,
-                             const uint64_t *x1, uint64_t *y, cudaStream_t stream);
-template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const bool *x0, const bool *x1,
-                             bool *y, cudaStream_t stream);
-template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                    const Complex<float> *x0, const Complex<float> *x1, Complex<float> *y,
-                                    cudaStream_t stream);
-template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                    const Complex<float> *x0, const float *x1, Complex<float> *y, cudaStream_t stream);
-template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op, const float *x0,
-                                    const Complex<float> *x1, Complex<float> *y, cudaStream_t stream);
-template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                    const Complex<double> *x0, const Complex<double> *x1, Complex<double> *y,
-                                    cudaStream_t stream);
-template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                    const Complex<double> *x0, const double *x1, Complex<double> *y,
-                                    cudaStream_t stream);
-template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op, const double *x0,
-                                    const Complex<double> *x1, Complex<double> *y, cudaStream_t stream);
-template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op, const double *x0,
-                                    const double *x1, Complex<double> *y, cudaStream_t stream);
-template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op, const float *x0,
-                                    const float *x1, Complex<float> *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                             const double *x0, const double *x1, double *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                             const float *x0, const float *x1, float *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const half *x0,
+                                             const half *x1, half *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int *x0,
+                                             const int *x1, int *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                             const int8_t *x0, const int8_t *x1, int8_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                             const uint8_t *x0, const uint8_t *x1, uint8_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                             const int64_t *x0, const int64_t *x1, int64_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                             const int16_t *x0, const int16_t *x1, int16_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                             const uint16_t *x0, const uint16_t *x1, uint16_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                             const uint32_t *x0, const uint32_t *x1, uint32_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                             const uint64_t *x0, const uint64_t *x1, uint64_t *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const bool *x0,
+                                             const bool *x1, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
+                                                    const std::vector<size_t> &x1_dims,
+                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                                    const Complex<float> *x0, const Complex<float> *x1,
+                                                    Complex<float> *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
+                                                    const std::vector<size_t> &x1_dims,
+                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                                    const Complex<float> *x0, const float *x1, Complex<float> *y,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
+                                                    const std::vector<size_t> &x1_dims,
+                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                                    const float *x0, const Complex<float> *x1, Complex<float> *y,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
+                                                    const std::vector<size_t> &x1_dims,
+                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                                    const Complex<double> *x0, const Complex<double> *x1,
+                                                    Complex<double> *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
+                                                    const std::vector<size_t> &x1_dims,
+                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                                    const Complex<double> *x0, const double *x1, Complex<double> *y,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
+                                                    const std::vector<size_t> &x1_dims,
+                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                                    const double *x0, const Complex<double> *x1, Complex<double> *y,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
+                                                    const std::vector<size_t> &x1_dims,
+                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                                    const double *x0, const double *x1, Complex<double> *y,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
+                                                    const std::vector<size_t> &x1_dims,
+                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                                    const float *x0, const float *x1, Complex<float> *y,
+                                                    cudaStream_t stream);
 
 // BroadcastTo
 template <typename T>
@@ -1186,24 +1198,24 @@ void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const siz
                                                                   output_addr);
 }
 
-template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
-                          const size_t &o1, const size_t &o2, const size_t &o3, const double *input_addr,
-                          double *output_addr, cudaStream_t stream);
-template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
-                          const size_t &o1, const size_t &o2, const size_t &o3, const float *input_addr,
-                          float *output_addr, cudaStream_t stream);
-template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
-                          const size_t &o1, const size_t &o2, const size_t &o3, const half *input_addr,
-                          half *output_addr, cudaStream_t stream);
-template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
-                          const size_t &o1, const size_t &o2, const size_t &o3, const int16_t *input_addr,
-                          int16_t *output_addr, cudaStream_t stream);
-template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
-                          const size_t &o1, const size_t &o2, const size_t &o3, const int32_t *input_addr,
-                          int32_t *output_addr, cudaStream_t stream);
-template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
-                          const size_t &o1, const size_t &o2, const size_t &o3, const int64_t *input_addr,
-                          int64_t *output_addr, cudaStream_t stream);
-template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
-                          const size_t &o1, const size_t &o2, const size_t &o3, const bool *input_addr,
-                          bool *output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const double *input_addr, double *output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const float *input_addr, float *output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const half *input_addr, half *output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const int16_t *input_addr, int16_t *output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const int32_t *input_addr, int32_t *output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const int64_t *input_addr, int64_t *output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const bool *input_addr, bool *output_addr, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh
new file mode 100644
index 00000000000..2dc973988e4
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BROADCAST_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BROADCAST_IMPL_CUH_
+#include <vector>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
+
+const float kFloatEplison = 1e-37;
+
+enum BroadcastOpType {
+  BROADCAST_TYPE_GREATER = 0,
+  BROADCAST_TYPE_LESS = 1,
+  BROADCAST_TYPE_MAXIMUM = 2,
+  BROADCAST_TYPE_MINIMUM = 3,
+  BROADCAST_TYPE_POWER = 4,
+  BROADCAST_TYPE_REALDIV = 5,
+  BROADCAST_TYPE_MUL = 6,
+  BROADCAST_TYPE_SUB = 7,
+  BROADCAST_TYPE_ADD = 8,
+  BROADCAST_TYPE_FLOORDIV = 9,
+  BROADCAST_TYPE_ABSGRAD = 10,
+  BROADCAST_TYPE_DIV = 11,
+  BROADCAST_TYPE_DIVNONAN = 12,
+  BROADCAST_TYPE_EQUAL = 13,
+  BROADCAST_TYPE_SQUARED_DIFFERENCE = 14,
+  BROADCAST_TYPE_MOD = 15,
+  BROADCAST_TYPE_FLOORMOD = 16,
+  BROADCAST_TYPE_ATAN2 = 17,
+  BROADCAST_TYPE_GREATER_EQUAL = 18,
+  BROADCAST_TYPE_LESS_EQUAL = 19,
+  BROADCAST_TYPE_NOT_EQUAL = 20,
+  BROADCAST_TYPE_LOGICAL_AND = 21,
+  BROADCAST_TYPE_LOGICAL_OR = 22,
+  BROADCAST_TYPE_TRUNCATEDIV = 23,
+  BROADCAST_TYPE_TRUNCATEMOD = 24,
+  BROADCAST_TYPE_COMPLEX = 25,
+  BROADCAST_TYPE_INVALID = 0xffffffff,
+};
+
+template <typename T>
+CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, bool *y,
+                                cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, T *y,
+                                  cudaStream_t stream);
+
+template <typename T1, typename T2, typename T3>
+CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const T1 *x0, const T2 *x1,
+                                         Complex<T3> *y, cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                  const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T *x0,
+                                  const T *x1, bool *y, cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T *x0,
+                                    const T *x1, T *y, cudaStream_t stream);
+
+template <typename T1, typename T2, typename T3>
+CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T1 *x0,
+                                           const T2 *x1, Complex<T3> *y, cudaStream_t stream);
+template <typename T>
+CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T *x0,
+                                           const T *x1, Complex<T> *y, cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                 const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                 const T *input_addr, T *output_addr, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BROADCAST_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cu
similarity index 72%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cu
index c4a759a5be0..9a383d77d32 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cu
@@ -17,7 +17,8 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T, typename S>
 __global__ void CastAll(T** inputs, S** output, const size_t num, const size_t *size) {
@@ -34,7 +35,7 @@ void CastAllKernel(T** inputs, S** output, const size_t max, const size_t num, c
     CastAll<<<GET_BLOCKS(max), GET_THREADS, 0, stream>>>(inputs, output, num, size);
     return;
 }
-template void CastAllKernel(half** inputs, float** output, const size_t max, const size_t num,
-                            const size_t *size, cudaStream_t stream);
-template void CastAllKernel(float** inputs, half** output, const size_t max, const size_t num,
-                            const size_t *size, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CastAllKernel(half** inputs, float** output, const size_t max, const size_t num,
+                                            const size_t *size, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CastAllKernel(float** inputs, half** output, const size_t max, const size_t num,
+                                            const size_t *size, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cuh
similarity index 56%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cuh
index fd2ccc188a7..63303e792b1 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cuh
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CAST_ALL_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CAST_ALL_IMPL_CUH_
 #include <vector>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T, typename S>
-void CastAllKernel(T **inputs, S **output, const size_t max, const size_t num, const size_t *size, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_
+CUDA_LIB_EXPORT void CastAllKernel(T **inputs, S **output, const size_t max, const size_t num, const size_t *size,
+                                   cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CAST_ALL_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cu
new file mode 100644
index 00000000000..9d373a7c826
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cu
@@ -0,0 +1,509 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <iostream>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh"
+#include "include/cuda_fp16.h"
+
+// Generic cast
+template <typename S, typename T>
+__device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) {
+  *output_addr = static_cast<T>((*input_addr));
+}
+
+// half --> integer
+__device__ __forceinline__ void CastBase(const half *input_addr, uint64_t *output_addr) {
+  *output_addr = __half2ull_rz((*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const half *input_addr, int64_t *output_addr) {
+  *output_addr = __half2ll_rz((*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const half *input_addr, uint32_t *output_addr) {
+  *output_addr = __half2uint_rz((*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const half *input_addr, int32_t *output_addr) {
+  *output_addr = __half2int_rz((*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const half *input_addr, uint16_t *output_addr) {
+  *output_addr = __half2ushort_rz((*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const half *input_addr, int16_t *output_addr) {
+  *output_addr = __half2short_rz((*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const half *input_addr, uint8_t *output_addr) {
+  *output_addr = static_cast<uint8_t>(__half2ushort_rz((*input_addr)));
+}
+
+__device__ __forceinline__ void CastBase(const half *input_addr, int8_t *output_addr) {
+  *output_addr = static_cast<int8_t>(__half2short_rz((*input_addr)));
+}
+
+// integer --> half
+__device__ __forceinline__ void CastBase(const uint64_t *input_addr, half *output_addr) {
+  *output_addr = __ull2half_rn((*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const int64_t *input_addr, half *output_addr) {
+  *output_addr = __ll2half_rn((*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const uint32_t *input_addr, half *output_addr) {
+  *output_addr = __uint2half_rn((*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const int32_t *input_addr, half *output_addr) {
+  *output_addr = __int2half_rn((*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const uint16_t *input_addr, half *output_addr) {
+  *output_addr = __ushort2half_rn((*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const int16_t *input_addr, half *output_addr) {
+  *output_addr = __short2half_rn((*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const uint8_t *input_addr, half *output_addr) {
+  *output_addr = __ushort2half_rn(static_cast<uint16_t>(*input_addr));
+}
+
+__device__ __forceinline__ void CastBase(const int8_t *input_addr, half *output_addr) {
+  *output_addr = __short2half_rn(static_cast<int16_t>(*input_addr));
+}
+
+// Cast
+template <typename S, typename T>
+__global__ void CastKernel(const int input_size, const S *input_addr, T *output_addr) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < input_size; pos += blockDim.x * gridDim.x) {
+    CastBase(input_addr + pos, output_addr + pos);
+  }
+}
+
+template <typename S, typename T>
+void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream) {
+  CastKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, input_addr, output_addr);
+}
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<float> *input_addr, Complex<double> *output_addr,
+                                   cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, int8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, int16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, int32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, int64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, uint8_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, uint16_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, uint32_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, uint64_t *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, float *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, double *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, half *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, bool *output_addr,
+                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex<double> *input_addr, Complex<float> *output_addr,
+                                   cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh
similarity index 57%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh
index b4ef646dbe8..3fe6249e059 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_H_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CAST_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CAST_IMPL_CUH_
 #include <vector>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "utils/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
 
 template <typename S, typename T>
-void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_H_
+CUDA_LIB_EXPORT void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CAST_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cu
similarity index 71%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cu
index 339929581bc..b2d4fa25c4e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T, typename S>
 __global__ void CheckValidKernel(const size_t size, const T *box, const T *img_metas, S *valid) {
@@ -58,11 +59,11 @@ void CheckValid(const size_t &size, const T *box, const T *img_metas, S *valid,
   CheckValidKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, box, img_metas, valid);
 }
 
-template void CheckValid(const size_t &size, const float *box, const float *img_metas, bool *valid,
-                         cudaStream_t cuda_stream);
-template void CheckValid(const size_t &size, const half *box, const half *img_metas, bool *valid,
-                         cudaStream_t cuda_stream);
-template void CheckValid(const size_t &size, const short *box, const short *img_metas, bool *valid,  // NOLINT
-                         cudaStream_t cuda_stream);
-template void CheckValid(const size_t &size, const unsigned char *box, const unsigned char *img_metas, bool *valid,
-                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CheckValid(const size_t &size, const float *box, const float *img_metas, bool *valid,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CheckValid(const size_t &size, const half *box, const half *img_metas, bool *valid,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CheckValid(const size_t &size, const short *box, const short *img_metas, bool *valid,  // NOLINT
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CheckValid(const size_t &size, const unsigned char *box, const unsigned char *img_metas,
+                                         bool *valid, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cuh
similarity index 56%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cuh
index 8870ae5a2bc..36086eb1982 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cuh
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CHECK_VALID_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CHECK_VALID_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CHECK_VALID_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CHECK_VALID_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T, typename S>
-void CheckValid(const size_t &size, const T *box, const T *img_metas, S *valid, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CheckValid(const size_t &size, const T *box, const T *img_metas, S *valid,
+                                cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CHECK_VALID_IMPL_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CHECK_VALID_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cu
similarity index 72%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cu
index c1974341ad4..954cc9f6629 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cuh"
+#include "include/cuda_fp16.h"
 
 // The implement of ScalingGradOp
 template <typename T>
@@ -38,11 +39,11 @@ void ScalingGradOp(const size_t size, const T *x, const float *scaling_factor, f
   ScalingGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, x, scaling_factor, scaling_out_addr);
 }
 
-template void ScalingGradOp<float>(const size_t size, const float *x, const float *scaling_factor,
-                                   float *scaling_out_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ScalingGradOp<float>(const size_t size, const float *x, const float *scaling_factor,
+                                                   float *scaling_out_addr, cudaStream_t cuda_stream);
 
-template void ScalingGradOp<half>(const size_t size, const half *x, const float *scaling_factor,
-                                  float *scaling_out_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ScalingGradOp<half>(const size_t size, const half *x, const float *scaling_factor,
+                                                  float *scaling_out_addr, cudaStream_t cuda_stream);
 
 // The implement of ClipGradNormOp
 template <typename T>
@@ -77,8 +78,10 @@ void ClipGradNormOp(const size_t size, const float *x, const T *clip_norm, const
                                                                         output_addr);
 }
 
-template void ClipGradNormOp<float>(const size_t size, const float *x, const float *clip_norm,
-                                    const float *reduce_sum_value, float *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ClipGradNormOp<float>(const size_t size, const float *x, const float *clip_norm,
+                                                    const float *reduce_sum_value, float *output_addr,
+                                                    cudaStream_t cuda_stream);
 
-template void ClipGradNormOp<half>(const size_t size, const float *x, const half *clip_norm,
-                                   const float *reduce_sum_value, float *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ClipGradNormOp<half>(const size_t size, const float *x, const half *clip_norm,
+                                                   const float *reduce_sum_value, float *output_addr,
+                                                   cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cuh
new file mode 100644
index 00000000000..7faf8cb530c
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cuh
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CLIP_GRAD_NORM_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CLIP_GRAD_NORM_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void ScalingGradOp(const size_t size, const T *x, const float *scaling_factor, float *scaling_out_addr,
+                                   cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void ClipGradNormOp(const size_t size, const float *x, const T *clip_norm,
+                                    const float *reduce_sum_value, float *output_addr, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CLIP_GRAD_NORM_IMPL_CUH_
diff --git a/mindspore/ccsrc/utils/complex.h b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h
similarity index 99%
rename from mindspore/ccsrc/utils/complex.h
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h
index cd83ebfd8b7..0779504dd38 100644
--- a/mindspore/ccsrc/utils/complex.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h
@@ -16,12 +16,12 @@
 #ifndef MINDSPORE_CCSRC_UTILS_COPLEX_H_
 #define MINDSPORE_CCSRC_UTILS_COPLEX_H_
 
-#include <complex>
-#include <limits>
 #ifdef ENABLE_GPU
 #include <thrust/complex.h>
 #include <cublas_v2.h>
 #endif
+#include <complex>
+#include <limits>
 #include "base/float16.h"
 #if defined(__CUDACC__)
 #define HOST_DEVICE __host__ __device__
@@ -32,8 +32,9 @@
 namespace mindspore {
 namespace utils {
 // Implement Complex for mindspore, inspired by std::complex.
+constexpr int T_SIZE = 2;
 template <typename T>
-struct alignas(sizeof(T) * 2) Complex {
+struct alignas(sizeof(T) * T_SIZE) Complex {
   Complex() = default;
   ~Complex() = default;
 
@@ -315,12 +316,9 @@ HOST_DEVICE inline T abs(const Complex<T> &z) {
 
 template <typename T>
 using Complex = mindspore::utils::Complex<T>;
-
 namespace std {
-
 template <typename T>
 class numeric_limits<mindspore::utils::Complex<T>> : public numeric_limits<T> {};
-
 }  // namespace std
 
 #endif  // MINDSPORE_CCSRC_UTILS_COPLEX_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cu
new file mode 100755
index 00000000000..df9462df770
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cu
@@ -0,0 +1,93 @@
+/**
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cuh"
+#include "include/cuda_fp16.h"
+template <typename T>
+__global__ void Concat(const size_t size, const int input_num, const int all_size_before_axis, const int all_size_axis,
+                       int *len_axis, T **inputs, T *output) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
+    int num = pos % all_size_before_axis / all_size_axis;
+    int block = -1;
+    int axis_inc = 0;
+    int block_len = 0;
+    for (int i = 0; i < input_num; i++) {
+      if (axis_inc <= num) {
+        block++;
+        axis_inc += len_axis[i];
+      } else {
+        break;
+      }
+    }
+    block_len = len_axis[block];
+    axis_inc -= len_axis[block];
+    int block_pos =
+      pos / all_size_before_axis * block_len * all_size_axis + (num - axis_inc) * all_size_axis + pos % all_size_axis;
+    output[pos] = inputs[block][block_pos];
+  }
+  return;
+}
+
+template <typename T>
+void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, const int all_size_axis,
+                  int *len_axis, T **inputs, T *output, cudaStream_t cuda_stream) {
+  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_num, all_size_before_axis, all_size_axis,
+                                                            len_axis, inputs, output);
+  return;
+}
+
+template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                           const int all_size_axis, int *len_axis, double **inputs, double *output,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                           const int all_size_axis, int *len_axis, float **inputs, float *output,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                           const int all_size_axis, int *len_axis, half **inputs, half *output,
+                                           cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                           const int all_size_axis, int *len_axis, int64_t **inputs, int64_t *output,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                           const int all_size_axis, int *len_axis, int **inputs, int *output,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                           const int all_size_axis, int *len_axis, short **inputs, short *output,  // NOLINT
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                           const int all_size_axis, int *len_axis, char **inputs, char *output,
+                                           cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                           const int all_size_axis, int *len_axis, uint64_t **inputs, uint64_t *output,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                           const int all_size_axis, int *len_axis, uint32_t **inputs, uint32_t *output,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                           const int all_size_axis, int *len_axis, uint16_t **inputs, uint16_t *output,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                           const int all_size_axis, int *len_axis, unsigned char **inputs,
+                                           unsigned char *output, cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                           const int all_size_axis, int *len_axis, bool **inputs, bool *output,
+                                           cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cuh
similarity index 51%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cuh
index 8b65d14467c..b09486847ce 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cuh
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CONCATV2_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CONCATV2_IMPL_CUH_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CONCATV2_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CONCATV2_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, const int all_size_axis,
-                  int *len_axis, T **inputs, T *output, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CONCATV2_IMPL_CUH_
+CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
+                                  const int all_size_axis, int *len_axis, T **inputs, T *output,
+                                  cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CONCATV2_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cu
similarity index 79%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cu
index c4bba2863c0..d1a9f81b549 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cu
@@ -85,16 +85,18 @@ void ConvertGradientBack(const size_t size, const size_t height_h, const size_t
     size, height_h, height_w, ori_h, ori_w, batchwidth, width, input_addr, output_addr);
 }
 
-template void ConvertGradient<float>(const size_t size, const size_t height_h, const size_t height_w,
-                                     const size_t batchwidth, const size_t width, float *input_addr, float *output_addr,
-                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ConvertGradient<float>(const size_t size, const size_t height_h, const size_t height_w,
+                                                     const size_t batchwidth, const size_t width, float *input_addr,
+                                                     float *output_addr, cudaStream_t cuda_stream);
 
-template void ConvertGradientBack<float>(const size_t size, const size_t height_h, const size_t height_w,
-                                         const size_t batchwidth, const size_t width, float *input_addr,
-                                         float *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ConvertGradientBack<float>(const size_t size, const size_t height_h,
+                                                         const size_t height_w, const size_t batchwidth,
+                                                         const size_t width, float *input_addr,
+                                                         float *output_addr, cudaStream_t cuda_stream);
 
-template void ConvertGradientBack<float>(const size_t size, const size_t height_h, const size_t height_w,
-                                         const size_t ori_h, const size_t ori_w, const size_t batchwidth,
-                                         const size_t width, float *input_addr, float *output_addr,
-                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ConvertGradientBack<float>(const size_t size, const size_t height_h,
+                                                         const size_t height_w, const size_t ori_h,
+                                                         const size_t ori_w, const size_t batchwidth,
+                                                         const size_t width, float *input_addr, float *output_addr,
+                                                         cudaStream_t cuda_stream);
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cuh
new file mode 100644
index 00000000000..8ddfc08db38
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cuh
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CONVERT_GRADIENT_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CONVERT_GRADIENT_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void ConvertGradient(const size_t size, const size_t height_h, const size_t height_w,
+                                     const size_t batchwidth, const size_t width, T *input_addr, T *outt_addr,
+                                     cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w,
+                                         const size_t batchwidth, const size_t width, T *input_addr, T *output_addr,
+                                         cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w,
+                                         const size_t ori_h, const size_t ori_w, const size_t batchwidth,
+                                         const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CONVERT_GRADIENT_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cu
similarity index 79%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cu
index b7a22dab73b..21146575b7e 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cu
@@ -16,7 +16,6 @@
 
 #include <thrust/reduce.h>
 #include "correction_mul_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 
 template <typename T>
 __global__ void CorrectionMul(const T* weight, const T* gamma, const T* running_std, const int batchsize, const int chw,
@@ -52,8 +51,9 @@ void CalCorrectionMul(const T* weight, const T* gamma, const T* running_std, int
                                                                             output);
 }
 
-template void CalCorrectionMul<float>(const float* weight, const float* gamma, const float* running_std, int N, int C,
-                                      int H, int W, float* output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCorrectionMul<float>(const float* weight, const float* gamma, const float* running_std,
+                                                      int N, int C, int H, int W, float* output,
+                                                      cudaStream_t cuda_stream);
 
 template <typename T>
 void CalCorrectionMulGrad(const T* d_out, const T* weight, const T* running_std, int N, int C, int H, int W, T* d_gamma,
@@ -62,5 +62,6 @@ void CalCorrectionMulGrad(const T* d_out, const T* weight, const T* running_std,
   Reduce<<<GET_BLOCKS(N), GET_THREADS, 0, cuda_stream>>>(N, C * H * W, tmp, running_std, d_gamma);
 }
 
-template void CalCorrectionMulGrad<float>(const float* d_out, const float* weight, const float* running_std, int N,
-                                          int C, int H, int W, float* d_gamma, float* tmp, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCorrectionMulGrad<float>(const float* d_out, const float* weight,
+                                                          const float* running_std, int N, int C, int H, int W,
+                                                          float* d_gamma, float* tmp, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh
new file mode 100644
index 00000000000..1d7b0a1a9ac
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CORRECTION_MUL_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CORRECTION_MUL_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void CalCorrectionMul(const T* weight, const T* gamma, const T* running_std, int batch_size,
+                                      int channel_size, int height, int width, T* output, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalCorrectionMulGrad(const T* d_out, const T* weight, const T* running_std, int batch_size,
+                                          int channel_size, int height, int width, T* d_gamma, T* tmp,
+                                          cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CORRECTION_MUL_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cu
similarity index 53%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cu
index c1b605c063f..98518f220fa 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cu
@@ -16,7 +16,8 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdint.h>
-#include "plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cuh"
+#include "include/cuda_fp16.h"
 
 // for every position, first calculate position it mirrors from in the new padded array
 // adjust calculated position to origin dx array dimensions and copy value
@@ -110,39 +111,47 @@ void CalCropAndResize(const size_t size, const T *input_image, float *input_boxe
   return;
 }
 
-template void CalCropAndResize<int8_t>(const size_t size, const int8_t *input_image, float *input_boxes,
-                                       int *input_box_index, int batch, int input_height, int input_width,
-                                       int final_height, int final_width, int channel, int method,
-                                       float extrapol_val, float *output, cudaStream_t cuda_stream);
-template void CalCropAndResize<int16_t>(const size_t size, const int16_t *input_image, float *input_boxes,
-                                        int *input_box_index, int batch, int input_height, int input_width,
-                                        int final_height, int final_width, int channel, int method,
-                                        float extrapol_val, float *output, cudaStream_t cuda_stream);
-template void CalCropAndResize<int32_t>(const size_t size, const int32_t *input_image, float *input_boxes,
-                                        int *input_box_index, int batch, int input_height, int input_width,
-                                        int final_height, int final_width, int channel, int method,
-                                        float extrapol_val, float *output, cudaStream_t cuda_stream);
-template void CalCropAndResize<int64_t>(const size_t size, const int64_t *input_image, float *input_boxes,
-                                        int *input_box_index, int batch, int input_height, int input_width,
-                                        int final_height, int final_width, int channel, int method,
-                                        float extrapol_val, float *output, cudaStream_t cuda_stream);
-template void CalCropAndResize<half>(const size_t size, const half *input_image, float *input_boxes,
-                                     int *input_box_index, int batch, int input_height, int input_width,
-                                     int final_height, int final_width, int channel, int method,
-                                     float extrapol_val, float *output, cudaStream_t cuda_stream);
-template void CalCropAndResize<float>(const size_t size, const float *input_image, float *input_boxes,
-                                      int *input_box_index, int batch, int input_height, int input_width,
-                                      int final_height, int final_width, int channel, int method,
-                                      float extrapol_val, float *output, cudaStream_t cuda_stream);
-template void CalCropAndResize<double>(const size_t size, const double *input_image, float *input_boxes,
-                                       int *input_box_index, int batch, int input_height, int input_width,
-                                       int final_height, int final_width, int channel, int method,
-                                       float extrapol_val, float *output, cudaStream_t cuda_stream);
-template void CalCropAndResize<uint8_t>(const size_t size, const uint8_t *input_image, float *input_boxes,
-                                        int *input_box_index, int batch, int input_height, int input_width,
-                                        int final_height, int final_width, int channel, int method,
-                                        float extrapol_val, float *output, cudaStream_t cuda_stream);
-template void CalCropAndResize<uint16_t>(const size_t size, const uint16_t *input_image, float *input_boxes,
-                                         int *input_box_index, int batch, int input_height, int input_width,
-                                         int final_height, int final_width, int channel, int method,
-                                         float extrapol_val, float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCropAndResize<int8_t>(const size_t size, const int8_t *input_image, float *input_boxes,
+                                                       int *input_box_index, int batch, int input_height,
+                                                       int input_width, int final_height, int final_width, int channel,
+                                                       int method, float extrapol_val, float *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCropAndResize<int16_t>(const size_t size, const int16_t *input_image,
+                                                        float *input_boxes, int *input_box_index, int batch,
+                                                        int input_height, int input_width, int final_height,
+                                                        int final_width, int channel, int method, float extrapol_val,
+                                                        float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCropAndResize<int32_t>(const size_t size, const int32_t *input_image,
+                                                        float *input_boxes, int *input_box_index, int batch,
+                                                        int input_height, int input_width, int final_height,
+                                                        int final_width, int channel, int method, float extrapol_val,
+                                                        float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCropAndResize<int64_t>(const size_t size, const int64_t *input_image,
+                                                        float *input_boxes, int *input_box_index, int batch,
+                                                        int input_height, int input_width, int final_height,
+                                                        int final_width, int channel, int method, float extrapol_val,
+                                                        float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCropAndResize<half>(const size_t size, const half *input_image, float *input_boxes,
+                                                     int *input_box_index, int batch, int input_height, int input_width,
+                                                     int final_height, int final_width, int channel, int method,
+                                                     float extrapol_val, float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCropAndResize<float>(const size_t size, const float *input_image, float *input_boxes,
+                                                      int *input_box_index, int batch, int input_height,
+                                                      int input_width, int final_height, int final_width, int channel,
+                                                      int method, float extrapol_val, float *output,
+                                                      cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCropAndResize<double>(const size_t size, const double *input_image, float *input_boxes,
+                                                       int *input_box_index, int batch, int input_height,
+                                                       int input_width, int final_height, int final_width, int channel,
+                                                       int method, float extrapol_val, float *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCropAndResize<uint8_t>(const size_t size, const uint8_t *input_image,
+                                                        float *input_boxes, int *input_box_index, int batch,
+                                                        int input_height, int input_width, int final_height,
+                                                        int final_width, int channel, int method,
+                                                        float extrapol_val, float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCropAndResize<uint16_t>(const size_t size, const uint16_t *input_image,
+                                                         float *input_boxes, int *input_box_index, int batch,
+                                                         int input_height, int input_width, int final_height,
+                                                         int final_width, int channel, int method, float extrapol_val,
+                                                         float *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cuh
new file mode 100644
index 00000000000..498a037fdd5
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CROP_AND_RESIZE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CROP_AND_RESIZE_IMPL_CUH_
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void CalCropAndResize(const size_t size, const T *input_image, float *input_boxes, int *input_box_index,
+                                      int batch, int input_height, int input_width, int final_height, int final_width,
+                                      int channel, int method, float extrapol_val, float *output,
+                                      cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CROP_AND_RESIZE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cu
similarity index 72%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cu
index 6fcbec1d545..b414a2a54b1 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cu
@@ -18,7 +18,7 @@
 #include <stdint.h>
 #include <cuda_runtime.h>
 #include "cross_entropy_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 template <typename T, typename S>
 __global__ void CrossEntropyWithSparseKernel(const T *logits, const S *labels, const size_t batch_size,
@@ -104,16 +104,19 @@ void CrossEntropy(const T *logits, const S *labels, const size_t batch_size, con
                                                                               epsilon, losses, dlogits);
 }
 
-template void CrossEntropyWithSparse<float, int>(const float *logits, const int *labels, const size_t batch_size,
-                                                 const size_t class_num, float *loss, cudaStream_t cuda_stream);
-template void CrossEntropyWithSparse<float, int64_t>(const float *logits, const int64_t *labels,
-                                                     const size_t batch_size, const size_t class_num, float *loss,
-                                                     cudaStream_t cuda_stream);
-template void CrossEntropyGradWithSparse<float, int>(const float *logits, const int *labels, const size_t batch_size,
-                                                     const size_t class_num, float *grad, cudaStream_t cuda_stream);
-template void CrossEntropyGradWithSparse<float, int64_t>(const float *logits, const int64_t *labels,
-                                                         const size_t batch_size, const size_t class_num, float *grad,
-                                                         cudaStream_t cuda_stream);
-template void CrossEntropy<float, float>(const float *logits, const float *labels, const size_t batch_size,
-                                         const size_t class_num, float *losses, float *dlogits,
-                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CrossEntropyWithSparse<float, int>(const float *logits, const int *labels,
+                                                                 const size_t batch_size, const size_t class_num,
+                                                                 float *loss, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CrossEntropyWithSparse<float, int64_t>(const float *logits, const int64_t *labels,
+                                                                     const size_t batch_size, const size_t class_num,
+                                                                     float *loss, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CrossEntropyGradWithSparse<float, int>(const float *logits, const int *labels,
+                                                                     const size_t batch_size, const size_t class_num,
+                                                                     float *grad, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CrossEntropyGradWithSparse<float, int64_t>(const float *logits, const int64_t *labels,
+                                                                         const size_t batch_size,
+                                                                         const size_t class_num, float *grad,
+                                                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CrossEntropy<float, float>(const float *logits, const float *labels,
+                                                         const size_t batch_size, const size_t class_num, float *losses,
+                                                         float *dlogits, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cuh
new file mode 100644
index 00000000000..21ef77f92f5
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cuh
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CROSS_ENTROPY_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CROSS_ENTROPY_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+// The batch size limit to judge whether to use multiple threads.
+constexpr int kLargeBatchLowLimit = 32768;
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CrossEntropyWithSparse(const T *logits, const S *labels, const size_t batch_size,
+                                            const size_t class_num, T *loss, cudaStream_t cuda_stream);
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CrossEntropyGradWithSparse(const T *logits, const S *labels, const size_t batch_size,
+                                                const size_t class_num, T *grad, cudaStream_t cuda_stream);
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CrossEntropy(const T *logits, const S *labels, const size_t batch_size, const size_t class_num,
+                                  T *losses, T *dlogits, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CROSS_ENTROPY_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cu
similarity index 89%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cu
index e41030a7893..695cb1af5ca 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cu
@@ -16,7 +16,6 @@
 
 #include <limits>
 #include "ctcloss_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 __device__ T LogSumExp(const T logprob1, const T logprob2) {
   if (logprob1 == logprob2 && logprob1 == -std::numeric_limits<T>::infinity()) {
@@ -427,20 +426,23 @@ void CTCLoss(T *log_alpha_b, T *log_beta_b, T *softmax_probs, int *label_value_w
     label_squence_length, cum_labels_length, cost, grads, prob_num, ignore_longer_outputs_than_inputs);
 }
 
-template void CalculateFwdVar<float>(float *log_alpha_b, int *label_value_with_blank, float *softmax_probs,
-                                     const int *sequence_length, bool ctc_merge_repeated, int batch, int SOffSet,
-                                     int maxtime, int blank, int *label_squence_length, int *cum_labels_length,
-                                     bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalculateFwdVar<float>(float *log_alpha_b, int *label_value_with_blank,
+                                                     float *softmax_probs, const int *sequence_length,
+                                                     bool ctc_merge_repeated, int batch, int SOffSet, int maxtime,
+                                                     int blank, int *label_squence_length, int *cum_labels_length,
+                                                     bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
 
-template void CalculateBwdVar<float>(float *log_beta_b, int *label_value_with_blank, float *softmax_probs,
-                                     const int *sequence_length, bool ctc_merge_repeated, int batch, int SOffSet,
-                                     int maxtime, int blank, int *label_squence_length, int *cum_labels_length,
-                                     bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalculateBwdVar<float>(float *log_beta_b, int *label_value_with_blank,
+                                                     float *softmax_probs, const int *sequence_length,
+                                                     bool ctc_merge_repeated, int batch, int SOffSet, int maxtime,
+                                                     int blank, int *label_squence_length, int *cum_labels_length,
+                                                     bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
 
-template void InnerSoftMax<float>(const float *probs, float *softmax_probs, const int *sequence_length, int max_time,
-                                  int batch, int numclass, cudaStream_t stream);
+template CUDA_LIB_EXPORT void InnerSoftMax<float>(const float *probs, float *softmax_probs, const int *sequence_length,
+                                                  int max_time, int batch, int numclass, cudaStream_t stream);
 
-template void CTCLoss<float>(float *log_alpha_b, float *log_beta_b, float *softmax_probs, int *label_value_with_blank,
-                             int batch, int SOffSet, int maxtime, int numclass, const int *sequence_length,
-                             int *label_squence_length, int *cum_labels_length, float *cost, float *grads,
-                             float *prob_num, bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CTCLoss<float>(float *log_alpha_b, float *log_beta_b, float *softmax_probs,
+                                             int *label_value_with_blank, int batch, int SOffSet, int maxtime,
+                                             int numclass, const int *sequence_length, int *label_squence_length,
+                                             int *cum_labels_length, float *cost, float *grads, float *prob_num,
+                                             bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cuh
new file mode 100644
index 00000000000..8e21b7e634d
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cuh
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CTCLOSS_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CTCLOSS_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void CalculateFwdVar(T *log_alpha_b, int *label_value_with_blank, T *softmax_probs,
+                                     const int *sequence_length, bool ctc_merge_repeated, int batch, int SOffSet,
+                                     int maxtime, int blank, int *label_squence_length, int *cum_labels_length,
+                                     bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalculateBwdVar(T *log_beta_b, int *label_value_with_blank, T *softmax_probs,
+                                     const int *sequence_length, bool ctc_merge_repeated, int batch, int SOffSet,
+                                     int maxtime, int blank, int *label_squence_length, int *cum_labels_length,
+                                     bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void InnerSoftMax(const T *probs, T *softmax_cost, const int *sequence_length, int max_time, int batch,
+                                  int numclass, cudaStream_t stream);
+
+CUDA_LIB_EXPORT void GenLabelValuePCR(int *label_value_sp, int *label_value_pcr, int *label_squence_length,
+                                      int *cum_labels_length, int *max_labels_length, int batch, cudaStream_t stream);
+
+CUDA_LIB_EXPORT void GenLabelWithBlank(int *label_value, int *label_value_with_blank, int *label_squence_length,
+                                       int *precum_labels_length, int *cum_labels_length, int batch, int blank,
+                                       cudaStream_t stream);
+
+CUDA_LIB_EXPORT void GenLabelValue(int *label_value_sp, const int64_t *label_indices, const int *label_values,
+                                   int *label_squence_length, int *cum_labels_length, int *max_labels_length,
+                                   int size, int blank,
+                                   int batch, cudaStream_t stream);
+
+CUDA_LIB_EXPORT void CalculatePreLength(int *label_squence_length, int *precum_labels_length, int *cum_labels_length,
+                                        int *max_labels_length, const int64_t *label_indices, int batch, int size,
+                                        cudaStream_t stream);
+CUDA_LIB_EXPORT void CalculateMaxSequence(const int *sequence_length, int *max_labels_length, int batch,
+                                          cudaStream_t stream);
+template <typename T>
+CUDA_LIB_EXPORT void CTCLoss(T *log_alpha_b, T *log_beta_b, T *softmax_probs, int *label_value_with_blank, int batch,
+                             int SOffSet, int maxtime, int numclass, const int *sequence_length,
+                             int *label_squence_length, int *cum_labels_length, T *cost, T *grads, T *prob_num,
+                             bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CTCLOSS_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.cc
new file mode 100644
index 00000000000..88c40cb6190
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.cc
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "utils/ms_context.h"
+
+namespace mindspore {
+namespace device {
+namespace gpu {
+CudaCommon &CudaCommon::GetInstance() {
+  static CudaCommon instance;
+  return instance;
+}
+
+CudaCommon::CudaCommon() {
+  uint32_t device_id = MsContext::GetInstance()->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  cudaDeviceProp prop;
+  (void)cudaGetDeviceProperties(&prop, device_id);
+  threads_per_block_ = prop.maxThreadsPerBlock;
+  max_blocks_ = prop.multiProcessorCount;
+  major_sm_ = prop.major;
+  minor_sm_ = prop.minor;
+  max_share_memory_ = prop.sharedMemPerBlock;
+}
+}  // namespace gpu
+}  // namespace device
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_common.h b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h
similarity index 78%
rename from mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_common.h
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h
index e0a172fdbd1..6e2ba9c96fe 100644
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_common.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h
@@ -14,12 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_CUDA_COMMON_H_
-#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_CUDA_COMMON_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUDA_COMMON_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUDA_COMMON_H_
 
+#include <cudnn.h>
+#include <cublas_v2.h>
 #include <algorithm>
-#include "plugin/device/gpu/hal/device/gpu_device_manager.h"
+#include <cusolverDn.h>
 
+#define CUDA_LIB_EXPORT __attribute__((visibility("default")))
 #define CUDA_KERNEL_ASSERT(cond)                                                       \
   if (!(cond)) {                                                                       \
     __assert_fail(#cond, __FILE__, static_cast<unsigned int>(__LINE__), __FUNCTION__); \
@@ -40,22 +43,10 @@ class CudaCommon {
   void set_check_sm(const bool &flag) { check_sm_ = flag; }
   bool check_sm() const { return check_sm_; }
 
-  static CudaCommon &GetInstance() {
-    static CudaCommon instance;
-    return instance;
-  }
+  static CudaCommon &GetInstance();
 
  private:
-  CudaCommon() {
-    uint32_t device_id = GPUDeviceManager::GetInstance().cur_device_id();
-    cudaDeviceProp prop;
-    (void)cudaGetDeviceProperties(&prop, device_id);
-    threads_per_block_ = prop.maxThreadsPerBlock;
-    max_blocks_ = prop.multiProcessorCount;
-    major_sm_ = prop.major;
-    minor_sm_ = prop.minor;
-    max_share_memory_ = prop.sharedMemPerBlock;
-  }
+  CudaCommon();
   ~CudaCommon() = default;
   CudaCommon(const CudaCommon &) = delete;
   CudaCommon &operator=(const CudaCommon &) = delete;
@@ -80,4 +71,4 @@ class CudaCommon {
 }  // namespace device
 }  // namespace mindspore
 
-#endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_CUDA_COMMON_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUDA_COMMON_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cu
similarity index 72%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cu
index a31fd2234bf..e881ee06266 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "cumprod_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void Copy(T *input, T *output, size_t size) {
@@ -137,19 +137,21 @@ void CumProd(const T *input, T *output, T *workspace, size_t dim0, size_t dim1,
   return;
 }
 
-template void CumProd<uint8_t>(const uint8_t *input, uint8_t *output, uint8_t *workspace, size_t dim0, size_t dim1,
-                              size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_,
-                              cudaStream_t stream);
-template void CumProd<int8_t>(const int8_t *input, int8_t *output, int8_t *workspace, size_t dim0, size_t dim1,
-                             size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_,
-                             cudaStream_t stream);
-template void CumProd<int32_t>(const int32_t *input, int32_t *output, int32_t *workspace, size_t dim0, size_t dim1,
-                              size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_,
-                              cudaStream_t stream);
-template void CumProd<double>(const double *input, double *output, double *workspace, size_t dim0, size_t dim1,
-                             size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_,
-                             cudaStream_t stream);
-template void CumProd<float>(const float *input, float *output, float *workspace, size_t dim0, size_t dim1, size_t dim2,
-                            size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
-template void CumProd<half>(const half *input, half *output, half *workspace, size_t dim0, size_t dim1, size_t dim2,
-                           size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CumProd<uint8_t>(const uint8_t *input, uint8_t *output, uint8_t *workspace,
+                                               size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
+                                               bool exclusive_, bool reverse_, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CumProd<int8_t>(const int8_t *input, int8_t *output, int8_t *workspace,
+                                              size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
+                                              bool exclusive_, bool reverse_, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CumProd<int32_t>(const int32_t *input, int32_t *output, int32_t *workspace,
+                                               size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
+                                               bool exclusive_, bool reverse_, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CumProd<double>(const double *input, double *output, double *workspace,
+                                              size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
+                                              bool exclusive_, bool reverse_, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CumProd<float>(const float *input, float *output, float *workspace,
+                                             size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
+                                             bool exclusive_, bool reverse_, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CumProd<half>(const half *input, half *output, half *workspace,
+                                            size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
+                                            bool exclusive_, bool reverse_, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cuh
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cuh
index 7998bc08912..8a08d82ebf5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cuh
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUMPROD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUMPROD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void CumProd(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, size_t dim2, size_t stride,
-            size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_
+CUDA_LIB_EXPORT void CumProd(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, size_t dim2,
+                             size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUMPROD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cu
similarity index 72%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cu
index 84f148cf6bb..9727ecfd97b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "cumsum_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void Copy(T *input, T *output, size_t size) {
@@ -137,19 +137,21 @@ void CumSum(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, s
   return;
 }
 
-template void CumSum<uint8_t>(const uint8_t *input, uint8_t *output, uint8_t *workspace, size_t dim0, size_t dim1,
-                              size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_,
-                              cudaStream_t stream);
-template void CumSum<int8_t>(const int8_t *input, int8_t *output, int8_t *workspace, size_t dim0, size_t dim1,
-                             size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_,
-                             cudaStream_t stream);
-template void CumSum<int32_t>(const int32_t *input, int32_t *output, int32_t *workspace, size_t dim0, size_t dim1,
-                              size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_,
-                              cudaStream_t stream);
-template void CumSum<double>(const double *input, double *output, double *workspace, size_t dim0, size_t dim1,
-                             size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_,
-                             cudaStream_t stream);
-template void CumSum<float>(const float *input, float *output, float *workspace, size_t dim0, size_t dim1, size_t dim2,
-                            size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
-template void CumSum<half>(const half *input, half *output, half *workspace, size_t dim0, size_t dim1, size_t dim2,
-                           size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CumSum<uint8_t>(const uint8_t *input, uint8_t *output, uint8_t *workspace,
+                                              size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
+                                              bool exclusive_, bool reverse_, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CumSum<int8_t>(const int8_t *input, int8_t *output, int8_t *workspace,
+                                             size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
+                                             bool exclusive_, bool reverse_, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CumSum<int32_t>(const int32_t *input, int32_t *output, int32_t *workspace,
+                                              size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
+                                              bool exclusive_, bool reverse_, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CumSum<double>(const double *input, double *output, double *workspace,
+                                             size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
+                                             bool exclusive_, bool reverse_, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CumSum<float>(const float *input, float *output, float *workspace,
+                                            size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
+                                            bool exclusive_, bool reverse_, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CumSum<half>(const half *input, half *output, half *workspace, size_t dim0, size_t dim1,
+                                           size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_,
+                                           cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cuh
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cuh
index 7e3c40c99ee..6d70ff1cbab 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cuh
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUMSUM_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUMSUM_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void CumSum(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, size_t dim2, size_t stride,
-            size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_
+CUDA_LIB_EXPORT void CumSum(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, size_t dim2,
+                            size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUMSUM_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cu
new file mode 100644
index 00000000000..99b0e7ac923
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cu
@@ -0,0 +1,138 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cuda_runtime.h>
+#include "depthtospace_impl.cuh"
+#include "include/cuda_fp16.h"
+
+template <typename T>
+__global__ void DepthToSpace(const size_t size, const T *input, const size_t in,
+                             const size_t ic, const size_t ih, const size_t iw,
+                             const size_t on, const size_t oc, const size_t oh,
+                             const size_t ow, const size_t r, T *output) {
+  size_t temp_stride = 0;
+  size_t temp_pos = 0;
+  size_t input_pos = 0;
+  size_t output_pos_array[DEPTHTOSPACE_BUFFER_DIMENSION];
+
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size;
+       pos += blockDim.x * gridDim.x) {
+    temp_stride = oc * oh * ow;
+    output_pos_array[0] = pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= oc;
+    output_pos_array[1] = temp_pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= oh;
+    output_pos_array[2] = temp_pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= ow;
+    output_pos_array[3] = temp_pos / temp_stride;
+
+    input_pos += output_pos_array[0];
+    input_pos =
+        (input_pos * ic) +
+        (output_pos_array[1] +
+         (r * (output_pos_array[2] % r) + output_pos_array[3] % r) * oc);
+    input_pos = (input_pos * ih) + (output_pos_array[2] / r);
+    input_pos = (input_pos * iw) + (output_pos_array[3] / r);
+
+    output[pos] = input[input_pos];
+    input_pos = 0;
+  }
+  return;
+}
+
+template <typename T>
+void CalDepthToSpace(const size_t size, const T *input, const size_t in,
+                     const size_t ic, const size_t ih, const size_t iw,
+                     const size_t on, const size_t oc, const size_t oh,
+                     const size_t ow, const size_t r, T *output,
+                     cudaStream_t cuda_stream) {
+  DepthToSpace<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+      size, input, in, ic, ih, iw, on, oc, oh, ow, r, output);
+  return;
+}
+
+template CUDA_LIB_EXPORT void CalDepthToSpace<float>(const size_t size, const float *input,
+                                                     const size_t in, const size_t ic,
+                                                     const size_t ih, const size_t iw,
+                                                     const size_t on, const size_t oc,
+                                                     const size_t oh, const size_t ow,
+                                                     const size_t r, float *output,
+                                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDepthToSpace<half>(const size_t size, const half *input,
+                                                    const size_t in, const size_t ic,
+                                                    const size_t ih, const size_t iw,
+                                                    const size_t on, const size_t oc,
+                                                    const size_t oh, const size_t ow,
+                                                    const size_t r, half *output,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDepthToSpace<int>(const size_t size, const int *input,
+                                                   const size_t in, const size_t ic,
+                                                   const size_t ih, const size_t iw,
+                                                   const size_t on, const size_t oc,
+                                                   const size_t oh, const size_t ow,
+                                                   const size_t r, int *output,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDepthToSpace<int64_t>(const size_t size, const int64_t *input,
+                                                       const size_t in, const size_t ic,
+                                                       const size_t ih, const size_t iw,
+                                                       const size_t on, const size_t oc,
+                                                       const size_t oh, const size_t ow,
+                                                       const size_t r, int64_t *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDepthToSpace<int16_t>(const size_t size, const int16_t *input,
+                                                       const size_t in, const size_t ic,
+                                                       const size_t ih, const size_t iw,
+                                                       const size_t on, const size_t oc,
+                                                       const size_t oh, const size_t ow,
+                                                       const size_t r, int16_t *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDepthToSpace<int8_t>(const size_t size, const int8_t *input,
+                                                      const size_t in, const size_t ic,
+                                                      const size_t ih, const size_t iw,
+                                                      const size_t on, const size_t oc,
+                                                      const size_t oh, const size_t ow,
+                                                      const size_t r, int8_t *output,
+                                                      cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDepthToSpace<uint8_t>(const size_t size, const uint8_t *input,
+                                                       const size_t in, const size_t ic,
+                                                       const size_t ih, const size_t iw,
+                                                       const size_t on, const size_t oc,
+                                                       const size_t oh, const size_t ow,
+                                                       const size_t r, uint8_t *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void
+CalDepthToSpace<uint16_t>(const size_t size, const uint16_t *input,
+                          const size_t in, const size_t ic, const size_t ih,
+                          const size_t iw, const size_t on, const size_t oc,
+                          const size_t oh, const size_t ow, const size_t r,
+                          uint16_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void
+CalDepthToSpace<uint32_t>(const size_t size, const uint32_t *input,
+                          const size_t in, const size_t ic, const size_t ih,
+                          const size_t iw, const size_t on, const size_t oc,
+                          const size_t oh, const size_t ow, const size_t r,
+                          uint32_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void
+CalDepthToSpace<uint64_t>(const size_t size, const uint64_t *input,
+                          const size_t in, const size_t ic, const size_t ih,
+                          const size_t iw, const size_t on, const size_t oc,
+                          const size_t oh, const size_t ow, const size_t r,
+                          uint64_t *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cuh
new file mode 100644
index 00000000000..1978bf96519
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DEPTHTOSPACE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DEPTHTOSPACE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+#define DEPTHTOSPACE_BUFFER_DIMENSION 4
+template <typename T>
+CUDA_LIB_EXPORT void CalDepthToSpace(const size_t size, const T *input, const size_t in,
+                                     const size_t ic, const size_t ih, const size_t iw,
+                                     const size_t on, const size_t oc, const size_t oh,
+                                     const size_t ow, const size_t r, T *output,
+                                     cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DEPTHTOSPACE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cu
similarity index 77%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cu
index bfe5741c294..c93f1e654c2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cu
@@ -15,6 +15,7 @@
  */ 
 
 #include "determinant_triangle_impl.cuh"
+#include "include/cuda_fp16.h"
 template <typename T>
 __global__ void DetTriangleKernel(T *input, T *output, size_t matrix_n_, size_t count) {
   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
@@ -75,9 +76,11 @@ bool CheckTriangle(T *input, int fill_mode_, size_t matrix_n_, size_t count, cud
   return host_error_res;
 }
 
-template void DetTriangle<float>(float *input, float *output, size_t matrix_n_, size_t count, cudaStream_t cuda_stream);
-template void DetTriangle<half>(half *input, half *output, size_t matrix_n_, size_t count, cudaStream_t cuda_stream);
-template bool CheckTriangle<float>(float *input, int fill_mode_, size_t matrix_n_, size_t count,
-                                   cudaStream_t cuda_stream);
-template bool CheckTriangle<half>(half *input, int fill_mode_, size_t matrix_n_, size_t count,
-                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void DetTriangle<float>(float *input, float *output, size_t matrix_n_, size_t count,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void DetTriangle<half>(half *input, half *output, size_t matrix_n_, size_t count,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT bool CheckTriangle<float>(float *input, int fill_mode_, size_t matrix_n_, size_t count,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT bool CheckTriangle<half>(half *input, int fill_mode_, size_t matrix_n_, size_t count,
+                                                  cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cuh
similarity index 53%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cuh
index a1a4cc0803b..c992c98fef8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cuh
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DETERMINANT_TRIANGLE_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DETERMINANT_TRIANGLE_IMPL_H_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DETERMINANT_TRIANGLE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DETERMINANT_TRIANGLE_IMPL_CUH_
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void DetTriangle(T *input, T *output, size_t matrix_n_, size_t count, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void DetTriangle(T *input, T *output, size_t matrix_n_, size_t count, cudaStream_t cuda_stream);
 template <typename T>
-bool CheckTriangle(T *input, int fill_mode_, size_t matrix_n_, size_t count, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DETERMINANT_TRIANGLE_IMPL_H_
+CUDA_LIB_EXPORT bool CheckTriangle(T *input, int fill_mode_, size_t matrix_n_, size_t count, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DETERMINANT_TRIANGLE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cu
similarity index 58%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cu
index 35decf9060b..debe823c39f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cu
@@ -17,6 +17,7 @@
 #include <stdint.h>
 #include "dropout3d_impl.cuh"
 #include "include/cuda_runtime.h"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void Dropout3DForwardKernel(const T *input, bool *mask, T *output, float *rand_f, const size_t num_count,
@@ -58,20 +59,21 @@ void Dropout3DForward(const T *input, bool *mask, T *output, float *rand_f, cons
                                                                                  keep_prob, scale, num_per_chan);
 }
 
-template void Dropout3DForward<float>(const float *input, bool *mask, float *output, float *rand_f,
-                                      const size_t num_count, const float keep_prob, const size_t num_per_chan,
-                                      cudaStream_t cuda_stream);
-template void Dropout3DForward<half>(const half *input, bool *mask, half *output, float *rand_f, const size_t num_count,
-                                     const float keep_prob, const size_t num_per_chan, cudaStream_t cuda_stream);
-template void Dropout3DForward<int8_t>(const int8_t *input, bool *mask, int8_t *output, float *rand_f,
-                                       const size_t num_count, const float keep_prob, const size_t num_per_chan,
-                                       cudaStream_t cuda_stream);
-template void Dropout3DForward<int16_t>(const int16_t *input, bool *mask, int16_t *output, float *rand_f,
-                                        const size_t num_count, const float keep_prob, const size_t num_per_chan,
-                                        cudaStream_t cuda_stream);
-template void Dropout3DForward<int32_t>(const int32_t *input, bool *mask, int32_t *output, float *rand_f,
-                                        const size_t num_count, const float keep_prob, const size_t num_per_chan,
-                                        cudaStream_t cuda_stream);
-template void Dropout3DForward<int64_t>(const int64_t *input, bool *mask, int64_t *output, float *rand_f,
-                                        const size_t num_count, const float keep_prob, const size_t num_per_chan,
-                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Dropout3DForward<float>(const float *input, bool *mask, float *output, float *rand_f,
+                                                      const size_t num_count, const float keep_prob,
+                                                      const size_t num_per_chan, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Dropout3DForward<half>(const half *input, bool *mask, half *output, float *rand_f,
+                                                     const size_t num_count, const float keep_prob,
+                                                     const size_t num_per_chan, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Dropout3DForward<int8_t>(const int8_t *input, bool *mask, int8_t *output, float *rand_f,
+                                                       const size_t num_count, const float keep_prob,
+                                                       const size_t num_per_chan, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Dropout3DForward<int16_t>(const int16_t *input, bool *mask, int16_t *output,
+                                                        float *rand_f, const size_t num_count, const float keep_prob,
+                                                        const size_t num_per_chan, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Dropout3DForward<int32_t>(const int32_t *input, bool *mask, int32_t *output,
+                                                        float *rand_f, const size_t num_count, const float keep_prob,
+                                                        const size_t num_per_chan, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Dropout3DForward<int64_t>(const int64_t *input, bool *mask, int64_t *output,
+                                                        float *rand_f, const size_t num_count, const float keep_prob,
+                                                        const size_t num_per_chan, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cuh
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cuh
index f1a50ce2ab7..75cd7d0fddd 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cuh
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_CUDA_IMPL_DROPOUT3D_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_CUDA_IMPL_DROPOUT3D_IMPL_CUH_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT3D_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT3D_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void Dropout3DForward(const T *input, bool *mask, T *output, float *rand_f, const size_t num_count,
-                      const float keep_prob, const size_t num_per_chan, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void Dropout3DForward(const T *input, bool *mask, T *output, float *rand_f, const size_t num_count,
+                                      const float keep_prob, const size_t num_per_chan, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_CUDA_IMPL_DROPOUT3D_IMPL_CUH_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT3D_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cu
similarity index 76%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cu
index d65f319ab73..e84f9cb8788 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cu
@@ -17,6 +17,7 @@
 #include <stdint.h>
 #include "dropout_impl.cuh"
 #include "include/cuda_runtime.h"
+#include "include/cuda_fp16.h"
 template <typename T>
 __global__ void DropoutForwardKernel(const T *input, T *mask, T *output, float *mask_f, size_t num_count,
                                      float keep_prob) {
@@ -65,11 +66,11 @@ void DropoutBackward(const T *dy, const T *mask, T *dx, size_t num_count, float
   DropoutBackwardKernel<<<GET_BLOCKS(num_count), GET_THREADS, 0, cuda_stream>>>(dy, mask, dx, num_count, drop_prob);
 }
 
-template void DropoutForward<float>(const float *input, float *mask, float *output, float *mask_f,
-                                    size_t num_count, float drop_prob, cudaStream_t cuda_stream);
-template void DropoutForward<half>(const half *input, half *mask, half *output, float *mask_f,
-                                    size_t num_count, float drop_prob, cudaStream_t cuda_stream);
-template void DropoutBackward<float>(const float *dy, const float *mask, float *dx, size_t num_count,
-                                     float drop_prob, cudaStream_t cuda_stream);
-template void DropoutBackward<half>(const half *dy, const half *mask, half *dx, size_t num_count,
-                                    float drop_prob, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void DropoutForward<float>(const float *input, float *mask, float *output, float *mask_f,
+                                                    size_t num_count, float drop_prob, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void DropoutForward<half>(const half *input, half *mask, half *output, float *mask_f,
+                                                   size_t num_count, float drop_prob, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void DropoutBackward<float>(const float *dy, const float *mask, float *dx, size_t num_count,
+                                                     float drop_prob, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void DropoutBackward<half>(const half *dy, const half *mask, half *dx, size_t num_count,
+                                                    float drop_prob, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cuh
new file mode 100644
index 00000000000..1518b2b2b48
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cuh
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void DropoutForward(const T *input, T *mask, T *output, float *mask_f, size_t num_count,
+                                    float keep_prob, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void DropoutBackward(const T *dy, const T *mask, T *dx, size_t num_count, float keep_prob,
+                                     cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cu
new file mode 100644
index 00000000000..1e37f6b92fd
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cu
@@ -0,0 +1,124 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dynamic_range_impl.cuh"
+#include <cuda_runtime.h>
+
+template <typename T>
+__global__ void ValidateInputAndInferShape(const T *range_start, const T *range_end, const T *range_delta,
+                                           int64_t *output_shape, DynamicRangeErrorCode *error_code,
+                                           const int64_t max_output_size) {
+  T start = range_start[0];
+  T end = range_end[0];
+  T delta = range_delta[0];
+  *error_code = DynamicRangeErrorCode::kOk;
+
+  if (delta == 0) {
+    *error_code = DynamicRangeErrorCode::kDeltaIsZero;
+    return;
+  }
+
+  if (start < end && delta < 0) {
+    *error_code = DynamicRangeErrorCode::kInvalidNegativeDelta;
+    return;
+  }
+
+  if (start > end && delta > 0) {
+    *error_code = DynamicRangeErrorCode::kInvalidPositiveDelta;
+    return;
+  }
+
+  if (*error_code == DynamicRangeErrorCode::kOk) {
+    int64_t real_output_shape = static_cast<int64_t>(ceil(static_cast<double>(end - start) / delta));
+
+    // verification in case of precision error during calculation of real_output_shape. one multiplication followed by
+    // one addition is much more precise than the division that occurs when calculating real_output_shape.
+    double last_value = start + (delta * (real_output_shape - 1));
+    double epsilon = 1e-6;
+    if ((end > start && last_value > end) || (start > end && last_value < end) || fabsf(last_value - end) < epsilon) {
+      real_output_shape--;
+    }
+
+    if (real_output_shape > max_output_size) {
+        *error_code = DynamicRangeErrorCode::kMaxSizeExceeded;
+    }
+    *output_shape = real_output_shape;
+  }
+}
+
+template <typename T>
+__global__ void Range(const T *range_start, const T *range_end, const T *range_delta, T *output, int64_t *output_shape,
+                      const int64_t max_output_size) {
+  T start = range_start[0];
+  T delta = range_delta[0];
+
+  size_t gt_id = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; gt_id < *output_shape; gt_id += blockDim.x * gridDim.x) {
+    output[gt_id] = gt_id * delta + start;
+  }
+}
+
+template <typename T>
+void CudaValidateInputAndInferShape(const T *range_start, const T *range_end, const T *range_delta,
+                                    int64_t *output_shape, DynamicRangeErrorCode *error_code,
+                                    const int64_t max_output_size, cudaStream_t cuda_stream) {
+  ValidateInputAndInferShape<<<1, 1, 0, cuda_stream>>>(range_start, range_end, range_delta, output_shape, error_code,
+                                                       max_output_size);
+}
+
+template <typename T>
+void CalRange(const T *range_start, const T *range_end, const T *range_delta, T *output, int64_t *output_shape,
+              DynamicRangeErrorCode *error_code, const int64_t max_output_size, cudaStream_t cuda_stream) {
+  Range<<<GET_BLOCKS(max_output_size), GET_THREADS, 0, cuda_stream>>>(range_start, range_end, range_delta,
+                                                                             output, output_shape, max_output_size);
+}
+
+template CUDA_LIB_EXPORT void CudaValidateInputAndInferShape<int>(const int *range_start, const int *range_end,
+                                                                  const int *range_delta, int64_t *output_shape,
+                                                                  DynamicRangeErrorCode *error_code,
+                                                                  const int64_t max_output_size,
+                                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CudaValidateInputAndInferShape<int64_t>(const int64_t *range_start,
+                                                                      const int64_t *range_end,
+                                                                      const int64_t *range_delta, int64_t *output_shape,
+                                                                      DynamicRangeErrorCode *error_code,
+                                                                      const int64_t max_output_size,
+                                                                      cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CudaValidateInputAndInferShape<float>(const float *range_start, const float *range_end,
+                                                                    const float *range_delta, int64_t *output_shape,
+                                                                    DynamicRangeErrorCode *error_code,
+                                                                    const int64_t max_output_size,
+                                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CudaValidateInputAndInferShape<double>(const double *range_start, const double *range_end,
+                                                                     const double *range_delta, int64_t *output_shape,
+                                                                     DynamicRangeErrorCode *error_code,
+                                                                     const int64_t max_output_size,
+                                                                     cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void CalRange<int>(const int *range_start, const int *range_end, const int *range_delta,
+                                            int *output, int64_t *output_shape, DynamicRangeErrorCode *error_code,
+                                            const int64_t max_output_size, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalRange<int64_t>(const int64_t *range_start, const int64_t *range_end,
+                                                const int64_t *range_delta, int64_t *output, int64_t *output_shape,
+                                                DynamicRangeErrorCode *error_code, const int64_t max_output_size,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalRange<float>(const float *range_start, const float *range_end,
+                                              const float *range_delta, float *output, int64_t *output_shape,
+                                              DynamicRangeErrorCode *error_code, const int64_t max_output_size,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalRange<double>(const double *range_start, const double *range_end,
+                                               const double *range_delta, double *output, int64_t *output_shape,
+                                               DynamicRangeErrorCode *error_code, const int64_t max_output_size,
+                                               cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cuh
new file mode 100644
index 00000000000..f606e5d351e
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cuh
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DYNAMIC_RANGE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DYNAMIC_RANGE_IMPL_CUH_
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+enum class DynamicRangeErrorCode {
+  kOk = 0,
+  kDeltaIsZero,
+  kInvalidPositiveDelta,
+  kInvalidNegativeDelta,
+  kMaxSizeExceeded
+};
+
+template <typename T>
+CUDA_LIB_EXPORT void CudaValidateInputAndInferShape(const T *range_start, const T *range_end, const T *range_delta,
+                                                    int64_t *output_shape, DynamicRangeErrorCode *error_code,
+                                                    const int64_t max_output_size, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalRange(const T *range_start, const T *range_end, const T *range_delta, T *output,
+                              int64_t *output_shape, DynamicRangeErrorCode *error_code, const int64_t max_output_size,
+                              cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DYNAMIC_RANGE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cu
similarity index 96%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cu
index 0586500f103..08d166da3d2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 #include "dynamic_stitch_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 
 __global__ void StitchKernel(const int *index_addr, const unsigned char *data_addr, unsigned char *output_addr,
                              const size_t index_num, const size_t data_size, int *max_index_dev) {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cuh
similarity index 50%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cuh
index bab5c8188a0..39fbf6cc5a0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cuh
@@ -14,11 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_DYNAMIC_STITCH_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_DYNAMIC_STITCH_CUH_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DYNAMIC_STITCH_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DYNAMIC_STITCH_IMPL_CUH_
 #include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
-void CallStitch(const int *index_addr, const unsigned char *data_addr, unsigned char *output_addr,
-                const size_t index_num, const size_t data_size, int *max_index_dev, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_DYNAMIC_STITCH_CUH_
+CUDA_LIB_EXPORT void CallStitch(const int *index_addr, const unsigned char *data_addr, unsigned char *output_addr,
+                                const size_t index_num, const size_t data_size, int *max_index_dev,
+                                cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DYNAMIC_STITCH_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/einsum_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cu
similarity index 73%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/einsum_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cu
index 3a8abf65280..26a35f0f7cf 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/einsum_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cu
@@ -15,8 +15,7 @@
  */
 #include <cuda_runtime.h>
 #include "einsum_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 template <typename T>
 __global__ void Diagonal(const size_t out_size, const T *input, const size_t *inp_shape, const size_t shape_size,
                          const size_t left_dim, const size_t right_dim, T *output) {
@@ -64,15 +63,15 @@ void CalDiagonal(const size_t size, const T *input, const size_t *input_shape, c
   Diagonal<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, input_shape, shape_size, left_dim, right_dim,
                                                               output);
 }
-template void CalDiagonal<double>(const size_t size, const double *input, const size_t *input_shape,
-                                  const size_t shape_size, const size_t left_dim, const size_t right_dim,
-                                  double *output, cudaStream_t cuda_stream);
-template void CalDiagonal<float>(const size_t size, const float *input, const size_t *input_shape,
-                                 const size_t shape_size, const size_t left_dim, const size_t right_dim, float *output,
-                                 cudaStream_t cuda_stream);
-template void CalDiagonal<half>(const size_t size, const half *input, const size_t *input_shape,
-                                const size_t shape_size, const size_t left_dim, const size_t right_dim, half *output,
-                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDiagonal<double>(const size_t size, const double *input, const size_t *input_shape,
+                                                  const size_t shape_size, const size_t left_dim,
+                                                  const size_t right_dim, double *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDiagonal<float>(const size_t size, const float *input, const size_t *input_shape,
+                                                 const size_t shape_size, const size_t left_dim, const size_t right_dim,
+                                                 float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDiagonal<half>(const size_t size, const half *input, const size_t *input_shape,
+                                                const size_t shape_size, const size_t left_dim, const size_t right_dim,
+                                                half *output, cudaStream_t cuda_stream);
 template <typename T>
 __global__ void DiagonalGrad(const size_t d_size, const T *dout, const size_t *inp_shape, const size_t shape_size,
                              const size_t left_dim, const size_t right_dim, T *d_inp) {
@@ -116,15 +115,15 @@ void CalDiagonalGrad(const size_t d_size, const T *dout, const size_t *input_sha
   DiagonalGrad<<<GET_BLOCKS(d_size), GET_THREADS, 0, cuda_stream>>>(d_size, dout, input_shape, shape_size, left_dim,
                                                                     right_dim, d_inp);
 }
-template void CalDiagonalGrad<double>(const size_t size, const double *dout, const size_t *input_shape,
-                                      const size_t shape_size, const size_t left_dim, const size_t right_dim,
-                                      double *d_inp, cudaStream_t cuda_stream);
-template void CalDiagonalGrad<float>(const size_t size, const float *dout, const size_t *input_shape,
-                                     const size_t shape_size, const size_t left_dim, const size_t right_dim,
-                                     float *d_inp, cudaStream_t cuda_stream);
-template void CalDiagonalGrad<half>(const size_t size, const half *dout, const size_t *input_shape,
-                                    const size_t shape_size, const size_t left_dim, const size_t right_dim, half *d_inp,
-                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDiagonalGrad<double>(const size_t size, const double *dout, const size_t *input_shape,
+                                                      const size_t shape_size, const size_t left_dim,
+                                                      const size_t right_dim, double *d_inp, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDiagonalGrad<float>(const size_t size, const float *dout, const size_t *input_shape,
+                                                     const size_t shape_size, const size_t left_dim,
+                                                     const size_t right_dim, float *d_inp, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDiagonalGrad<half>(const size_t size, const half *dout, const size_t *input_shape,
+                                                    const size_t shape_size, const size_t left_dim,
+                                                    const size_t right_dim, half *d_inp, cudaStream_t cuda_stream);
 template <typename T>
 __global__ void ReduceSum(const size_t out_size, const T *input, T *output, const size_t *out_shape,
                           const size_t shape_size, const size_t reduce_dim, const size_t dim_val) {
@@ -273,12 +272,12 @@ void CalDot(const size_t size, T *input_a, const T *input_b, T *output, cudaStre
   cudaFree(cur_out);
 }
 
-template void CalDot<double>(const size_t size, double *input_a, const double *input_b, double *output,
-                             cudaStream_t cuda_stream);
-template void CalDot<float>(const size_t size, float *input_a, const float *input_b, float *output,
-                            cudaStream_t cuda_stream);
-template void CalDot<half>(const size_t size, half *input_a, const half *input_b, half *output,
-                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDot<double>(const size_t size, double *input_a, const double *input_b, double *output,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDot<float>(const size_t size, float *input_a, const float *input_b, float *output,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDot<half>(const size_t size, half *input_a, const half *input_b, half *output,
+                                           cudaStream_t cuda_stream);
 
 template <typename T>
 __global__ void DotGrad(const size_t size, const T dout, T *mid_res, T *input_b, T *input_a) {
@@ -291,12 +290,12 @@ template <typename T>
 void CalDotGrad(const size_t size, const T dout, T *mid_res, T *input_b, T *input_a, cudaStream_t cuda_stream) {
   DotGrad<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dout, mid_res, input_b, input_a);
 }
-template void CalDotGrad<double>(const size_t size, const double dout, double *mid_res, double *input_b,
-                                 double *input_a, cudaStream_t cuda_stream);
-template void CalDotGrad<float>(const size_t size, const float dout, float *mid_res, float *input_b, float *input_a,
-                                cudaStream_t cuda_stream);
-template void CalDotGrad<half>(const size_t size, const half dout, half *mid_res, half *input_b, half *input_a,
-                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDotGrad<double>(const size_t size, const double dout, double *mid_res, double *input_b,
+                                                 double *input_a, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDotGrad<float>(const size_t size, const float dout, float *mid_res, float *input_b,
+                                                float *input_a, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalDotGrad<half>(const size_t size, const half dout, half *mid_res, half *input_b,
+                                               half *input_a, cudaStream_t cuda_stream);
 // Element-wise ArithMetic
 template <typename T>
 __global__ void ElewiseArithMulKernel(const size_t nums, const T *x0, const T *x1, T *y) {
@@ -343,14 +342,15 @@ void CalMul(const bool broadcast_flag, const size_t shape_len, const size_t *lft
   }
 }
 
-template void CalMul<double>(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape,
-                             const size_t lft_num, const size_t *rht_shape, const size_t rht_num,
-                             const size_t *out_shape, const size_t out_num, const double *x0, const double *x1,
-                             double *y, cudaStream_t stream);
-template void CalMul<float>(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape,
-                            const size_t lft_num, const size_t *rht_shape, const size_t rht_num,
-                            const size_t *out_shape, const size_t out_num, const float *x0, const float *x1, float *y,
-                            cudaStream_t stream);
-template void CalMul<half>(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape,
-                           const size_t lft_num, const size_t *rht_shape, const size_t rht_num, const size_t *out_shape,
-                           const size_t out_num, const half *x0, const half *x1, half *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalMul<double>(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape,
+                                             const size_t lft_num, const size_t *rht_shape, const size_t rht_num,
+                                             const size_t *out_shape, const size_t out_num, const double *x0,
+                                             const double *x1, double *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalMul<float>(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape,
+                                            const size_t lft_num, const size_t *rht_shape, const size_t rht_num,
+                                            const size_t *out_shape, const size_t out_num, const float *x0,
+                                            const float *x1, float *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalMul<half>(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape,
+                                           const size_t lft_num, const size_t *rht_shape, const size_t rht_num,
+                                           const size_t *out_shape, const size_t out_num, const half *x0,
+                                           const half *x1, half *y, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cuh
new file mode 100644
index 00000000000..f2073c8603d
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cuh
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EINSUM_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EINSUM_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "include/cuda_fp16.h"
+#define EINSUM_MAX_DIMENSION 20
+template <typename T>
+struct DynamicSharedMem;
+template <>
+struct DynamicSharedMem<double> {
+  __device__ double *addr() {
+    extern __shared__ double addr_double[];
+    return addr_double;
+  }
+};
+template <>
+struct DynamicSharedMem<float> {
+  __device__ float *addr() {
+    extern __shared__ float addr_float[];
+    return addr_float;
+  }
+};
+template <>
+struct DynamicSharedMem<half> {
+  __device__ half *addr() {
+    extern __shared__ half addr_half[];
+    return addr_half;
+  }
+};
+template <typename T>
+CUDA_LIB_EXPORT void CalDiagonal(const size_t size, const T *input, const size_t *input_shape, const size_t shape_size,
+                                 const size_t left_dim, const size_t right_dim, T *output, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalDiagonalGrad(const size_t d_size, const T *dout, const size_t *input_shape,
+                                     const size_t shape_size, const size_t left_dim, const size_t right_dim, T *d_inp,
+                                     cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalDot(const size_t size, T *input_a, const T *input_b, T *output, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalDotGrad(const size_t size, const T dout, T *mid_res, T *input_b, T *input_a,
+                                cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalMul(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape,
+                            const size_t lft_num, const size_t *rht_shape, const size_t rht_num,
+                            const size_t *out_shape, const size_t out_num, const T *x0, const T *x1, T *y,
+                            cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EINSUM_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cu
new file mode 100644
index 00000000000..910469623c8
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cu
@@ -0,0 +1,100 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cuh"
+#include "include/cuda_fp16.h"
+
+template <typename T>
+__global__ void SubOffset(T *indices, size_t size, int64_t offset) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    indices[pos] -= static_cast<T>(offset);
+  }
+  return;
+}
+
+template <typename T, typename S>
+void CalEmbeddingLookup(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2,
+                        size_t input_dim1, int64_t offset, cudaStream_t stream) {
+  size_t size = output_dim0 * output_dim1 * output_dim2;
+  SubOffset<<<GET_BLOCKS(output_dim1), GET_THREADS, 0, stream>>>(indices, output_dim1, offset);
+  GatherV2Kernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, indices, output, output_dim0, output_dim1,
+                                                               output_dim2, input_dim1);
+  // restore indices
+  SubOffset<<<GET_BLOCKS(output_dim1), GET_THREADS, 0, stream>>>(indices, output_dim1, -offset);
+  return;
+}
+
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<float, int>(float *input, int *indices, float *output,
+                                                             size_t output_dim0, size_t output_dim1, size_t output_dim2,
+                                                             size_t input_dim1, int64_t offset, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<float, int64_t>(float *input, int64_t *indices, float *output,
+                                                                 size_t output_dim0, size_t output_dim1,
+                                                                 size_t output_dim2, size_t input_dim1, int64_t offset,
+                                                                 cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<half, int>(half *input, int *indices, half *output, size_t output_dim0,
+                                                            size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                            int64_t offset, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<half, int64_t>(half *input, int64_t *indices, half *output,
+                                                                size_t output_dim0, size_t output_dim1,
+                                                                size_t output_dim2, size_t input_dim1, int64_t offset,
+                                                                cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<double, int>(double *input, int *indices, double *output,
+                                                              size_t output_dim0, size_t output_dim1,
+                                                              size_t output_dim2, size_t input_dim1, int64_t offset,
+                                                              cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<double, int64_t>(double *input, int64_t *indices, double *output,
+                                                                  size_t output_dim0, size_t output_dim1,
+                                                                  size_t output_dim2, size_t input_dim1, int64_t offset,
+                                                                  cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<int, int>(int *input, int *indices, int *output, size_t output_dim0,
+                                                           size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                           int64_t offset, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<int, int64_t>(int *input, int64_t *indices, int *output,
+                                                               size_t output_dim0, size_t output_dim1,
+                                                               size_t output_dim2, size_t input_dim1, int64_t offset,
+                                                               cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<int16_t, int>(int16_t *input, int *indices, int16_t *output,
+                                                               size_t output_dim0, size_t output_dim1,
+                                                               size_t output_dim2, size_t input_dim1, int64_t offset,
+                                                               cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<int16_t, int64_t>(int16_t *input, int64_t *indices, int16_t *output,
+                                                                   size_t output_dim0, size_t output_dim1,
+                                                                   size_t output_dim2, size_t input_dim1,
+                                                                   int64_t offset, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<int8_t, int>(int8_t *input, int *indices, int8_t *output,
+                                                              size_t output_dim0, size_t output_dim1,
+                                                              size_t output_dim2, size_t input_dim1, int64_t offset,
+                                                              cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<int8_t, int64_t>(int8_t *input, int64_t *indices, int8_t *output,
+                                                                  size_t output_dim0, size_t output_dim1,
+                                                                  size_t output_dim2, size_t input_dim1, int64_t offset,
+                                                                  cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<uint8_t, int>(uint8_t *input, int *indices, uint8_t *output,
+                                                               size_t output_dim0, size_t output_dim1,
+                                                               size_t output_dim2, size_t input_dim1, int64_t offset,
+                                                               cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<uint8_t, int64_t>(uint8_t *input, int64_t *indices, uint8_t *output,
+                                                                   size_t output_dim0, size_t output_dim1,
+                                                                   size_t output_dim2, size_t input_dim1,
+                                                                   int64_t offset, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<bool, int>(bool *input, int *indices, bool *output, size_t output_dim0,
+                                                            size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                            int64_t offset, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalEmbeddingLookup<bool, int64_t>(bool *input, int64_t *indices, bool *output,
+                                                                size_t output_dim0, size_t output_dim1,
+                                                                size_t output_dim2, size_t input_dim1, int64_t offset,
+                                                                cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cuh
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cuh
index b4f220171d0..26f625c08b9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cuh
@@ -14,11 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_EMBEDDING_LOOKUP_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_EMBEDDING_LOOKUP_IMPL_CUH_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EMBEDDING_LOOKUP_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EMBEDDING_LOOKUP_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T, typename S>
-void CalEmbeddingLookup(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2,
-                        size_t input_dim1, int64_t offset, cudaStream_t stream);
+CUDA_LIB_EXPORT void CalEmbeddingLookup(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1,
+                                        size_t output_dim2, size_t input_dim1, int64_t offset, cudaStream_t stream);
 
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_EMBEDDING_LOOKUP_IMPL_CUH_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EMBEDDING_LOOKUP_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cu
similarity index 62%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cu
index 080ac397ba7..29df14fd9c3 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "equalcount_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "include/cuda_fp16.h"
 template <typename T>
 __global__ void EqualCount(const int size, const T* input1, const T* input2, T* output) {
   T equal_count = 0;
@@ -35,9 +35,9 @@ void CalEqualCount(const int size, const T* input1, const T* input2, T* output,
   return;
 }
 
-template void CalEqualCount<int>(const int size, const int* input1, const int* input2, int* output,
-                                 cudaStream_t cuda_stream);
-template void CalEqualCount<float>(const int size, const float* input1, const float* input2, float* output,
-                                   cudaStream_t cuda_stream);
-template void CalEqualCount<half>(const int size, const half* input1, const half* input2, half* output,
-                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalEqualCount<int>(const int size, const int* input1, const int* input2, int* output,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalEqualCount<float>(const int size, const float* input1, const float* input2,
+                                                   float* output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalEqualCount<half>(const int size, const half* input1, const half* input2, half* output,
+                                                  cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cuh
similarity index 54%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cuh
index ba6004da3b8..a9b2dfa38fe 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cuh
@@ -14,9 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_EQUALCOUNT_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_EQUALCOUNT_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EQUALCOUNT_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EQUALCOUNT_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void CalEqualCount(const int size, const T* input1, const T* input2, T* output, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CalEqualCount(const int size, const T* input1, const T* input2, T* output,
+                                   cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_EQUALCOUNT_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EQUALCOUNT_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cu
new file mode 100644
index 00000000000..210a9adaf27
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cu
@@ -0,0 +1,110 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cuh"
+#include "include/cuda_fp16.h"
+
+template <typename T>
+__global__ void ExtractImagePatches(size_t output_size, int64_t stride_row, int64_t stride_col, int64_t rate_row,
+                                    int64_t rate_col, int64_t output_cols, bool need_batch, int64_t row_stride,
+                                    int64_t patch_stride, int64_t other_stride, int64_t input_row_size,
+                                    int64_t input_col_size, int64_t row_padding_top, int64_t col_padding_left,
+                                    int64_t col_input_stride, int64_t row_input_stride, int64_t patch_input_stride,
+                                    int64_t output_depth, const T *input, T *output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_size; pos += blockDim.x * gridDim.x) {
+    const int64_t batch_index = need_batch ? (static_cast<int64_t>(pos) / other_stride) : 0;
+    const int64_t inner_index =
+      need_batch ? (static_cast<int64_t>(pos) - batch_index * other_stride) : static_cast<int64_t>(pos);
+    // inner index
+    const int64_t patch_index = inner_index / patch_stride;
+    const int64_t patch_offset = (inner_index - patch_index * patch_stride) / output_depth;
+    // row
+    const int64_t row_index = patch_index / output_cols;
+    const int64_t row_offset = patch_offset / row_stride;
+    const int64_t input_row = row_index * stride_row + row_offset * rate_row - row_padding_top;
+    if (input_row < 0 || input_row >= input_row_size) {
+      output[pos] = static_cast<T>(0);
+      continue;
+    }
+    // col
+    const int64_t col_index = patch_index - row_index * output_cols;
+    const int64_t col_offset = patch_offset - row_offset * row_stride;
+    const int64_t input_col = col_index * stride_col + col_offset * rate_col - col_padding_left;
+    if (input_col < 0 || input_col >= input_col_size) {
+      output[pos] = static_cast<T>(0);
+      continue;
+    }
+    // depth
+    const int64_t depth = inner_index - (inner_index / output_depth) * output_depth;
+    // input index
+    const int64_t input_index =
+      depth + input_col * col_input_stride + input_row * row_input_stride + batch_index * patch_input_stride;
+    output[pos] = input[static_cast<size_t>(input_index)];
+  }
+  return;
+}
+
+template <typename T>
+void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, int64_t stride_col, int64_t rate_row,
+                                int64_t rate_col, int64_t output_cols, bool need_batch, int64_t row_stride,
+                                int64_t patch_stride, int64_t other_stride, int64_t input_row_size,
+                                int64_t input_col_size, int64_t row_padding_top, int64_t col_padding_left,
+                                int64_t col_input_stride, int64_t row_input_stride, int64_t patch_input_stride,
+                                int64_t output_depth, const T *input, T *output, cudaStream_t stream) {
+  ExtractImagePatches<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(
+    output_size, stride_row, stride_col, rate_row, rate_col, output_cols, need_batch, row_stride, patch_stride,
+    other_stride, input_row_size, input_col_size, row_padding_top, col_padding_left, col_input_stride, row_input_stride,
+    patch_input_stride, output_depth, input, output);
+}
+
+template CUDA_LIB_EXPORT void CalExtractImagePatchesNHWC<int>(size_t output_size, int64_t stride_row,
+                                                              int64_t stride_col, int64_t rate_row, int64_t rate_col,
+                                                              int64_t output_cols, bool need_batch, int64_t row_stride,
+                                                              int64_t patch_stride, int64_t other_stride,
+                                                              int64_t input_row_size, int64_t input_col_size,
+                                                              int64_t row_padding_top, int64_t col_padding_left,
+                                                              int64_t col_input_stride, int64_t row_input_stride,
+                                                              int64_t patch_input_stride, int64_t output_depth,
+                                                              const int *input, int *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalExtractImagePatchesNHWC<float>(size_t output_size, int64_t stride_row,
+                                                                int64_t stride_col, int64_t rate_row, int64_t rate_col,
+                                                                int64_t output_cols, bool need_batch,
+                                                                int64_t row_stride, int64_t patch_stride,
+                                                                int64_t other_stride, int64_t input_row_size,
+                                                                int64_t input_col_size, int64_t row_padding_top,
+                                                                int64_t col_padding_left, int64_t col_input_stride,
+                                                                int64_t row_input_stride, int64_t patch_input_stride,
+                                                                int64_t output_depth, const float *input, float *output,
+                                                                cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalExtractImagePatchesNHWC<half>(size_t output_size, int64_t stride_row,
+                                                               int64_t stride_col, int64_t rate_row, int64_t rate_col,
+                                                               int64_t output_cols, bool need_batch, int64_t row_stride,
+                                                               int64_t patch_stride, int64_t other_stride,
+                                                               int64_t input_row_size, int64_t input_col_size,
+                                                               int64_t row_padding_top, int64_t col_padding_left,
+                                                               int64_t col_input_stride, int64_t row_input_stride,
+                                                               int64_t patch_input_stride, int64_t output_depth,
+                                                               const half *input, half *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalExtractImagePatchesNHWC<double>(size_t output_size, int64_t stride_row,
+                                                                 int64_t stride_col, int64_t rate_row, int64_t rate_col,
+                                                                 int64_t output_cols, bool need_batch,
+                                                                 int64_t row_stride, int64_t patch_stride,
+                                                                 int64_t other_stride, int64_t input_row_size,
+                                                                 int64_t input_col_size, int64_t row_padding_top,
+                                                                 int64_t col_padding_left, int64_t col_input_stride,
+                                                                 int64_t row_input_stride, int64_t patch_input_stride,
+                                                                 int64_t output_depth, const double *input,
+                                                                 double *output, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cuh
new file mode 100644
index 00000000000..9328c150f11
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cuh
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EXTRACT_IMAGE_PATCHES_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EXTRACT_IMAGE_PATCHES_IMPL_CUH_
+#include <vector>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, int64_t stride_col,
+                                                int64_t rate_row, int64_t rate_col, int64_t output_cols,
+                                                bool need_batch, int64_t row_stride, int64_t patch_stride,
+                                                int64_t other_stride, int64_t input_row_size, int64_t input_col_size,
+                                                int64_t row_padding_top, int64_t col_padding_left,
+                                                int64_t col_input_stride, int64_t row_input_stride,
+                                                int64_t patch_input_stride, int64_t output_depth, const T *input,
+                                                T *output, cudaStream_t stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EXTRACT_IMAGE_PATCHES_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/eye_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cu
similarity index 80%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/eye_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cu
index 715c135ec43..98279706ad7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/eye_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cu
@@ -36,5 +36,7 @@ void Eye(const size_t size, const size_t dim, T *output_addr, cudaStream_t cuda_
   return;
 }
 
-template void Eye<float>(const size_t size, const size_t dim, float *output_addr, cudaStream_t cuda_stream);
-template void Eye<double>(const size_t size, const size_t dim, double *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Eye<float>(const size_t size, const size_t dim, float *output_addr,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Eye<double>(const size_t size, const size_t dim, double *output_addr,
+                                          cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/eye_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cuh
similarity index 60%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/eye_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cuh
index bca33c388ac..0595131283b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/eye_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cuh
@@ -14,11 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_EYE_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_EYE_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EYE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EYE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void Eye(const size_t size, const size_t dim, T *output_addr, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void Eye(const size_t size, const size_t dim, T *output_addr, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_EYE_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EYE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cu
similarity index 96%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cu
index 621f9a46668..e07de17c9d9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cu
@@ -21,7 +21,7 @@
 #include <thrust/reduce.h>
 #include <thrust/pair.h>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __global__ void FakeLearnedScaleQuantPerChannel(float *output, const int size, float *input_alpha,
                                                 float *input_quant, const int channel_num) {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh
new file mode 100644
index 00000000000..f8f7817ede0
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+CUDA_LIB_EXPORT void CalLSQNudgePerChannel(const float *input, const int size, float *input_alpha,
+                                           float *input_quant_max, float *input_div_alpha, float *input_quant,
+                                           const bool neg_trunc, const int channel_num, cudaStream_t cuda_stream);
+
+CUDA_LIB_EXPORT void CalFakeLearnedScaleQuantPerChannel(float *output, const int size, float *input_alpha,
+                                                        float *input_quant, const int channel_num,
+                                                        cudaStream_t cuda_stream);
+
+CUDA_LIB_EXPORT void CalFakeLearnedScaleQuantPerChannelGrad(float *grad_input, float *grad_alpha, const float *gradient,
+                                                            const int size, const float *input_div_alpha,
+                                                            const float *input_quant, const bool neg_trunc,
+                                                            const int channel_num, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cu
similarity index 96%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cu
index a2fc40e3d65..0103794e3ce 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cu
@@ -19,7 +19,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/pair.h>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __global__ void FakeLearnedScaleQuantPerLayer(float *output, const int size, float *input_alpha,
                                               float *input_quant) {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh
new file mode 100644
index 00000000000..b9a7067f5f3
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_LEARNED_SCALE_QUANT_PERLAYER_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_LEARNED_SCALE_QUANT_PERLAYER_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+CUDA_LIB_EXPORT void CalLSQNudgePerLayer(const float *input, const int size, float *input_alpha, float *input_quant_max,
+                                         float *input_div_alpha, float *input_quant, const bool neg_trunc,
+                                         cudaStream_t cuda_stream);
+
+CUDA_LIB_EXPORT void CalFakeLearnedScaleQuantPerLayer(float *output, const int size, float *input_alpha,
+                                                      float *input_quant, cudaStream_t cuda_stream);
+
+CUDA_LIB_EXPORT void CalFakeLearnedScaleQuantPerLayerGrad(float *grad_input, float *grad_alpha, const float *gradient,
+                                                          const int size, const float *input_div_alpha,
+                                                          const float *input_quant, const bool neg_trunc,
+                                                          cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_LEARNED_SCALE_QUANT_PERLAYER_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cu
similarity index 100%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cu
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh
new file mode 100644
index 00000000000..c9a9c561814
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_QUANT_PERCHANNEL_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_QUANT_PERCHANNEL_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+CUDA_LIB_EXPORT void CalNudgePerChannel(float *input_min, float *input_max, const float quant_min,
+                                        const float quant_max, float *nudge_min, float *nudge_max, float *scale,
+                                        const int channel_num, const bool symmetric, cudaStream_t cuda_stream);
+
+CUDA_LIB_EXPORT void CalFakeQuantPerChannel(const float *input, float *output, const int total_num,
+                                            const int channel_num, const float *nudge_min, const float *nudge_max,
+                                            const float *scale, cudaStream_t cuda_stream);
+
+CUDA_LIB_EXPORT void CalFakeQuantPerChannelGrad(const float *input, const float *gradient, float *output,
+                                                const int total_num, const int channel_num, const float *nudge_min,
+                                                const float *nudge_max, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_QUANT_PERCHANNEL_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cu
similarity index 100%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cu
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh
new file mode 100644
index 00000000000..203190d9aef
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_QUANT_PERLAYER_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_QUANT_PERLAYER_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+CUDA_LIB_EXPORT void CalNudgePerLayer(float *input_min, float *input_max, const float quant_min, const float quant_max,
+                                      float *nudge_min, float *nudge_max, float *scale, const bool symmetric,
+                                      cudaStream_t cuda_stream);
+
+CUDA_LIB_EXPORT void CalFakeQuantPerLayer(const float *input, float *output, const int size, const float *nudge_min,
+                                          const float *nudge_max, const float *scale, cudaStream_t cuda_stream);
+
+CUDA_LIB_EXPORT void CalFakeQuantPerLayerGrad(const float *input, const float *gradient, float *output, const int size,
+                                              const float *nudge_min, const float *nudge_max,
+                                              cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_QUANT_PERLAYER_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fill_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cu
similarity index 64%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fill_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cu
index 58995dee4cb..d493d338e6f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fill_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cu
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/fill_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cuh"
 #include "include/cuda_runtime.h"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void FillKernel(const size_t m, const size_t n, const T *input, T *output) {
@@ -30,6 +31,9 @@ void Fill(const size_t &m, const size_t &n, const T *input, T *output, cudaStrea
   FillKernel<<<(m * n + 255) / 256, 256, 0, stream>>>(m, n, input, output);
 }
 
-template void Fill<float>(const size_t &m, const size_t &n, const float *input, float *output, cudaStream_t stream);
-template void Fill<half>(const size_t &m, const size_t &n, const half *input, half *output, cudaStream_t stream);
-template void Fill<double>(const size_t &m, const size_t &n, const double *input, double *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Fill<float>(const size_t &m, const size_t &n, const float *input, float *output,
+                                          cudaStream_t stream);
+template CUDA_LIB_EXPORT void Fill<half>(const size_t &m, const size_t &n, const half *input, half *output,
+                                         cudaStream_t stream);
+template CUDA_LIB_EXPORT void Fill<double>(const size_t &m, const size_t &n, const double *input, double *output,
+                                           cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fill_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cuh
similarity index 59%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fill_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cuh
index acdcc191d58..0d304724857 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fill_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cuh
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_FILL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_FILL_H_
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FILL_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FILL_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void Fill(const size_t &m, const size_t &n, const T *input, T *output, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_FILL_H_
+CUDA_LIB_EXPORT void Fill(const size_t &m, const size_t &n, const T *input, T *output, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FILL_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/float_status_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cu
similarity index 64%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/float_status_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cu
index a27fe35e190..07d181cb1da 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/float_status_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cu
@@ -15,7 +15,8 @@
  */
 
 #include "include/cuda_runtime.h"
-#include "plugin/device/gpu/kernel/cuda_impl/float_status_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void IsNan(const size_t size, const T* input, bool* out) {
@@ -126,15 +127,27 @@ void CalIsFinite(const size_t size, const T* input, bool* output, cudaStream_t c
   return;
 }
 
-template void CalFloatStatus<float>(const size_t size, const float* input, float* output, cudaStream_t cuda_stream);
-template void CalFloatStatus<half>(const size_t size, const half* input, float* output, cudaStream_t cuda_stream);
-template void CalFloatStatus<double>(const size_t size, const double* input, float* output, cudaStream_t cuda_stream);
-template void CalIsInf<float>(const size_t size, const float* input, bool* output, cudaStream_t cuda_stream);
-template void CalIsInf<half>(const size_t size, const half* input, bool* output, cudaStream_t cuda_stream);
-template void CalIsInf<double>(const size_t size, const double* input, bool* output, cudaStream_t cuda_stream);
-template void CalIsNan<float>(const size_t size, const float* input, bool* output, cudaStream_t cuda_stream);
-template void CalIsNan<half>(const size_t size, const half* input, bool* output, cudaStream_t cuda_stream);
-template void CalIsNan<double>(const size_t size, const double* input, bool* output, cudaStream_t cuda_stream);
-template void CalIsFinite<float>(const size_t size, const float* input, bool* output, cudaStream_t cuda_stream);
-template void CalIsFinite<half>(const size_t size, const half* input, bool* output, cudaStream_t cuda_stream);
-template void CalIsFinite<double>(const size_t size, const double* input, bool* output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalFloatStatus<float>(const size_t size, const float* input, float* output,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalFloatStatus<half>(const size_t size, const half* input, float* output,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalFloatStatus<double>(const size_t size, const double* input, float* output,
+                                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIsInf<float>(const size_t size, const float* input, bool* output,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIsInf<half>(const size_t size, const half* input, bool* output,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIsInf<double>(const size_t size, const double* input, bool* output,
+                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIsNan<float>(const size_t size, const float* input, bool* output,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIsNan<half>(const size_t size, const half* input, bool* output,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIsNan<double>(const size_t size, const double* input, bool* output,
+                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIsFinite<float>(const size_t size, const float* input, bool* output,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIsFinite<half>(const size_t size, const half* input, bool* output,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIsFinite<double>(const size_t size, const double* input, bool* output,
+                                                  cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cuh
new file mode 100644
index 00000000000..b1794e4d023
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cuh
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FLOAT_STATUS_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FLOAT_STATUS_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void CalFloatStatus(const size_t size, const T *input, float *output, cudaStream_t stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalIsNan(const size_t size, const T *input, bool *output, cudaStream_t stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalIsInf(const size_t size, const T *input, bool *output, cudaStream_t stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalIsFinite(const size_t size, const T *input, bool *output, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FLOAT_STATUS_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cu
similarity index 76%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cu
index dada21003b7..ca586df5afa 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __device__ __forceinline__ T PowFunc(T x, T y) {
@@ -77,11 +78,11 @@ void ApplyFtrl(const size_t size, const T *gradient, const T *learning_rate, con
                                                                      accumulation, linear);
 }
 
-template void ApplyFtrl<float>(const size_t size, const float *gradient, const float *learning_rate,
-                               const float *l1_regularization, const float *l2_regularization,
-                               const float *learning_rate_power, float *variable, float *accumulation, float *linear,
-                               cudaStream_t cuda_stream);
-template void ApplyFtrl<half>(const size_t size, const half *gradient, const half *learning_rate,
-                              const half *l1_regularization, const half *l2_regularization,
-                              const half *learning_rate_power, half *variable, half *accumulation, half *linear,
-                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyFtrl<float>(const size_t size, const float *gradient, const float *learning_rate,
+                                               const float *l1_regularization, const float *l2_regularization,
+                                               const float *learning_rate_power, float *variable, float *accumulation,
+                                               float *linear, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ApplyFtrl<half>(const size_t size, const half *gradient, const half *learning_rate,
+                                              const half *l1_regularization, const half *l2_regularization,
+                                              const half *learning_rate_power, half *variable, half *accumulation,
+                                              half *linear, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cuh
similarity index 50%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cuh
index 87add534c2c..0c9d261288b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cuh
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FTRL_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FTRL_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FTRL_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FTRL_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void ApplyFtrl(const size_t size, const T *gradient, const T *learning_rate, const T *l1_regularization,
-               const T *l2_regularization, const T *learning_rate_power, T *variable, T *accumulation, T *linear,
-               cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void ApplyFtrl(const size_t size, const T *gradient, const T *learning_rate, const T *l1_regularization,
+                               const T *l2_regularization, const T *learning_rate_power, T *variable, T *accumulation,
+                               T *linear, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FTRL_IMPL_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FTRL_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cu
new file mode 100755
index 00000000000..63b72443404
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cu
@@ -0,0 +1,150 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh"
+#include "include/cuda_fp16.h"
+template <typename T, typename S>
+__global__ void GatherKernel(const T *input, const S *index, T *output, const size_t dim_before_axis,
+                             const size_t dim_at_axis_input, const size_t dim_at_axis_output,
+                             const size_t dim_after_axis) {
+  size_t num = dim_before_axis * dim_at_axis_output * dim_after_axis;
+  size_t i, k;
+  for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < num;
+       id += blockDim.x * gridDim.x) {
+    i = id / (dim_at_axis_output * dim_after_axis);
+    k = id % dim_after_axis;
+
+    S j = index[id];
+    if (j < 0) {
+        j += static_cast<S>(dim_at_axis_input);
+    }
+    CUDA_KERNEL_ASSERT(j >= 0);
+    size_t j_read = static_cast<size_t>(j);
+    CUDA_KERNEL_ASSERT(j_read < dim_at_axis_input);
+    size_t read_id = i * dim_at_axis_input * dim_after_axis + j_read * dim_after_axis + k;
+    output[id] = input[read_id];
+  }
+  return;
+}
+template <typename T, typename S>
+void Gather(const T *input, const S *index, T *output, const size_t dim_before_axis,
+            const size_t dim_at_axis_input, const size_t dim_at_axis_output,
+            const size_t dim_after_axis, cudaStream_t stream) {
+  size_t size = dim_before_axis * dim_at_axis_output * dim_after_axis;
+  GatherKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, index, output, dim_before_axis, dim_at_axis_input,
+                                                             dim_at_axis_output, dim_after_axis);
+  return;
+}
+
+template CUDA_LIB_EXPORT void Gather<double, int>(const double *input, const int *index, double *output,
+                                                  const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                  const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                  cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<double, int64_t>(const double *input, const int64_t *index, double *output,
+                                                      const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                      const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                      cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<float, int>(const float *input, const int *index, float *output,
+                                                 const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                 const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                 cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<float, int64_t>(const float *input, const int64_t *index, float *output,
+                                                     const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                     const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                     cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<half, int>(const half *input, const int *index, half *output,
+                                                const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<half, int64_t>(const half *input, const int64_t *index, half *output,
+                                                    const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<int64_t, int>(const int64_t *input, const int *index, int64_t *output,
+                                                   const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                   const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<int64_t, int64_t>(const int64_t *input, const int64_t *index, int64_t *output,
+                                                       const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                       const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                       cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<int, int>(const int *input, const int *index, int *output,
+                                               const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                               const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                               cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<int, int64_t>(const int *input, const int64_t *index, int *output,
+                                                   const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                   const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<int16_t, int>(const int16_t *input, const int *index, int16_t *output,
+                                                   const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                   const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<int16_t, int64_t>(const int16_t *input, const int64_t *index, int16_t *output,
+                                                       const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                       const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                       cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<int8_t, int>(const int8_t *input, const int *index, int8_t *output,
+                                                  const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                  const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                  cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<int8_t, int64_t>(const int8_t *input, const int64_t *index, int8_t *output,
+                                                      const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                      const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                      cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<unsigned char, int>(const unsigned char *input, const int *index,
+                                                         unsigned char *output, const size_t dim_before_axis,
+                                                         const size_t dim_at_axis_input,
+                                                         const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                         cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<unsigned char, int64_t>(const unsigned char *input, const int64_t *index,
+                                                             unsigned char *output, const size_t dim_before_axis,
+                                                             const size_t dim_at_axis_input,
+                                                             const size_t dim_at_axis_output,
+                                                             const size_t dim_after_axis, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<bool, int>(const bool *input, const int *index, bool *output,
+                                                const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<bool, int64_t>(const bool *input, const int64_t *index, bool *output,
+                                                    const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<uint16_t, int>(const uint16_t *input, const int *index, uint16_t *output,
+                                                    const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<uint16_t, int64_t>(const uint16_t *input, const int64_t *index, uint16_t *output,
+                                                        const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                        const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                        cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<uint32_t, int>(const uint32_t *input, const int *index, uint32_t *output,
+                                                    const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<uint32_t, int64_t>(const uint32_t *input, const int64_t *index, uint32_t *output,
+                                                        const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                        const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                        cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<uint64_t, int>(const uint64_t *input, const int *index, uint64_t *output,
+                                                    const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void Gather<uint64_t, int64_t>(const uint64_t *input, const int64_t *index, uint64_t *output,
+                                                        const size_t dim_before_axis, const size_t dim_at_axis_input,
+                                                        const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                        cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh
index b6749798553..038841bd0af 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh
@@ -14,11 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_GATHER_GPU_CU_H
-#define MINDSPORE_GATHER_GPU_CU_H
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHER_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHER_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T, typename S>
-void Gather(const T *input, const S *index, T *output, const size_t dim_before_axis,
-            const size_t dim_at_axis_input, const size_t dim_at_axis_output,
-            const size_t dim_after_axis, cudaStream_t stream);
+CUDA_LIB_EXPORT void Gather(const T *input, const S *index, T *output, const size_t dim_before_axis,
+                            const size_t dim_at_axis_input, const size_t dim_at_axis_output,
+                            const size_t dim_after_axis, cudaStream_t stream);
 
-#endif
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHER_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cu
new file mode 100755
index 00000000000..4e09bd1d8a6
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cu
@@ -0,0 +1,154 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+
+template <typename T, typename S>
+__global__ void GatherGradKernel(const size_t num, const T *index, const S *grad, S *output,
+                                 const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                 const size_t dim_at_axis_output, const size_t dim_after_axis) {
+  size_t i, k;
+
+  for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < num;
+       id += blockDim.x * gridDim.x) {
+    i = id / (dim_at_axis_index * dim_after_axis);
+    k = id % dim_after_axis;
+
+    T j = index[id];
+    if (j < 0) {
+        j += static_cast<T>(dim_at_axis_output);
+    }
+    CUDA_KERNEL_ASSERT(j >= 0);
+    size_t j_read = static_cast<size_t>(j);
+    CUDA_KERNEL_ASSERT(j_read < dim_at_axis_output);
+    size_t read_id = i * dim_at_axis_output * dim_after_axis + j_read * dim_after_axis + k;
+    MsAtomicAdd(output + read_id, grad[id]);
+  }
+  return;
+}
+
+template <typename S>
+__global__ void InitOutput(const size_t size, S *output) {
+    S zero = 0;
+    for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < size; id += blockDim.x * gridDim.x) {
+        output[id] = zero;
+    }
+    return;
+}
+
+template <typename T, typename S>
+void GatherGrad(const T *index, const S *grad, S *output, const size_t dim_before_axis,
+                const size_t dim_at_axis_index, const size_t dim_at_axis_output, const size_t dim_after_axis,
+                cudaStream_t stream) {
+  size_t size = dim_before_axis * dim_at_axis_output * dim_after_axis;
+  InitOutput<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(size, output);
+
+  size = dim_before_axis * dim_at_axis_index * dim_after_axis;
+  GatherGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(size, index, grad, output,
+                                                                 dim_before_axis, dim_at_axis_index,
+                                                                 dim_at_axis_output, dim_after_axis);
+  return;
+}
+
+template CUDA_LIB_EXPORT void GatherGrad<int, double>(const int *index, const double *grad, double *output,
+                                                      const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                      const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                      cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int64_t, double>(const int64_t *index, const double *grad, double *output,
+                                                          const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                          const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                          cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int, float>(const int *index, const float *grad, float *output,
+                                                     const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                     const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                     cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int64_t, float>(const int64_t *index, const float *grad, float *output,
+                                                         const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                         const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                         cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int, half>(const int *index, const half *grad, half *output,
+                                                    const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int64_t, half>(const int64_t *index, const half *grad, half *output,
+                                                        const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                        const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                        cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int, int>(const int *index, const int *grad, int *output,
+                                                   const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                   const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int64_t, int>(const int64_t *index, const int *grad, int *output,
+                                                       const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                       const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                       cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int, int8_t>(const int *index, const int8_t *grad, int8_t *output,
+                                                      const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                      const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                      cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int64_t, int8_t>(const int64_t *index, const int8_t *grad, int8_t *output,
+                                                          const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                          const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                          cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int, int16_t>(const int *index, const int16_t *grad, int16_t *output,
+                                                       const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                       const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                       cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int64_t, int16_t>(const int64_t *index, const int16_t *grad, int16_t *output,
+                                                           const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                           const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                           cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int, int64_t>(const int *index, const int64_t *grad, int64_t *output,
+                                                       const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                       const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                       cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int64_t, int64_t>(const int64_t *index, const int64_t *grad, int64_t *output,
+                                                           const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                           const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                           cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int, unsigned char>(const int *index, const unsigned char *grad,
+                                                             unsigned char *output, const size_t dim_before_axis,
+                                                             const size_t dim_at_axis_index,
+                                                             const size_t dim_at_axis_output,
+                                                             const size_t dim_after_axis, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int64_t, unsigned char>(const int64_t *index, const unsigned char *grad,
+                                                                 unsigned char *output, const size_t dim_before_axis,
+                                                                 const size_t dim_at_axis_index,
+                                                                 const size_t dim_at_axis_output,
+                                                                 const size_t dim_after_axis, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int, unsigned int>(const int *index, const unsigned int *grad,
+                                                            unsigned int *output, const size_t dim_before_axis,
+                                                            const size_t dim_at_axis_index,
+                                                            const size_t dim_at_axis_output,
+                                                            const size_t dim_after_axis, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int64_t, unsigned int>(const int64_t *index, const unsigned int *grad,
+                                                                unsigned int *output, const size_t dim_before_axis,
+                                                                const size_t dim_at_axis_index,
+                                                                const size_t dim_at_axis_output,
+                                                                const size_t dim_after_axis, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int, bool>(const int *index, const bool *grad, bool *output,
+                                                    const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherGrad<int64_t, bool>(const int64_t *index, const bool *grad, bool *output,
+                                                        const size_t dim_before_axis, const size_t dim_at_axis_index,
+                                                        const size_t dim_at_axis_output, const size_t dim_after_axis,
+                                                        cudaStream_t stream);
+
+
+
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather_grad.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cuh
similarity index 51%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather_grad.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cuh
index 974e4e36268..b482c13b2fe 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather_grad.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cuh
@@ -14,11 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_GATHER_GRAD_GPU_CU_H
-#define MINDSPORE_GATHER_GRAD_GPU_CU_H
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHER_GRAD_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHER_GRAD_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T, typename S>
-void GatherGrad(const T *index, const S *grad, S *output, const size_t dim_before_axis,
-                const size_t dim_at_axis_index, const size_t dim_at_axis_output, const size_t dim_after_axis,
-                cudaStream_t stream);
+CUDA_LIB_EXPORT void GatherGrad(const T *index, const S *grad, S *output, const size_t dim_before_axis,
+                                const size_t dim_at_axis_index, const size_t dim_at_axis_output,
+                                const size_t dim_after_axis, cudaStream_t stream);
 
-#endif
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHER_GRAD_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cu
new file mode 100644
index 00000000000..670f0909449
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cu
@@ -0,0 +1,124 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cuh"
+#include "include/cuda_fp16.h"
+template <typename T, typename S>
+__global__ void GatherNdKernel(T *input, S *indices, T *output, const size_t output_dim0, const size_t output_dim1,
+                               const size_t indices_dim1, S *batch_indices, S *batch_strides) {
+  int num = output_dim0 * output_dim1;
+  int i, j;
+  for (int write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num;
+       write_index += blockDim.x * gridDim.x) {
+    i = write_index / output_dim1 % output_dim0;
+    j = write_index % output_dim1;
+
+    bool out_of_bound = false;
+    int read_index = 0;
+    int indices_i = 0;
+    for (size_t k = 0; k < indices_dim1; k++) {
+      size_t ind = indices_dim1 * i + k;
+      indices_i = indices[ind];
+      out_of_bound |= !(indices_i < batch_indices[k]);
+      read_index += indices_i * batch_strides[k];
+    }
+    read_index += j;
+
+    if (!out_of_bound) {
+      output[write_index] = input[read_index];
+    } else {
+      output[write_index] = 0;
+    }
+  }
+  return;
+}
+template <typename T, typename S>
+void GatherNd(T *input, S *indices, T *output, const size_t &output_dim0, const size_t &output_dim1,
+              const size_t &indices_dim1, S *batch_indices, S *batch_strides, cudaStream_t stream) {
+  int size = output_dim0 * output_dim1;
+  GatherNdKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, indices, output, output_dim0, output_dim1,
+                                                               indices_dim1, batch_indices, batch_strides);
+  return;
+}
+
+template CUDA_LIB_EXPORT void GatherNd<double, int>(double *input, int *indices, double *output,
+                                                    const size_t &output_dim0, const size_t &output_dim1,
+                                                    const size_t &indices_dim1, int *batch_indices, int *batch_strides,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<float, int>(float *input, int *indices, float *output, const size_t &output_dim0,
+                                                   const size_t &output_dim1, const size_t &indices_dim1,
+                                                   int *batch_indices, int *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<half, int>(half *input, int *indices, half *output, const size_t &output_dim0,
+                                                  const size_t &output_dim1, const size_t &indices_dim1,
+                                                  int *batch_indices, int *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<int, int>(int *input, int *indices, int *output, const size_t &output_dim0,
+                                                 const size_t &output_dim1, const size_t &indices_dim1,
+                                                 int *batch_indices, int *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<short, int>(short *input, int *indices, short *output, const size_t &output_dim0,  // NOLINT
+                                                   const size_t &output_dim1, const size_t &indices_dim1,
+                                                   int *batch_indices, int *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<unsigned int, int>(unsigned int *input, int *indices, unsigned int *output,
+                                                          const size_t &output_dim0, const size_t &output_dim1,
+                                                          const size_t &indices_dim1, int *batch_indices,
+                                                          int *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<char, int>(char *input, int *indices, char *output, const size_t &output_dim0,
+                                                  const size_t &output_dim1, const size_t &indices_dim1,
+                                                  int *batch_indices, int *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<unsigned char, int>(unsigned char *input, int *indices, unsigned char *output,
+                                                           const size_t &output_dim0, const size_t &output_dim1,
+                                                           const size_t &indices_dim1, int *batch_indices,
+                                                           int *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<bool, int>(bool *input, int *indices, bool *output, const size_t &output_dim0,
+                                                  const size_t &output_dim1, const size_t &indices_dim1,
+                                                  int *batch_indices, int *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<double, int64_t>(double *input, int64_t *indices, double *output,
+                                                        const size_t &output_dim0, const size_t &output_dim1,
+                                                        const size_t &indices_dim1, int64_t *batch_indices,
+                                                        int64_t *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<float, int64_t>(float *input, int64_t *indices, float *output,
+                                                       const size_t &output_dim0, const size_t &output_dim1,
+                                                       const size_t &indices_dim1, int64_t *batch_indices,
+                                                       int64_t *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<half, int64_t>(half *input, int64_t *indices, half *output,
+                                                      const size_t &output_dim0, const size_t &output_dim1,
+                                                      const size_t &indices_dim1, int64_t *batch_indices,
+                                                      int64_t *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<int, int64_t>(int *input, int64_t *indices, int *output,
+                                                     const size_t &output_dim0, const size_t &output_dim1,
+                                                     const size_t &indices_dim1, int64_t *batch_indices,
+                                                     int64_t *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<short, int64_t>(short *input, int64_t *indices, short *output,  // NOLINT
+                                                       const size_t &output_dim0, const size_t &output_dim1,
+                                                       const size_t &indices_dim1, int64_t *batch_indices,
+                                                       int64_t *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<unsigned int, int64_t>(unsigned int *input, int64_t *indices,
+                                                              unsigned int *output, const size_t &output_dim0,
+                                                              const size_t &output_dim1, const size_t &indices_dim1,
+                                                              int64_t *batch_indices, int64_t *batch_strides,
+                                                              cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<char, int64_t>(char *input, int64_t *indices, char *output,
+                                                      const size_t &output_dim0, const size_t &output_dim1,
+                                                      const size_t &indices_dim1, int64_t *batch_indices,
+                                                      int64_t *batch_strides, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<unsigned char, int64_t>(unsigned char *input, int64_t *indices,
+                                                               unsigned char *output, const size_t &output_dim0,
+                                                               const size_t &output_dim1, const size_t &indices_dim1,
+                                                               int64_t *batch_indices, int64_t *batch_strides,
+                                                               cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherNd<bool, int64_t>(bool *input, int64_t *indices, bool *output,
+                                                      const size_t &output_dim0, const size_t &output_dim1,
+                                                      const size_t &indices_dim1, int64_t *batch_indices,
+                                                      int64_t *batch_strides, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gathernd.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cuh
similarity index 54%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gathernd.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cuh
index 127a45f1a25..8fe6e68298a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gathernd.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cuh
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_GATHERND_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_GATHERND_CUH_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHERND_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHERND_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T, typename S>
-void GatherNd(T *input, S *indices, T *output, const size_t &output_dim0, const size_t &output_dim1,
-              const size_t &indices_dim1, S *batch_indices, S *batch_strides, cudaStream_t stream);
+CUDA_LIB_EXPORT void GatherNd(T *input, S *indices, T *output, const size_t &output_dim0, const size_t &output_dim1,
+                              const size_t &indices_dim1, S *batch_indices, S *batch_strides, cudaStream_t stream);
 
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_GATHERND_CUH_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHERND_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cu
new file mode 100755
index 00000000000..7cb9a1d0bbe
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cu
@@ -0,0 +1,103 @@
+/**
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh"
+#include "include/cuda_fp16.h"
+template <typename T, typename S>
+__global__ void GatherV2Kernel(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1,
+                               size_t output_dim2, size_t input_dim1) {
+  size_t num = output_dim0 * output_dim1 * output_dim2;
+  size_t i, j, k;
+  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num;
+       write_index += blockDim.x * gridDim.x) {
+    i = write_index / (output_dim1 * output_dim2) % output_dim0;
+    j = write_index / output_dim2 % output_dim1;
+    k = write_index % output_dim2;
+
+    if ((indices[j] >= 0) && (indices[j] < input_dim1)) {
+      size_t read_index = i * input_dim1 * output_dim2 + indices[j] * output_dim2 + k;
+      output[write_index] = input[read_index];
+    } else {
+      output[write_index] = 0;
+    }
+  }
+
+  return;
+}
+template <typename T, typename S>
+void GatherV2(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2,
+              size_t input_dim1, cudaStream_t stream) {
+  size_t size = output_dim0 * output_dim1 * output_dim2;
+  GatherV2Kernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, indices, output, output_dim0, output_dim1,
+                                                               output_dim2, input_dim1);
+  return;
+}
+
+template CUDA_LIB_EXPORT void GatherV2<float, int>(float *input, int *indices, float *output, size_t output_dim0,
+                                                   size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<float, int64_t>(float *input, int64_t *indices, float *output,
+                                                       size_t output_dim0, size_t output_dim1, size_t output_dim2,
+                                                       size_t input_dim1, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<half, int>(half *input, int *indices, half *output, size_t output_dim0,
+                                                  size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                  cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<half, int64_t>(half *input, int64_t *indices, half *output, size_t output_dim0,
+                                                      size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                      cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<double, int>(double *input, int *indices, double *output, size_t output_dim0,
+                                                    size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<double, int64_t>(double *input, int64_t *indices, double *output,
+                                                        size_t output_dim0, size_t output_dim1, size_t output_dim2,
+                                                        size_t input_dim1, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<int, int>(int *input, int *indices, int *output, size_t output_dim0,
+                                                 size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                 cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<int, int64_t>(int *input, int64_t *indices, int *output, size_t output_dim0,
+                                                     size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                     cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<int16_t, int>(int16_t *input, int *indices, int16_t *output, size_t output_dim0,
+                                                     size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                     cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<int16_t, int64_t>(int16_t *input, int64_t *indices, int16_t *output,
+                                                         size_t output_dim0, size_t output_dim1, size_t output_dim2,
+                                                         size_t input_dim1, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<int8_t, int>(int8_t *input, int *indices, int8_t *output, size_t output_dim0,
+                                                    size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<int8_t, int64_t>(int8_t *input, int64_t *indices, int8_t *output,
+                                                        size_t output_dim0, size_t output_dim1, size_t output_dim2,
+                                                        size_t input_dim1, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<uint32_t, int>(uint32_t *input, int *indices, uint32_t *output,
+                                                      size_t output_dim0, size_t output_dim1, size_t output_dim2,
+                                                      size_t input_dim1, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<uint32_t, int64_t>(uint32_t *input, int64_t *indices, uint32_t *output,
+                                                          size_t output_dim0, size_t output_dim1, size_t output_dim2,
+                                                          size_t input_dim1, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<uint8_t, int>(uint8_t *input, int *indices, uint8_t *output, size_t output_dim0,
+                                                     size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                     cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<uint8_t, int64_t>(uint8_t *input, int64_t *indices, uint8_t *output,
+                                                         size_t output_dim0, size_t output_dim1, size_t output_dim2,
+                                                         size_t input_dim1, cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<bool, int>(bool *input, int *indices, bool *output, size_t output_dim0,
+                                                  size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                  cudaStream_t stream);
+template CUDA_LIB_EXPORT void GatherV2<bool, int64_t>(bool *input, int64_t *indices, bool *output, size_t output_dim0,
+                                                      size_t output_dim1, size_t output_dim2, size_t input_dim1,
+                                                      cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh
similarity index 62%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh
index 944b08f9596..aa82f2f74b2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_GATHERV2_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_GATHERV2_CUH_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHERV2_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHERV2_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T, typename S>
-void GatherV2(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2,
-              size_t input_dim1, cudaStream_t stream);
+CUDA_LIB_EXPORT void GatherV2(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1,
+                              size_t output_dim2, size_t input_dim1, cudaStream_t stream);
 
 template <typename T, typename S>
 __global__ void GatherV2Kernel(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1,
                                size_t output_dim2, size_t input_dim1);
 
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_GATHERV2_CUH_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHERV2_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gelu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cu
similarity index 89%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gelu_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cu
index 0d6b5614b47..2b46041b1ac 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gelu_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/gelu_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void GeluKernel(size_t size, T *input_addr, T *output_addr) {
@@ -127,7 +127,9 @@ void GeluGradKernel(size_t size, half *dy_addr, half *x_addr, half *dx_addr, cud
   }
 }
 
-template void Gelu(size_t size, float *input_addr, float *output_addr, cudaStream_t cuda_stream);
-template void Gelu(size_t size, half *input_addr, half *output_addr, cudaStream_t cuda_stream);
-template void GeluGradKernel(size_t size, float *dy_addr, float *x_addr, float *dx_addr, cudaStream_t cuda_stream);
-template void GeluGradKernel(size_t size, half *dy_addr, half *x_addr, half *dx_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Gelu(size_t size, float *input_addr, float *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Gelu(size_t size, half *input_addr, half *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void GeluGradKernel(size_t size, float *dy_addr, float *x_addr, float *dx_addr,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void GeluGradKernel(size_t size, half *dy_addr, half *x_addr, half *dx_addr,
+                                             cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/softplus_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cuh
similarity index 62%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/softplus_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cuh
index 43992e3d260..d856c4b3afc 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/softplus_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cuh
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SOFTPLUS_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SOFTPLUS_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GELU_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GELU_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template<typename T>
-void Softplus(const size_t input_size, const T* input_addr, T* output_addr, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void Gelu(size_t input_size, T* input_addr, T* output_addr, cudaStream_t cuda_stream);
 
 template<typename T>
-void SoftplusGrad(const size_t size, const T* dy_addr, const T* x_addr, T* dx_addr, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void GeluGradKernel(size_t size, T* dy_addr, T* x_addr, T* dx_addr, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SOFTPLUS_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GELU_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cu
similarity index 92%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cu
index 195f5f5ea03..500fea07afa 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cu
@@ -16,9 +16,8 @@
 
 #include <algorithm>
 #include <limits>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "include/cuda_fp16.h"
-#include "plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cuh"
 
 const int kWarpSize = 32;
 const int kBlockSize = 512;
@@ -327,9 +326,12 @@ void CalGeneralReduction(bool small, const T *input, const size_t bound, const s
   return;
 }
 
-template void CalGeneralReduction(bool small, const double *input, const size_t bound_, const size_t outerSize_,
-                                  const size_t innerSize_, int *index, double *output, cudaStream_t cuda_stream);
-template void CalGeneralReduction(bool small, const float *input, const size_t bound_, const size_t outerSize_,
-                                  const size_t innerSize_, int *index, float *output, cudaStream_t cuda_stream);
-template void CalGeneralReduction(bool small, const half *input, const size_t bound_, const size_t outerSize_,
-                                  const size_t innerSize_, int *index, half *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalGeneralReduction(bool small, const double *input, const size_t bound_,
+                                                  const size_t outerSize_, const size_t innerSize_, int *index,
+                                                  double *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalGeneralReduction(bool small, const float *input, const size_t bound_,
+                                                  const size_t outerSize_, const size_t innerSize_, int *index,
+                                                  float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalGeneralReduction(bool small, const half *input, const size_t bound_,
+                                                  const size_t outerSize_, const size_t innerSize_, int *index,
+                                                  half *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cuh
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cuh
index b09cf08e4cc..fc6b7237eaf 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cuh
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_GENERAL_REDUCTION_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_GENERAL_REDUCTION_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GENERAL_REDUCTION_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GENERAL_REDUCTION_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T, typename S>
-void CalGeneralReduction(bool small, const T *input, const size_t bound_, const size_t outerSize_,
-                         const size_t innerSize_, S *index, T *output, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_GENERAL_REDUCTION_H_
+CUDA_LIB_EXPORT void CalGeneralReduction(bool small, const T *input, const size_t bound_, const size_t outerSize_,
+                                         const size_t innerSize_, S *index, T *output, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GENERAL_REDUCTION_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hash_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cu
similarity index 80%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hash_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cu
index fa7550e228b..9e28741bd8c 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hash_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cu
@@ -15,7 +15,6 @@
  */
 
 #include "hash_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 
 template <typename T>
 __global__ void HashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size,
@@ -57,8 +56,10 @@ void DoHashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_inde
   return;
 }
 
-template void DoHashSwapOut<float>(const float *hash_table, float *swap_out_value, const int *swap_out_index,
-                                   const int index_size, const int hash_dim, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void DoHashSwapOut<float>(const float *hash_table, float *swap_out_value,
+                                                   const int *swap_out_index, const int index_size, const int hash_dim,
+                                                   cudaStream_t cuda_stream);
 
-template void DoHashSwapIn<float>(float *hash_table, const float *swap_in_value, const int *swap_in_index,
-                                  const int index_size, const int hash_dim, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void DoHashSwapIn<float>(float *hash_table, const float *swap_in_value,
+                                                  const int *swap_in_index, const int index_size, const int hash_dim,
+                                                  cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cuh
new file mode 100755
index 00000000000..1748047af93
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cuh
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HASH_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HASH_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void DoHashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index,
+                                   const int index_size, const int hash_dim, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void DoHashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index,
+                                  const int index_size, const int hash_dim, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HASH_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cu
similarity index 68%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cu
index 6558715cad0..f9b897419f3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void HsigmoidKernel(size_t size, const T *input, T *output) {
@@ -43,10 +44,12 @@ void CalHSigmoidGrad(const size_t &size, const T *dout, const T *x, T *output, c
   HsigmoidGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dout, x, output);
 }
 
-template void CalHSigmoid<half>(const size_t &size, const half *input, half *output, cudaStream_t cuda_stream);
-template void CalHSigmoid<float>(const size_t &size, const float *input, float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalHSigmoid<half>(const size_t &size, const half *input, half *output,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalHSigmoid<float>(const size_t &size, const float *input, float *output,
+                                                 cudaStream_t cuda_stream);
 
-template void CalHSigmoidGrad<half>(const size_t &size, const half *dout, const half *x, half *output,
-                                    cudaStream_t cuda_stream);
-template void CalHSigmoidGrad<float>(const size_t &size, const float *dout, const float *x, float *output,
-                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalHSigmoidGrad<half>(const size_t &size, const half *dout, const half *x, half *output,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalHSigmoidGrad<float>(const size_t &size, const float *dout, const float *x,
+                                                     float *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cuh
new file mode 100644
index 00000000000..fd61ec0aabf
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HSIGMOID_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HSIGMOID_IMPL_CUH_
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void CalHSigmoid(const size_t &size, const T *input, T *output, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalHSigmoidGrad(const size_t &size, const T *dout, const T *x, T *output,
+                                     cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HSIGMOID_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hswish_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cu
similarity index 73%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hswish_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cu
index 982e0eba2fb..22f11903e57 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hswish_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/hswish_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void HSwishKernel(size_t size, const T *input, T *output) {
@@ -62,10 +63,12 @@ void CalHSwishGrad(const size_t &size, const T *dout, const T *x, T *output, cud
   HSwishGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dout, x, output);
 }
 
-template void CalHSwish<half>(const size_t &size, const half *input, half *output, cudaStream_t cuda_stream);
-template void CalHSwish<float>(const size_t &size, const float *input, float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalHSwish<half>(const size_t &size, const half *input, half *output,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalHSwish<float>(const size_t &size, const float *input, float *output,
+                                               cudaStream_t cuda_stream);
 
-template void CalHSwishGrad<half>(const size_t &size, const half *dout, const half *x, half *output,
-                                    cudaStream_t cuda_stream);
-template void CalHSwishGrad<float>(const size_t &size, const float *dout, const float *x, float *output,
-                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalHSwishGrad<half>(const size_t &size, const half *dout, const half *x, half *output,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalHSwishGrad<float>(const size_t &size, const float *dout, const float *x, float *output,
+                                                   cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hswish_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cuh
similarity index 55%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hswish_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cuh
index 416f40f52d2..18cfb4bf007 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hswish_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cuh
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSWISH_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSWISH_IMPL_CUH_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HSWISH_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HSWISH_IMPL_CUH_
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void CalHSwish(const size_t &size, const T *input, T *output, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CalHSwish(const size_t &size, const T *input, T *output, cudaStream_t cuda_stream);
 
 template <typename T>
-void CalHSwishGrad(const size_t &size, const T *dout, const T *x, T *output, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CalHSwishGrad(const size_t &size, const T *dout, const T *x, T *output, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSWISH_IMPL_CUH_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HSWISH_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cu
similarity index 72%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cu
index 8a10e340992..b7571578080 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cu
@@ -14,10 +14,8 @@
  * limitations under the License.
  */
 #include "in_top_k_impl.cuh"
-
 #include <cuda_runtime.h>
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void InTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output,
@@ -39,9 +37,10 @@ void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const
                                                                       batch_size, class_id_count, k);
 }
 
-template void CalInTopK<half>(const half *predictions, const int32_t *targets, bool *output, const half *top_k_output,
-                              size_t batch_size, size_t class_id_count, int64_t k, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalInTopK<half>(const half *predictions, const int32_t *targets, bool *output,
+                                              const half *top_k_output, size_t batch_size, size_t class_id_count,
+                                              int64_t k, cudaStream_t cuda_stream);
 
-template void CalInTopK<float>(const float *predictions, const int32_t *targets, bool *output,
-                               const float *top_k_output, size_t batch_size, size_t class_id_count, int64_t k,
-                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalInTopK<float>(const float *predictions, const int32_t *targets, bool *output,
+                                               const float *top_k_output, size_t batch_size, size_t class_id_count,
+                                               int64_t k, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cuh
similarity index 54%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cuh
index f72b20ab434..5e3fa35c67f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cuh
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_IN_TOP_K_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_IN_TOP_K_CUH_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_IN_TOP_K_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_IN_TOP_K_IMPL_CUH_
 #include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t batch_size,
-               size_t class_id_count, int64_t k, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output,
+                               size_t batch_size, size_t class_id_count, int64_t k, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_IN_TOP_K_CUH_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_IN_TOP_K_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/index_add_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cu
similarity index 50%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/index_add_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cu
index d5743543d23..31921bb7610 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/index_add_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cu
@@ -14,10 +14,8 @@
  * limitations under the License.
  */
 #include <iostream>
-#include "plugin/device/gpu/kernel/cuda_impl/index_add_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "include/cuda_fp16.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 template <typename T>
 __global__ void IndexAddAtomic(T *dst, const int *index, const T *src, const size_t src_size, const size_t outer_size,
   const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size) {
@@ -61,24 +59,29 @@ void CalIndexAdd(T *dst, const int *index, const T *src, const size_t outer_size
   }
 }
 
-template void CalIndexAdd<double>(double *dst, const int *index, const double *src, const size_t outer_size,
-  const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock,
-  cudaStream_t cuda_stream);
-template void CalIndexAdd<float>(float *dst, const int *index, const float *src, const size_t outer_size,
-  const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock,
-  cudaStream_t cuda_stream);
-template void CalIndexAdd<half>(half *dst, const int *index, const half *src, const size_t outer_size,
-  const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock,
-  cudaStream_t cuda_stream);
-template void CalIndexAdd<int>(int *dst, const int *index, const int *src, const size_t outer_size,
-  const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock,
-  cudaStream_t cuda_stream);
-template void CalIndexAdd<int16_t>(int16_t *dst, const int *index, const int16_t *src, const size_t outer_size,
-  const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock,
-  cudaStream_t cuda_stream);
-template void CalIndexAdd<int8_t>(int8_t *dst, const int *index, const int8_t *src, const size_t outer_size,
-  const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock,
-  cudaStream_t cuda_stream);
-template void CalIndexAdd<uint8_t>(uint8_t *dst, const int *index, const uint8_t *src, const size_t outer_size,
-  const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock,
-  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIndexAdd<double>(double *dst, const int *index, const double *src,
+                                                  const size_t outer_size, const size_t src_axis_size,
+                                                  const size_t dst_axis_size, const size_t inner_size,
+                                                  const bool use_lock, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIndexAdd<float>(float *dst, const int *index, const float *src,
+                                                 const size_t outer_size, const size_t src_axis_size,
+                                                 const size_t dst_axis_size, const size_t inner_size,
+                                                 const bool use_lock, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIndexAdd<half>(half *dst, const int *index, const half *src, const size_t outer_size,
+                                                const size_t src_axis_size, const size_t dst_axis_size,
+                                                const size_t inner_size, const bool use_lock, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIndexAdd<int>(int *dst, const int *index, const int *src, const size_t outer_size,
+                                               const size_t src_axis_size, const size_t dst_axis_size,
+                                               const size_t inner_size, const bool use_lock, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIndexAdd<int16_t>(int16_t *dst, const int *index, const int16_t *src,
+                                                   const size_t outer_size, const size_t src_axis_size,
+                                                   const size_t dst_axis_size, const size_t inner_size,
+                                                   const bool use_lock, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIndexAdd<int8_t>(int8_t *dst, const int *index, const int8_t *src,
+                                                  const size_t outer_size, const size_t src_axis_size,
+                                                  const size_t dst_axis_size, const size_t inner_size,
+                                                  const bool use_lock, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalIndexAdd<uint8_t>(uint8_t *dst, const int *index, const uint8_t *src,
+                                                   const size_t outer_size, const size_t src_axis_size,
+                                                   const size_t dst_axis_size, const size_t inner_size,
+                                                   const bool use_lock, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/index_add_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cuh
similarity index 50%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/index_add_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cuh
index ab993c93603..ef753c429f6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/index_add_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cuh
@@ -14,9 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_INDEXADD_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_INDEXADD_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INDEX_ADD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INDEX_ADD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void CalIndexAdd(T *dst, const int *index, const T *src, const size_t outer_size, const size_t src_axis_size,
-  const size_t dst_axis_size, const size_t inner_size, const bool use_lock, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_INDEXADD_H_
+CUDA_LIB_EXPORT void CalIndexAdd(T *dst, const int *index, const T *src, const size_t outer_size,
+                                 const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size,
+                                 const bool use_lock, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INDEX_ADD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cu
similarity index 96%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cu
index 3698c082345..bb948cc8e75 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __global__ void CopyMemKernel(const size_t thread_num, const size_t N, const size_t C,
                               float *gamma_addr, float *beta_addr,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cuh
new file mode 100644
index 00000000000..56e1d869c69
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cuh
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INSTANCE_NORM_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INSTANCE_NORM_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+CUDA_LIB_EXPORT void CopyMemDevice2Device(const size_t N, const size_t C, float *gamma_addr, float *beta_addr,
+                                          float *runing_mean_addr, float *runnig_variance_addr, float *ws_gamma,
+                                          float *ws_beta, float *ws_mean, float *ws_var, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void ComputeMean(const size_t N, const size_t C, float *dgamma, float *dbeta, const float *ws_dgamma,
+                                 const float *ws_dbeta, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INSTANCE_NORM_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/iou_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cu
similarity index 85%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/iou_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cu
index ca8c51e54e5..819a7f54581 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/iou_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/iou_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cuh"
+#include "include/cuda_fp16.h"
 
 __device__ float CoordinateMax(const float a, const float b) {
   return (a > b ? a : b);
@@ -67,7 +68,7 @@ void IOU(const size_t &size, const T *box1, const T *box2, T *iou_results, const
   IOUKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, box1, box2, iou_results, mode, input_len_0);
 }
 
-template void IOU(const size_t &size, const float *box1, const float *box2, float *iou_results, const size_t &mode,
-                  const size_t &input_len_0, cudaStream_t cuda_stream);
-template void IOU(const size_t &size, const half *box1, const half *box2, half *iou_results, const size_t &mode,
-                  const size_t &input_len_0, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void IOU(const size_t &size, const float *box1, const float *box2, float *iou_results,
+                                  const size_t &mode, const size_t &input_len_0, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void IOU(const size_t &size, const half *box1, const half *box2, half *iou_results,
+                                  const size_t &mode, const size_t &input_len_0, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/iou_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cuh
similarity index 57%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/iou_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cuh
index c9d0de6238b..7e18195ea53 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/iou_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cuh
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_IOU_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_IOU_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_IOU_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_IOU_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 #define IOU_LOCATION_NUM 2
 #define IOU_DIMENSION 4
 
 template <typename T>
-void IOU(const size_t &size, const T *box1, const T *box2, T *iou_results, const size_t &mode,
-         const size_t &input_len_0, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void IOU(const size_t &size, const T *box1, const T *box2, T *iou_results, const size_t &mode,
+                         const size_t &input_len_0, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_IOU_IMPL_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_IOU_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2_loss.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cu
similarity index 77%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2_loss.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cu
index 6c303594cfa..ba136cc10e0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2_loss.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cu
@@ -15,8 +15,7 @@
  */
 
 #include "l2_loss.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 template <typename T>
 __global__ void L2LossKernel(const size_t input_size, const T *input , T *output) {
@@ -39,5 +38,7 @@ void L2Loss(const size_t input_size, const T *input , T *output, cudaStream_t st
   L2LossKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, input, output);
 }
 
-template void L2Loss<float>(const size_t input_size, const float *input , float *output, cudaStream_t stream);
-template void L2Loss<half>(const size_t input_size, const half *input , half *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void L2Loss<float>(const size_t input_size, const float *input , float *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void L2Loss<half>(const size_t input_size, const half *input , half *output,
+                                           cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2_loss.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cuh
similarity index 60%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2_loss.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cuh
index 428451c84fe..b8d544dea07 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2_loss.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cuh
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_L2_LOSS_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_L2_LOSS_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_L2_LOSS_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_L2_LOSS_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void L2Loss(const size_t input_size, const T *input , T *output, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_L2_LOSS_H_
+CUDA_LIB_EXPORT void L2Loss(const size_t input_size, const T *input , T *output, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_L2_LOSS_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cu
similarity index 67%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cu
index 25603f50874..7d617ba33fb 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cu
@@ -15,7 +15,6 @@
  */
 
 #include "l2normalize_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "include/cuda_fp16.h"
 template <typename T>
 __global__ void AssignEps(const size_t size, const float eps, T* value) {
@@ -31,6 +30,9 @@ void GetMaxWithEpsAndValue(const size_t size, const float eps, T* value, cudaStr
   AssignEps<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, eps, value);
 }
 
-template void GetMaxWithEpsAndValue<float>(const size_t size, const float eps, float* value, cudaStream_t cuda_stream);
-template void GetMaxWithEpsAndValue<half>(const size_t size, const float eps, half* value, cudaStream_t cuda_stream);
-template void GetMaxWithEpsAndValue<int>(const size_t size, const float eps, int* value, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void GetMaxWithEpsAndValue<float>(const size_t size, const float eps, float* value,
+                                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void GetMaxWithEpsAndValue<half>(const size_t size, const float eps, half* value,
+                                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void GetMaxWithEpsAndValue<int>(const size_t size, const float eps, int* value,
+                                                         cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cuh
similarity index 58%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cuh
index 1f37cef9158..9fe7a4d2145 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cuh
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_L2NORMALIZE_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_L2NORMALIZE_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_L2NORMALIZE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_L2NORMALIZE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void GetMaxWithEpsAndValue(const size_t size, const float eps, T* value, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void GetMaxWithEpsAndValue(const size_t size, const float eps, T* value, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_L2NORMALIZE_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_L2NORMALIZE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cu
similarity index 92%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cu
index 3cf14c491e9..af8bd26b6a2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cu
@@ -17,8 +17,9 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh"
+#include "include/cuda_fp16.h"
 
 constexpr int THREAD_PER_BLOCK = 256;
 constexpr int NUM_PER_THREAD_REDUCE = 4;
@@ -404,12 +405,15 @@ void LayerNormGradGrad(const int &row_dim, const int &col_dim, const int &param_
 }
 
 
-template void LayerNormGradGrad(const int &row_dim, const int &col_dim, const int &param_dim, float *global_sum1,
-                                float *global_sum2, const float &epsilon, const float *dy, const float *x,
-                                const float *mean, const float *var, const float *gamma, const float *grad_dx,
-                                const float *grad_dg, const float *grad_db, float *d_dy, float *d_x, float *d_gamma,
-                                cudaStream_t stream);
-template void LayerNormGradGrad(const int &row_dim, const int &col_dim, const int &param_dim, half *global_sum1,
-                                half *global_sum2, const half &epsilon, const half *dy, const half *x, const half *mean,
-                                const half *var, const half *gamma, const half *grad_dx, const half *grad_dg,
-                                const half *grad_db, half *d_dy, half *d_x, half *d_gamma, cudaStream_t stream);
+template CUDA_LIB_EXPORT void LayerNormGradGrad(const int &row_dim, const int &col_dim, const int &param_dim,
+                                                float *global_sum1, float *global_sum2, const float &epsilon,
+                                                const float *dy, const float *x, const float *mean, const float *var,
+                                                const float *gamma, const float *grad_dx, const float *grad_dg,
+                                                const float *grad_db, float *d_dy, float *d_x, float *d_gamma,
+                                                cudaStream_t stream);
+template CUDA_LIB_EXPORT void LayerNormGradGrad(const int &row_dim, const int &col_dim, const int &param_dim,
+                                                half *global_sum1, half *global_sum2, const half &epsilon,
+                                                const half *dy, const half *x, const half *mean, const half *var,
+                                                const half *gamma, const half *grad_dx, const half *grad_dg,
+                                                const half *grad_db, half *d_dy, half *d_x, half *d_gamma,
+                                                cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cuh
new file mode 100644
index 00000000000..134e90d0e29
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cuh
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_GRAD_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_GRAD_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void LayerNormGradGrad(const int& row_dim, const int& col_dim, const int& param_dim, T* global_sum1,
+                                       T* global_sum2, const T& epsilon, const T* dy, const T* x, const T* mean,
+                                       const T* var, const T* gamma, const T* grad_dx, const T* grad_dg,
+                                       const T* grad_db, T* d_dy, T* d_x, T* d_gamma, cudaStream_t stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_GRAD_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cu
similarity index 91%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cu
index 9133d4f35eb..974f2e488f3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cu
@@ -17,8 +17,9 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh"
+#include "include/cuda_fp16.h"
 
 constexpr int NUM_PER_THREAD_REDUCE = 4;
 constexpr int WARP_SIZE = 32;
@@ -249,9 +250,11 @@ void LayerNormGrad(const int &row_dim, const int &col_dim, const int &param_dim,
                                                                                   epsilon, dy, x, mean, var, dg, db);
 }
 
-template void LayerNormGrad(const int &row_dim, const int &col_dim, const int &param_dim, const float &epsilon,
-                            const float *dy, const float *x, const float *mean, const float *var, const float *gamma,
-                            float *dx, float *dg, float *db, cudaStream_t stream);
-template void LayerNormGrad(const int &row_dim, const int &col_dim, const int &param_dim, const half &epsilon,
-                            const half *dy, const half *x, const half *mean, const half *var, const half *gamma,
-                            half *dx, half *dg, half *db, cudaStream_t stream);
+template CUDA_LIB_EXPORT void LayerNormGrad(const int &row_dim, const int &col_dim, const int &param_dim,
+                                            const float &epsilon, const float *dy, const float *x, const float *mean,
+                                            const float *var, const float *gamma, float *dx, float *dg, float *db,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void LayerNormGrad(const int &row_dim, const int &col_dim, const int &param_dim,
+                                            const half &epsilon, const half *dy, const half *x, const half *mean,
+                                            const half *var, const half *gamma, half *dx, half *dg, half *db,
+                                            cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cuh
new file mode 100644
index 00000000000..29ce6723579
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void LayerNormGrad(const int& row_dim, const int& col_dim, const int& param_dim, const T& epsilon,
+                                   const T* dy, const T* x, const T* mean, const T* var, const T* gamma, T* dx, T* dg,
+                                   T* db, cudaStream_t stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cu
similarity index 89%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cu
index 91bc9710276..2fcfdba78c9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cu
@@ -17,7 +17,7 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh"
 
 constexpr int NUM_PER_THREAD_REDUCE = 4;
 constexpr int WARP_SIZE = 32;
@@ -149,9 +149,9 @@ void LayerNorm(const int &row_dim, const int &col_dim, const int &param_dim, con
                                                                          beta, y, mean, var);
 }
 
-template void LayerNorm(const int &row_dim, const int &col_dim, const int &param_dim, const float &epsilon,
-                        const float *x, const float *gamma, const float *beta, float *y, float *mean, float *var,
-                        cudaStream_t stream);
-template void LayerNorm(const int &row_dim, const int &col_dim, const int &param_dim, const half &epsilon,
-                        const half *x, const half *gamma, const half *beta, half *y, half *mean, half *var,
-                        cudaStream_t stream);
+template CUDA_LIB_EXPORT void LayerNorm(const int &row_dim, const int &col_dim, const int &param_dim,
+                                        const float &epsilon, const float *x, const float *gamma, const float *beta,
+                                        float *y, float *mean, float *var, cudaStream_t stream);
+template CUDA_LIB_EXPORT void LayerNorm(const int &row_dim, const int &col_dim, const int &param_dim,
+                                        const half &epsilon, const half *x, const half *gamma, const half *beta,
+                                        half *y, half *mean, half *var, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh
similarity index 62%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh
index cb1674bbc20..5f4ea1ab38e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 struct DynamicSharedMem;
@@ -37,7 +37,7 @@ struct DynamicSharedMem<half> {
 };
 
 template <typename T>
-void LayerNorm(const int& outer, const int& inner, const int& param_dim, const T& epsilon, const T* x, const T* gamma,
-               const T* beta, T* y, T* mean, T* var, cudaStream_t stream);
+CUDA_LIB_EXPORT void LayerNorm(const int& outer, const int& inner, const int& param_dim, const T& epsilon, const T* x,
+                               const T* gamma, const T* beta, T* y, T* mean, T* var, cudaStream_t stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/linspace.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cu
similarity index 81%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/linspace.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cu
index 30bd353b6a5..d0f5ed05851 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/linspace.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/linspace.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cuh"
 #include <iostream>
 
 template <typename T>
@@ -28,5 +28,5 @@ template <typename T>
 void calLinSpace(const T *start, const T *stop, const size_t value_count, T *output, cudaStream_t cuda_stream) {
   LinSpaceKernel<<<GET_BLOCKS(value_count), GET_THREADS, 0, cuda_stream>>>(start, stop, value_count, output);
 }
-template void calLinSpace<float>(const float *start, const float *stop, const size_t value_count, float *output,
-                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void calLinSpace<float>(const float *start, const float *stop, const size_t value_count,
+                                                 float *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/linspace.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cuh
similarity index 56%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/linspace.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cuh
index c0d3474b187..4948bb9dd22 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/linspace.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cuh
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_LINSPACE_IMPL_CU_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_LINSPACE_IMPL_CU_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LINSPACE_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LINSPACE_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void calLinSpace(const T *start, const T *stop, const size_t value_count, T *output, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_LINSPACE_IMPL_CU_H_
+CUDA_LIB_EXPORT void calLinSpace(const T *start, const T *stop, const size_t value_count, T *output,
+                                 cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LINSPACE_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cu
similarity index 64%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cu
index 1d8b4c33408..85de67182c7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cu
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cuh"
 #include "include/cuda_fp16.h"
 
 template <typename T>
@@ -87,18 +86,24 @@ void CalLocalResponseNormGradNHWC(const T *dy, const T *x, const T *y, const int
   return;
 }
 
-template void CalLocalResponseNormNHWC<float>(const float *input, const int depth_radius, const float bias,
-  const float alpha, const float beta, const size_t channels, const size_t num_elements, float *scale, float *output,
-  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalLocalResponseNormNHWC<float>(const float *input, const int depth_radius,
+                                                              const float bias, const float alpha, const float beta,
+                                                              const size_t channels, const size_t num_elements,
+                                                              float *scale, float *output, cudaStream_t cuda_stream);
 
-template void CalLocalResponseNormNHWC<half>(const half *input, const int depth_radius, const float bias,
-  const float alpha, const float beta, const size_t channels, const size_t num_elements, float *scale, half *output,
-  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalLocalResponseNormNHWC<half>(const half *input, const int depth_radius,
+                                                             const float bias, const float alpha, const float beta,
+                                                             const size_t channels, const size_t num_elements,
+                                                             float *scale, half *output, cudaStream_t cuda_stream);
 
-template void CalLocalResponseNormGradNHWC<float>(const float *dy, const float *x, const float *y,
-  const int depth_radius, const float bias, const float alpha, const float beta, const size_t channels,
-  const size_t num_elements, float *scale, float *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalLocalResponseNormGradNHWC<float>(const float *dy, const float *x, const float *y,
+                                                                  const int depth_radius, const float bias,
+                                                                  const float alpha, const float beta,
+                                                                  const size_t channels, const size_t num_elements,
+                                                                  float *scale, float *dx, cudaStream_t cuda_stream);
 
-template void CalLocalResponseNormGradNHWC<half>(const half *dy, const half *x, const half *y,
-  const int depth_radius, const float bias, const float alpha, const float beta, const size_t channels,
-  const size_t num_elements, float *scale, half *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalLocalResponseNormGradNHWC<half>(const half *dy, const half *x, const half *y,
+                                                                 const int depth_radius, const float bias,
+                                                                 const float alpha, const float beta,
+                                                                 const size_t channels, const size_t num_elements,
+                                                                 float *scale, half *dx, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cuh
new file mode 100644
index 00000000000..0468a0f3778
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cuh
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOCAL_RESPONSE_NORM_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOCAL_RESPONSE_NORM_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void CalLocalResponseNormNHWC(const T *input, const int depth_radius, const float bias,
+                                              const float alpha, const float beta, const size_t channels,
+                                              const size_t num_elements, float *scale, T *output,
+                                              cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalLocalResponseNormGradNHWC(const T *dy, const T *x, const T *y, const int depth_radius,
+                                                  const float bias, const float alpha, const float beta,
+                                                  const size_t channels, const size_t num_elements, float *scale, T *dx,
+                                                  cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOCAL_RESPONSE_NORM_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cu
similarity index 85%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cu
index 592300196a7..84eccc296a7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cu
@@ -16,9 +16,7 @@
 
 #include <vector>
 #include <iostream>
-
-#include "plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cuh"
 
 template <typename T>
 struct LogicalNotFunc {
@@ -37,4 +35,4 @@ void LogicalNotImpl(const int &nums, const T *x, bool *y, cudaStream_t stream) {
   return LogicalNotKernel<T, LogicalNotFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x, y);
 }
 
-template void LogicalNotImpl(const int &nums, const bool *x, bool *y, cudaStream_t stream);
+template CUDA_LIB_EXPORT void LogicalNotImpl(const int &nums, const bool *x, bool *y, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cuh
similarity index 59%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cuh
index dbf8185b67f..4a5af18bbe8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cuh
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LOGICAL_NOT_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LOGICAL_NOT_H_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOGICAL_NOT_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOGICAL_NOT_IMPL_CUH_
 #include <vector>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void LogicalNotImpl(const int &nums, const T *x, bool *y, cudaStream_t stream);
+CUDA_LIB_EXPORT void LogicalNotImpl(const int &nums, const T *x, bool *y, cudaStream_t stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LOGICAL_NOT_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOGICAL_NOT_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cu
similarity index 70%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cu
index 96b99c5de82..0d2d2c76166 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cu
@@ -16,7 +16,6 @@
 
 #include <algorithm>
 #include "loss_with_reduction_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "util.cuh"
 
 inline __device__ float logT(float x) { return logf(x); }
@@ -383,62 +382,77 @@ void NLLLossGrad(const int n, const int c, const ReductionMode reduction, const
                                                                dloss, dinput);
 }
 
-template void KLDivLoss<float>(const int &input_size, const ReductionMode &reduction, const float *input_x,
-                               const float *input_y, float *loss, float *tmp_loss, cudaStream_t stream);
+template CUDA_LIB_EXPORT void KLDivLoss<float>(const int &input_size, const ReductionMode &reduction,
+                                               const float *input_x, const float *input_y, float *loss, float *tmp_loss,
+                                               cudaStream_t stream);
 
-template void KLDivLossGrad<float>(const int &input_size, const ReductionMode &reduction, const float *input_x,
-                                   const float *input_y, const float *dloss, float *dx, float *dy, cudaStream_t stream);
+template CUDA_LIB_EXPORT void KLDivLossGrad<float>(const int &input_size, const ReductionMode &reduction,
+                                                   const float *input_x, const float *input_y, const float *dloss,
+                                                   float *dx, float *dy, cudaStream_t stream);
 
-template void BinaryCrossEntropyLoss<float>(const int &input_size, const ReductionMode &reduction, const float *input_x,
-                                            const float *input_y, const float *weight, float *loss, float *tmp_loss,
-                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void BinaryCrossEntropyLoss<float>(const int &input_size, const ReductionMode &reduction,
+                                                            const float *input_x, const float *input_y,
+                                                            const float *weight, float *loss, float *tmp_loss,
+                                                            cudaStream_t stream);
 
-template void BinaryCrossEntropyLossGrad<float>(const int &input_size, const ReductionMode &reduction,
-                                                const float *input_x, const float *input_y, const float *weight,
-                                                const float *dloss, float *dx, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BinaryCrossEntropyLossGrad<float>(const int &input_size, const ReductionMode &reduction,
+                                                                const float *input_x, const float *input_y,
+                                                                const float *weight, const float *dloss, float *dx,
+                                                                cudaStream_t stream);
 
-template void NLLLoss<float, float>(const int n, const int c, const ReductionMode reduction, const float *input,
-                                    const int32_t *target, const float *weight, float *loss, float *total_weight,
-                                    float *tmp_loss, float *tmp_target_weight, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NLLLoss<float, float>(const int n, const int c, const ReductionMode reduction,
+                                                    const float *input, const int32_t *target, const float *weight,
+                                                    float *loss, float *total_weight, float *tmp_loss,
+                                                    float *tmp_target_weight, cudaStream_t stream);
 
-template void NLLLoss<float, half>(const int n, const int c, const ReductionMode reduction, const float *input,
-                                   const int32_t *target, const half *weight, float *loss, half *total_weight,
-                                   float *tmp_loss, half *tmp_target_weight, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NLLLoss<float, half>(const int n, const int c, const ReductionMode reduction,
+                                                   const float *input, const int32_t *target, const half *weight,
+                                                   float *loss, half *total_weight, float *tmp_loss,
+                                                   half *tmp_target_weight, cudaStream_t stream);
 
-template void NLLLossGrad<float, float>(const int n, const int c, const ReductionMode reduction, const float *input,
-                                        const int32_t *target, const float *weight, const float *total_weight,
-                                        const float *dloss, float *dinput, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NLLLossGrad<float, float>(const int n, const int c, const ReductionMode reduction,
+                                                        const float *input, const int32_t *target, const float *weight,
+                                                        const float *total_weight, const float *dloss, float *dinput,
+                                                        cudaStream_t stream);
 
-template void NLLLossGrad<float, half>(const int n, const int c, const ReductionMode reduction, const float *input,
-                                       const int32_t *target, const half *weight, const half *total_weight,
-                                       const float *dloss, float *dinput, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NLLLossGrad<float, half>(const int n, const int c, const ReductionMode reduction,
+                                                       const float *input, const int32_t *target, const half *weight,
+                                                       const half *total_weight, const float *dloss, float *dinput,
+                                                       cudaStream_t stream);
 
-template void KLDivLoss<half>(const int &input_size, const ReductionMode &reduction, const half *input_x,
-                              const half *input_y, half *loss, half *tmp_loss, cudaStream_t stream);
+template CUDA_LIB_EXPORT void KLDivLoss<half>(const int &input_size, const ReductionMode &reduction,
+                                              const half *input_x, const half *input_y, half *loss, half *tmp_loss,
+                                              cudaStream_t stream);
 
-template void KLDivLossGrad<half>(const int &input_size, const ReductionMode &reduction, const half *input_x,
-                                  const half *input_y, const half *dloss, half *dx, half *dy, cudaStream_t stream);
+template CUDA_LIB_EXPORT void KLDivLossGrad<half>(const int &input_size, const ReductionMode &reduction,
+                                                  const half *input_x, const half *input_y, const half *dloss, half *dx,
+                                                  half *dy, cudaStream_t stream);
 
-template void BinaryCrossEntropyLoss<half>(const int &input_size, const ReductionMode &reduction, const half *input_x,
-                                           const half *input_y, const half *weight, half *loss, half *tmp_loss,
-                                           cudaStream_t stream);
+template CUDA_LIB_EXPORT void BinaryCrossEntropyLoss<half>(const int &input_size, const ReductionMode &reduction,
+                                                           const half *input_x, const half *input_y, const half *weight,
+                                                           half *loss, half *tmp_loss, cudaStream_t stream);
 
-template void BinaryCrossEntropyLossGrad<half>(const int &input_size, const ReductionMode &reduction,
-                                               const half *input_x, const half *input_y, const half *weight,
-                                               const half *dloss, half *dx, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BinaryCrossEntropyLossGrad<half>(const int &input_size, const ReductionMode &reduction,
+                                                               const half *input_x, const half *input_y,
+                                                               const half *weight, const half *dloss, half *dx,
+                                                               cudaStream_t stream);
 
-template void NLLLoss<half, half>(const int n, const int c, const ReductionMode reduction, const half *input,
-                                  const int32_t *target, const half *weight, half *loss, half *total_weight,
-                                  half *tmp_loss, half *tmp_target_weight, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NLLLoss<half, half>(const int n, const int c, const ReductionMode reduction,
+                                                  const half *input, const int32_t *target, const half *weight,
+                                                  half *loss, half *total_weight, half *tmp_loss,
+                                                  half *tmp_target_weight, cudaStream_t stream);
 
-template void NLLLoss<half, float>(const int n, const int c, const ReductionMode reduction, const half *input,
-                                   const int32_t *target, const float *weight, half *loss, float *total_weight,
-                                   half *tmp_loss, float *tmp_target_weight, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NLLLoss<half, float>(const int n, const int c, const ReductionMode reduction,
+                                                   const half *input, const int32_t *target, const float *weight,
+                                                   half *loss, float *total_weight, half *tmp_loss,
+                                                   float *tmp_target_weight, cudaStream_t stream);
 
-template void NLLLossGrad<half, half>(const int n, const int c, const ReductionMode reduction, const half *input,
-                                      const int32_t *target, const half *weight, const half *total_weight,
-                                      const half *dloss, half *dinput, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NLLLossGrad<half, half>(const int n, const int c, const ReductionMode reduction,
+                                                      const half *input, const int32_t *target, const half *weight,
+                                                      const half *total_weight, const half *dloss, half *dinput,
+                                                      cudaStream_t stream);
 
-template void NLLLossGrad<half, float>(const int n, const int c, const ReductionMode reduction, const half *input,
-                                       const int32_t *target, const float *weight, const float *total_weight,
-                                       const half *dloss, half *dinput, cudaStream_t stream);
+template CUDA_LIB_EXPORT void NLLLossGrad<half, float>(const int n, const int c, const ReductionMode reduction,
+                                                       const half *input, const int32_t *target, const float *weight,
+                                                       const float *total_weight, const half *dloss, half *dinput,
+                                                       cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh
new file mode 100644
index 00000000000..0b6d2ec00c4
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOSS_WITH_REDUCTION_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOSS_WITH_REDUCTION_IMPL_CUH_
+#include <map>
+#include <string>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+enum class ReductionMode { kNone, kMean, kSum };
+
+static std::map<std::string, ReductionMode> kReductionModeMap{
+  {"none", ReductionMode::kNone}, {"mean", ReductionMode::kMean}, {"sum", ReductionMode::kSum}};
+
+template <typename T>
+CUDA_LIB_EXPORT void BinaryCrossEntropyLoss(const int &input_size, const ReductionMode &reduction, const T *input_x,
+                                            const T *input_y, const T *weight, T *loss, T *tmp_loss,
+                                            cudaStream_t stream);
+template <typename T>
+CUDA_LIB_EXPORT void BinaryCrossEntropyLossGrad(const int &input_size, const ReductionMode &reduction, const T *input_x,
+                                                const T *input_y, const T *weight, const T *dloss, T *dx,
+                                                cudaStream_t stream);
+template <typename T>
+CUDA_LIB_EXPORT void KLDivLoss(const int &input_size, const ReductionMode &reduction, const T *input_x,
+                               const T *input_y, T *loss, T *tmp_loss, cudaStream_t stream);
+template <typename T>
+CUDA_LIB_EXPORT void KLDivLossGrad(const int &input_size, const ReductionMode &reduction, const T *input_x,
+                                   const T *input_y, const T *dloss, T *dx, T *dy, cudaStream_t stream);
+template <typename T, typename S>
+CUDA_LIB_EXPORT void NLLLoss(const int n, const int c, const ReductionMode reduction, const T *input,
+                             const int32_t *target, const S *weight, T *loss, S *total_weight, T *tmp_loss,
+                             S *tmp_target_weight, cudaStream_t stream);
+template <typename T, typename S>
+CUDA_LIB_EXPORT void NLLLossGrad(const int n, const int c, const ReductionMode reduction, const T *input,
+                                 const int32_t *target, const S *weight, const S *total_weight, const T *dloss,
+                                 T *dinput, cudaStream_t stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOSS_WITH_REDUCTION_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cu
similarity index 59%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cu
index 7a0d01d2b27..1cd39a50a5e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cu
@@ -16,7 +16,7 @@
 #include "matrix_band_part_impl.cuh"
 #include <cuda_runtime.h>
 #include <algorithm>
-#include "utils/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
 
 template <typename T>
 using Complex = mindspore::utils::Complex<T>;
@@ -45,13 +45,15 @@ void MatrixBandPart(const size_t size, const T *input_matrix_addr, const size_t
                                                                                         l, u, output_addr, cuda_stream);
 }
 
-template void MatrixBandPart<int32_t>(const size_t size, const int32_t *input_matrix_addr, const size_t m,
-                                      const size_t n, const int64_t l, const int64_t u, int32_t *output_addr,
-                                      cudaStream_t cuda_stream);
-template void MatrixBandPart<int64_t>(const size_t size, const int64_t *input_matrix_addr, const size_t m,
-                                      const size_t n, const int64_t l, const int64_t u, int64_t *output_addr,
-                                      cudaStream_t cuda_stream);
-template void MatrixBandPart<float>(const size_t size, const float *input_matrix_addr, const size_t m, const size_t n,
-                                    const int64_t l, const int64_t u, float *output_addr, cudaStream_t cuda_stream);
-template void MatrixBandPart<double>(const size_t size, const double *input_matrix_addr, const size_t m, const size_t n,
-                                     const int64_t l, const int64_t u, double *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixBandPart<int32_t>(const size_t size, const int32_t *input_matrix_addr,
+                                                      const size_t m, const size_t n, const int64_t l, const int64_t u,
+                                                      int32_t *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixBandPart<int64_t>(const size_t size, const int64_t *input_matrix_addr,
+                                                      const size_t m, const size_t n, const int64_t l, const int64_t u,
+                                                      int64_t *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixBandPart<float>(const size_t size, const float *input_matrix_addr, const size_t m,
+                                                    const size_t n, const int64_t l, const int64_t u,
+                                                    float *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixBandPart<double>(const size_t size, const double *input_matrix_addr, const size_t m,
+                                                     const size_t n, const int64_t l, const int64_t u,
+                                                     double *output_addr, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cuh
similarity index 51%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cuh
index c8f11f340ce..70bed47d1d5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cuh
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_MATRIX_BAND_PART_IMPL_CUH
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_MATRIX_BAND_PART_IMPL_CUH
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_BAND_PART_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_BAND_PART_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void MatrixBandPart(const size_t size, const T *input_matrix_addr, const size_t m, const size_t n, const int64_t l,
-                    const int64_t u, T *output_addr, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void MatrixBandPart(const size_t size, const T *input_matrix_addr, const size_t m, const size_t n,
+                                    const int64_t l, const int64_t u, T *output_addr, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_MATRIX_BAND_PART_IMPL_CUH
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_BAND_PART_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_combine_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_combine_impl.cu
similarity index 89%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_combine_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_combine_impl.cu
index b1bd5fdb695..9e0f10717d7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_combine_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_combine_impl.cu
@@ -66,7 +66,8 @@ void MatrixCombine(const size_t size, const size_t src_height, const size_t src_
   return;
 }
 
-template void MatrixCombine<float>(const size_t size, const size_t src_height, const size_t src_width,
-                                   const size_t dst_width, const size_t residual, const size_t res_width,
-                                   const size_t batch, float *input_addr, float *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixCombine<float>(const size_t size, const size_t src_height, const size_t src_width,
+                                                   const size_t dst_width, const size_t residual,
+                                                   const size_t res_width, const size_t batch, float *input_addr,
+                                                   float *output_addr, cudaStream_t cuda_stream);
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_combine_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_combine_impl.cuh
new file mode 100644
index 00000000000..bbc6c2f9c80
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_combine_impl.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_COMBINE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_COMBINE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void MatrixCombine(const size_t size, const size_t src_height, const size_t src_width,
+                                   const size_t dst_width, const size_t residual, const size_t res_width,
+                                   const size_t batch, T *input_addr, T *output_addr, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_COMBINE_IMPL_CUH_
+
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cu
similarity index 58%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cu
index 8eb9dd81cf8..6ba617be014 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cu
@@ -16,7 +16,7 @@
 #include "matrix_diag_part_impl.cuh"
 #include <cuda_runtime.h>
 #include <algorithm>
-#include "utils/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
 
 template <typename T>
 using Complex = mindspore::utils::Complex<T>;
@@ -60,19 +60,23 @@ void MatrixDiagPart(const size_t size, const T *input_matrix_addr, const size_t
     size, input_matrix_addr, m, n, l, u, num_diags, max_diag_len, la, ua, padding_value, output_addr, cuda_stream);
 }
 
-template void MatrixDiagPart<int32_t>(const size_t size, const int32_t *input_matrix_addr, const size_t m,
-                                      const size_t n, const int64_t l, const int64_t u, const size_t num_diags,
-                                      const size_t max_diag_len, const int64_t la, const int64_t ua,
-                                      int32_t *padding_value, int32_t *output_addr, cudaStream_t cuda_stream);
-template void MatrixDiagPart<int64_t>(const size_t size, const int64_t *input_matrix_addr, const size_t m,
-                                      const size_t n, const int64_t l, const int64_t u, const size_t num_diags,
-                                      const size_t max_diag_len, const int64_t la, const int64_t ua,
-                                      int64_t *padding_value, int64_t *output_addr, cudaStream_t cuda_stream);
-template void MatrixDiagPart<float>(const size_t size, const float *input_matrix_addr, const size_t m, const size_t n,
-                                    const int64_t l, const int64_t u, const size_t num_diags, const size_t max_diag_len,
-                                    const int64_t la, const int64_t ua, float *padding_value, float *output_addr,
-                                    cudaStream_t cuda_stream);
-template void MatrixDiagPart<double>(const size_t size, const double *input_matrix_addr, const size_t m, const size_t n,
-                                     const int64_t l, const int64_t u, const size_t num_diags,
-                                     const size_t max_diag_len, const int64_t la, const int64_t ua,
-                                     double *padding_value, double *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixDiagPart<int32_t>(const size_t size, const int32_t *input_matrix_addr,
+                                                      const size_t m, const size_t n, const int64_t l, const int64_t u,
+                                                      const size_t num_diags, const size_t max_diag_len,
+                                                      const int64_t la, const int64_t ua, int32_t *padding_value,
+                                                      int32_t *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixDiagPart<int64_t>(const size_t size, const int64_t *input_matrix_addr,
+                                                      const size_t m, const size_t n, const int64_t l, const int64_t u,
+                                                      const size_t num_diags, const size_t max_diag_len,
+                                                      const int64_t la, const int64_t ua, int64_t *padding_value,
+                                                      int64_t *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixDiagPart<float>(const size_t size, const float *input_matrix_addr, const size_t m,
+                                                    const size_t n, const int64_t l, const int64_t u,
+                                                    const size_t num_diags, const size_t max_diag_len, const int64_t la,
+                                                    const int64_t ua, float *padding_value, float *output_addr,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixDiagPart<double>(const size_t size, const double *input_matrix_addr, const size_t m,
+                                                     const size_t n, const int64_t l, const int64_t u,
+                                                     const size_t num_diags, const size_t max_diag_len,
+                                                     const int64_t la, const int64_t ua, double *padding_value,
+                                                     double *output_addr, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cuh
new file mode 100644
index 00000000000..7a6ede2931f
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_DIAG_PART_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_DIAG_PART_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void MatrixDiagPart(const size_t size, const T *input_matrix_addr, const size_t m, const size_t n,
+                                    const int64_t l, const int64_t u, const size_t num_diags, const size_t max_diag_len,
+                                    const int64_t la, const int64_t ua, T *padding_value, T *output_addr,
+                                    cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_DIAG_PART_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cu
similarity index 57%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cu
index 01d3c9c1427..28937ec55b9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cu
@@ -67,26 +67,29 @@ void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_c
   return;
 }
 
-template void MatrixSetDiag<int>(const int outer_batch, const int inner_row, const int inner_col, const int num_diags,
-                                 const int max_diag_len, const int lower_index, const int upper_index,
-                                 const bool right_align_super_diagonal, const bool right_align_sub_diagonal,
-                                 const bool is_single_diag, const int *diag_addr, int *output_addr,
-                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixSetDiag<int>(const int outer_batch, const int inner_row, const int inner_col,
+                                                 const int num_diags, const int max_diag_len, const int lower_index,
+                                                 const int upper_index, const bool right_align_super_diagonal,
+                                                 const bool right_align_sub_diagonal, const bool is_single_diag,
+                                                 const int *diag_addr, int *output_addr, cudaStream_t cuda_stream);
 
-template void MatrixSetDiag<int64_t>(const int outer_batch, const int inner_row, const int inner_col,
-                                     const int num_diags, const int max_diag_len, const int lower_index,
-                                     const int upper_index, const bool right_align_super_diagonal,
-                                     const bool right_align_sub_diagonal, const bool is_single_diag,
-                                     const int64_t *diag_addr, int64_t *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixSetDiag<int64_t>(const int outer_batch, const int inner_row, const int inner_col,
+                                                     const int num_diags, const int max_diag_len, const int lower_index,
+                                                     const int upper_index, const bool right_align_super_diagonal,
+                                                     const bool right_align_sub_diagonal, const bool is_single_diag,
+                                                     const int64_t *diag_addr, int64_t *output_addr,
+                                                     cudaStream_t cuda_stream);
 
-template void MatrixSetDiag<float>(const int outer_batch, const int inner_row, const int inner_col, const int num_diags,
-                                   const int max_diag_len, const int lower_index, const int upper_index,
-                                   const bool right_align_super_diagonal, const bool right_align_sub_diagonal,
-                                   const bool is_single_diag, const float *diag_addr, float *output_addr,
-                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixSetDiag<float>(const int outer_batch, const int inner_row, const int inner_col,
+                                                   const int num_diags, const int max_diag_len, const int lower_index,
+                                                   const int upper_index, const bool right_align_super_diagonal,
+                                                   const bool right_align_sub_diagonal, const bool is_single_diag,
+                                                   const float *diag_addr, float *output_addr,
+                                                   cudaStream_t cuda_stream);
 
-template void MatrixSetDiag<double>(const int outer_batch, const int inner_row, const int inner_col,
-                                    const int num_diags, const int max_diag_len, const int lower_index,
-                                    const int upper_index, const bool right_align_super_diagonal,
-                                    const bool right_align_sub_diagonal, const bool is_single_diag,
-                                    const double *diag_addr, double *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixSetDiag<double>(const int outer_batch, const int inner_row, const int inner_col,
+                                                    const int num_diags, const int max_diag_len, const int lower_index,
+                                                    const int upper_index, const bool right_align_super_diagonal,
+                                                    const bool right_align_sub_diagonal, const bool is_single_diag,
+                                                    const double *diag_addr, double *output_addr,
+                                                    cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cuh
new file mode 100644
index 00000000000..d6fdd692512
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cuh
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_SET_DIAG_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_SET_DIAG_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_col, const int num_diags,
+                                   const int max_diag_len, const int lower_index, const int upper_index,
+                                   const bool right_align_super_diagonal, const bool right_align_sub_diagonal,
+                                   const bool is_single_diag, const T *diag_addr, T *output_addr,
+                                   cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_SET_DIAG_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cu
similarity index 86%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cu
index b5ddb4e4835..23fbb71f198 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cu
@@ -65,8 +65,8 @@ void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, T
   return;
 }
 
-template void MatrixSplit<float>(const size_t size, const size_t split_dim, const size_t dim, float *input_addr,
-                                 float *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixSplit<float>(const size_t size, const size_t split_dim, const size_t dim,
+                                                 float *input_addr, float *output_addr, cudaStream_t cuda_stream);
 
-template void MatrixSplit<double>(const size_t size, const size_t split_dim, const size_t dim, double *input_addr,
-                                  double *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixSplit<double>(const size_t size, const size_t split_dim, const size_t dim,
+                                                  double *input_addr, double *output_addr, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cuh
similarity index 54%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cuh
index 3e2a808e08e..16281c6381b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cuh
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MATRIXSPLIT_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MATRIXSPLIT_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_SPLIT_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_SPLIT_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, T *input_addr, T *output_addr,
-                 cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, T *input_addr,
+                                 T *output_addr, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MATRIXSPLIT_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_SPLIT_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cu
similarity index 60%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cu
index 3f1489f3c76..8f02f465b3b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cu
@@ -16,9 +16,8 @@
 
 #include <algorithm>
 #include "maxpool_with_argmax_grad_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "include/cuda_fp16.h"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 template <typename T, typename S>
 __global__ void MaxPoolWithArgmaxGrad(const T* dy,
@@ -75,23 +74,23 @@ void CalMaxPoolWithArgmaxGrad(const T* dy,
   return;
 }
 
-template void CalMaxPoolWithArgmaxGrad<float, int>(const float* dy,
-                                                    const int* index,
-                                                    const int n,
-                                                    const int c,
-                                                    const int xHeight,
-                                                    const int xWidth,
-                                                    const int dyHeight,
-                                                    const int dyWidth,
-                                                    float* dx,
-                                                    cudaStream_t cuda_stream);
-template void CalMaxPoolWithArgmaxGrad<half, int>(const half* dy,
-                                                    const int* index,
-                                                    const int n,
-                                                    const int c,
-                                                    const int xHeight,
-                                                    const int xWidth,
-                                                    const int dyHeight,
-                                                    const int dyWidth,
-                                                    half* dx,
-                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalMaxPoolWithArgmaxGrad<float, int>(const float* dy,
+                                                                   const int* index,
+                                                                   const int n,
+                                                                   const int c,
+                                                                   const int xHeight,
+                                                                   const int xWidth,
+                                                                   const int dyHeight,
+                                                                   const int dyWidth,
+                                                                   float* dx,
+                                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalMaxPoolWithArgmaxGrad<half, int>(const half* dy,
+                                                                  const int* index,
+                                                                  const int n,
+                                                                  const int c,
+                                                                  const int xHeight,
+                                                                  const int xWidth,
+                                                                  const int dyHeight,
+                                                                  const int dyWidth,
+                                                                  half* dx,
+                                                                  cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cuh
new file mode 100644
index 00000000000..bb045c883e0
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cuh
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MAXPOOL_WITH_ARGMAX_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MAXPOOL_WITH_ARGMAX_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CalMaxPoolWithArgmaxGrad(const T* dy, const S* index, const int n, const int c, const int xHeight,
+                                              const int xWidth, const int dyHeight, const int dyWidth, T* dx,
+                                              cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MAXPOOL_WITH_ARGMAX_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cu
similarity index 62%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cu
index b4dd206b2a5..201b974f847 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cu
@@ -16,7 +16,6 @@
 
 #include <algorithm>
 #include "maxpool_with_argmax_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "include/cuda_fp16.h"
 template <typename T, typename S>
 __global__ void MaxPoolWithArgmax(const T* input,
@@ -112,36 +111,36 @@ void CalMaxPoolWithArgmax(const T* input,
                         index);
 }
 
-template void CalMaxPoolWithArgmax<float, int>(const float* input,
-                                                const int n,
-                                                const int c,
-                                                const int h,
-                                                const int w,
-                                                const int windowHeight,
-                                                const int windowWidth,
-                                                const int strideHeight,
-                                                const int strideWidth,
-                                                const int padTop,
-                                                const int padLeft,
-                                                const int outputHeight,
-                                                const int outputWidth,
-                                                float* output,
-                                                int* index,
-                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalMaxPoolWithArgmax<float, int>(const float* input,
+                                                               const int n,
+                                                               const int c,
+                                                               const int h,
+                                                               const int w,
+                                                               const int windowHeight,
+                                                               const int windowWidth,
+                                                               const int strideHeight,
+                                                               const int strideWidth,
+                                                               const int padTop,
+                                                               const int padLeft,
+                                                               const int outputHeight,
+                                                               const int outputWidth,
+                                                               float* output,
+                                                               int* index,
+                                                               cudaStream_t cuda_stream);
 
-template void CalMaxPoolWithArgmax<half, int>(const half* input,
-                                                const int n,
-                                                const int c,
-                                                const int h,
-                                                const int w,
-                                                const int windowHeight,
-                                                const int windowWidth,
-                                                const int strideHeight,
-                                                const int strideWidth,
-                                                const int padTop,
-                                                const int padLeft,
-                                                const int outputHeight,
-                                                const int outputWidth,
-                                                half* output,
-                                                int* index,
-                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalMaxPoolWithArgmax<half, int>(const half* input,
+                                                              const int n,
+                                                              const int c,
+                                                              const int h,
+                                                              const int w,
+                                                              const int windowHeight,
+                                                              const int windowWidth,
+                                                              const int strideHeight,
+                                                              const int strideWidth,
+                                                              const int padTop,
+                                                              const int padLeft,
+                                                              const int outputHeight,
+                                                              const int outputWidth,
+                                                              half* output,
+                                                              int* index,
+                                                              cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cuh
new file mode 100644
index 00000000000..24b8afc8cdb
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cuh
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MAXPOOL_WITH_ARGMAX_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MAXPOOL_WITH_ARGMAX_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CalMaxPoolWithArgmax(const T* input, const int n, const int c, const int h, const int w,
+                                          const int windowHeight, const int windowWidth, const int strideHeight,
+                                          const int strideWidth, const int padTop, const int padLeft,
+                                          const int outputHeight, const int outputWidth, T* output, S *index,
+                                          cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MAXPOOL_WITH_ARGMAX_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cu
similarity index 98%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cu
index fb62fb413ca..f1cff4f10fd 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cu
@@ -20,7 +20,6 @@
 #include <thrust/reduce.h>
 #include <thrust/pair.h>
 #include "minmax_update_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 
 __global__ void UpdateInputMinMaxPerLayerWithEMA(const float *input_min, const float *input_max, float *output_min,
                                                  float *output_max, const float min, const float max,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh
new file mode 100644
index 00000000000..828ec9fa7dc
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MINMAX_UPDATE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MINMAX_UPDATE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+CUDA_LIB_EXPORT void CalMinMaxPerChannel(float *input, float *input_min, float *input_max, float *output_min,
+                                         float *output_max, const int total_num, const int channel_num,
+                                         const float ema_decay, const bool ema, cudaStream_t cuda_stream);
+
+CUDA_LIB_EXPORT void CalMinMaxPerLayer(float *input, float *input_min, float *input_max, float *output_min,
+                                       float *output_max, const int size, const float ema_decay, const bool ema,
+                                       cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MINMAX_UPDATE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cu
similarity index 79%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cu
index 65e88876fe6..ad5170ac1e1 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cu
@@ -16,7 +16,8 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdint.h>
-#include "plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cuh"
+#include "include/cuda_fp16.h"
 
 // check for existence in current padded array on X and Y dims
 __inline__ __device__ bool range_check(int x, int y, int padded_width, int padded_height) {
@@ -252,27 +253,33 @@ void CalMirrorPadGrad(const size_t dx_size, const size_t interim_dy_size, T *dy,
     mode, dx);
 }
 
-template void CalMirrorPad<float>(const size_t size, const float *input, const int old_batch, const int old_channel,
-                                  const int old_height, const int old_width, const int padded_height,
-                                  const int padded_width, int padd_num, const int64_t *paddings, int mode,
-                                  float *output, cudaStream_t cuda_stream);
-template void CalMirrorPad<half>(const size_t size, const half *input, const int old_batch, const int old_channel,
-                                 const int old_height, const int old_width, const int padded_height,
-                                 const int padded_width, int padd_num, const int64_t *paddings, int mode, half *output,
-                                 cudaStream_t cuda_stream);
-template void CalMirrorPad<int>(const size_t size, const int *input, const int old_batch, const int old_channel,
-                                const int old_height, const int old_width, const int padded_height,
-                                const int padded_width, int padd_num, const int64_t *paddings, int mode, int *output,
-                                cudaStream_t cuda_stream);
-template void CalMirrorPadGrad<float>(const size_t dx_size, const size_t dy_size, float *dy, float *interim_dy,
-                                      const int dx_batches, const int dx_channels, const int dx_height,
-                                      const int dx_width, const int dy_height, const int dy_width, const int padd_dim,
-                                      const int64_t *paddings, int mode, float *dx, cudaStream_t cuda_stream);
-template void CalMirrorPadGrad<half>(const size_t dx_size, const size_t dy_size, half *dy, half *interim_dy,
-                                     const int dx_batches, const int dx_channels, const int dx_height,
-                                     const int dx_width, const int dy_height, const int dy_width, const int padd_dim,
-                                     const int64_t *paddings, int mode, half *dx, cudaStream_t cuda_stream);
-template void CalMirrorPadGrad<int>(const size_t dx_size, const size_t dy_size, int *dy, int *interim_dy,
-                                    const int dx_batches, const int dx_channels, const int dx_height,
-                                    const int dx_width, const int dy_height, const int dy_width, const int padd_dim,
-                                    const int64_t *paddings, int mode, int *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalMirrorPad<float>(const size_t size, const float *input, const int old_batch,
+                                                  const int old_channel, const int old_height, const int old_width,
+                                                  const int padded_height, const int padded_width, int padd_num,
+                                                  const int64_t *paddings, int mode, float *output,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalMirrorPad<half>(const size_t size, const half *input, const int old_batch,
+                                                 const int old_channel, const int old_height, const int old_width,
+                                                 const int padded_height, const int padded_width, int padd_num,
+                                                 const int64_t *paddings, int mode, half *output,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalMirrorPad<int>(const size_t size, const int *input, const int old_batch,
+                                                const int old_channel, const int old_height, const int old_width,
+                                                const int padded_height, const int padded_width, int padd_num,
+                                                const int64_t *paddings, int mode, int *output,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalMirrorPadGrad<float>(const size_t dx_size, const size_t dy_size, float *dy,
+                                                      float *interim_dy, const int dx_batches, const int dx_channels,
+                                                      const int dx_height, const int dx_width, const int dy_height,
+                                                      const int dy_width, const int padd_dim, const int64_t *paddings,
+                                                      int mode, float *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalMirrorPadGrad<half>(const size_t dx_size, const size_t dy_size, half *dy,
+                                                     half *interim_dy, const int dx_batches, const int dx_channels,
+                                                     const int dx_height, const int dx_width, const int dy_height,
+                                                     const int dy_width, const int padd_dim, const int64_t *paddings,
+                                                     int mode, half *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalMirrorPadGrad<int>(const size_t dx_size, const size_t dy_size, int *dy,
+                                                    int *interim_dy, const int dx_batches, const int dx_channels,
+                                                    const int dx_height, const int dx_width, const int dy_height,
+                                                    const int dy_width, const int padd_dim, const int64_t *paddings,
+                                                    int mode, int *dx, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cuh
new file mode 100755
index 00000000000..6a4e705f97a
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cuh
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MIRROR_PAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MIRROR_PAD_IMPL_CUH_
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+// preset size of paddings
+#define MAX_PADDINGS 4
+#define PADDING_SIZE 2
+
+// define constants for kernel indexing use
+#define BATCH 0 * PADDING_SIZE
+#define CHANNEL 1 * PADDING_SIZE
+#define HEIGHT 2 * PADDING_SIZE
+#define WIDTH 3 * PADDING_SIZE
+#define TOP 0
+#define BOTTOM 1
+#define LEFT 0
+#define RIGHT 1
+
+template <typename T>
+CUDA_LIB_EXPORT void CalMirrorPad(const size_t size, const T *input, const int old_batch, const int old_channel,
+                                  const int old_height, const int old_width, const int padded_height,
+                                  const int padded_width, int padd_num, const int64_t *paddings, int mode, T *output,
+                                  cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalMirrorPadGrad(const size_t dx_size, const size_t dy_size, T *dy, T *interim,
+                                      const int output_batch, const int output_channel, const int output_height,
+                                      const int output_width, const int input_height, const int input_width,
+                                      const int padd_dim, const int64_t *paddings, int mode, T *dx,
+                                      cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MIRROR_PAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/momentum_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cu
similarity index 62%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/momentum_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cu
index 8c4934a32b4..b7d2d3db7f3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/momentum_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cu
@@ -15,6 +15,7 @@
  */
 
 #include "momentum_impl.cuh"
+#include "include/cuda_fp16.h"
 template <typename T, typename S, typename G>
 __global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const S *learning_rate,
                                              const G *gradient, const S *momentum, bool use_nesterov) {
@@ -175,52 +176,64 @@ void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, co
     num, element_num, weight_decay, scale, variable, accumulation, learning_rate, gradient, momentum);
 }
 // end CombineFusedWeightDecayScaleMomentum
-template void MomentumUpdateVariable<float, float, float>(const size_t size, float *variable, float *accumulation,
-                                                          const float *learning_rate, const float *gradient,
-                                                          const float *momentum, bool use_nesterov,
-                                                          cudaStream_t cuda_stream);
-template void MomentumUpdateVariable<half, half, half>(const size_t size, half *variable, half *accumulation,
-                                                       const half *learning_rate, const half *gradient,
-                                                       const half *momentum, bool use_nesterov,
-                                                       cudaStream_t cuda_stream);
-template void MomentumUpdateVariable<half, float, half>(const size_t size, half *variable, half *accumulation,
-                                                        const float *learning_rate, const half *gradient,
-                                                        const float *momentum, bool use_nesterov,
-                                                        cudaStream_t cuda_stream);
-template void MomentumUpdateVariable<float, float, half>(const size_t size, float *variable, float *accumulation,
-                                                         const float *learning_rate, const half *gradient,
-                                                         const float *momentum, bool use_nesterov,
-                                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MomentumUpdateVariable<float, float, float>(const size_t size, float *variable,
+                                                                          float *accumulation,
+                                                                          const float *learning_rate,
+                                                                          const float *gradient, const float *momentum,
+                                                                          bool use_nesterov, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MomentumUpdateVariable<half, half, half>(const size_t size, half *variable,
+                                                                       half *accumulation, const half *learning_rate,
+                                                                       const half *gradient, const half *momentum,
+                                                                       bool use_nesterov, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MomentumUpdateVariable<half, float, half>(const size_t size, half *variable,
+                                                                        half *accumulation, const float *learning_rate,
+                                                                        const half *gradient, const float *momentum,
+                                                                        bool use_nesterov, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MomentumUpdateVariable<float, float, half>(const size_t size, float *variable,
+                                                                         float *accumulation,
+                                                                         const float *learning_rate,
+                                                                         const half *gradient, const float *momentum,
+                                                                         bool use_nesterov, cudaStream_t cuda_stream);
 
-template void FusedWeightDecayScaleMomentum(const size_t element_num, float *weight_decay, float *scale,
-                                            float *variable, float *accumulation, const float *learning_rate,
-                                            const float *gradient, const float *momentum, cudaStream_t cuda_stream);
-template void FusedWeightDecayScaleMomentum(const size_t element_num, float *weight_decay, float *scale,
-                                            float *variable, float *accumulation, const float *learning_rate,
-                                            const half *gradient, const float *momentum, cudaStream_t cuda_stream);
-template void FusedWeightDecayMomentum(const size_t element_num, float *weight_decay, float *variable,
-                                       float *accumulation, const float *learning_rate, const float *gradient,
-                                       const float *momentum, cudaStream_t cuda_stream);
-template void FusedWeightDecayMomentum(const size_t element_num, float *weight_decay, float *variable,
-                                       float *accumulation, const float *learning_rate, const half *gradient,
-                                       const float *momentum, cudaStream_t cuda_stream);
-template void FusedScaleMomentum(const size_t element_num, float *scale, float *variable, float *accumulation,
-                                 const float *learning_rate, const float *gradient, const float *momentum,
-                                 cudaStream_t cuda_stream);
-template void FusedScaleMomentum(const size_t element_num, float *scale, float *variable, float *accumulation,
-                                 const float *learning_rate, const half *gradient, const float *momentum,
-                                 cudaStream_t cuda_stream);
-template void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, const size_t *elements,
-                                                   float **weight_decay, float **scale, float **variable,
-                                                   float **accumulation, float **learning_rate, float **gradient,
-                                                   float **momentum, cudaStream_t cuda_stream);
-template void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, const size_t *elements,
-                                                   float **weight_decay, float **scale, float **variable,
-                                                   float **accumulation, float **learning_rate, half **gradient,
-                                                   float **momentum, cudaStream_t cuda_stream);
-template void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *elements, float **scale,
-                                        float **variable, float **accumulation, float **learning_rate, float **gradient,
-                                        float **momentum, cudaStream_t cuda_stream);
-template void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *elements, float **scale,
-                                        float **variable, float **accumulation, float **learning_rate, half **gradient,
-                                        float **momentum, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FusedWeightDecayScaleMomentum(const size_t element_num, float *weight_decay, float *scale,
+                                                            float *variable, float *accumulation,
+                                                            const float *learning_rate, const float *gradient,
+                                                            const float *momentum, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FusedWeightDecayScaleMomentum(const size_t element_num, float *weight_decay, float *scale,
+                                                            float *variable, float *accumulation,
+                                                            const float *learning_rate, const half *gradient,
+                                                            const float *momentum, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FusedWeightDecayMomentum(const size_t element_num, float *weight_decay, float *variable,
+                                                       float *accumulation, const float *learning_rate,
+                                                       const float *gradient, const float *momentum,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FusedWeightDecayMomentum(const size_t element_num, float *weight_decay, float *variable,
+                                                       float *accumulation, const float *learning_rate,
+                                                       const half *gradient, const float *momentum,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FusedScaleMomentum(const size_t element_num, float *scale, float *variable,
+                                                 float *accumulation, const float *learning_rate, const float *gradient,
+                                                 const float *momentum, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FusedScaleMomentum(const size_t element_num, float *scale, float *variable,
+                                                 float *accumulation, const float *learning_rate, const half *gradient,
+                                                 const float *momentum, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num,
+                                                                   const size_t *elements, float **weight_decay,
+                                                                   float **scale, float **variable,
+                                                                   float **accumulation, float **learning_rate,
+                                                                   float **gradient, float **momentum,
+                                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num,
+                                                                   const size_t *elements, float **weight_decay,
+                                                                   float **scale, float **variable,
+                                                                   float **accumulation, float **learning_rate,
+                                                                   half **gradient, float **momentum,
+                                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *elements,
+                                                        float **scale, float **variable, float **accumulation,
+                                                        float **learning_rate, float **gradient, float **momentum,
+                                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *elements,
+                                                        float **scale, float **variable, float **accumulation,
+                                                        float **learning_rate, half **gradient, float **momentum,
+                                                        cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh
new file mode 100644
index 00000000000..df9d40393c8
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MOMENTUM_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MOMENTUM_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T, typename S, typename G>
+CUDA_LIB_EXPORT void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate,
+                                            const G *gradient, const S *momentum, bool use_nesterov,
+                                            cudaStream_t cuda_stream);
+template <typename T, typename S>
+CUDA_LIB_EXPORT void FusedWeightDecayScaleMomentum(const size_t element_num, T *weight_decay, T *scale, T *variable,
+                                                   T *accumulation, const T *learning_rate, const S *gradient,
+                                                   const T *momentum, cudaStream_t cuda_stream);
+template <typename T, typename S>
+CUDA_LIB_EXPORT void FusedWeightDecayMomentum(const size_t element_num, T *weight_decay, T *variable, T *accumulation,
+                                              const T *learning_rate, const S *gradient, const T *momentum,
+                                              cudaStream_t cuda_stream);
+template <typename T, typename S>
+CUDA_LIB_EXPORT void FusedScaleMomentum(const size_t element_num, T *scale, T *variable, T *accumulation,
+                                        const T *learning_rate, const S *gradient, const T *momentum,
+                                        cudaStream_t cuda_stream);
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, const size_t *element,
+                                                          T **weight_decay, T **scale, T **variable, T **accumulation,
+                                                          T **learning_rate, S **gradient, T **momentum,
+                                                          cudaStream_t cuda_stream);
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *element, T **scale,
+                                               T **variable, T **accumulation, T **learning_rate, S **gradient,
+                                               T **momentum, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MOMENTUM_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cu
similarity index 86%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cu
index f69cbb3d653..7ad4ebc91c1 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cu
@@ -136,8 +136,9 @@ void Multinomial(int row, int col, T *probs, curandState *state, int64_t *num_sa
   MultinomialKernel<<<grid_dim, block_dim, shm_size, stream>>>(row, col, probs, state, num_sample, output);
 }
 
-template void Multinomial<float>(int row, int col, float *probs, curandState *state, int64_t *num_sample, int *output,
-                                 cudaStream_t stream);
-template void CheckNonNeg<float>(const size_t size, const float *input, float *output, cudaStream_t cuda_stream);
-template void CheckZero<float>(const size_t distributions, const size_t categories, const float *input, float *output,
-                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Multinomial<float>(int row, int col, float *probs, curandState *state,
+                                                 int64_t *num_sample, int *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CheckNonNeg<float>(const size_t size, const float *input, float *output,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CheckZero<float>(const size_t distributions, const size_t categories, const float *input,
+                                               float *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cuh
new file mode 100644
index 00000000000..bfd82d2392d
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cuh
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MULTINOMIAL_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MULTINOMIAL_IMPL_CUH_
+#include <curand_kernel.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+CUDA_LIB_EXPORT void InitRandState(int seed, int num, curandState *state, cudaStream_t stream);
+template <typename T>
+CUDA_LIB_EXPORT void Multinomial(int row, int col, T *probs, curandState *rand_state, int64_t *num_sample, int *output,
+                                 cudaStream_t stream);
+template <typename T>
+CUDA_LIB_EXPORT void CheckNonNeg(const size_t size, const T *input, T *output, cudaStream_t stream);
+template <typename T>
+CUDA_LIB_EXPORT void CheckZero(const size_t distributions, const size_t categories, const T *input, T *output,
+                               cudaStream_t stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MULTINOMIAL_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cu
similarity index 92%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cu
index 413c5f0d4a9..08d9044852d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cu
@@ -198,11 +198,12 @@ void CalNms(const int num, const float IOU_value, T *output, bool *sel_boxes, in
   ReducePass<<<1, GET_THREADS, 0, cuda_stream>>>(num, sel_boxes, row_mask);
 }
 
-template void CalSort<float>(const int &inner, float *data_in, float *data_out, int *index_buff, float *data_buff,
-                             int box_size, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalSort<float>(const int &inner, float *data_in, float *data_out, int *index_buff,
+                                             float *data_buff, int box_size, cudaStream_t stream);
 
-template void CalPreprocess<float>(const int num, int *sel_idx, bool *sel_boxes, float *input, float *output,
-                                   int *index_buff, int box_size, bool *row_mask, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPreprocess<float>(const int num, int *sel_idx, bool *sel_boxes, float *input,
+                                                   float *output, int *index_buff, int box_size, bool *row_mask,
+                                                   cudaStream_t cuda_stream);
 
-template void CalNms<float>(const int num, const float IOU_value, float *output, bool *sel_boxes, int box_size,
-                            bool *row_mask, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalNms<float>(const int num, const float IOU_value, float *output, bool *sel_boxes,
+                                            int box_size, bool *row_mask, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cuh
new file mode 100644
index 00000000000..066a4207b98
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cuh
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_NMS_WITH_MASK_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_NMS_WITH_MASK_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void CalSort(const int &inner, T *data_in, T *data_out, int *index_buff, T *data_buff, int box_size_,
+                             cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, T *input, T *output, int *index_buff,
+                                   int box_size_, bool *row_mask, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalNms(const int num, const float IOU_value, T *output, bool *sel_boxes, int box_size_,
+                            bool *row_mask, cudaStream_t cuda_stream);
+
+CUDA_LIB_EXPORT int NmsRoundUpPower2(int v);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_NMS_WITH_MASK_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cu
similarity index 61%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cu
index 9aada57e6c4..1336cf1231e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "one_hot_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "include/cuda_fp16.h"
 template <typename T, typename S>
 __global__ void OneHotKernel(size_t size, const S *indices, size_t depth, const T *on_value, const T *off_value,
                              size_t left_dim_size, size_t right_dim_size, T *output) {
@@ -45,13 +45,15 @@ void OneHot(const S *indices, size_t depth, const T *on_value, const T *off_valu
                                                                   left_dim_size, right_dim_size, output);
   return;
 }
-template void OneHot<float, int>(const int *indices, size_t depth, const float *on_value, const float *off_value,
-                                 size_t left_dim_size, size_t right_dim_size, float *output, cudaStream_t cuda_stream);
-template void OneHot<half, int>(const int *indices, size_t depth, const half *on_value, const half *off_value,
-                                size_t left_dim_size, size_t right_dim_size, half *output, cudaStream_t cuda_stream);
-template void OneHot<float, int64_t>(const int64_t *indices, size_t depth, const float *on_value,
-                                     const float *off_value, size_t left_dim_size, size_t right_dim_size, float *output,
-                                     cudaStream_t cuda_stream);
-template void OneHot<half, int64_t>(const int64_t *indices, size_t depth, const half *on_value, const half *off_value,
-                                    size_t left_dim_size, size_t right_dim_size, half *output,
-                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void OneHot<float, int>(const int *indices, size_t depth, const float *on_value,
+                                                 const float *off_value, size_t left_dim_size, size_t right_dim_size,
+                                                 float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void OneHot<half, int>(const int *indices, size_t depth, const half *on_value,
+                                                const half *off_value, size_t left_dim_size, size_t right_dim_size,
+                                                half *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void OneHot<float, int64_t>(const int64_t *indices, size_t depth, const float *on_value,
+                                                     const float *off_value, size_t left_dim_size,
+                                                     size_t right_dim_size, float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void OneHot<half, int64_t>(const int64_t *indices, size_t depth, const half *on_value,
+                                                    const half *off_value, size_t left_dim_size, size_t right_dim_size,
+                                                    half *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cuh
similarity index 54%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cuh
index 5b5991256ec..65eaaa4d46e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cuh
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ONE_HOT_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ONE_HOT_IMPL_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ONE_HOT_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ONE_HOT_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T, typename S>
-void OneHot(const S *indices, size_t depth_, const T *on_value, const T *off_value, size_t left_dim_size,
-            size_t right_dim_size, T *output, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void OneHot(const S *indices, size_t depth_, const T *on_value, const T *off_value,
+                            size_t left_dim_size, size_t right_dim_size, T *output, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ONE_HOT_IMPL_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ONE_HOT_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cu
new file mode 100644
index 00000000000..ad510fac826
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cu
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include "oneslike_impl.cuh"
+#include "include/cuda_fp16.h"
+template <typename T>
+__global__ void OnesLike(const size_t size, const T* input,  T* output) {
+  int one = 1;
+  T val = static_cast<T>(one);
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    output[pos] = val;
+  }
+  return;
+}
+template <typename T>
+void CalOnesLike(const size_t size, const T* input, T* output, cudaStream_t cuda_stream) {
+  OnesLike<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output);
+  return;
+}
+
+template CUDA_LIB_EXPORT void CalOnesLike<double>(const size_t size, const double* input, double* output,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalOnesLike<float>(const size_t size, const float* input, float* output,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalOnesLike<half>(const size_t size, const half* input, half* output,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalOnesLike<int8_t>(const size_t size, const int8_t* input, int8_t* output,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalOnesLike<int16_t>(const size_t size, const int16_t* input, int16_t* output,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalOnesLike<int32_t>(const size_t size, const int32_t* input, int32_t* output,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalOnesLike<int64_t>(const size_t size, const int64_t* input, int64_t* output,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalOnesLike<uint8_t>(const size_t size, const uint8_t* input, uint8_t* output,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalOnesLike<uint16_t>(const size_t size, const uint16_t* input, uint16_t* output,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalOnesLike<uint32_t>(const size_t size, const uint32_t* input, uint32_t* output,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalOnesLike<uint64_t>(const size_t size, const uint64_t* input, uint64_t* output,
+                                                    cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cuh
similarity index 59%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cuh
index 21bd995fb7b..88d62d30a34 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cuh
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ONESLIKE_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ONESLIKE_H_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ONESLIKE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ONESLIKE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void CalOnesLike(const size_t size, const T* input, T* output, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const T* input, T* output, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ONESLIKE_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ONESLIKE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cu
new file mode 100755
index 00000000000..0cb05efef20
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cu
@@ -0,0 +1,74 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cuh"
+#include "include/cuda_fp16.h"
+template <typename T>
+__global__ void Pack(const size_t size, const size_t input_num, const size_t dims_behind_axis, T** inputs, T* output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
+      size_t cur_input_index = pos / dims_behind_axis % input_num;
+      size_t cycle_len = input_num * dims_behind_axis;
+      size_t local_index = pos / cycle_len * dims_behind_axis + pos % cycle_len % dims_behind_axis;
+      output[pos] = inputs[cur_input_index][local_index];
+  }
+  return;
+}
+
+template <typename T>
+void PackKernel(const size_t size, const size_t input_num,
+                const size_t dims_behind_axis, T** inputs, T* output,
+                cudaStream_t cuda_stream) {
+  Pack<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_num, dims_behind_axis, inputs, output);
+  return;
+}
+
+
+template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num,
+                                         const size_t dims_behind_axis, int8_t** inputs, int8_t* output,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num,
+                                         const size_t dims_behind_axis, int16_t** inputs, int16_t* output,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num,
+                                         const size_t dims_behind_axis, int** inputs, int* output,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num,
+                                         const size_t dims_behind_axis, int64_t** inputs, int64_t* output,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num,
+                                         const size_t dims_behind_axis, uint8_t** inputs, uint8_t* output,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num,
+                                         const size_t dims_behind_axis, uint16_t** inputs, uint16_t* output,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num,
+                                         const size_t dims_behind_axis, uint32_t** inputs, uint32_t* output,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num,
+                                         const size_t dims_behind_axis, uint64_t** inputs, uint64_t* output,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num,
+                                         const size_t dims_behind_axis, half** inputs, half* output,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num,
+                                         const size_t dims_behind_axis, float** inputs, float* output,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num,
+                                         const size_t dims_behind_axis, bool** inputs, bool* output,
+                                         cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pack.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cuh
similarity index 50%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pack.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cuh
index a74c125d6a9..0838eea47b4 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pack.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cuh
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_PACK_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_PACK_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PACK_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PACK_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void PackKernel(const size_t size,
-                const size_t input_num,
-                const size_t dims_behind_axis,
-                T** inputs,
-                T* output,
-                cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_PACK_H_
+CUDA_LIB_EXPORT void PackKernel(const size_t size,
+                                const size_t input_num,
+                                const size_t dims_behind_axis,
+                                T** inputs,
+                                T* output,
+                                cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PACK_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cu
similarity index 65%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cu
index e6ac41ea9c6..32262056c38 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cu
@@ -16,7 +16,8 @@
 
 #include <stdio.h>
 #include <stdint.h>
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
+#include "include/cuda_fp16.h"
 
 // For internal OP use, not user facing
 template <typename T>
@@ -268,72 +269,77 @@ void CalPadGradNDHWC(const size_t size, const T *dy, const int num, const int ol
                                                                   pad_head, pad_top, pad_left, dx);
 }
 
-template void CalPad<float>(const size_t size, const float* input, const int num, const int channels,
-                            const int old_height, const int old_width, const int padded_height, const int padded_width,
-                            const int pad_top, const int pad_left, float pad_value, float* output,
-                            cudaStream_t cuda_stream);
-template void CalPadGrad<float>(const size_t size, const float* dy, const int num, const int channels,
-                                const int old_height, const int old_width, const int padded_height,
-                                const int padded_width, const int pad_top, const int pad_left, float* dx,
-                                cudaStream_t cuda_stream);
-template void CalPad<half>(const size_t size, const half* input, const int num, const int channels,
-                           const int old_height, const int old_width, const int padded_height, const int padded_width,
-                           const int pad_top, const int pad_left, float pad_value, half* output,
-                           cudaStream_t cuda_stream);
-template void CalPadGrad<half>(const size_t size, const half* dy, const int num, const int channels,
-                               const int old_height, const int old_width, const int padded_height,
-                               const int padded_width, const int pad_top, const int pad_left, half* dx,
-                               cudaStream_t cuda_stream);
-template void CalPadNHWC<float>(const size_t size, const float* input, const int num, const int old_height,
-                                const int old_width, const int channels, const int padded_height,
-                                const int padded_width, const int pad_top, const int pad_left, float pad_value,
-                                float* output, cudaStream_t cuda_stream);
-template void CalPadNHWC<half>(const size_t size, const half* input, const int num, const int old_height,
-                               const int old_width, const int channels, const int padded_height,
-                               const int padded_width, const int pad_top, const int pad_left, float pad_value,
-                               half* output, cudaStream_t cuda_stream);
-template void CalPadGradNHWC<float>(const size_t size, const float* dy, const int num, const int old_height,
-                                    const int old_width, const int channels, const int padded_height,
-                                    const int padded_width, const int pad_top, const int pad_left, float* dx,
-                                    cudaStream_t cuda_stream);
-template void CalPadGradNHWC<half>(const size_t size, const half* dy, const int num, const int old_height,
-                                   const int old_width, const int channels, const int padded_height,
-                                   const int padded_width, const int pad_top, const int pad_left, half* dx,
-                                   cudaStream_t cuda_stream);
-template void CalPadGeneral<float>(const float *input, float *output, const size_t *input_shape, const size_t *strides,
-                                   const int *paddings, const int input_size, const size_t input_rank,
-                                   cudaStream_t cuda_stream);
-template void CalPadGeneral<half>(const half *input, half *output, const size_t *input_shape, const size_t *strides,
-                                  const int *paddings, const int input_size, const size_t input_rank,
-                                  cudaStream_t cuda_stream);
-template void CalPadGeneral<int>(const int *input, int *output, const size_t *input_shape, const size_t *strides,
-                                 const int *paddings, const int input_size, const size_t input_rank,
-                                 cudaStream_t cuda_stream);
-template void CalPad3d<float>(const size_t size, const float* input, const int num, const int channels,
-                              const int old_depth, const int old_height, const int old_width, const int padded_depth,
-                              const int padded_height, const int padded_width, const int pad_head, const int pad_top,
-                              const int pad_left, const float pad_value, float* output, cudaStream_t cuda_stream);
-template void CalPad3d<half>(const size_t size, const half* input, const int num, const int channels,
-                             const int old_depth, const int old_height, const int old_width, const int padded_depth,
-                             const int padded_height, const int padded_width, const int pad_head, const int pad_top,
-                             const int pad_left, const float pad_value, half* output, cudaStream_t cuda_stream);
-template void CalPadGrad3d<float>(const size_t size, const float* dy, const int num, const int channels,
-                                  const int old_depth, const int old_height, const int old_width,
-                                  const int padded_depth, const int padded_height, const int padded_width,
-                                  const int pad_head, const int pad_top, const int pad_left, float* dx,
-                                  cudaStream_t cuda_stream);
-template void CalPadGrad3d<half>(const size_t size, const half* dy, const int num, const int channels,
-                                 const int old_depth, const int old_height, const int old_width,
-                                 const int padded_depth, const int padded_height, const int padded_width,
-                                 const int pad_head, const int pad_top, const int pad_left, half* dx,
-                                 cudaStream_t cuda_stream);
-template void CalPadGradNDHWC<float>(const size_t size, const float *dy, const int num, const int old_depth,
-                                     const int old_height, const int old_width, const int channels,
-                                     const int padded_depth, const int padded_height, const int padded_width,
-                                     const int pad_head, const int pad_top, const int pad_left, float *dx,
-                                     cudaStream_t cuda_stream);
-template void CalPadGradNDHWC<half>(const size_t size, const half *dy, const int num, const int old_depth,
-                                    const int old_height, const int old_width, const int channels,
-                                    const int padded_depth, const int padded_height, const int padded_width,
-                                    const int pad_head, const int pad_top, const int pad_left, half *dx,
-                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPad<float>(const size_t size, const float* input, const int num, const int channels,
+                                            const int old_height, const int old_width, const int padded_height,
+                                            const int padded_width, const int pad_top, const int pad_left,
+                                            float pad_value, float* output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadGrad<float>(const size_t size, const float* dy, const int num, const int channels,
+                                                const int old_height, const int old_width, const int padded_height,
+                                                const int padded_width, const int pad_top, const int pad_left,
+                                                float* dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPad<half>(const size_t size, const half* input, const int num, const int channels,
+                                           const int old_height, const int old_width, const int padded_height,
+                                           const int padded_width, const int pad_top, const int pad_left,
+                                           float pad_value, half* output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadGrad<half>(const size_t size, const half* dy, const int num, const int channels,
+                                               const int old_height, const int old_width, const int padded_height,
+                                               const int padded_width, const int pad_top, const int pad_left, half* dx,
+                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadNHWC<float>(const size_t size, const float* input, const int num,
+                                                const int old_height, const int old_width, const int channels,
+                                                const int padded_height, const int padded_width, const int pad_top,
+                                                const int pad_left, float pad_value, float* output,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadNHWC<half>(const size_t size, const half* input, const int num,
+                                               const int old_height, const int old_width, const int channels,
+                                               const int padded_height, const int padded_width, const int pad_top,
+                                               const int pad_left, float pad_value, half* output,
+                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadGradNHWC<float>(const size_t size, const float* dy, const int num,
+                                                    const int old_height, const int old_width, const int channels,
+                                                    const int padded_height, const int padded_width, const int pad_top,
+                                                    const int pad_left, float* dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadGradNHWC<half>(const size_t size, const half* dy, const int num,
+                                                   const int old_height, const int old_width, const int channels,
+                                                   const int padded_height, const int padded_width, const int pad_top,
+                                                   const int pad_left, half* dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadGeneral<float>(const float *input, float *output, const size_t *input_shape,
+                                                   const size_t *strides, const int *paddings, const int input_size,
+                                                   const size_t input_rank, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadGeneral<half>(const half *input, half *output, const size_t *input_shape,
+                                                  const size_t *strides, const int *paddings, const int input_size,
+                                                  const size_t input_rank, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadGeneral<int>(const int *input, int *output, const size_t *input_shape,
+                                                 const size_t *strides, const int *paddings, const int input_size,
+                                                 const size_t input_rank, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPad3d<float>(const size_t size, const float* input, const int num, const int channels,
+                                              const int old_depth, const int old_height, const int old_width,
+                                              const int padded_depth, const int padded_height, const int padded_width,
+                                              const int pad_head, const int pad_top, const int pad_left,
+                                              const float pad_value, float* output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPad3d<half>(const size_t size, const half* input, const int num, const int channels,
+                                             const int old_depth, const int old_height, const int old_width,
+                                             const int padded_depth, const int padded_height, const int padded_width,
+                                             const int pad_head, const int pad_top, const int pad_left,
+                                             const float pad_value, half* output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadGrad3d<float>(const size_t size, const float* dy, const int num, const int channels,
+                                                  const int old_depth, const int old_height, const int old_width,
+                                                  const int padded_depth, const int padded_height,
+                                                  const int padded_width, const int pad_head, const int pad_top,
+                                                  const int pad_left, float* dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadGrad3d<half>(const size_t size, const half* dy, const int num, const int channels,
+                                                 const int old_depth, const int old_height, const int old_width,
+                                                 const int padded_depth, const int padded_height,
+                                                 const int padded_width, const int pad_head, const int pad_top,
+                                                 const int pad_left, half* dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadGradNDHWC<float>(const size_t size, const float *dy, const int num,
+                                                     const int old_depth, const int old_height, const int old_width,
+                                                     const int channels, const int padded_depth,
+                                                     const int padded_height, const int padded_width,
+                                                     const int pad_head, const int pad_top, const int pad_left,
+                                                     float *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalPadGradNDHWC<half>(const size_t size, const half *dy, const int num,
+                                                    const int old_depth, const int old_height, const int old_width,
+                                                    const int channels, const int padded_depth, const int padded_height,
+                                                    const int padded_width, const int pad_head, const int pad_top,
+                                                    const int pad_left, half *dx, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh
new file mode 100644
index 00000000000..081a98c1523
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PAD_IMPL_CUH_
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void CalPad(const size_t size, const T* input, const int num, const int channels, const int old_height,
+                            const int old_width, const int padded_height, const int padded_width, const int pad_top,
+                            const int pad_left, float pad_value, T* output, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalPadGrad(const size_t size, const T* dy, const int num, const int channels, const int old_height,
+                                const int old_width, const int padded_height, const int padded_width, const int pad_top,
+                                const int pad_left, T* dx, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalPadNHWC(const size_t size, const T* input, const int num, const int old_height,
+                                const int old_width, const int channels, const int padded_height,
+                                const int padded_width, const int pad_top, const int pad_left, float pad_value,
+                                T* output, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalPadGradNHWC(const size_t size, const T* input, const int num, const int old_height,
+                                    const int old_width, const int channels, const int padded_height,
+                                    const int padded_width, const int pad_top, const int pad_left, T* output,
+                                    cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalPadGeneral(const T *input, T *output, const size_t *input_shape, const size_t *strides,
+                                   const int *paddings, const int input_size, const size_t input_rank,
+                                   cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalPad3d(const size_t size, const T* input, const int num, const int channels, const int old_depth,
+                              const int old_height, const int old_width, const int padded_depth,
+                              const int padded_height, const int padded_width, const int pad_head, const int pad_top,
+                              const int pad_left, const float pad_value, T* output, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalPadGrad3d(const size_t size, const T* dy, const int num, const int channels,
+                                  const int old_depth, const int old_height, const int old_width,
+                                  const int padded_depth, const int padded_height, const int padded_width,
+                                  const int pad_head, const int pad_top, const int pad_left, T* dx,
+                                  cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalPadNDHWC(const size_t size, const T *input, const int num, const int old_depth,
+                                 const int old_height, const int old_width, const int channels, const int padded_depth,
+                                 const int padded_height, const int padded_width, const int pad_head, const int pad_top,
+                                 const int pad_left, const float pad_value, T *output, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void CalPadGradNDHWC(const size_t size, const T *dy, const int num, const int old_depth,
+                                     const int old_height, const int old_width, const int channels,
+                                     const int padded_depth, const int padded_height, const int padded_width,
+                                     const int pad_head, const int pad_top, const int pad_left, T *dx,
+                                     cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cu
similarity index 83%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cu
index 9cfb1948b7c..ed560e6cd06 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cu
@@ -14,9 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 template <typename T>
 __global__ void CalPReLUGradKernel(size_t size, size_t weight_size, size_t per_channel_size,
@@ -62,7 +61,7 @@ void CalPReLUGrad(size_t size, size_t weight_size, size_t per_channel_size,
   return;
 }
 
-template void CalPReLUGrad(size_t, size_t, size_t, const float *, const float *, const float *,
-                           float *, float *, float *, cudaStream_t);
-template void CalPReLUGrad(size_t, size_t, size_t, const half *, const half *, const half *,
-                           half *, half *, float *, cudaStream_t);
+template CUDA_LIB_EXPORT void CalPReLUGrad(size_t, size_t, size_t, const float *, const float *, const float *,
+                                           float *, float *, float *, cudaStream_t);
+template CUDA_LIB_EXPORT void CalPReLUGrad(size_t, size_t, size_t, const half *, const half *, const half *,
+                                           half *, half *, float *, cudaStream_t);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cuh
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cuh
index 90bbda6bc05..8f31c2ed5f9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cuh
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_GRAD_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_GRAD_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void CalPReLUGrad(size_t input_size, size_t weight_size, size_t per_channel_size,
-                  const T *dy, const T *x, const T *w, T *dx, T *dw, float *dw_array, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_GRAD_H_
+CUDA_LIB_EXPORT void CalPReLUGrad(size_t input_size, size_t weight_size, size_t per_channel_size, const T *dy,
+                                  const T *x, const T *w, T *dx, T *dw, float *dw_array, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cu
similarity index 81%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cu
index 2b1f687bbfb..62d3759776b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/prelu_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void CalPReLUKernel(size_t size, size_t weight_size, size_t per_channel_size,
@@ -32,5 +33,5 @@ void CalPReLU(size_t size, size_t weight_size, size_t per_channel_size,
                                                                     input, weight, output);
 }
 
-template void CalPReLU(size_t, size_t, size_t, const float *, const float *, float *, cudaStream_t);
-template void CalPReLU(size_t, size_t, size_t, const half *, const half *, half *, cudaStream_t);
+template CUDA_LIB_EXPORT void CalPReLU(size_t, size_t, size_t, const float *, const float *, float *, cudaStream_t);
+template CUDA_LIB_EXPORT void CalPReLU(size_t, size_t, size_t, const half *, const half *, half *, cudaStream_t);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cuh
similarity index 54%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cuh
index eb8c45486f5..0d74034c63f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cuh
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void CalPReLU(size_t input_size, size_t weight_size, size_t per_channel_size,
-              const T *input, const T *weight, T *output, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_H_
+CUDA_LIB_EXPORT void CalPReLU(size_t input_size, size_t weight_size, size_t per_channel_size,
+                              const T *input, const T *weight, T *output, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cu
similarity index 75%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cu
index 77b39e037be..c8790db1dbe 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cu
@@ -18,9 +18,8 @@
 #include <math.h>
 #include <float.h>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 template <typename T>
 __global__ void PSROIPoolInitKernel(size_t size_init, T *input) {
@@ -113,19 +112,21 @@ void PSROIPoolForwardLauncher(
     }
 }
 
-template void PSROIPoolForwardLauncher<float>(
-    const float* input, const float spatial_scale, const int rois_number, const int feature_height,
-    const int feature_width, const int feature_channels, const int pooled_height,
-    const int pooled_width, const float* roi_boxes,
-    const int group_size, const int output_channels,
-    float* output_data, int* mapping_channel, cudaStream_t stream);
+template CUDA_LIB_EXPORT void PSROIPoolForwardLauncher<float>(const float* input, const float spatial_scale,
+                                                              const int rois_number, const int feature_height,
+                                                              const int feature_width, const int feature_channels,
+                                                              const int pooled_height, const int pooled_width,
+                                                              const float* roi_boxes, const int group_size,
+                                                              const int output_channels, float* output_data,
+                                                              int* mapping_channel, cudaStream_t stream);
 
-template void PSROIPoolForwardLauncher<half>(
-    const half *input, const half spatial_scale, const int rois_number, const int feature_height,
-    const int feature_width, const int feature_channels, const int pooled_height,
-    const int pooled_width, const half *roi_boxes,
-    const int group_size, const int output_channels,
-    half *output_data, int* mapping_channel, cudaStream_t stream);
+template CUDA_LIB_EXPORT void PSROIPoolForwardLauncher<half>(const half *input, const half spatial_scale,
+                                                             const int rois_number, const int feature_height,
+                                                             const int feature_width, const int feature_channels,
+                                                             const int pooled_height, const int pooled_width,
+                                                             const half *roi_boxes, const int group_size,
+                                                             const int output_channels, half *output_data,
+                                                             int* mapping_channel, cudaStream_t stream);
 
 template <typename T>
 __global__ void PSROIPoolBackward(const int nthreads, const T* input_diff,
@@ -209,12 +210,18 @@ void PSROIPoolBackwardLauncher(const T* input_diff, const int* mapping_channel,
     }
 }
 
-template void PSROIPoolBackwardLauncher<float>(const float* input_diff, const int* mapping_channel,
-    const int batch_size, const int rois_number, const float spatial_scale, const int feature_channels,
-    const int feature_height, const int feature_width, const int pooled_width, const int pooled_height,
-    const int output_channels, float* output_diff, const float* roi_boxes, cudaStream_t stream);
+template CUDA_LIB_EXPORT void PSROIPoolBackwardLauncher<float>(const float* input_diff, const int* mapping_channel,
+                                                               const int batch_size, const int rois_number,
+                                                               const float spatial_scale, const int feature_channels,
+                                                               const int feature_height, const int feature_width,
+                                                               const int pooled_width, const int pooled_height,
+                                                               const int output_channels, float* output_diff,
+                                                               const float* roi_boxes, cudaStream_t stream);
 
-template void PSROIPoolBackwardLauncher<half>(const half* input_diff, const int* mapping_channel, const int batch_size,
-    const int rois_number, const half spatial_scale, const int feature_channels, const int feature_height,
-    const int feature_width, const int pooled_width, const int pooled_height, const int output_channels,
-    half* output_diff, const half* roi_boxes, cudaStream_t stream);
+template CUDA_LIB_EXPORT void PSROIPoolBackwardLauncher<half>(const half* input_diff, const int* mapping_channel,
+                                                              const int batch_size, const int rois_number,
+                                                              const half spatial_scale, const int feature_channels,
+                                                              const int feature_height, const int feature_width,
+                                                              const int pooled_width, const int pooled_height,
+                                                              const int output_channels, half* output_diff,
+                                                              const half* roi_boxes, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cuh
new file mode 100644
index 00000000000..170906170b1
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cuh
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PSROI_POOLING_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PSROI_POOLING_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void PSROIPoolForwardLauncher(const T* input, const T spatial_scale, const int rois_number,
+                                              const int feature_height, const int feature_width,
+                                              const int feature_channels, const int pooled_height,
+                                              const int pooled_width, const T* roi_boxes, const int group_size,
+                                              const int output_channels, T* output_data, int* mapping_channel,
+                                              cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void PSROIPoolBackwardLauncher(const T* input_diff, const int* mapping_channel, const int batch_size,
+                                               const int rois_number, const T spatial_scale, const int feature_channels,
+                                               const int feature_height, const int feature_width,
+                                               const int pooled_width, const int pooled_height,
+                                               const int output_channels, T* output_diff, const T* roi_boxes,
+                                               cudaStream_t stream);
+
+#endif   // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PSROI_POOLING_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_categorical.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cu
similarity index 61%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_categorical.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cu
index 79dedd603e0..000de8f5efa 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_categorical.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/random_categorical.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename S>
 __global__ void RandomCategorical(const size_t num_samples, double** dev_rand, double** dev_cdf,
@@ -72,19 +73,22 @@ void GetCdfKernel(const T *logits_addr, double** dev_cdf, const size_t batch_siz
     GetCdf<<<GET_BLOCKS(size_cdf), GET_THREADS, 0, cuda_stream>>>(logits_addr, dev_cdf, batch_size, num_classes);
 }
 
-template void GetCdfKernel<half>(const half *logits_addr, double** dev_cdf, const size_t batch_size,
-        const size_t num_classes, cudaStream_t cuda_stream);
-template void GetCdfKernel<float>(const float *logits_addr, double** dev_cdf, const size_t batch_size,
-        const size_t num_classes, cudaStream_t cuda_stream);
-template void GetCdfKernel<double>(const double *logits_addr, double** dev_cdf, const size_t batch_size,
-        const size_t num_classes, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void GetCdfKernel<half>(const half *logits_addr, double** dev_cdf, const size_t batch_size,
+                                                 const size_t num_classes, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void GetCdfKernel<float>(const float *logits_addr, double** dev_cdf, const size_t batch_size,
+                                                  const size_t num_classes, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void GetCdfKernel<double>(const double *logits_addr, double** dev_cdf, const size_t batch_size,
+                                                   const size_t num_classes, cudaStream_t cuda_stream);
 
-template void RandomCategoricalKernel<int16_t>(const size_t num_samples,
-        double** dev_rand, double** dev_cdf, const size_t batch_size, const size_t num_classes,
-        int16_t *output_addr, cudaStream_t cuda_stream);
-template void RandomCategoricalKernel<int>(const size_t num_samples,
-        double** dev_rand, double** dev_cdf, const size_t batch_size, const size_t num_classes,
-        int *output_addr, cudaStream_t cuda_stream);
-template void RandomCategoricalKernel<int64_t>(const size_t num_samples,
-        double** dev_rand, double** dev_cdf, const size_t batch_size, const size_t num_classes,
-        int64_t *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void RandomCategoricalKernel<int16_t>(const size_t num_samples, double** dev_rand,
+                                                               double** dev_cdf, const size_t batch_size,
+                                                               const size_t num_classes, int16_t *output_addr,
+                                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void RandomCategoricalKernel<int>(const size_t num_samples, double** dev_rand,
+                                                           double** dev_cdf, const size_t batch_size,
+                                                           const size_t num_classes, int *output_addr,
+                                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void RandomCategoricalKernel<int64_t>(const size_t num_samples, double** dev_rand,
+                                                               double** dev_cdf, const size_t batch_size,
+                                                               const size_t num_classes, int64_t *output_addr,
+                                                               cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cuh
new file mode 100644
index 00000000000..fcaacde2018
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cuh
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_CATEGORICAL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_CATEGORICAL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void GetCdfKernel(const T *logits_addr, double** dev_cdf, const size_t batch_size,
+                                  const size_t num_classes, cudaStream_t cuda_stream);
+template <typename S>
+CUDA_LIB_EXPORT void RandomCategoricalKernel(const size_t num_samples, double** dev_rand, double** dev_cdf,
+                                             const size_t batch_size, const size_t num_classes, S *output_addr,
+                                             cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_CATEGORICAL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cu
similarity index 91%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cu
index da0f65ba428..a0a9b3c2d72 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cuh"
 #include <algorithm>
 
 int RcwmRoundUpPower2(int v) {
@@ -257,8 +257,9 @@ void CalRandomChoiceWithMask(const int &input_size, const int &input_shape_size,
                                                               index_buff, rank_buff, Tnum_buff);
 }
 
-template void CalRandomChoiceWithMask(const int &input_size, const int &input_shape_size, const int &d1, const int &d2,
-                                      const int &d3, const int &d4, const int &d5, const int &seedc, const int &count,
-                                      const bool *input, int *output_index, bool *output_mask, int *index_buff,
-                                      int *mask_buff, int *rank_buff, int *Tnum_buff, int *tmp_buff,
-                                      curandState *globalState, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalRandomChoiceWithMask(const int &input_size, const int &input_shape_size, const int &d1,
+                                                      const int &d2, const int &d3, const int &d4, const int &d5,
+                                                      const int &seedc, const int &count, const bool *input,
+                                                      int *output_index, bool *output_mask, int *index_buff,
+                                                      int *mask_buff, int *rank_buff, int *Tnum_buff, int *tmp_buff,
+                                                      curandState *globalState, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cuh
new file mode 100644
index 00000000000..e64e60a78e5
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cuh
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_CHOICE_WITH_MASK_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_CHOICE_WITH_MASK_IMPL_CUH_
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#define BLOCKSIZE 256
+#define MAX_DIMENSION 5
+
+template <typename T, typename S, typename K>
+CUDA_LIB_EXPORT void CalRandomChoiceWithMaskSmall(int input_size, int seedc, int count, K *input, S *output_index,
+                                                  K *output_mask, cudaStream_t stream);
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CalRandomChoiceWithMask(const int &input_size, const int &input_shape_size, const int &d1,
+                                             const int &d2, const int &d3, const int &d4, const int &d5,
+                                             const int &seedc, const int &count, const T *input, S *output_index,
+                                             T *output_mask, S *index_buff, S *mask_buff, S *rank_buff,
+                                             S *Tnum_buff, S *tmp_buff, curandState *globalState, cudaStream_t stream);
+
+CUDA_LIB_EXPORT int RcwmRoundUpPower2(int v);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_CHOICE_WITH_MASK_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_op_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cu
similarity index 71%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_op_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cu
index 2d1998f7dd6..e6c5a33b0e0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_op_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cu
@@ -100,17 +100,17 @@ void UniformReal(int seed, int seed2, curandState *globalState, T *output, size_
   return;
 }
 
-template void StandardNormal<float>(int seed, int seed2, curandState *globalState,
-                                    float *output, size_t count, cudaStream_t cuda_stream);
-template void StandardNormal<int>(int seed, int seed2, curandState *globalState,
-                                  int *output, size_t count, cudaStream_t cuda_stream);
-template bool UniformInt<float>(int seed, int seed2, curandState *globalState, float *input1, size_t input_size_1,
-                                float *input2, size_t input_size_2, float *output, size_t count,
-                              cudaStream_t cuda_stream);
-template bool UniformInt<int>(int seed, int seed2, curandState *globalState, int *input1, size_t input_size_1,
-                              int *input2, size_t input_size_2, int *output, size_t count,
-                              cudaStream_t cuda_stream);
-template void UniformReal<float>(int seed, int seed2, curandState *globalState,
-                                 float *output, size_t count, cudaStream_t cuda_stream);
-template void UniformReal<int>(int seed, int seed2, curandState *globalState,
-                               int *output, size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StandardNormal<float>(int seed, int seed2, curandState *globalState,
+                                                    float *output, size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StandardNormal<int>(int seed, int seed2, curandState *globalState,
+                                                  int *output, size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT bool UniformInt<float>(int seed, int seed2, curandState *globalState, float *input1,
+                                                size_t input_size_1, float *input2, size_t input_size_2, float *output,
+                                                size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT bool UniformInt<int>(int seed, int seed2, curandState *globalState, int *input1,
+                                              size_t input_size_1, int *input2, size_t input_size_2, int *output,
+                                              size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void UniformReal<float>(int seed, int seed2, curandState *globalState,
+                                                 float *output, size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void UniformReal<int>(int seed, int seed2, curandState *globalState,
+                                               int *output, size_t count, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cuh
new file mode 100644
index 00000000000..00c4ba3656b
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cuh
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_OP_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_OP_IMPL_CUH_
+#include <curand_kernel.h>
+#include <random>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void StandardNormal(int seed, int seed2, curandState *globalState,
+                                    T *output, size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT bool UniformInt(int seed, int seed2, curandState *globalState,
+                                T *input1, size_t input_size_1, T *input2, size_t input_size_2,
+                                T *output, size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void UniformReal(int seed, int seed2, curandState *globalState,
+                                 T *output, size_t count, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_OP_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/range_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cu
similarity index 72%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/range_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cu
index afc3a50a8aa..997dbaaf3c3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/range_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cu
@@ -16,7 +16,6 @@
 
 #include <cuda_runtime.h>
 #include "range_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 
 template <typename T>
 __global__ void Range(const int size, const float start, const float limit, const float delta, const T *input,
@@ -32,8 +31,8 @@ void CalRange(const int size, const float start, const float limit, const float
   Range<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, start, limit, delta, input, output);
   return;
 }
-template void CalRange<float>(const int size, const float start, const float limit, const float delta,
-                              const float *input, float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalRange<float>(const int size, const float start, const float limit, const float delta,
+                                              const float *input, float *output, cudaStream_t cuda_stream);
 
-template void CalRange<int>(const int size, const float start, const float limit, const float delta, const int *input,
-                            int *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalRange<int>(const int size, const float start, const float limit, const float delta,
+                                            const int *input, int *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cuh
similarity index 54%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cuh
index d0fdbc5948b..c1aa34f1842 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cuh
@@ -1,24 +1,23 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMGRAD_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMGRAD_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void CalBatchNormGrad(T *x, T *dy, float *scale, float *save_mean, float *save_variance, T *dx, float *bn_scale,
-                      float *bn_bias, double epsilon, int N, int C, int H, int W, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMGRAD_H_
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANGE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANGE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void CalRange(const int size, const float start, const float limit, const float delta, const T *input,
+                              T *output, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANGE_IMPL_CUH
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rcwm_small_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rcwm_small_impl.cu
similarity index 91%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rcwm_small_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rcwm_small_impl.cu
index d992020489b..57468823855 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rcwm_small_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rcwm_small_impl.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/topk_lib.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_lib.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cuh"
 
 // Kernel started from here
 #define L2_RCWM_HELPER(BLOCK, NUM_WARP_Q, NUM_THREAD_Q, IS_DESCEND)                      \
@@ -148,5 +148,6 @@ void CalRandomChoiceWithMaskSmall(int input_size, int seedc, int count, K *input
   RCWMScaleK<T, S, K>(seedc, input_size, input, count, output_index, output_mask, stream);
 }
 
-template void CalRandomChoiceWithMaskSmall<float, int, bool>(int input_size, int seedc, int count, bool *input,
-                                                          int *output_index, bool *output_mask, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalRandomChoiceWithMaskSmall<float, int, bool>(int input_size, int seedc, int count,
+                                                                             bool *input, int *output_index,
+                                                                             bool *output_mask, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cu
similarity index 62%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cu
index 9fbc8013dbd..a8c0831acc6 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cu
@@ -15,9 +15,7 @@
  */
 
 #include <cuda_runtime.h>
-
 #include "real_to_complex_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 
 template <typename T>
 __global__ void ToComplex(const size_t size, const T *input, T *output, cudaStream_t cuda_stream) {
@@ -33,8 +31,11 @@ void RealToComplex(const size_t size, const T *input, T *output, cudaStream_t cu
   ToComplex<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output, cuda_stream);
 }
 
-template void RealToComplex<double>(const size_t size, const double *input, double *output, cudaStream_t cuda_stream);
-template void RealToComplex<float>(const size_t size, const float *input, float *output, cudaStream_t cuda_stream);
-template void RealToComplex<int>(const size_t size, const int *input, int *output, cudaStream_t cuda_stream);
-template void RealToComplex<int64_t>(const size_t size, const int64_t *input, int64_t *output,
-                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void RealToComplex<double>(const size_t size, const double *input, double *output,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void RealToComplex<float>(const size_t size, const float *input, float *output,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void RealToComplex<int>(const size_t size, const int *input, int *output,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void RealToComplex<int64_t>(const size_t size, const int64_t *input, int64_t *output,
+                                                     cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/range_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cuh
old mode 100644
new mode 100755
similarity index 60%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/range_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cuh
index 2d0aabc5d44..7a5daa94a8b
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/range_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cuh
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANGE_IMPL_CUH_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANGE_IMPL_CUH_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REAL_TO_COMPLEX_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REAL_TO_COMPLEX_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void CalRange(const int size, const float start, const float limit, const float delta, const T *input, T *output,
-              cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANGE_IMPL_CUH
+CUDA_LIB_EXPORT void RealToComplex(const size_t size, const T *input, T *output, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REAL_TO_COMPLEX_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cu
similarity index 51%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cu
index 29084bf0f84..926e2b31b22 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void CalReLUGradKernel(int size, T *dy, T *y, T *dx) {
@@ -30,11 +30,11 @@ void CalReLUGrad(int size, T *dy, T *y, T *dx, cudaStream_t cuda_stream) {
   return;
 }
 
-template void CalReLUGrad(int size, double *dy, double *y, double *dx, cudaStream_t cuda_stream);
-template void CalReLUGrad(int size, float *dy, float *y, float *dx, cudaStream_t cuda_stream);
-template void CalReLUGrad(int size, half *dy, half *y, half *dx, cudaStream_t cuda_stream);
-template void CalReLUGrad(int size, int8_t *dy, int8_t *y, int8_t *dx, cudaStream_t cuda_stream);
-template void CalReLUGrad(int size, int16_t *dy, int16_t *y, int16_t *dx, cudaStream_t cuda_stream);
-template void CalReLUGrad(int size, int32_t *dy, int32_t *y, int32_t *dx, cudaStream_t cuda_stream);
-template void CalReLUGrad(int size, int64_t *dy, int64_t *y, int64_t *dx, cudaStream_t cuda_stream);
-template void CalReLUGrad(int size, uint8_t *dy, uint8_t *y, uint8_t *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLUGrad(int size, double *dy, double *y, double *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLUGrad(int size, float *dy, float *y, float *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLUGrad(int size, half *dy, half *y, half *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLUGrad(int size, int8_t *dy, int8_t *y, int8_t *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLUGrad(int size, int16_t *dy, int16_t *y, int16_t *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLUGrad(int size, int32_t *dy, int32_t *y, int32_t *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLUGrad(int size, int64_t *dy, int64_t *y, int64_t *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLUGrad(int size, uint8_t *dy, uint8_t *y, uint8_t *dx, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cuh
similarity index 59%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cuh
index 9a392aef20d..b465b918301 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cuh
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_GRAD_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_GRAD_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RELU_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RELU_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void CalReLUGrad(int input_size, T *dy, T *y, T *dx, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_GRAD_H_
+CUDA_LIB_EXPORT void CalReLUGrad(int input_size, T *dy, T *y, T *dx, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RELU_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cu
new file mode 100644
index 00000000000..a4cef64d7f5
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cu
@@ -0,0 +1,106 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+
+template <typename T>
+__global__ void CalReLUKernel(int size, T *input_addr, T *output_addr) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    output_addr[pos] = input_addr[pos] > static_cast<T>(0) ? input_addr[pos] : static_cast<T>(0);
+  }
+}
+
+template <typename T>
+void CalReLU(int size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) {
+  CalReLUKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr);
+}
+
+template CUDA_LIB_EXPORT void CalReLU(int size, double *input_addr, double *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLU(int size, float *input_addr, float *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLU(int size, half *input_addr, half *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLU(int size, int8_t *input_addr, int8_t *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLU(int size, int16_t *input_addr, int16_t *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLU(int size, int32_t *input_addr, int32_t *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLU(int size, int64_t *input_addr, int64_t *output_addr, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReLU(int size, uint8_t *input_addr, uint8_t *output_addr, cudaStream_t cuda_stream);
+
+template <typename T>
+__global__ void ReluV2Kernel(const size_t num, const T *x, T *y, uint32_t *mask) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += blockDim.x * gridDim.x) {
+    T v = x[i];
+    bool p = v > static_cast<T>(0);
+    y[i] = p ? v : static_cast<T>(0);
+
+    auto warp_predict = BallotSync(p, __activemask());
+    if (LaneId() == 0) {
+      mask[WarpId(i)] = warp_predict;
+    }
+  }
+}
+
+template <typename T>
+void ReluV2(const size_t num, const T *x, T *y, uint32_t *mask, cudaStream_t cuda_stream) {
+  ReluV2Kernel<<<kBlocksPerGrid(num), kThreadsPerBlock, 0, cuda_stream>>>(num, x, y, mask);
+}
+
+template <typename T>
+__global__ void ReluGradV2Kernel(const size_t num, const T *dy, const uint32_t *mask, T *dx) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += blockDim.x * gridDim.x) {
+    bool p = mask[WarpId(i)] & (1 << LaneId());
+    dx[i] = p ? dy[i] : static_cast<T>(0);
+  }
+}
+
+template <typename T>
+void ReluGradV2(const size_t num, const T *dy, const uint32_t *mask, T *dx, cudaStream_t cuda_stream) {
+  ReluGradV2Kernel<<<kBlocksPerGrid(num), kThreadsPerBlock, 0, cuda_stream>>>(num, dy, mask, dx);
+}
+
+template CUDA_LIB_EXPORT void ReluV2(const size_t num, const double *x, double *y, uint32_t *mask,
+                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluV2(const size_t num, const float *x, float *y, uint32_t *mask,
+                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluV2(const size_t num, const half *x, half *y, uint32_t *mask,
+                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluV2(const size_t num, const int8_t *x, int8_t *y, uint32_t *mask,
+                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluV2(const size_t num, const int16_t *x, int16_t *y, uint32_t *mask,
+                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluV2(const size_t num, const int32_t *x, int32_t *y, uint32_t *mask,
+                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluV2(const size_t num, const int64_t *x, int64_t *y, uint32_t *mask,
+                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluV2(const size_t num, const uint8_t *x, uint8_t *y, uint32_t *mask,
+                                     cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const double *dy, const uint32_t *mask, double *dx,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const float *dy, const uint32_t *mask, float *dx,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const half *dy, const uint32_t *mask, half *dx,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const int8_t *dy, const uint32_t *mask, int8_t *dx,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const int16_t *dy, const uint32_t *mask, int16_t *dx,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const int32_t *dy, const uint32_t *mask, int32_t *dx,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const int64_t *dy, const uint32_t *mask, int64_t *dx,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const uint8_t *dy, const uint32_t *mask, uint8_t *dx,
+                                         cudaStream_t cuda_stream);
+
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh
similarity index 51%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh
index 134aed477d7..9b68cd2857b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RELU_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RELU_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void CalReLU(int input_size, T *input_addr, T *output_addr, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CalReLU(int input_size, T *input_addr, T *output_addr, cudaStream_t cuda_stream);
 
 template <typename T>
-void ReluV2(const size_t num, const T *x, T *y, uint32_t *mask, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void ReluV2(const size_t num, const T *x, T *y, uint32_t *mask, cudaStream_t cuda_stream);
 template <typename T>
-void ReluGradV2(const size_t num, const T *dy, const uint32_t *mask, T *dx, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_H_
+CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const T *dy, const uint32_t *mask, T *dx, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RELU_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cu
similarity index 88%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cu
index 3c508bee832..131d6d15e8b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cu
@@ -14,10 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "include/cuda_fp16.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 template <typename T>
 __global__ void ResizeBilinear(const T *input, const int n, const int c, const int input_h, const int input_w,
   const int output_h, const int output_w, const int nchw, const int chw, const int hw, const float h_scale,
@@ -157,9 +155,11 @@ void CalResizeBilinearGrad(const float *input, const int n, const int c, const i
   return;
 }
 
-template void CalResizeBilinear<float>(const float *input, const int n, const int c, const int input_h,
-  const int input_w, const int output_h, const int output_w, const float h_scale, const float w_scale, float *output,
-  cudaStream_t cuda_stream);
-template void CalResizeBilinear<half>(const half *input, const int n, const int c, const int input_h,
-  const int input_w, const int output_h, const int output_w, const float h_scale, const float w_scale, half *output,
-  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalResizeBilinear<float>(const float *input, const int n, const int c, const int input_h,
+                                                       const int input_w, const int output_h, const int output_w,
+                                                       const float h_scale, const float w_scale, float *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalResizeBilinear<half>(const half *input, const int n, const int c, const int input_h,
+                                                      const int input_w, const int output_h, const int output_w,
+                                                      const float h_scale, const float w_scale, half *output,
+                                                      cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cuh
new file mode 100644
index 00000000000..710b5d03886
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cuh
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_BILINEAR_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_BILINEAR_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "include/cuda_fp16.h"
+template <typename T>
+CUDA_LIB_EXPORT void CalResizeBilinear(const T *input, const int n_, const int c_, const int input_h_,
+                                       const int input_w_, const int output_h_, const int output_w_,
+                                       const float h_scale, const float w_scale, T *output, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CalResizeBilinearGrad(const half *input, const int n_, const int c_, const int input_h_,
+                                           const int input_w_, const int output_h_, const int output_w_,
+                                           const float h_scale, const float w_scale, half *output, float *interim,
+                                           cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CalResizeBilinearGrad(const float *input, const int n_, const int c_, const int input_h_,
+                                           const int input_w_, const int output_h_, const int output_w_,
+                                           const float h_scale, const float w_scale, float *output, float *interim,
+                                           cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_BILINEAR_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cu
similarity index 64%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cu
index e2b8209d1cf..0ea9ae0dc38 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cu
@@ -18,8 +18,8 @@
 #include <stdint.h>
 #include <math.h>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cuh"
 
 template <typename T>
 __global__ void InitZero(T *output, const int output_size) {
@@ -76,15 +76,19 @@ void CalResizeNearestNeighborGrad(const int input_size, const T *input, const in
   return;
 }
 
-template void CalResizeNearestNeighborGrad<float>(const int input_size, const float *input, const int s1, const int s2,
-                                                  const int s3, const int s4, float *output, const int d1, const int d2,
-                                                  const int d3, const int d4, bool align_corners, float h_scale,
-                                                  float w_scale, cudaStream_t cuda_stream);
-template void CalResizeNearestNeighborGrad<half>(const int input_size, const half *input, const int s1, const int s2,
-                                                 const int s3, const int s4, half *output, const int d1, const int d2,
-                                                 const int d3, const int d4, bool align_corners, float h_scale,
-                                                 float w_scale, cudaStream_t cuda_stream);
-template void CalResizeNearestNeighborGrad<int>(const int input_size, const int *input, const int s1, const int s2,
-                                                const int s3, const int s4, int *output, const int d1, const int d2,
-                                                const int d3, const int d4, bool align_corners, float h_scale,
-                                                float w_scale, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalResizeNearestNeighborGrad<float>(const int input_size, const float *input,
+                                                                  const int s1, const int s2, const int s3,
+                                                                  const int s4, float *output, const int d1,
+                                                                  const int d2, const int d3, const int d4,
+                                                                  bool align_corners, float h_scale, float w_scale,
+                                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalResizeNearestNeighborGrad<half>(const int input_size, const half *input, const int s1,
+                                                                 const int s2, const int s3, const int s4, half *output,
+                                                                 const int d1, const int d2, const int d3, const int d4,
+                                                                 bool align_corners, float h_scale, float w_scale,
+                                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalResizeNearestNeighborGrad<int>(const int input_size, const int *input, const int s1,
+                                                                const int s2, const int s3, const int s4, int *output,
+                                                                const int d1, const int d2, const int d3, const int d4,
+                                                                bool align_corners, float h_scale, float w_scale,
+                                                                cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cuh
new file mode 100644
index 00000000000..ec156628e7c
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_NEAREST_NEIGHBOR_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_NEAREST_NEIGHBOR_GRAD_IMPL_CUH_
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#define RESIZENEARESTNEIGHBORGRAD_DIMENSION 4
+
+template <typename T>
+CUDA_LIB_EXPORT void CalResizeNearestNeighborGrad(const int input_size, const T *input, const int s1, const int s2,
+                                                  const int s3, const int s4, T *output, const int d1, const int d2,
+                                                  const int d3, const int d4, bool align_corners, float h_scale,
+                                                  float w_scale, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_NEAREST_NEIGHBOR_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cu
similarity index 63%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cu
index 3186b1c4566..ac80d937697 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cu
@@ -18,7 +18,8 @@
 #include <stdint.h>
 #include <math.h>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void ResizeNearestNeighbor(const int size, const T *input, const int s1, const int s2, const int s3,
@@ -65,15 +66,18 @@ void CalResizeNearestNeighbor(const int size, const T *input, const int s1, cons
   return;
 }
 
-template void CalResizeNearestNeighbor<float>(const int size, const float *input, const int s1, const int s2,
-                                              const int s3, const int s4, float *output, const int d1, const int d2,
-                                              const int d3, const int d4, bool align_corners, float h_scale,
-                                              float w_scale, cudaStream_t cuda_stream);
-template void CalResizeNearestNeighbor<half>(const int size, const half *input, const int s1, const int s2,
-                                             const int s3, const int s4, half *output, const int d1, const int d2,
-                                             const int d3, const int d4, bool align_corners, float h_scale,
-                                             float w_scale, cudaStream_t cuda_stream);
-template void CalResizeNearestNeighbor<int>(const int size, const int *input, const int s1, const int s2, const int s3,
-                                            const int s4, int *output, const int d1, const int d2, const int d3,
-                                            const int d4, bool align_corners, float h_scale, float w_scale,
-                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalResizeNearestNeighbor<float>(const int size, const float *input, const int s1,
+                                                              const int s2, const int s3, const int s4, float *output,
+                                                              const int d1, const int d2, const int d3, const int d4,
+                                                              bool align_corners, float h_scale, float w_scale,
+                                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalResizeNearestNeighbor<half>(const int size, const half *input, const int s1,
+                                                             const int s2, const int s3, const int s4, half *output,
+                                                             const int d1, const int d2, const int d3, const int d4,
+                                                             bool align_corners, float h_scale, float w_scale,
+                                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalResizeNearestNeighbor<int>(const int size, const int *input, const int s1,
+                                                            const int s2, const int s3, const int s4, int *output,
+                                                            const int d1, const int d2, const int d3, const int d4,
+                                                            bool align_corners, float h_scale, float w_scale,
+                                                            cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cuh
new file mode 100644
index 00000000000..b2651e1252d
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_NEAREST_NEIGHBOR_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_NEAREST_NEIGHBOR_IMPL_CUH_
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#define RESIZENEARESTNEIGHBOR_DIMENSION 4
+
+template <typename T>
+CUDA_LIB_EXPORT void CalResizeNearestNeighbor(const int size, const T *input, const int s1, const int s2, const int s3,
+                                              const int s4, T *output, const int d1, const int d2, const int d3,
+                                              const int d4, bool align_corners, float h_scale, float w_scale,
+                                              cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_NEAREST_NEIGHBOR_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cu
new file mode 100644
index 00000000000..4e3b69d95ec
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cu
@@ -0,0 +1,189 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <assert.h>
+#include <stdio.h>
+#include <stdint.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cuh"
+#include "include/cuda_fp16.h"
+
+// Util function to convert a 1D input array index to an N-D positional index
+// Required since GPU iterates over all values in an ND array as a 1D array
+__inline__ __device__ void IdxToPos(size_t idx, size_t *pos, size_t cur_thread_idx, size_t *cum_shape,
+                                    size_t shape_size) {
+  size_t rem_val = idx;
+  for (int i = 0; i < shape_size; i++) {
+    pos[cur_thread_idx + i] = rem_val / cum_shape[i];
+    rem_val = rem_val % cum_shape[i];
+  }
+  return;
+}
+
+// Util function to convert a N-D positonal index to a 1D index
+__inline__ __device__ size_t PosToIdx(size_t *pos, size_t cur_thread_idx, size_t *cum_shape, size_t shape_size) {
+  size_t idx = 0;
+  for (int i = 0; i < shape_size; i++) {
+    idx = idx + (pos[cur_thread_idx + i] * cum_shape[i]);
+  }
+  return idx;
+}
+
+// CumShape takes Shape: (2,2,5) => cumShape (10,5,1) which informs how many values
+// each dimension will represent. Required for converting 1d index to positional vector.
+// In this example 10 in dim 0 means, an increase of 1 in this dim leads to another 10 values
+// in the overall array
+__global__ void ComputeCumShape(const size_t *input_shape_ptr, size_t *input_shape_cum_ptr, size_t shape_size) {
+  int cur_val = 1;
+  for (int i = shape_size - 1; i >= 0; i--) {
+    // iterate list in reverse and cummulatively build shape
+    input_shape_cum_ptr[i] = cur_val;
+    cur_val = cur_val * input_shape_ptr[i];
+  }
+  return;
+}
+template <typename T, typename S>
+__global__ void ReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim,
+                                const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr,
+                                size_t *input_shape_cum_ptr, size_t shape_size, T *output) {
+  // calculate which thread this is out of total across all blocks for accessing respective cur_pos_arr memory
+  size_t cur_thread_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+  cur_thread_idx = cur_thread_idx * shape_size;
+  size_t cur_slice = 0;          // current slice as split by the batch_dim
+  size_t cur_slice_seq_len = 0;  // reverse seq length for this slice as provided by user
+  size_t new_idx = 0;            // calculate corresponding reverse element from input
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) {
+    IdxToPos(idx, cur_pos_arr, cur_thread_idx, input_shape_cum_ptr, shape_size);
+    cur_slice = cur_pos_arr[cur_thread_idx + batch_dim];  // all accesses to cur_pos_arr have to be adjusted per thread
+    cur_slice_seq_len = seq_len[cur_slice];
+    if (cur_slice_seq_len == 0) {  // adjust length to 1 if 0 provided, same result in both cases
+      cur_slice_seq_len = 1;
+    }
+    if (cur_pos_arr[cur_thread_idx + seq_dim] > (cur_slice_seq_len - 1)) {  // check if within range
+      // copy value directly and continue - outside of reversal range
+      output[idx] = input[idx];
+      continue;
+    }
+    // find corresponding reverse element in input
+    cur_pos_arr[cur_thread_idx + seq_dim] =
+      (cur_slice_seq_len - 1) - cur_pos_arr[cur_thread_idx + seq_dim];                 // adjust position to target
+    new_idx = PosToIdx(cur_pos_arr, cur_thread_idx, input_shape_cum_ptr, shape_size);  // get the updated index
+    output[idx] = input[new_idx];
+  }
+  return;
+}
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim,
+                                        const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr,
+                                        size_t *input_shape_cum_ptr, size_t shape_size, T *output,
+                                        cudaStream_t cuda_stream) {
+  ComputeCumShape<<<1, 1, 0, cuda_stream>>>(input_shape_ptr, input_shape_cum_ptr, shape_size);
+  ReverseSequence<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+    size, input, seq_len, batch_dim, seq_dim, cur_pos_arr, input_shape_ptr, input_shape_cum_ptr, shape_size, output);
+  return;
+}
+
+template CUDA_LIB_EXPORT void CalReverseSequence<int8_t, int>(const size_t size, const int8_t *input,
+                                                              const int *seq_len, const int64_t batch_dim,
+                                                              const int64_t seq_dim, size_t *cur_pos_arr,
+                                                              const size_t *input_shape_ptr,
+                                                              size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                              int8_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<int8_t, int64_t>(const size_t size, const int8_t *input,
+                                                                  const int64_t *seq_len, const int64_t batch_dim,
+                                                                  const int64_t seq_dim, size_t *cur_pos_arr,
+                                                                  const size_t *input_shape_ptr,
+                                                                  size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                                  int8_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<int16_t, int>(const size_t size, const int16_t *input,
+                                                               const int *seq_len, const int64_t batch_dim,
+                                                               const int64_t seq_dim, size_t *cur_pos_arr,
+                                                               const size_t *input_shape_ptr,
+                                                               size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                               int16_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<int16_t, int64_t>(const size_t size, const int16_t *input,
+                                                                   const int64_t *seq_len, const int64_t batch_dim,
+                                                                   const int64_t seq_dim, size_t *cur_pos_arr,
+                                                                   const size_t *input_shape_ptr,
+                                                                   size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                                   int16_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<int, int>(const size_t size, const int *input, const int *seq_len,
+                                                           const int64_t batch_dim, const int64_t seq_dim,
+                                                           size_t *cur_pos_arr, const size_t *input_shape_ptr,
+                                                           size_t *intput_shape_cum_ptr, size_t shape_size, int *output,
+                                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<int, int64_t>(const size_t size, const int *input,
+                                                               const int64_t *seq_len, const int64_t batch_dim,
+                                                               const int64_t seq_dim, size_t *cur_pos_arr,
+                                                               const size_t *input_shape_ptr,
+                                                               size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                               int *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<int64_t, int>(const size_t size, const int64_t *input,
+                                                               const int *seq_len, const int64_t batch_dim,
+                                                               const int64_t seq_dim, size_t *cur_pos_arr,
+                                                               const size_t *input_shape_ptr,
+                                                               size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                               int64_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<int64_t, int64_t>(const size_t size, const int64_t *input,
+                                                                   const int64_t *seq_len, const int64_t batch_dim,
+                                                                   const int64_t seq_dim, size_t *cur_pos_arr,
+                                                                   const size_t *input_shape_ptr,
+                                                                   size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                                   int64_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<half, int>(const size_t size, const half *input, const int *seq_len,
+                                                            const int64_t batch_dim, const int64_t seq_dim,
+                                                            size_t *cur_pos_arr, const size_t *input_shape_ptr,
+                                                            size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                            half *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<half, int64_t>(const size_t size, const half *input,
+                                                                const int64_t *seq_len, const int64_t batch_dim,
+                                                                const int64_t seq_dim, size_t *cur_pos_arr,
+                                                                const size_t *input_shape_ptr,
+                                                                size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                                half *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<float, int>(const size_t size, const float *input, const int *seq_len,
+                                                             const int64_t batch_dim, const int64_t seq_dim,
+                                                             size_t *cur_pos_arr, const size_t *input_shape_ptr,
+                                                             size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                             float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<float, int64_t>(const size_t size, const float *input,
+                                                                 const int64_t *seq_len, const int64_t batch_dim,
+                                                                 const int64_t seq_dim, size_t *cur_pos_arr,
+                                                                 const size_t *input_shape_ptr,
+                                                                 size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                                 float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<double, int>(const size_t size, const double *input,
+                                                              const int *seq_len, const int64_t batch_dim,
+                                                              const int64_t seq_dim, size_t *cur_pos_arr,
+                                                              const size_t *input_shape_ptr,
+                                                              size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                              double *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<double, int64_t>(const size_t size, const double *input,
+                                                                  const int64_t *seq_len, const int64_t batch_dim,
+                                                                  const int64_t seq_dim, size_t *cur_pos_arr,
+                                                                  const size_t *input_shape_ptr,
+                                                                  size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                                  double *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<bool, int>(const size_t size, const bool *input, const int *seq_len,
+                                                            const int64_t batch_dim, const int64_t seq_dim,
+                                                            size_t *cur_pos_arr, const size_t *input_shape_ptr,
+                                                            size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                            bool *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalReverseSequence<bool, int64_t>(const size_t size, const bool *input,
+                                                                const int64_t *seq_len, const int64_t batch_dim,
+                                                                const int64_t seq_dim, size_t *cur_pos_arr,
+                                                                const size_t *input_shape_ptr,
+                                                                size_t *intput_shape_cum_ptr, size_t shape_size,
+                                                                bool *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cuh
new file mode 100644
index 00000000000..5a0f57451bc
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cuh
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REVERSE_SEQUENCE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REVERSE_SEQUENCE_IMPL_CUH_
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim,
+                                        const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr,
+                                        size_t *intput_shape_cum_ptr, size_t shape_size, T *output,
+                                        cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REVERSE_SEQUENCE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cu
new file mode 100644
index 00000000000..1b48174e5f4
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cu
@@ -0,0 +1,65 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cuda_runtime.h>
+#include "reverse_v2_impl.cuh"
+#include "include/cuda_fp16.h"
+template <typename T>
+__global__ void ReverseV2(const T* input, T* output, const size_t* input_shape, const int64_t* strides,
+                          const int64_t* axis, size_t input_size, size_t axis_size) {
+  for (int64_t gt_id = blockIdx.x * blockDim.x + threadIdx.x; gt_id < input_size; gt_id += blockDim.x * gridDim.x) {
+    int64_t intermediate_index = gt_id;
+    for (size_t i = 0; i < axis_size; i++) {
+      int64_t d = axis[i];
+      int64_t pre_reverse_position = (gt_id / strides[d]) % input_shape[d];
+      int64_t reversed_position = input_shape[d] - pre_reverse_position - 1;
+      intermediate_index += ((reversed_position - pre_reverse_position) * strides[d]);
+    }
+
+    output[intermediate_index] = input[gt_id];
+  }
+  return;
+}
+template <typename T>
+void CalReverseV2(const T* input, T* output, const size_t* input_shape, const int64_t* strides, const int64_t* axis,
+                  size_t input_size, size_t axis_size, cudaStream_t cuda_stream) {
+  ReverseV2<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input, output, input_shape, strides, axis,
+                                                                     input_size, axis_size);
+  return;
+}
+
+template CUDA_LIB_EXPORT void CalReverseV2<half>(const half* input, half* output, const size_t* input_shape,
+                                                 const int64_t* strides, const int64_t* axis, size_t input_size,
+                                                 size_t axis_size, cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void CalReverseV2<float>(const float* input, float* output, const size_t* input_shape,
+                                                  const int64_t* strides, const int64_t* axis, size_t input_size,
+                                                  size_t axis_size, cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void CalReverseV2<uint8_t>(const uint8_t* input, uint8_t* output, const size_t* input_shape,
+                                                    const int64_t* strides, const int64_t* axis, size_t input_size,
+                                                    size_t axis_size, cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void CalReverseV2<int16_t>(const int16_t* input, int16_t* output, const size_t* input_shape,
+                                                    const int64_t* strides, const int64_t* axis, size_t input_size,
+                                                    size_t axis_size, cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void CalReverseV2<int32_t>(const int32_t* input, int32_t* output, const size_t* input_shape,
+                                                    const int64_t* strides, const int64_t* axis, size_t input_size,
+                                                    size_t axis_size, cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void CalReverseV2<int64_t>(const int64_t* input, int64_t* output, const size_t* input_shape,
+                                                    const int64_t* strides, const int64_t* axis, size_t input_size,
+                                                    size_t axis_size, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cuh
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cuh
index e1aa136e5fb..7edfa6c7411 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cuh
@@ -13,9 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_REVERSE_V2_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_REVERSE_V2_CUH_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REVERSE_V2_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REVERSE_V2_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void CalReverseV2(const T* input, T* output, const size_t* input_shape, const int64_t* strides, const int64_t* axis,
-                  size_t input_size, size_t axis_size, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_REVERSE_V2_CUH_
+CUDA_LIB_EXPORT void CalReverseV2(const T* input, T* output, const size_t* input_shape, const int64_t* strides,
+                                  const int64_t* axis, size_t input_size, size_t axis_size, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REVERSE_V2_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cu
similarity index 92%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cu
index 4a2b68e70bc..ee6f93f0f73 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cu
@@ -15,8 +15,7 @@
  */
 
 #include <iostream>
-#include "plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cuh"
 
 template <typename T>
 __global__ void RmsPropKernel(const T* learning_rate, const T decay, const T momentum, const T epsilon, T* variable,
@@ -57,12 +56,12 @@ void RmsPropCenter(const T* learning_rate, const T* decay, const T* momentum, co
                                                                          moment, gradients, size);
 }
 
-template
+template CUDA_LIB_EXPORT
 void RmsProp(const float* learning_rate, const float decay, const float momentum, const float epsilon,
-            float* variable, float* mean_square, float* moment, float* gradients, const size_t size,
-            cudaStream_t cuda_stream);
+             float* variable, float* mean_square, float* moment, float* gradients, const size_t size,
+             cudaStream_t cuda_stream);
 
-template
+template CUDA_LIB_EXPORT
 void RmsPropCenter(const float* learning_rate, const float* decay, const float* momentum, const float* epsilon,
                    float* variable, float* mean_gradients, float* mean_square, float*moment, float* gradients,
                    const size_t size, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cuh
new file mode 100644
index 00000000000..e87991023cc
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cuh
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RMSPROP_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RMSPROP_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void RmsProp(const T* learning_rate, const T decay, const T momentum, const T epsilon, T* variable,
+                             T* mean_square, T* moment, T* gradients, const size_t size, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void RmsPropCenter(const T* learning_rate, const T* decay, const T* momentum, const T* epsilon,
+                                   T* variable, T* mean_gradients, T* mean_square, T* moment, T* gradients,
+                                   const size_t size, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RMSPROP_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cu
similarity index 85%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cu
index 11b783976ac..9c718b99b2a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cu
@@ -16,7 +16,6 @@
 
 #include "roi_align_impl.cuh"
 #include "util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 
 inline __device__ int roi_cast_int(float x) { return __float2int_rd(x); }
 inline __device__ int roi_cast_int(half x) { return __half2int_rd(x); }
@@ -182,15 +181,16 @@ void ROIAlign(const T *x, const T *roi_boxes, int roi_rows, int roi_cols, T *out
                                                                     height, width, pooled_height, pooled_width);
 }
 
-template void ROIAlign<float>(const float *x, const float *roi_boxes, int roi_rows, int roi_cols, float *out_data,
-                              const float spatial_scale, const int sample_num, int roi_end_mode, const int channels,
-                              const int height, const int width, const int pooled_height, const int pooled_width,
-                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ROIAlign<float>(const float *x, const float *roi_boxes, int roi_rows, int roi_cols,
+                                              float *out_data, const float spatial_scale, const int sample_num,
+                                              int roi_end_mode, const int channels, const int height, const int width,
+                                              const int pooled_height, const int pooled_width,
+                                              cudaStream_t cuda_stream);
 
-template void ROIAlign<half>(const half *x, const half *roi_boxes, int roi_rows, int roi_cols, half *out_data,
-                             const half spatial_scale, const int sample_num, int roi_end_mode, const int channels,
-                             const int height, const int width, const int pooled_height, const int pooled_width,
-                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ROIAlign<half>(const half *x, const half *roi_boxes, int roi_rows, int roi_cols,
+                                             half *out_data, const half spatial_scale, const int sample_num,
+                                             int roi_end_mode, const int channels, const int height, const int width,
+                                             const int pooled_height, const int pooled_width, cudaStream_t cuda_stream);
 
 template <typename T>
 __global__ void ROIAlignGradInitKernel(size_t size_init, T *dx) {
@@ -275,12 +275,14 @@ void ROIAlignGrad(const T *dy, const T *roi_boxes, int batch_size, int roi_rows,
     pooled_width);
 }
 
-template void ROIAlignGrad<float>(const float *dy, const float *roi_boxes, int batch_size, int roi_rows, int roi_cols,
-                                  float *dx, const float spatial_scale, const int sample_num, int roi_end_mode,
-                                  const int channels, const int height, const int width, const int pooled_height,
-                                  const int pooled_width, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ROIAlignGrad<float>(const float *dy, const float *roi_boxes, int batch_size, int roi_rows,
+                                                  int roi_cols, float *dx, const float spatial_scale,
+                                                  const int sample_num, int roi_end_mode, const int channels,
+                                                  const int height, const int width, const int pooled_height,
+                                                  const int pooled_width, cudaStream_t cuda_stream);
 
-template void ROIAlignGrad<half>(const half *dy, const half *roi_boxes, int batch_size, int roi_rows, int roi_cols,
-                                 half *dx, const half spatial_scale, const int sample_num, int roi_end_mode,
-                                 const int channels, const int height, const int width, const int pooled_height,
-                                 const int pooled_width, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ROIAlignGrad<half>(const half *dy, const half *roi_boxes, int batch_size, int roi_rows,
+                                                 int roi_cols, half *dx, const half spatial_scale, const int sample_num,
+                                                 int roi_end_mode, const int channels, const int height,
+                                                 const int width, const int pooled_height, const int pooled_width,
+                                                 cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cuh
new file mode 100644
index 00000000000..bd091ab4c02
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cuh
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ROI_ALIGN_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ROI_ALIGN_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void ROIAlign(const T *x, const T *roi_boxes, int roi_rows, int roi_cols, T *out_data,
+                              const T spatial_scale, const int sample_num, int roi_end_mode, const int channels,
+                              const int height, const int width, const int pooled_height, const int pooled_width,
+                              cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void ROIAlignGrad(const T *dy, const T *roi_boxes, int batch_size, int roi_rows, int roi_cols, T *dx,
+                                  const T spatial_scale, const int sample_num, int roi_end_mode, const int channels,
+                                  const int height, const int width, const int pooled_height, const int pooled_width,
+                                  cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ROI_ALIGN_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cu
new file mode 100644
index 00000000000..5de93728879
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cu
@@ -0,0 +1,104 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cuh"
+
+template <typename T, typename S>
+__global__ void ScatterUpdateKernel(const size_t inner_size, const size_t updates_size, const S *indices,
+                                    const T *updates, T *input) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
+    const size_t index = pos / inner_size;
+    const size_t offset = pos % inner_size;
+    const size_t current_pos = indices[index] * inner_size + offset;
+    input[current_pos] = updates[pos];
+  }
+}
+
+template <typename T, typename S>
+__global__ void ScatterAddKernel(const size_t inner_size, const size_t updates_size, const S *indices, const T *updates,
+                                 T *input) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
+    const size_t index = pos / inner_size;
+    const size_t offset = pos % inner_size;
+    const size_t current_pos = indices[index] * inner_size + offset;
+    MsAtomicAdd(&input[current_pos], updates[pos]);
+  }
+}
+
+template <typename T, typename S>
+__global__ void ScatterSubKernel(const size_t inner_size, const size_t updates_size, const S *indices, const T *updates,
+                                 T *input) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
+    const size_t index = pos / inner_size;
+    const size_t offset = pos % inner_size;
+    const size_t current_pos = indices[index] * inner_size + offset;
+    MsAtomicAdd(&input[current_pos], -updates[pos]);
+  }
+}
+
+template <typename T, typename S>
+void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, const size_t &indices_size,
+                 const S *indices, const T *updates, T *input, cudaStream_t cuda_stream) {
+  const size_t updates_size = inner_size * indices_size;
+  switch (func_type) {
+    case SCATTER_FUNC_UPDATE:
+      return ScatterUpdateKernel<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size,
+                                                                                            indices, updates, input);
+    case SCATTER_FUNC_ADD:
+      return ScatterAddKernel<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size,
+                                                                                         indices, updates, input);
+    case SCATTER_FUNC_SUB:
+      return ScatterSubKernel<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size,
+                                                                                         indices, updates, input);
+    default:
+      break;
+  }
+}
+
+template CUDA_LIB_EXPORT void ScatterFunc<float, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                                      const size_t &indices_size, const int *indices,
+                                                      const float *updates, float *input, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ScatterFunc<float, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                                          const size_t &indices_size, const int64_t *indices,
+                                                          const float *updates, float *input, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ScatterFunc<half, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                                     const size_t &indices_size, const int *indices,
+                                                     const half *updates, half *input, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ScatterFunc<half, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                                         const size_t &indices_size, const int64_t *indices,
+                                                         const half *updates, half *input, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ScatterFunc<int, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                                    const size_t &indices_size, const int *indices, const int *updates,
+                                                    int *input, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ScatterFunc<int, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                                        const size_t &indices_size, const int64_t *indices,
+                                                        const int *updates, int *input, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ScatterFunc<unsigned char, int>(enum ScatterFunctorType func_type,
+                                                              const size_t &inner_size, const size_t &indices_size,
+                                                              const int *indices, const unsigned char *updates,
+                                                              unsigned char *input, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ScatterFunc<unsigned char, int64_t>(enum ScatterFunctorType func_type,
+                                                                  const size_t &inner_size, const size_t &indices_size,
+                                                                  const int64_t *indices, const unsigned char *updates,
+                                                                  unsigned char *input, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ScatterFunc<int8_t, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                                       const size_t &indices_size, const int *indices,
+                                                       const int8_t *updates, int8_t *input, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ScatterFunc<int8_t, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                                           const size_t &indices_size, const int64_t *indices,
+                                                           const int8_t *updates, int8_t *input,
+                                                           cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cuh
similarity index 55%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cuh
index 8c264d0fbc8..4b6fb1d6cde 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cuh
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_FUNCTOR_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_FUNCTOR_IMPL_CUH_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_FUNCTOR_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_FUNCTOR_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 enum ScatterFunctorType {
   SCATTER_FUNC_UPDATE = 0,
@@ -27,7 +26,8 @@ enum ScatterFunctorType {
 };
 
 template <typename T, typename S>
-void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, const size_t &indices_size,
-                 const S *indices, const T *updates, T *input, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                 const size_t &indices_size, const S *indices, const T *updates, T *input,
+                                 cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_FUNCTOR_IMPL_CUH_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_FUNCTOR_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cu
new file mode 100644
index 00000000000..73da9c2a0f1
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cu
@@ -0,0 +1,118 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+
+template <typename T, typename S>
+__global__ void ScatterNdKernel(S *indices, T *update, T *output, const size_t block_size, const size_t input_size,
+                                const size_t output_size, const size_t indices_dim_0, const size_t indices_dim_1,
+                                S *indices_stride, S *work_shape) {
+  int i, j;
+  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
+       read_index += blockDim.x * gridDim.x) {
+    size_t write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / block_size;
+    j = read_index % block_size;
+
+    for (size_t k = 0; k < indices_dim_1; k++) {
+      S indices_i = indices[i * indices_dim_1 + k];
+      out_bound |= indices_i >= work_shape[k];
+      write_index += indices_i * indices_stride[k];
+    }
+
+    write_index += j;
+    out_bound |= write_index >= output_size;
+
+    if (!out_bound) {
+      MsAtomicAdd(&output[write_index], update[read_index]);
+    }
+  }
+}
+
+template <typename T, typename S>
+void ScatterNd(S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
+               const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, S *indices_stride,
+               S *work_shape, cudaStream_t stream) {
+  ScatterNdKernel<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(indices, update, output, block_size, input_size,
+                                                                       output_size, indices_dim_0, indices_dim_1,
+                                                                       indices_stride, work_shape);
+  return;
+}
+
+template CUDA_LIB_EXPORT void ScatterNd<double, int>(int *indices, double *update, double *output,
+                                                     const size_t &block_size, const size_t &input_size,
+                                                     const size_t &output_size, const size_t &indices_dim_0,
+                                                     const size_t &indices_dim_1, int *indices_stride, int *work_shape,
+                                                     cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScatterNd<double, int64_t>(int64_t *indices, double *update, double *output,
+                                                         const size_t &block_size, const size_t &input_size,
+                                                         const size_t &output_size, const size_t &indices_dim_0,
+                                                         const size_t &indices_dim_1, int64_t *indices_stride,
+                                                         int64_t *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScatterNd<float, int>(int *indices, float *update, float *output,
+                                                    const size_t &block_size, const size_t &input_size,
+                                                    const size_t &output_size, const size_t &indices_dim_0,
+                                                    const size_t &indices_dim_1, int *indices_stride, int *work_shape,
+                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScatterNd<float, int64_t>(int64_t *indices, float *update, float *output,
+                                                        const size_t &block_size, const size_t &input_size,
+                                                        const size_t &output_size, const size_t &indices_dim_0,
+                                                        const size_t &indices_dim_1, int64_t *indices_stride,
+                                                        int64_t *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScatterNd<half, int>(int *indices, half *update, half *output, const size_t &block_size,
+                                                   const size_t &input_size, const size_t &output_size,
+                                                   const size_t &indices_dim_0, const size_t &indices_dim_1,
+                                                   int *indices_stride, int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScatterNd<half, int64_t>(int64_t *indices, half *update, half *output,
+                                                       const size_t &block_size, const size_t &input_size,
+                                                       const size_t &output_size, const size_t &indices_dim_0,
+                                                       const size_t &indices_dim_1, int64_t *indices_stride,
+                                                       int64_t *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScatterNd<int, int>(int *indices, int *update, int *output, const size_t &block_size,
+                                                  const size_t &input_size, const size_t &output_size,
+                                                  const size_t &indices_dim_0, const size_t &indices_dim_1,
+                                                  int *indices_stride, int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScatterNd<int, int64_t>(int64_t *indices, int *update, int *output,
+                                                      const size_t &block_size, const size_t &input_size,
+                                                      const size_t &output_size, const size_t &indices_dim_0,
+                                                      const size_t &indices_dim_1, int64_t *indices_stride,
+                                                      int64_t *work_shape, cudaStream_t stream);
+// NOLINTNEXTLINE
+template CUDA_LIB_EXPORT void ScatterNd<short, int>(int *indices, short *update, short *output,
+                                                    const size_t &block_size, const size_t &input_size,
+                                                    const size_t &output_size, const size_t &indices_dim_0,
+                                                    const size_t &indices_dim_1, int *indices_stride, int *work_shape,
+                                                    cudaStream_t stream);
+// NOLINTNEXTLINE
+template CUDA_LIB_EXPORT void ScatterNd<short, int64_t>(int64_t *indices, short *update, short *output,
+                                                        const size_t &block_size, const size_t &input_size,
+                                                        const size_t &output_size, const size_t &indices_dim_0,
+                                                        const size_t &indices_dim_1, int64_t *indices_stride,
+                                                        int64_t *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScatterNd<unsigned char, int>(int *indices, unsigned char *update, unsigned char *output,
+                                                            const size_t &block_size, const size_t &input_size,
+                                                            const size_t &output_size, const size_t &indices_dim_0,
+                                                            const size_t &indices_dim_1, int *indices_stride,
+                                                            int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScatterNd<unsigned char, int64_t>(int64_t *indices, unsigned char *update,
+                                                                unsigned char *output, const size_t &block_size,
+                                                                const size_t &input_size, const size_t &output_size,
+                                                                const size_t &indices_dim_0,
+                                                                const size_t &indices_dim_1, int64_t *indices_stride,
+                                                                int64_t *work_shape, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cuh
new file mode 100644
index 00000000000..5c159e8aecb
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cuh
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_ND_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_ND_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void ScatterNd(S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
+                               const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
+                               S *indices_stride, S *work_shape, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_ND_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cu
new file mode 100644
index 00000000000..4363c04d5ed
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cu
@@ -0,0 +1,209 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cuh"
+
+template <typename T, typename S>
+__global__ void ScatterNdUpdate(const size_t unit_size, const size_t index_depth, const size_t updates_size,
+                                const S *out_strides, const S *indices, const T *updates, T *input) {
+  int i, j;
+  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size);
+       read_index += blockDim.x * gridDim.x) {
+    size_t write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / unit_size;
+    j = read_index % unit_size;
+
+    for (size_t k = 0; k < index_depth; k++) {
+      S indices_i = indices[i * index_depth + k];
+      out_bound |= indices_i < 0;
+      write_index += indices_i * out_strides[k] * unit_size;
+    }
+
+    write_index += j;
+
+    if (!out_bound) {
+      input[write_index] = updates[read_index];
+    }
+  }
+}
+
+template <typename T, typename S>
+__global__ void ScatterNdAdd(const size_t unit_size, const size_t index_depth, const size_t updates_size,
+                             const S *out_strides, const S *indices, const T *updates, T *input) {
+  int i, j;
+  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size);
+       read_index += blockDim.x * gridDim.x) {
+    size_t write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / unit_size;
+    j = read_index % unit_size;
+
+    for (size_t k = 0; k < index_depth; k++) {
+      S indices_i = indices[i * index_depth + k];
+      out_bound |= indices_i < 0;
+      write_index += indices_i * out_strides[k] * unit_size;
+    }
+
+    write_index += j;
+
+    if (!out_bound) {
+      MsAtomicAdd(&input[write_index], updates[read_index]);
+    }
+  }
+}
+
+template <typename T, typename S>
+__global__ void ScatterNdSub(const size_t unit_size, const size_t index_depth, const size_t updates_size,
+                             const S *out_strides, const S *indices, const T *updates, T *input) {
+  int i, j;
+  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size);
+       read_index += blockDim.x * gridDim.x) {
+    size_t write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / unit_size;
+    j = read_index % unit_size;
+
+    for (size_t k = 0; k < index_depth; k++) {
+      S indices_i = indices[i * index_depth + k];
+      out_bound |= indices_i < 0;
+      write_index += indices_i * out_strides[k] * unit_size;
+    }
+
+    write_index += j;
+
+    if (!out_bound) {
+      MsAtomicAdd(&input[write_index], -updates[read_index]);
+    }
+  }
+}
+
+template <typename T, typename S>
+void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, const size_t &num_units,
+                         const size_t &index_depth, const S *out_strides, const S *indices, const T *updates, T *input,
+                         cudaStream_t cuda_stream) {
+  const size_t updates_size = unit_size * num_units;
+  switch (func_type) {
+    case SCATTER_ND_FUNC_UPDATE:
+      return ScatterNdUpdate<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(
+        unit_size, index_depth, updates_size, out_strides, indices, updates, input);
+    case SCATTER_ND_FUNC_ADD:
+      return ScatterNdAdd<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(
+        unit_size, index_depth, updates_size, out_strides, indices, updates, input);
+    case SCATTER_ND_FUNC_SUB:
+      return ScatterNdSub<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(
+        unit_size, index_depth, updates_size, out_strides, indices, updates, input);
+    default:
+      break;
+  }
+}
+
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<double, int64_t>(enum ScatterNdFunctorType func_type,
+                                                                   const size_t &unit_size, const size_t &num_units,
+                                                                   const size_t &index_depth,
+                                                                   const int64_t *out_strides, const int64_t *indices,
+                                                                   const double *updates, double *input,
+                                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<double, int32_t>(enum ScatterNdFunctorType func_type,
+                                                                   const size_t &unit_size, const size_t &num_units,
+                                                                   const size_t &index_depth,
+                                                                   const int32_t *out_strides, const int32_t *indices,
+                                                                   const double *updates, double *input,
+                                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<float, int64_t>(enum ScatterNdFunctorType func_type,
+                                                                  const size_t &unit_size, const size_t &num_units,
+                                                                  const size_t &index_depth,
+                                                                  const int64_t *out_strides, const int64_t *indices,
+                                                                  const float *updates, float *input,
+                                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<float, int32_t>(enum ScatterNdFunctorType func_type,
+                                                                  const size_t &unit_size, const size_t &num_units,
+                                                                  const size_t &index_depth,
+                                                                  const int32_t *out_strides, const int32_t *indices,
+                                                                  const float *updates, float *input,
+                                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<half, int64_t>(enum ScatterNdFunctorType func_type,
+                                                                 const size_t &unit_size, const size_t &num_units,
+                                                                 const size_t &index_depth, const int64_t *out_strides,
+                                                                 const int64_t *indices, const half *updates,
+                                                                 half *input, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<half, int32_t>(enum ScatterNdFunctorType func_type,
+                                                                 const size_t &unit_size, const size_t &num_units,
+                                                                 const size_t &index_depth, const int32_t *out_strides,
+                                                                 const int32_t *indices, const half *updates,
+                                                                 half *input, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<int32_t, int64_t>(enum ScatterNdFunctorType func_type,
+                                                                    const size_t &unit_size, const size_t &num_units,
+                                                                    const size_t &index_depth,
+                                                                    const int64_t *out_strides, const int64_t *indices,
+                                                                    const int32_t *updates, int32_t *input,
+                                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<int32_t, int32_t>(enum ScatterNdFunctorType func_type,
+                                                                    const size_t &unit_size, const size_t &num_units,
+                                                                    const size_t &index_depth,
+                                                                    const int32_t *out_strides, const int32_t *indices,
+                                                                    const int32_t *updates, int32_t *input,
+                                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<int16_t, int64_t>(enum ScatterNdFunctorType func_type,
+                                                                    const size_t &unit_size, const size_t &num_units,
+                                                                    const size_t &index_depth,
+                                                                    const int64_t *out_strides, const int64_t *indices,
+                                                                    const int16_t *updates, int16_t *input,
+                                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<int16_t, int32_t>(enum ScatterNdFunctorType func_type,
+                                                                    const size_t &unit_size, const size_t &num_units,
+                                                                    const size_t &index_depth,
+                                                                    const int32_t *out_strides, const int32_t *indices,
+                                                                    const int16_t *updates, int16_t *input,
+                                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<uint8_t, int64_t>(enum ScatterNdFunctorType func_type,
+                                                                    const size_t &unit_size, const size_t &num_units,
+                                                                    const size_t &index_depth,
+                                                                    const int64_t *out_strides, const int64_t *indices,
+                                                                    const uint8_t *updates, uint8_t *input,
+                                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<uint8_t, int32_t>(enum ScatterNdFunctorType func_type,
+                                                                    const size_t &unit_size, const size_t &num_units,
+                                                                    const size_t &index_depth,
+                                                                    const int32_t *out_strides, const int32_t *indices,
+                                                                    const uint8_t *updates, uint8_t *input,
+                                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<int8_t, int64_t>(enum ScatterNdFunctorType func_type,
+                                                                   const size_t &unit_size, const size_t &num_units,
+                                                                   const size_t &index_depth,
+                                                                   const int64_t *out_strides, const int64_t *indices,
+                                                                   const int8_t *updates, int8_t *input,
+                                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<int8_t, int32_t>(enum ScatterNdFunctorType func_type,
+                                                                   const size_t &unit_size, const size_t &num_units,
+                                                                   const size_t &index_depth,
+                                                                   const int32_t *out_strides, const int32_t *indices,
+                                                                   const int8_t *updates, int8_t *input,
+                                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<bool, int64_t>(enum ScatterNdFunctorType func_type,
+                                                                 const size_t &unit_size, const size_t &num_units,
+                                                                 const size_t &index_depth, const int64_t *out_strides,
+                                                                 const int64_t *indices, const bool *updates,
+                                                                 bool *input, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalScatterNdFunctor<bool, int32_t>(enum ScatterNdFunctorType func_type,
+                                                                 const size_t &unit_size, const size_t &num_units,
+                                                                 const size_t &index_depth, const int32_t *out_strides,
+                                                                 const int32_t *indices, const bool *updates,
+                                                                 bool *input, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cuh
similarity index 53%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cuh
index faf08587d49..c2c380597d9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cuh
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ND_FUNCTOR_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ND_FUNCTOR_IMPL_CUH_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_ND_FUNCTOR_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_ND_FUNCTOR_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 enum ScatterNdFunctorType {
   SCATTER_ND_FUNC_UPDATE = 0,
@@ -27,8 +26,8 @@ enum ScatterNdFunctorType {
 };
 
 template <typename T, typename S>
-void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, const size_t &num_units,
-                         const size_t &index_depth, const S *out_strides, const S *indices, const T *updates, T *input,
-                         cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                         const size_t &num_units, const size_t &index_depth, const S *out_strides,
+                                         const S *indices, const T *updates, T *input, cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ND_FUNCTOR_IMPL_CUH_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_ND_FUNCTOR_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cu
new file mode 100644
index 00000000000..a7619b7fab0
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cu
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <include/cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cuh"
+#include "include/cuda_fp16.h"
+
+template <typename T>
+__global__ void Select(const size_t size, const bool* cond, const T* input_x, const T* input_y, T* output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
+    output[pos] = cond[pos] ? input_x[pos] : input_y[pos];
+  }
+  return;
+}
+
+template <typename T>
+void CalSelect(const size_t size, const bool* cond, const T* input_x, const T* input_y, T* output,
+               cudaStream_t cuda_stream) {
+  Select<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, cond, input_x, input_y, output);
+  return;
+}
+
+template CUDA_LIB_EXPORT void CalSelect<double>(const size_t size, const bool* cond, const double* input_X,
+                                                const double* input_y, double* output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSelect<float>(const size_t size, const bool* cond, const float* input_X,
+                                               const float* input_y, float* output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSelect<int>(const size_t size, const bool* cond, const int* input_X,
+                                             const int* input_y, int* output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSelect<half>(const size_t size, const bool* cond, const half* input_X,
+                                              const half* input_y, half* output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSelect<int64_t>(const size_t size, const bool* cond, const int64_t* input_X,
+                                                 const int64_t* input_y, int64_t* output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSelect<bool>(const size_t size, const bool *cond, const bool *input_X,
+                                              const bool *input_y, bool *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/select_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cuh
similarity index 56%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/select_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cuh
index 3ee876061b3..f465fa0f4ea 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/select_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cuh
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SELECT_IMPL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SELECT_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SELECT_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SELECT_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void CalSelect(const size_t size, const bool* cond, const T* input_x, const T* input_y, T* output,
-               cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SELECT_IMPL_H_
+CUDA_LIB_EXPORT void CalSelect(const size_t size, const bool* cond, const T* input_x, const T* input_y, T* output,
+                               cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SELECT_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sgd_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cu
similarity index 81%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sgd_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cu
index 6677c2676d2..f5540d04755 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sgd_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include <iostream>
-#include "plugin/device/gpu/kernel/cuda_impl/sgd_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cuh"
 
 template <typename T>
 __global__ void SGDKernel(const int size, const T dampening, const T weight_decay, const bool nesterov, const T *grad,
@@ -52,6 +52,6 @@ void SGD(const int size, const T dampening, const T weight_decay, const bool nes
                                                                lr, param, accum, stat);
 }
 
-template void SGD(const int size, const float dampening, const float weight_decay, const bool nesterov, const float *lr,
-                  const float *momentum, const float *grad, float *param, float *accum, float *stat,
-                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SGD(const int size, const float dampening, const float weight_decay, const bool nesterov,
+                                  const float *lr, const float *momentum, const float *grad, float *param, float *accum,
+                                  float *stat, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sgd_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cuh
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sgd_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cuh
index 487f88c128c..cc091404674 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sgd_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cuh
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SGD_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SGD_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SGD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SGD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void SGD(const int size, const T dampening, const T weight_decay, const bool nesterov, const T *lr, const T *momentum,
-         const T *grad, T *param, T *accum, T *stat, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SGD_IMPL_H_
+CUDA_LIB_EXPORT void SGD(const int size, const T dampening, const T weight_decay, const bool nesterov, const T *lr,
+                         const T *momentum, const T *grad, T *param, T *accum, T *stat, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SGD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cu
similarity index 57%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cu
index 1119bb138b3..39623c84f76 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T, typename S>
 __global__ void SigmoidCrossEntropyWithLogitsGradKernel(const size_t size, const T *logits, const S *labels,
@@ -49,14 +50,23 @@ void SigmoidCrossEntropyWithLogitsGrad(const size_t size, const T *logits, const
                                                                                              dout_addr, outputs);
 }
 
-template void SigmoidCrossEntropyWithLogitsGrad<half, half>(const size_t size, const half *logits,
-                                                              const half *labels, const half *dout_addr,
-                                                              half *outputs, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogitsGrad<half, half>(const size_t size,
+                                                                            const half *logits,
+                                                                            const half *labels,
+                                                                            const half *dout_addr,
+                                                                            half *outputs,
+                                                                            cudaStream_t cuda_stream);
 
-template void SigmoidCrossEntropyWithLogitsGrad<float, float>(const size_t size, const float *logits,
-                                                              const float *labels, const float *dout_addr,
-                                                              float *outputs, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogitsGrad<float, float>(const size_t size,
+                                                                              const float *logits,
+                                                                              const float *labels,
+                                                                              const float *dout_addr,
+                                                                              float *outputs,
+                                                                              cudaStream_t cuda_stream);
 
-template void SigmoidCrossEntropyWithLogitsGrad<double, double>(const size_t size, const double *logits,
-                                                                const double *labels, const double *dout_addr,
-                                                                double *outputs, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogitsGrad<double, double>(const size_t size,
+                                                                                const double *logits,
+                                                                                const double *labels,
+                                                                                const double *dout_addr,
+                                                                                double *outputs,
+                                                                                cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cuh
new file mode 100644
index 00000000000..cdcff8a755a
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cuh
@@ -0,0 +1,24 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T, typename S>
+CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogitsGrad(const size_t size, const T *logits, const S *labels,
+                                                       const T *dout_addr, T *outputs, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cu
similarity index 64%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cu
index 73b16c6a123..ec99b655623 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T, typename S>
 __global__ void SigmoidCrossEntropyWithLogitsKernel(const size_t size, const T *logits, const S *labels, T *outputs) {
@@ -41,10 +42,12 @@ void SigmoidCrossEntropyWithLogits(const size_t size, const T *logits, const S *
   SigmoidCrossEntropyWithLogitsKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, logits, labels, outputs);
 }
 
-template void SigmoidCrossEntropyWithLogits<half, half>(const size_t size, const half *logits, const half *labels,
-                                                          half *outputs, cudaStream_t cuda_stream);
-template void SigmoidCrossEntropyWithLogits<float, float>(const size_t size, const float *logits, const float *labels,
-                                                          float *outputs, cudaStream_t cuda_stream);
-template void SigmoidCrossEntropyWithLogits<double, double>(const size_t size, const double *logits,
-                                                            const double *labels, double *outputs,
-                                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogits<half, half>(const size_t size, const half *logits,
+                                                                        const half *labels, half *outputs,
+                                                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogits<float, float>(const size_t size, const float *logits,
+                                                                          const float *labels, float *outputs,
+                                                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogits<double, double>(const size_t size, const double *logits,
+                                                                            const double *labels, double *outputs,
+                                                                            cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cuh
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cuh
index dbae8bc4b59..f98055cd75e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cuh
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T, typename S>
-void SigmoidCrossEntropyWithLogitsGrad(const size_t size, const T *logits, const S *labels, const T *dout_addr,
-                                       T *outputs, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogits(const size_t size, const T *logits, const S *labels, T *outputs,
+                                                   cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cu
new file mode 100644
index 00000000000..9844c5753cc
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cu
@@ -0,0 +1,133 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <algorithm>
+#include <numeric>
+#include <functional>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cuh"
+#include "include/cuda_fp16.h"
+
+namespace {
+constexpr size_t kMaxDim = 8;
+}
+
+template <typename T, size_t N>
+class VectorWrapper {
+ public:
+  explicit VectorWrapper(const std::vector<T> &v) { std::copy(v.begin(), v.end(), data); }
+  ~VectorWrapper() {}
+  __device__ T& operator[](size_t index) { return data[index]; }
+
+ private:
+  T data[N];
+};
+
+template <typename T>
+__global__ void CopySlicesKernel(VectorWrapper<int64_t, kMaxDim> begins, VectorWrapper<int64_t, kMaxDim> stride,
+                                 VectorWrapper<size_t, kMaxDim> u, VectorWrapper<size_t, kMaxDim> u_offset,
+                                 VectorWrapper<size_t, kMaxDim> o_offset, const T *update_addr, T *output_addr) {
+  size_t update_num = u[0] * u_offset[0];
+
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < update_num; pos += blockDim.x * gridDim.x) {
+    size_t i = pos / (u_offset[0]) % u[0];
+    size_t j = pos / (u_offset[1]) % u[1];
+    size_t k = pos / (u_offset[2]) % u[2];
+    size_t l = pos / (u_offset[3]) % u[3];
+    size_t m = pos / (u_offset[4]) % u[4];
+    size_t n = pos / (u_offset[5]) % u[5];
+    size_t o = pos / (u[7]) % u[6];
+    size_t p = pos % u[7];
+
+    size_t output_idx = (i * stride[0] + begins[0]) * o_offset[0] + (j * stride[1] + begins[1]) * o_offset[1] +
+                        (k * stride[2] + begins[2]) * o_offset[2] + (l * stride[3] + begins[3]) * o_offset[3] +
+                        (m * stride[4] + begins[4]) * o_offset[4] + (n * stride[5] + begins[5]) * o_offset[5] +
+                        (o * stride[6] + begins[6]) * o_offset[6] + (p * stride[7] + begins[7]);
+    output_addr[output_idx] = update_addr[pos];
+  }
+}
+
+std::vector<size_t> CalculateOffset(const std::vector<size_t> &shape) {
+  std::vector<size_t> offset(kMaxDim);
+  offset[7] = 1;
+  offset[6] = offset[7] * shape[7];
+  offset[5] = offset[6] * shape[6];
+  offset[4] = offset[5] * shape[5];
+  offset[3] = offset[4] * shape[4];
+  offset[2] = offset[3] * shape[3];
+  offset[1] = offset[2] * shape[2];
+  offset[0] = offset[1] * shape[1];
+  return offset;
+}
+
+template <typename T>
+void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape, const T *update, T *output,
+                cudaStream_t cuda_stream) {
+  size_t size = std::accumulate(update_shape.begin(), update_shape.end(), 1, std::multiplies<size_t>());
+
+  VectorWrapper<size_t, kMaxDim> o_offset(CalculateOffset(output_shape));
+  VectorWrapper<size_t, kMaxDim> u_offset(CalculateOffset(update_shape));
+
+  VectorWrapper<int64_t, kMaxDim> begins(begin);
+  VectorWrapper<int64_t, kMaxDim> strides(stride);
+  VectorWrapper<size_t, kMaxDim> update_shapes(update_shape);
+
+  CopySlicesKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(begins, strides, update_shapes, u_offset,
+                                                                      o_offset, update, output);
+}
+
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const bool *update, bool *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const double *update, double *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const float *update, float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const half *update, half *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const int64_t *update, int64_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const int *update, int *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const short *update, short *output, cudaStream_t cuda_stream);  // NOLINT
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const int8_t *update, int8_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const uint64_t *update, uint64_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const uint32_t *update, uint32_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const uint16_t *update, uint16_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const unsigned char *update, unsigned char *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                         const char *update, char *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cuh
similarity index 51%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cuh
index f3c2b1725eb..c72538d9030 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cuh
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SLICE_COPY_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SLICE_COPY_IMPL_CUH_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SLICE_COPY_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SLICE_COPY_IMPL_CUH_
 #include <cuda_runtime.h>
 #include <vector>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                  const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape, const T *update,
-                  T *output, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SLICE_COPY_IMPL_CUH_
+CUDA_LIB_EXPORT void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
+                                const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
+                                const T *update, T *output, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SLICE_COPY_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cu
new file mode 100644
index 00000000000..54a46ed30c1
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cu
@@ -0,0 +1,670 @@
+/**
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <algorithm>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"
+#include "include/cuda_fp16.h"
+
+template <typename T>
+__global__ void Slice1D(const size_t s1, const size_t l1, const size_t d1, const T *input, T *output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1; pos += blockDim.x * gridDim.x) {
+    output[pos] = input[pos + s1];
+  }
+}
+
+template <typename T>
+__global__ void Slice2D(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1,
+                        const size_t d2, const T *input, T *output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2; pos += blockDim.x * gridDim.x) {
+    size_t i = pos / l2 % l1;
+    size_t j = pos % l2;
+
+    size_t offset = (i + s1) * d2 + (j + s2);
+    output[pos] = input[offset];
+  }
+}
+
+template <typename T>
+__global__ void Slice3D(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2,
+                        const size_t l3, const size_t d1, const size_t d2, const size_t d3, const T *input, T *output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3; pos += blockDim.x * gridDim.x) {
+    size_t i = pos / (l2 * l3) % l1;
+    size_t j = pos / l3 % l2;
+    size_t k = pos % l3;
+
+    size_t offset = (i + s1) * (d2 * d3) + (j + s2) * d3 + (k + s3);
+    output[pos] = input[offset];
+  }
+}
+
+template <typename T>
+__global__ void Slice4D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
+                        const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
+                        const size_t d3, const size_t d4, const T *input, T *output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4; pos += blockDim.x * gridDim.x) {
+    size_t i = pos / (l2 * l3 * l4) % l1;
+    size_t j = pos / (l3 * l4) % l2;
+    size_t k = pos / l4 % l3;
+    size_t o = pos % l4;
+
+    size_t offset = (i + s1) * (d2 * d3 * d4) + (j + s2) * (d3 * d4) + (k + s3) * d4 + (o + s4);
+    output[pos] = input[offset];
+  }
+}
+
+template <typename T>
+__global__ void Slice5D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
+                        const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
+                        const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
+                        const T *input, T *output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4 * l5;
+       pos += blockDim.x * gridDim.x) {
+    size_t i = pos / (l2 * l3 * l4 * l5) % l1;
+    size_t j = pos / (l3 * l4 * l5) % l2;
+    size_t k = pos / (l4 * l5) % l3;
+    size_t o = pos / l5 % l4;
+    size_t q = pos % l5;
+
+    size_t offset =
+      (i + s1) * (d2 * d3 * d4 * d5) + (j + s2) * (d3 * d4 * d5) + (k + s3) * (d4 * d5) + (o + s4) * d5 + (q + s5);
+    output[pos] = input[offset];
+  }
+}
+
+template <typename T>
+__global__ void Slice6D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
+                        const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                        const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3,
+                        const size_t d4, const size_t d5, const size_t d6, const T *input, T *output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4 * l5 * l6;
+       pos += blockDim.x * gridDim.x) {
+    size_t i = pos / (l2 * l3 * l4 * l5 * l6) % l1;
+    size_t j = pos / (l3 * l4 * l5 * l6) % l2;
+    size_t k = pos / (l4 * l5 * l6) % l3;
+    size_t o = pos / (l5 * l6) % l4;
+    size_t q = pos / l6 % l5;
+    size_t r = pos % l6;
+
+    size_t offset =
+      (i + s1) * (d2 * d3 * d4 * d5 * d6) + (j + s2) * (d3 * d4 * d5 * d6) + (k + s3) * (d4 * d5 * d6) + (o + s4) *
+      (d5 * d6) + (q + s5) * d6 + (r + s6);
+    output[pos] = input[offset];
+  }
+}
+
+template <typename T>
+__global__ void Slice7D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
+                        const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3,
+                        const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1,
+                        const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
+                        const size_t d7, const T *input, T *output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4 * l5 * l6 * l7;
+       pos += blockDim.x * gridDim.x) {
+    size_t i = pos / (l2 * l3 * l4 * l5 * l6 * l7) % l1;
+    size_t j = pos / (l3 * l4 * l5 * l6 * l7) % l2;
+    size_t k = pos / (l4 * l5 * l6 * l7) % l3;
+    size_t o = pos / (l5 * l6 * l7) % l4;
+    size_t q = pos / (l6 * l7) % l5;
+    size_t r = pos / l7 % l6;
+    size_t s = pos % l7;
+
+    size_t offset =
+      (i + s1) * (d2 * d3 * d4 * d5 * d6 * d7) + (j + s2) * (d3 * d4 * d5 * d6 * d7) + (k + s3) * (d4 * d5 * d6 * d7)+
+      (o + s4) * (d5 * d6 * d7) + (q + s5) * (d6 * d7) + (r + s6) * d7 + (s + s7);
+    output[pos] = input[offset];
+  }
+}
+
+template <typename T>
+__global__ void Slice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                        const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                        const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                        const T *dy, T *dx) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (l1 * l2 * l3 * l4); pos += blockDim.x * gridDim.x) {
+    size_t i = pos / (l2 * l3 * l4) % l1;
+    size_t j = pos / (l3 * l4) % l2;
+    size_t k = pos / l4 % l3;
+    size_t o = pos % l4;
+    size_t input_idx = (i + s1) * (d2 * d3 * d4) + (j + s2) * (d3 * d4) + (k + s3) * d4 + (o + s4);
+    dx[input_idx] = dy[pos];
+  }
+}
+
+template <typename T>
+__global__ void FillArray(T *addr, const size_t len, const float value) {
+  T value_ = static_cast<T>(value);
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < len; pos += blockDim.x * gridDim.x) {
+    addr[pos] = value_;
+  }
+  return;
+}
+template <typename T>
+void FillDeviceArray(const size_t input_size, T *addr, const float value, cudaStream_t cuda_stream) {
+  FillArray<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(addr, input_size, value);
+  return;
+}
+template <typename T>
+void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const T *input, T *output, cudaStream_t stream) {
+  Slice1D<<<GET_BLOCKS(l1), GET_THREADS, 0, stream>>>(s1, l1, d1, input, output);
+}
+template <typename T>
+void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, const size_t d2,
+                   const T *input, T *output, cudaStream_t stream) {
+  Slice2D<<<GET_BLOCKS(l1 * l2), GET_THREADS, 0, stream>>>(s1, s2, l1, l2, d1, d2, input, output);
+}
+template <typename T>
+void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, const size_t l3,
+                   const size_t d1, const size_t d2, const size_t d3, const T *input, T *output, cudaStream_t stream) {
+  Slice3D<<<GET_BLOCKS(l1 * l2 * l3), GET_THREADS, 0, stream>>>(s1, s2, s3, l1, l2, l3, d1, d2, d3, input, output);
+}
+template <typename T>
+void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, const size_t l2,
+                   const size_t l3, const size_t l4, const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                   const T *input, T *output, cudaStream_t stream) {
+  Slice4D<<<GET_BLOCKS(l1 * l2 * l3 * l4), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, l1, l2, l3, l4, d1, d2, d3, d4,
+                                                                     input, output);
+}
+template <typename T>
+void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t l1,
+                   const size_t l2, const size_t l3, const size_t l4, const size_t l5, const size_t d1, const size_t d2,
+                   const size_t d3, const size_t d4, const size_t d5, const T *input, T *output, cudaStream_t stream) {
+  Slice5D<<<GET_BLOCKS(l1 * l2 * l3 * l4 * l5), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, s5, l1, l2, l3, l4, l5, d1,
+                                                                          d2, d3, d4, d5, input, output);
+}
+template <typename T>
+void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t s6,
+                   const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, const size_t l6,
+                   const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
+                   const T *input, T *output, cudaStream_t stream) {
+  Slice6D<<<GET_BLOCKS(l1 * l2 * l3 * l4 * l5 * l6), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, s5, s6, l1, l2, l3, l4,
+                                                                               l5, l6, d1, d2, d3, d4, d5, d6, input,
+                                                                               output);
+}
+template <typename T>
+void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t s6,
+                   const size_t s7, const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
+                   const size_t l6, const size_t l7, const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                   const size_t d5, const size_t d6, const size_t d7, const T *input, T *output, cudaStream_t stream) {
+  Slice7D<<<GET_BLOCKS(l1 * l2 * l3 * l4 * l5 * l6 * l7), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, s5, s6, s7, l1, l2,
+                                                                                    l3, l4, l5, l6, l7, d1, d2, d3, d4,
+                                                                                    d5, d6, d7, input, output);
+}
+template <typename T>
+void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
+                   const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
+                   const size_t d3, const size_t d4, const T *dy, T *dx, cudaStream_t stream) {
+  Slice4DGrad<<<GET_BLOCKS(l1 * l2 * l3 * l4), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, l1, l2, l3, l4, d1, d2, d3, d4,
+                                                                     dy, dx);
+}
+
+template <typename T>
+__global__ void StridedSliceKernel(const size_t b0, const size_t b1, const size_t b2, const size_t b3, const size_t b4,
+                                   const size_t b5, const size_t b6, const size_t s0, const size_t s1, const size_t s2,
+                                   const size_t s3, const size_t s4, const size_t s5, const size_t s6, const size_t i0,
+                                   const size_t i1, const size_t i2, const size_t i3, const size_t i4, const size_t i5,
+                                   const size_t i6, const size_t o0, const size_t o1, const size_t o2, const size_t o3,
+                                   const size_t o4, const size_t o5, const size_t o6, const T *input_addr,
+                                   T *output_addr) {
+  size_t output_num = o0 * o1 * o2 * o3 * o4 * o5 * o6;
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_num; pos += blockDim.x * gridDim.x) {
+    size_t i = pos / (o1 * o2 * o3 * o4 * o5 * o6) % o0;
+    size_t j = pos / (o2 * o3 * o4 * o5 * o6) % o1;
+    size_t k = pos / (o3 * o4 * o5 * o6) % o2;
+    size_t l = pos / (o4 * o5 * o6) % o3;
+    size_t m = pos / (o5 * o6) % o4;
+    size_t n = pos / (o6) % o5;
+    size_t o = pos % o6;
+
+    size_t input_idx = (i * s0 + b0) * i1 * i2 * i3 * i4 * i5 * i6 + (j * s1 + b1) * i2 * i3 * i4 * i5 * i6 +
+                       (k * s2 + b2) * i3 * i4 * i5 * i6 + (l * s3 + b3) * i4 * i5 * i6 + (m * s4 + b4) * i5 * i6 +
+                       (n * s5 + b5) * i6 + (o * s6 + b6);
+    output_addr[pos] = input_addr[input_idx];
+  }
+}
+
+template <typename T>
+void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                  const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape, const T *input,
+                  T *output, cudaStream_t cuda_stream) {
+  size_t size = output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3] * output_shape[4] *
+                output_shape[5] * output_shape[6];
+  StridedSliceKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+    begin[0], begin[1], begin[2], begin[3], begin[4], begin[5], begin[6], strides[0], strides[1], strides[2],
+    strides[3], strides[4], strides[5], strides[6], input_shape[0], input_shape[1], input_shape[2], input_shape[3],
+    input_shape[4], input_shape[5], input_shape[6], output_shape[0], output_shape[1], output_shape[2], output_shape[3],
+    output_shape[4], output_shape[5], output_shape[6], input, output);
+}
+
+template <typename T>
+__global__ void StridedSliceGradKernel(const size_t b0, const size_t b1, const size_t b2, const size_t b3,
+                                       const size_t b4, const size_t b5, const size_t b6, const size_t s0,
+                                       const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                       const size_t s5, const size_t s6, const size_t i0, const size_t i1,
+                                       const size_t i2, const size_t i3, const size_t i4, const size_t i5,
+                                       const size_t i6, const size_t o0, const size_t o1, const size_t o2,
+                                       const size_t o3, const size_t o4, const size_t o5, const size_t o6, const T *dy,
+                                       T *dx) {
+  size_t output_num = o0 * o1 * o2 * o3 * o4 * o5 * o6;
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_num; pos += blockDim.x * gridDim.x) {
+    size_t i = pos / (o1 * o2 * o3 * o4 * o5 * o6) % o0;
+    size_t j = pos / (o2 * o3 * o4 * o5 * o6) % o1;
+    size_t k = pos / (o3 * o4 * o5 * o6) % o2;
+    size_t l = pos / (o4 * o5 * o6) % o3;
+    size_t m = pos / (o5 * o6) % o4;
+    size_t n = pos / (o6) % o5;
+    size_t o = pos % o6;
+
+    size_t input_idx = (i * s0 + b0) * i1 * i2 * i3 * i4 * i5 * i6 + (j * s1 + b1) * i2 * i3 * i4 * i5 * i6 +
+                       (k * s2 + b2) * i3 * i4 * i5 * i6 + (l * s3 + b3) * i4 * i5 * i6 + (m * s4 + b4) * i5 * i6 +
+                       (n * s5 + b5) * i6 + (o * s6 + b6);
+                       (n * s5 + b5) * i6 + (o * s6 + b6);
+    dx[input_idx] = dy[pos];
+  }
+  return;
+}
+
+template <typename T>
+void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                      const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape, const T *dy, T *dx,
+                      cudaStream_t cuda_stream) {
+  size_t size = dy_shape[0] * dy_shape[1] * dy_shape[2] * dy_shape[3] * dy_shape[4] * dy_shape[5] * dy_shape[6];
+  StridedSliceGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+    begin[0], begin[1], begin[2], begin[3], begin[4], begin[5], begin[6], strides[0], strides[1], strides[2],
+    strides[3], strides[4], strides[5], strides[6], dx_shape[0], dx_shape[1], dx_shape[2], dx_shape[3], dx_shape[4],
+    dx_shape[5], dx_shape[6], dy_shape[0], dy_shape[1], dy_shape[2], dy_shape[3], dy_shape[4], dy_shape[5], dy_shape[6],
+    dy, dx);
+}
+
+template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const double *input,
+                                            double *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const float *input,
+                                            float *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const half *input,
+                                            half *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const int *input,
+                                            int *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const short *input,  // NOLINT
+                                            short *output, cudaStream_t stream);  // NOLINT
+template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1,
+                                            const unsigned char *input, unsigned char *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const int64_t *input,
+                                            int64_t *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const bool *input,
+                                            bool *output, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2,
+                                            const size_t d1, const size_t d2, const double *input, double *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2,
+                                            const size_t d1, const size_t d2, const float *input, float *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2,
+                                            const size_t d1, const size_t d2, const half *input, half *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2,
+                                            const size_t d1, const size_t d2, const int *input, int *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2,
+                                            const size_t d1, const size_t d2, const short *input, short *output,  // NOLINT
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2,
+                                            const size_t d1, const size_t d2, const unsigned char *input,
+                                            unsigned char *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2,
+                                            const size_t d1, const size_t d2, const int64_t *input, int64_t *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2,
+                                            const size_t d1, const size_t d2, const bool *input, bool *output,
+                                            cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t d1, const size_t d2,
+                                            const size_t d3, const double *input, double *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t d1, const size_t d2,
+                                            const size_t d3, const float *input, float *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t d1, const size_t d2,
+                                            const size_t d3, const half *input, half *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t d1, const size_t d2,
+                                            const size_t d3, const int *input, int *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t d1, const size_t d2,
+                                            const size_t d3, const short *input, short *output, cudaStream_t stream);  // NOLINT
+template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t d1, const size_t d2,
+                                            const size_t d3, const unsigned char *input, unsigned char *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t d1, const size_t d2,
+                                            const size_t d3, const int64_t *input, int64_t *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t d1, const size_t d2,
+                                            const size_t d3, const bool *input, bool *output, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const double *input, double *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const float *input, float *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const half *input, half *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const int *input, int *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const short *input, short *output, cudaStream_t stream);  // NOLINT
+template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const unsigned char *input, unsigned char *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const int64_t *input, int64_t *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const bool *input, bool *output, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t l1, const size_t l2, const size_t l3,
+                                            const size_t l4, const size_t l5, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const double *input,
+                                            double *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t l1, const size_t l2, const size_t l3,
+                                            const size_t l4, const size_t l5, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const float *input,
+                                            float *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t l1, const size_t l2, const size_t l3,
+                                            const size_t l4, const size_t l5, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const half *input,
+                                            half *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t l1, const size_t l2, const size_t l3,
+                                            const size_t l4, const size_t l5, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const int64_t *input,
+                                            int64_t *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t l1, const size_t l2, const size_t l3,
+                                            const size_t l4, const size_t l5, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const int *input,
+                                            int *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t l1, const size_t l2, const size_t l3,
+                                            const size_t l4, const size_t l5, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const short *input,  // NOLINT
+                                            short *output, cudaStream_t stream);  // NOLINT
+template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t l1, const size_t l2, const size_t l3,
+                                            const size_t l4, const size_t l5, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5,
+                                            const unsigned char *input, unsigned char *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t l1, const size_t l2, const size_t l3,
+                                            const size_t l4, const size_t l5, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const bool *input,
+                                            bool *output, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t l1, const size_t l2,
+                                            const size_t l3, const size_t l4, const size_t l5, const size_t l6,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const size_t d5, const size_t d6, const double *input, double *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t l1, const size_t l2,
+                                            const size_t l3, const size_t l4, const size_t l5, const size_t l6,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const size_t d5, const size_t d6, const float *input, float *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t l1, const size_t l2,
+                                            const size_t l3, const size_t l4, const size_t l5, const size_t l6,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const size_t d5, const size_t d6, const half *input, half *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t l1, const size_t l2,
+                                            const size_t l3, const size_t l4, const size_t l5, const size_t l6,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const size_t d5, const size_t d6, const int64_t *input, int64_t *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t l1, const size_t l2,
+                                            const size_t l3, const size_t l4, const size_t l5, const size_t l6,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const size_t d5, const size_t d6, const int *input, int *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t l1, const size_t l2,
+                                            const size_t l3, const size_t l4, const size_t l5, const size_t l6,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const size_t d5, const size_t d6, const short *input, short *output,  // NOLINT
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t l1, const size_t l2,
+                                            const size_t l3, const size_t l4, const size_t l5, const size_t l6,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const size_t d5, const size_t d6, const unsigned char *input,
+                                            unsigned char *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t l1, const size_t l2,
+                                            const size_t l3, const size_t l4, const size_t l5, const size_t l6,
+                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                            const size_t d5, const size_t d6, const bool *input, bool *output,
+                                            cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t s7, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t l4, const size_t l5,
+                                            const size_t l6, const size_t l7, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const size_t d6,
+                                            const size_t d7, const double *input, double *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t s7, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t l4, const size_t l5,
+                                            const size_t l6, const size_t l7, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const size_t d6,
+                                            const size_t d7, const float *input, float *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t s7, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t l4, const size_t l5,
+                                            const size_t l6, const size_t l7, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const size_t d6,
+                                            const size_t d7, const half *input, half *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t s7, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t l4, const size_t l5,
+                                            const size_t l6, const size_t l7, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const size_t d6,
+                                            const size_t d7, const int64_t *input, int64_t *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t s7, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t l4, const size_t l5,
+                                            const size_t l6, const size_t l7, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const size_t d6,
+                                            const size_t d7, const int *input, int *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t s7, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t l4, const size_t l5,
+                                            const size_t l6, const size_t l7, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const size_t d6,
+                                            const size_t d7, const short *input, short *output, cudaStream_t stream);  // NOLINT
+template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t s7, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t l4, const size_t l5,
+                                            const size_t l6, const size_t l7, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const size_t d6,
+                                            const size_t d7, const unsigned char *input, unsigned char *output,
+                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                            const size_t s5, const size_t s6, const size_t s7, const size_t l1,
+                                            const size_t l2, const size_t l3, const size_t l4, const size_t l5,
+                                            const size_t l6, const size_t l7, const size_t d1, const size_t d2,
+                                            const size_t d3, const size_t d4, const size_t d5, const size_t d6,
+                                            const size_t d7, const bool *input, bool *output, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void CalSlice4DGrad<double>(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                                     const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                                     const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                                     const double *dy, double *dx, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalSlice4DGrad<float>(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                                    const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                                    const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                                    const float *dy, float *dx, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalSlice4DGrad<half>(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                                   const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                                   const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                                   const half *dy, half *dx, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalSlice4DGrad<int>(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                                  const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                                  const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                                  const int *dy, int *dx, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalSlice4DGrad<short>(const size_t s1, const size_t s2, const size_t s3, const size_t s4,  // NOLINT
+                                                    const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                                    const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                                    const short *dy, short *dx, cudaStream_t stream);  // NOLINT
+template CUDA_LIB_EXPORT void CalSlice4DGrad<unsigned char>(const size_t s1, const size_t s2, const size_t s3,
+                                                            const size_t s4, const size_t l1, const size_t l2,
+                                                            const size_t l3, const size_t l4, const size_t d1,
+                                                            const size_t d2, const size_t d3, const size_t d4,
+                                                            const unsigned char *dy, unsigned char *dx,
+                                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalSlice4DGrad<int64_t>(const size_t s1, const size_t s2, const size_t s3,
+                                                      const size_t s4, const size_t l1, const size_t l2,
+                                                      const size_t l3, const size_t l4, const size_t d1,
+                                                      const size_t d2, const size_t d3, const size_t d4,
+                                                      const int64_t *dy, int64_t *dx, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalSlice4DGrad<bool>(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
+                                                   const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                                   const size_t d1, const size_t d2, const size_t d3, const size_t d4,
+                                                   const bool *dy, bool *dx, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void FillDeviceArray<bool>(const size_t input_size, bool *addr, const float value,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FillDeviceArray<int64_t>(const size_t input_size, int64_t *addr, const float value,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FillDeviceArray<int>(const size_t input_size, int *addr, const float value,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FillDeviceArray<short>(const size_t input_size, short *addr, const float value,  // NOLINT
+                                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FillDeviceArray<int8_t>(const size_t input_size, int8_t *addr, const float value,
+                                                      cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FillDeviceArray<uint64_t>(const size_t input_size, uint64_t *addr, const float value,
+                                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FillDeviceArray<uint32_t>(const size_t input_size, uint32_t *addr, const float value,
+                                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FillDeviceArray<uint16_t>(const size_t input_size, uint16_t *addr, const float value,
+                                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FillDeviceArray<unsigned char>(const size_t input_size, unsigned char *addr,
+                                                             const float value, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FillDeviceArray<half>(const size_t input_size, half *addr, const float value,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FillDeviceArray<float>(const size_t input_size, float *addr, const float value,
+                                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void FillDeviceArray<double>(const size_t input_size, double *addr, const float value,
+                                                      cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                           const bool *input, bool *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                           const double *input, double *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                           const float *input, float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                           const half *input, half *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                           const int64_t *input, int64_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                           const int *input, int *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                           const short *input, short *output, cudaStream_t cuda_stream);  // NOLINT
+template CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                           const int8_t *input, int8_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                           const uint64_t *input, uint64_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                           const uint32_t *input, uint32_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                           const uint16_t *input, uint16_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                           const unsigned char *input, unsigned char *output, cudaStream_t cuda_stream);
+
+template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                               const bool *dy, bool *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                               const double *dy, double *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                               const float *dy, float *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                               const half *dy, half *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                               const int64_t *dy, int64_t *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                               const int *dy, int *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                               const short *dy, short *dx, cudaStream_t cuda_stream);  // NOLINT
+template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                               const int8_t *dy, int8_t *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                               const uint64_t *dy, uint64_t *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                               const uint32_t *dy, uint32_t *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                               const uint16_t *dy, uint16_t *dx, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                               const unsigned char *dy, unsigned char *dx, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh
new file mode 100644
index 00000000000..86fd9238d27
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SLICE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SLICE_IMPL_CUH_
+#include <cuda_runtime.h>
+#include <vector>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T, typename...S>
+CUDA_LIB_EXPORT void SliceKernel(const T *input, T *output, const size_t output_size, cudaStream_t cuda_stream,
+                                 S...pack);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
+                                    const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
+                                    const size_t d3, const size_t d4, const T *dy, T *dx, cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const T *input, T *output,
+                                   cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1,
+                                   const size_t d2, const T *input, T *output, cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2,
+                                   const size_t l3, const size_t d1, const size_t d2, const size_t d3, const T *input,
+                                   T *output, cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
+                                   const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
+                                   const size_t d3, const size_t d4, const T *input, T *output, cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
+                                   const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
+                                   const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
+                                   const T *input, T *output, cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
+                                   const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4,
+                                   const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3,
+                                   const size_t d4, const size_t d5, const size_t d6, const T *input, T *output,
+                                   cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
+                                   const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3,
+                                   const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1,
+                                   const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
+                                   const size_t d7, const T *input, T *output, cudaStream_t stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
+                                  const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
+                                  const T *input, T *output, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
+                                      const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
+                                      const T *dy, T *dx, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, T *addr, const float value, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SLICE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cu
similarity index 67%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cu
index 028511192c4..f87f1790c11 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "smooth_l1_loss_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void SmoothL1LossKernel(const int input_size, const float beta, const T *prediction, const T *target,
@@ -87,17 +87,20 @@ void SmoothL1LossGrad(const int &input_size, const float &beta, const T *predict
   SmoothL1LossGradKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, beta, prediction, target,
                                                                              dloss, dx);
 }
-template void SmoothL1Loss<double>(const int &input_size, const float &beta, const double *prediction,
-                                  const double *target, double *loss, cudaStream_t stream);
-template void SmoothL1LossGrad<double>(const int &input_size, const float &beta, const double *prediction,
-                                      const double *target, const double *dloss, double *dx, cudaStream_t stream);
+template CUDA_LIB_EXPORT void SmoothL1Loss<double>(const int &input_size, const float &beta, const double *prediction,
+                                                   const double *target, double *loss, cudaStream_t stream);
+template CUDA_LIB_EXPORT void SmoothL1LossGrad<double>(const int &input_size, const float &beta,
+                                                       const double *prediction, const double *target,
+                                                       const double *dloss, double *dx, cudaStream_t stream);
 
-template void SmoothL1Loss<float>(const int &input_size, const float &beta, const float *prediction,
-                                  const float *target, float *loss, cudaStream_t stream);
-template void SmoothL1LossGrad<float>(const int &input_size, const float &beta, const float *prediction,
-                                      const float *target, const float *dloss, float *dx, cudaStream_t stream);
+template CUDA_LIB_EXPORT void SmoothL1Loss<float>(const int &input_size, const float &beta, const float *prediction,
+                                                  const float *target, float *loss, cudaStream_t stream);
+template CUDA_LIB_EXPORT void SmoothL1LossGrad<float>(const int &input_size, const float &beta, const float *prediction,
+                                                      const float *target, const float *dloss, float *dx,
+                                                      cudaStream_t stream);
 
-template void SmoothL1Loss<half>(const int &input_size, const float &beta, const half *prediction,
-                                  const half *target, half *loss, cudaStream_t stream);
-template void SmoothL1LossGrad<half>(const int &input_size, const float &beta, const half *prediction,
-                                      const half *target, const half *dloss, half *dx, cudaStream_t stream);
+template CUDA_LIB_EXPORT void SmoothL1Loss<half>(const int &input_size, const float &beta, const half *prediction,
+                                                 const half *target, half *loss, cudaStream_t stream);
+template CUDA_LIB_EXPORT void SmoothL1LossGrad<half>(const int &input_size, const float &beta, const half *prediction,
+                                                     const half *target, const half *dloss, half *dx,
+                                                     cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cuh
new file mode 100644
index 00000000000..d943a5a65c6
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SMOOTH_L1_LOSS_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SMOOTH_L1_LOSS_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void SmoothL1Loss(const int &input_size, const float &beta, const T *prediction, const T *target,
+                                  T *loss, cudaStream_t stream);
+template <typename T>
+CUDA_LIB_EXPORT void SmoothL1LossGrad(const int &input_size, const float &beta, const T *prediction, const T *target,
+                                      const T *dloss, T *dx, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SMOOTH_L1_LOSS_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/softplus_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cu
similarity index 78%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/softplus_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cu
index 17a120c95b6..ebc3dab9593 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/softplus_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/softplus_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void SoftplusKernel(const size_t size, const T *input_addr, T *output_addr) {
@@ -71,9 +71,11 @@ void SoftplusGrad(const size_t size, const half *dy_addr, const half *x_addr, ha
   SoftplusGradKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy_addr, x_addr, dx_addr);
 }
 
-template void Softplus(const size_t size, const float *input_addr, float *output_addr, cudaStream_t cuda_stream);
-template void Softplus(const size_t size, const half *input_addr, half *output_addr, cudaStream_t cuda_stream);
-template void SoftplusGrad(const size_t size, const float *dy_addr, const float *x_addr, float *dx_addr,
-                           cudaStream_t cuda_stream);
-template void SoftplusGrad(const size_t size, const half *dy_addr, const half *x_addr, half *dx_addr,
-                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Softplus(const size_t size, const float *input_addr, float *output_addr,
+                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Softplus(const size_t size, const half *input_addr, half *output_addr,
+                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SoftplusGrad(const size_t size, const float *dy_addr, const float *x_addr, float *dx_addr,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SoftplusGrad(const size_t size, const half *dy_addr, const half *x_addr, half *dx_addr,
+                                           cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gelu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cuh
similarity index 51%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gelu_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cuh
index 4b8fad79441..c5bcd49cd20 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gelu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cuh
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_GELU_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_GELU_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SOFTPLUS_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SOFTPLUS_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template<typename T>
-void Gelu(size_t input_size, T* input_addr, T* output_addr, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void Softplus(const size_t input_size, const T* input_addr, T* output_addr, cudaStream_t cuda_stream);
 
 template<typename T>
-void GeluGradKernel(size_t size, T* dy_addr, T* x_addr, T* dx_addr, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void SoftplusGrad(const size_t size, const T* dy_addr, const T* x_addr, T* dx_addr,
+                                  cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_GELU_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SOFTPLUS_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cu
new file mode 100644
index 00000000000..279681a2f2b
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cu
@@ -0,0 +1,140 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cuda_runtime.h>
+#include "spacetobatch_impl.cuh"
+#include "include/cuda_fp16.h"
+
+template <typename T>
+__global__ void SpaceToBatch(const size_t size, const T *input, const size_t in,
+                             const size_t ih, const size_t iw, const size_t ic,
+                             const size_t on, const size_t oh, const size_t ow,
+                             const size_t oc, const size_t pad_up, const size_t pad_dn,
+                             const size_t pad_lft, const size_t pad_rht, const size_t block_num,
+                             T *output) {
+  size_t temp_stride = 0;
+  size_t temp_pos = 0;
+  size_t idx_in = 0;
+  size_t idx_ic = 0;
+  size_t idx_ih = 0;
+  size_t idx_iw = 0;
+  size_t idx_on = 0;
+  size_t output_pos = 0;
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size;
+    pos += blockDim.x * gridDim.x) {
+    temp_stride = ic * ih * iw;
+    idx_in = pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= ic;
+    idx_ic = temp_pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= ih;
+    idx_ih = temp_pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= iw;
+    idx_iw = temp_pos / temp_stride;
+
+    idx_on = (((idx_ih + pad_up) % block_num) * block_num + ((idx_iw + pad_lft) % block_num)) * in + idx_in;
+    output_pos = idx_on * oc;
+    output_pos = (output_pos + idx_ic) * oh;
+    output_pos = (output_pos + ((idx_ih + pad_up) - (idx_on / (in * block_num))) / block_num) * ow;
+    output_pos = (output_pos + ((idx_iw + pad_lft) - ((idx_on / in) % block_num)) / block_num);
+    output[output_pos] = input[pos];
+  }
+  return;
+}
+
+template <typename T>
+void CalSpaceToBatch(const size_t size, const T *input, const size_t in,
+                     const size_t ih, const size_t iw, const size_t ic,
+                     const size_t on, const size_t oh, const size_t ow,
+                     const size_t oc, const size_t pad_up, const size_t pad_dn,
+                     const size_t pad_lft, const size_t pad_rht, const size_t block_num,
+                     T *output, cudaStream_t cuda_stream) {
+  cudaMemset(output, 0, on * oc * oh * ow * sizeof(T));
+  SpaceToBatch<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+    size, input, in, ih, iw, ic, on, oh, ow, oc, pad_up, pad_dn, pad_lft, pad_rht, block_num, output);
+  return;
+}
+
+template CUDA_LIB_EXPORT void CalSpaceToBatch<float>(const size_t size, const float *input, const size_t in,
+                                                     const size_t ih, const size_t iw, const size_t ic,
+                                                     const size_t on, const size_t oh, const size_t ow,
+                                                     const size_t oc, const size_t pad_up, const size_t pad_dn,
+                                                     const size_t pad_lft, const size_t pad_rht, const size_t block_num,
+                                                     float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToBatch<half>(const size_t size, const half *input, const size_t in,
+                                                    const size_t ih, const size_t iw, const size_t ic,
+                                                    const size_t on, const size_t oh, const size_t ow,
+                                                    const size_t oc, const size_t pad_up, const size_t pad_dn,
+                                                    const size_t pad_lft, const size_t pad_rht, const size_t block_num,
+                                                    half *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToBatch<int>(const size_t size, const int *input, const size_t in,
+                                                   const size_t ih, const size_t iw, const size_t ic,
+                                                   const size_t on, const size_t oh, const size_t ow,
+                                                   const size_t oc, const size_t pad_up, const size_t pad_dn,
+                                                   const size_t pad_lft, const size_t pad_rht, const size_t block_num,
+                                                   int *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToBatch<int64_t>(const size_t size, const int64_t *input, const size_t in,
+                                                       const size_t ih, const size_t iw, const size_t ic,
+                                                       const size_t on, const size_t oh, const size_t ow,
+                                                       const size_t oc, const size_t pad_up, const size_t pad_dn,
+                                                       const size_t pad_lft, const size_t pad_rht,
+                                                       const size_t block_num, int64_t *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToBatch<int16_t>(const size_t size, const int16_t *input, const size_t in,
+                                                       const size_t ih, const size_t iw, const size_t ic,
+                                                       const size_t on, const size_t oh, const size_t ow,
+                                                       const size_t oc, const size_t pad_up, const size_t pad_dn,
+                                                       const size_t pad_lft, const size_t pad_rht,
+                                                       const size_t block_num, int16_t *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToBatch<int8_t>(const size_t size, const int8_t *input, const size_t in,
+                                                      const size_t ih, const size_t iw, const size_t ic,
+                                                      const size_t on, const size_t oh, const size_t ow,
+                                                      const size_t oc, const size_t pad_up, const size_t pad_dn,
+                                                      const size_t pad_lft, const size_t pad_rht,
+                                                      const size_t block_num, int8_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToBatch<uint8_t>(const size_t size, const uint8_t *input, const size_t in,
+                                                       const size_t ih, const size_t iw, const size_t ic,
+                                                       const size_t on, const size_t oh, const size_t ow,
+                                                       const size_t oc, const size_t pad_up, const size_t pad_dn,
+                                                       const size_t pad_lft, const size_t pad_rht,
+                                                       const size_t block_num, uint8_t *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToBatch<uint16_t>(const size_t size, const uint16_t *input, const size_t in,
+                                                        const size_t ih, const size_t iw, const size_t ic,
+                                                        const size_t on, const size_t oh, const size_t ow,
+                                                        const size_t oc, const size_t pad_up, const size_t pad_dn,
+                                                        const size_t pad_lft, const size_t pad_rht,
+                                                        const size_t block_num, uint16_t *output,
+                                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToBatch<uint32_t>(const size_t size, const uint32_t *input, const size_t in,
+                                                        const size_t ih, const size_t iw, const size_t ic,
+                                                        const size_t on, const size_t oh, const size_t ow,
+                                                        const size_t oc, const size_t pad_up, const size_t pad_dn,
+                                                        const size_t pad_lft, const size_t pad_rht,
+                                                        const size_t block_num, uint32_t *output,
+                                                        cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToBatch<uint64_t>(const size_t size, const uint64_t *input, const size_t in,
+                                                        const size_t ih, const size_t iw, const size_t ic,
+                                                        const size_t on, const size_t oh, const size_t ow,
+                                                        const size_t oc, const size_t pad_up, const size_t pad_dn,
+                                                        const size_t pad_lft, const size_t pad_rht,
+                                                        const size_t block_num, uint64_t *output,
+                                                        cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cuh
new file mode 100644
index 00000000000..c9f20719999
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cuh
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPACETOBATCH_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPACETOBATCH_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void CalSpaceToBatch(const size_t size, const T *input, const size_t in,
+                                     const size_t ih, const size_t iw, const size_t ic,
+                                     const size_t on, const size_t oh, const size_t ow,
+                                     const size_t oc, const size_t pad_up, const size_t pad_dn,
+                                     const size_t pad_lft, const size_t pad_rht, const size_t block_num,
+                                     T *output, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPACETOBATCH_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cu
new file mode 100644
index 00000000000..562e40e7af7
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cu
@@ -0,0 +1,138 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include "spacetodepth_impl.cuh"
+#include "include/cuda_fp16.h"
+
+template <typename T>
+__global__ void SpaceToDepth(const size_t size, const T *input, const size_t in,
+                             const size_t ic, const size_t ih, const size_t iw,
+                             const size_t on, const size_t oc, const size_t oh,
+                             const size_t ow, const size_t r, T *output) {
+  size_t temp_stride = 0;
+  size_t temp_pos = 0;
+  size_t output_pos = 0;
+  size_t input_pos_array[SPACETODEPTH_BUFFER_DIMENSION];
+
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size;
+       pos += blockDim.x * gridDim.x) {
+    temp_stride = ic * ih * iw;
+    input_pos_array[0] = pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= ic;
+    input_pos_array[1] = temp_pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= ih;
+    input_pos_array[2] = temp_pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= iw;
+    input_pos_array[3] = temp_pos / temp_stride;
+
+    output_pos += input_pos_array[0];
+    output_pos = (output_pos * oc) +
+                 (input_pos_array[1] +
+                  (r * (input_pos_array[2] % r) + input_pos_array[3] % r) * ic);
+    output_pos = (output_pos * oh) + (input_pos_array[2] / r);
+    output_pos = (output_pos * ow) + (input_pos_array[3] / r);
+
+    output[output_pos] = input[pos];
+    output_pos = 0;
+  }
+  return;
+}
+
+template <typename T>
+void CalSpaceToDepth(const size_t size, const T *input, const size_t in,
+                     const size_t ic, const size_t ih, const size_t iw,
+                     const size_t on, const size_t oc, const size_t oh,
+                     const size_t ow, const size_t r, T *output,
+                     cudaStream_t cuda_stream) {
+  SpaceToDepth<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+      size, input, in, ic, ih, iw, on, oc, oh, ow, r, output);
+  return;
+}
+
+template CUDA_LIB_EXPORT void CalSpaceToDepth<float>(const size_t size, const float *input,
+                                                     const size_t in, const size_t ic,
+                                                     const size_t ih, const size_t iw,
+                                                     const size_t on, const size_t oc,
+                                                     const size_t oh, const size_t ow,
+                                                     const size_t r, float *output,
+                                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToDepth<half>(const size_t size, const half *input,
+                                                    const size_t in, const size_t ic,
+                                                    const size_t ih, const size_t iw,
+                                                    const size_t on, const size_t oc,
+                                                    const size_t oh, const size_t ow,
+                                                    const size_t r, half *output,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToDepth<int>(const size_t size, const int *input,
+                                                   const size_t in, const size_t ic,
+                                                   const size_t ih, const size_t iw,
+                                                   const size_t on, const size_t oc,
+                                                   const size_t oh, const size_t ow,
+                                                   const size_t r, int *output,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToDepth<int64_t>(const size_t size, const int64_t *input,
+                                                       const size_t in, const size_t ic,
+                                                       const size_t ih, const size_t iw,
+                                                       const size_t on, const size_t oc,
+                                                       const size_t oh, const size_t ow,
+                                                       const size_t r, int64_t *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToDepth<int16_t>(const size_t size, const int16_t *input,
+                                                       const size_t in, const size_t ic,
+                                                       const size_t ih, const size_t iw,
+                                                       const size_t on, const size_t oc,
+                                                       const size_t oh, const size_t ow,
+                                                       const size_t r, int16_t *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToDepth<int8_t>(const size_t size, const int8_t *input,
+                                                      const size_t in, const size_t ic,
+                                                      const size_t ih, const size_t iw,
+                                                      const size_t on, const size_t oc,
+                                                      const size_t oh, const size_t ow,
+                                                      const size_t r, int8_t *output,
+                                                      cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSpaceToDepth<uint8_t>(const size_t size, const uint8_t *input,
+                                                       const size_t in, const size_t ic,
+                                                       const size_t ih, const size_t iw,
+                                                       const size_t on, const size_t oc,
+                                                       const size_t oh, const size_t ow,
+                                                       const size_t r, uint8_t *output,
+                                                       cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void
+CalSpaceToDepth<uint16_t>(const size_t size, const uint16_t *input,
+                          const size_t in, const size_t ic, const size_t ih,
+                          const size_t iw, const size_t on, const size_t oc,
+                          const size_t oh, const size_t ow, const size_t r,
+                          uint16_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void
+CalSpaceToDepth<uint32_t>(const size_t size, const uint32_t *input,
+                          const size_t in, const size_t ic, const size_t ih,
+                          const size_t iw, const size_t on, const size_t oc,
+                          const size_t oh, const size_t ow, const size_t r,
+                          uint32_t *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void
+CalSpaceToDepth<uint64_t>(const size_t size, const uint64_t *input,
+                          const size_t in, const size_t ic, const size_t ih,
+                          const size_t iw, const size_t on, const size_t oc,
+                          const size_t oh, const size_t ow, const size_t r,
+                          uint64_t *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cuh
new file mode 100644
index 00000000000..7dfcc853232
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cuh
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPACETODEPTH_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPACETODEPTH_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#define SPACETODEPTH_BUFFER_DIMENSION 4
+template <typename T>
+CUDA_LIB_EXPORT void CalSpaceToDepth(const size_t size, const T *input, const size_t in,
+                                     const size_t ic, const size_t ih, const size_t iw,
+                                     const size_t on, const size_t oc, const size_t oh,
+                                     const size_t ow, const size_t r, T *output,
+                                     cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPACETODEPTH_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cu
similarity index 68%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cu
index 41a33427509..01711a947b7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __device__ __forceinline__ bool CompareFunc(T x, T y) {
@@ -90,14 +91,18 @@ void CalSparseApplyProximalAdagrad(const size_t size, const size_t indices_size,
     variable_out, accumulation_out);
 }
 
-template void CalSparseApplyProximalAdagrad<float>(const size_t size, const size_t indices_size,
-                                                   const float *learning_rate, const float *l1_regularization,
-                                                   const float *l2_regularization, const float *gradient,
-                                                   const int *indices, float *variable, float *accumulation,
-                                                   float *variable_out, float *accumulation_out,
-                                                   cudaStream_t cuda_stream);
-template void CalSparseApplyProximalAdagrad<half>(const size_t size, const size_t indices_size,
-                                                  const half *learning_rate, const half *l1_regularization,
-                                                  const half *l2_regularization, const half *gradient,
-                                                  const int *indices, half *variable, half *accumulation,
-                                                  half *variable_out, half *accumulation_out, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSparseApplyProximalAdagrad<float>(const size_t size, const size_t indices_size,
+                                                                   const float *learning_rate,
+                                                                   const float *l1_regularization,
+                                                                   const float *l2_regularization,
+                                                                   const float *gradient, const int *indices,
+                                                                   float *variable, float *accumulation,
+                                                                   float *variable_out, float *accumulation_out,
+                                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSparseApplyProximalAdagrad<half>(const size_t size, const size_t indices_size,
+                                                                  const half *learning_rate,
+                                                                  const half *l1_regularization,
+                                                                  const half *l2_regularization, const half *gradient,
+                                                                  const int *indices, half *variable,
+                                                                  half *accumulation, half *variable_out,
+                                                                  half *accumulation_out, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cuh
new file mode 100644
index 00000000000..c866f966352
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_APPLY_PROXIMAL_ADAGRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_APPLY_PROXIMAL_ADAGRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void CalSparseApplyProximalAdagrad(const size_t size, const size_t indices_size, const T *learning_rate,
+                                                   const T *l1_regularization, const T *l2_regularization,
+                                                   const T *gradient, const int *indices, T *variable, T *accumulation,
+                                                   T *variable_out, T *accumulation_out, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_APPLY_PROXIMAL_ADAGRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_cross_entropy_cuda_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_cross_entropy_cuda_impl.cu
similarity index 72%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_cross_entropy_cuda_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_cross_entropy_cuda_impl.cu
index b549c5bd4ee..a31c738a318 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_cross_entropy_cuda_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_cross_entropy_cuda_impl.cu
@@ -16,7 +16,6 @@
 
 #include <stdint.h>
 #include "sparse_cross_entropy_cuda_impl.cuh"
-#include "include/cuda_runtime.h"
 
 template <typename T>
 __global__ void CalCrossEntropyKernel(const float *logits, T *labels, const int batch_size, const int class_num,
@@ -67,11 +66,11 @@ void CalCrossEntropyGrad(const float *logits, T *labels, const int batch_size, c
   return;
 }
 
-template void CalCrossEntropy<int>(const float *logits, int *labels, const int batch_size, const int class_num,
-                                   float *loss, cudaStream_t cuda_stream);
-template void CalCrossEntropy<uint64_t>(const float *logits, uint64_t *labels, const int batch_size,
-                                        const int class_num, float *loss, cudaStream_t cuda_stream);
-template void CalCrossEntropyGrad<int>(const float *logits, int *labels, const int batch_size, const int class_num,
-                                       float *grad, cudaStream_t cuda_stream);
-template void CalCrossEntropyGrad<uint64_t>(const float *logits, uint64_t *labels, const int batch_size,
-                                            const int class_num, float *grad, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCrossEntropy<int>(const float *logits, int *labels, const int batch_size,
+                                                   const int class_num, float *loss, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCrossEntropy<uint64_t>(const float *logits, uint64_t *labels, const int batch_size,
+                                                        const int class_num, float *loss, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCrossEntropyGrad<int>(const float *logits, int *labels, const int batch_size,
+                                                       const int class_num, float *grad, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalCrossEntropyGrad<uint64_t>(const float *logits, uint64_t *labels, const int batch_size,
+                                                            const int class_num, float *grad, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_cross_entropy_cuda_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_cross_entropy_cuda_impl.cuh
new file mode 100755
index 00000000000..5c7d2ee4065
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_cross_entropy_cuda_impl.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_CROSS_ENTROPY_CUDA_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_CROSS_ENTROPY_CUDA_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T>
+CUDA_LIB_EXPORT void CalCrossEntropy(const float *logits, T *labels, const int batch_size, const int class_num,
+                                     float *loss, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalCrossEntropyGrad(const float *logits, T *labels, const int batch_size, const int class_num,
+                                         float *grad, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_CROSS_ENTROPY_CUDA_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cu
similarity index 54%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cu
index 19172b48023..78b7b6b4b10 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cu
@@ -15,7 +15,6 @@
  */
 
 #include "sparse_ftrl_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "include/cuda_fp16.h"
 
 template <typename T>
@@ -89,24 +88,35 @@ void CalSparseApplyFtrl(const T *gradient, const S *indices, const int num_index
     n_stride, learning_rate, l1_regularization, l2_regularization, learning_rate_power, variable, accumulation, linear);
 }
 
-template void CalSparseApplyFtrl<float, int>(const float *gradient, const int *indices, const int num_index,
-                                             const size_t n_stride, const float learning_rate,
-                                             const float l1_regularization, const float l2_regularization,
-                                             const float learning_rate_power, const bool use_locking, float *variable,
-                                             float *accumulation, float *linear, cudaStream_t cuda_stream);
-template void CalSparseApplyFtrl<float, int64_t>(const float *gradient, const int64_t *indices, const int num_index,
-                                             const size_t n_stride, const float learning_rate,
-                                             const float l1_regularization, const float l2_regularization,
-                                             const float learning_rate_power, const bool use_locking, float *variable,
-                                             float *accumulation, float *linear, cudaStream_t cuda_stream);
-template void CalSparseApplyFtrl<half, int>(const half *gradient, const int *indices, const int num_index,
-                                            const size_t n_stride, const float learning_rate,
-                                            const float l1_regularization, const float l2_regularization,
-                                            const float learning_rate_power, const bool use_locking, half *variable,
-                                            half *accumulation, half *linear, cudaStream_t cuda_stream);
-template void CalSparseApplyFtrl<half, int64_t>(const half *gradient, const int64_t *indices, const int num_index,
-                                            const size_t n_stride, const float learning_rate,
-                                            const float l1_regularization, const float l2_regularization,
-                                            const float learning_rate_power, const bool use_locking, half *variable,
-                                            half *accumulation, half *linear, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSparseApplyFtrl<float, int>(const float *gradient, const int *indices,
+                                                             const int num_index, const size_t n_stride,
+                                                             const float learning_rate, const float l1_regularization,
+                                                             const float l2_regularization,
+                                                             const float learning_rate_power, const bool use_locking,
+                                                             float *variable, float *accumulation, float *linear,
+                                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSparseApplyFtrl<float, int64_t>(const float *gradient, const int64_t *indices,
+                                                                 const int num_index, const size_t n_stride,
+                                                                 const float learning_rate,
+                                                                 const float l1_regularization,
+                                                                 const float l2_regularization,
+                                                                 const float learning_rate_power,
+                                                                 const bool use_locking, float *variable,
+                                                                 float *accumulation, float *linear,
+                                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSparseApplyFtrl<half, int>(const half *gradient, const int *indices,
+                                                            const int num_index, const size_t n_stride,
+                                                            const float learning_rate, const float l1_regularization,
+                                                            const float l2_regularization,
+                                                            const float learning_rate_power, const bool use_locking,
+                                                            half *variable, half *accumulation, half *linear,
+                                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSparseApplyFtrl<half, int64_t>(const half *gradient, const int64_t *indices,
+                                                                const int num_index, const size_t n_stride,
+                                                                const float learning_rate,
+                                                                const float l1_regularization,
+                                                                const float l2_regularization,
+                                                                const float learning_rate_power, const bool use_locking,
+                                                                half *variable, half *accumulation, half *linear,
+                                                                cudaStream_t cuda_stream);
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cuh
new file mode 100644
index 00000000000..6020ad2eb27
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cuh
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_FTRL_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_FTRL_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CalSparseApplyFtrl(const T *gradient, const S *indices, const int num_index, const size_t n_stride,
+                                        const float learning_rate, const float l1_regularization,
+                                        const float l2_regularization, const float learning_rate_power,
+                                        const bool use_locking, T *variable, T *accumulation, T *linear,
+                                        cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_FTRL_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cu
new file mode 100755
index 00000000000..ef8916db09f
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cu
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cuh"
+#include "include/cuda_fp16.h"
+template <typename T>
+__global__ void Split(const size_t size, const int axis_step, const int all_size_before_axis,
+                      const int all_size_axis, const T* input, T** outputs) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    int num = pos % all_size_before_axis / all_size_axis;
+    int block = num / axis_step;
+    int block_pos = pos / all_size_before_axis * axis_step * all_size_axis +
+                    num % axis_step * all_size_axis + pos % all_size_axis;
+    outputs[block][block_pos] = input[pos];
+  }
+  return;
+}
+
+template <typename T>
+void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
+                 const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) {
+  Split<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, axis_step, all_size_before_axis,
+                                                           all_size_axis, input, outputs);
+  return;
+}
+
+template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
+                                          const int all_size_axis, const half* input, half** outputs,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
+                                          const int all_size_axis, const float* input, float** outputs,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
+                                          const int all_size_axis, const double* input, double** outputs,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
+                                          const int all_size_axis, const int* input, int** outputs,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
+                                          const int all_size_axis, const uint32_t* input, uint32_t** outputs,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
+                                          const int all_size_axis, const int64_t* input, int64_t** outputs,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
+                                          const int all_size_axis, const bool* input, bool** outputs,
+                                          cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/split_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cuh
similarity index 53%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/split_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cuh
index 7ca1593be6f..e3d1f9386e4 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/split_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cuh
@@ -14,11 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPLIT_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPLIT_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
-                 const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
+CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
+                                 const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPLIT_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cu
similarity index 82%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cu
index 7feb03a78bc..2d511cb56fe 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 template <typename T>
 __global__ void SquareSumAllKernel(const size_t size, const T* input_addr_0, const T* input_addr_1,
@@ -92,9 +92,9 @@ void SquareSumAll(const size_t input_size_, const T* input_addr_0, const T* inpu
   AssignKernel<<<GET_BLOCKS(1), GET_THREADS, 0, cuda_stream>>>(1, output_addr_0, output_addr_1, ws_addr_0, ws_addr_1);
 }
 
-template void SquareSumAll(const size_t input_size_, const half* input_addr_0, const half* input_addr_1,
-                  half* output_addr_0, half* output_addr_1, float* ws_addr_0, float* ws_addr_1,
-                  cudaStream_t cuda_stream);
-template void SquareSumAll(const size_t input_size_, const float* input_addr_0, const float* input_addr_1,
-                  float* output_addr_0, float* output_addr_1, float* ws_addr_0, float* ws_addr_1,
-                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SquareSumAll(const size_t input_size_, const half* input_addr_0, const half* input_addr_1,
+                                           half* output_addr_0, half* output_addr_1, float* ws_addr_0, float* ws_addr_1,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SquareSumAll(const size_t input_size_, const float* input_addr_0,
+                                           const float* input_addr_1, float* output_addr_0, float* output_addr_1,
+                                           float* ws_addr_0, float* ws_addr_1, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cuh
similarity index 50%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cuh
index 81e10d1d49e..207840f5121 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cuh
@@ -14,14 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SQUARE_SUM_ALL_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SQUARE_SUM_ALL_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void SquareSumAll(const size_t input_size_, const T* input_addr_0, const T* input_addr_1,
+                                  T* output_addr_0, T* output_addr_1, float* ws_addr_0, float* ws_addr_1,
+                                  cudaStream_t cuda_stream);
 
-template <typename T>
-void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_IMPL_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SQUARE_SUM_ALL_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cu
new file mode 100644
index 00000000000..ea2f85f6ed4
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cu
@@ -0,0 +1,219 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cuh"
+
+const int kWarpSize = 32;
+const int kNumWarps = 32;
+
+__inline__ __device__ float HalfFloatInputConvert(const half val) { return __half2float(val); }
+__inline__ __device__ float HalfFloatInputConvert(const float val) { return val; }
+__inline__ __device__ void HalfFloatOutputAssign(const float val, float *arr, int idx) { arr[idx] = val; }
+__inline__ __device__ void HalfFloatOutputAssign(const float val, half *arr, int idx) { arr[idx] = __float2half(val); }
+
+template <typename T, typename G>
+__global__ void SyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy,
+                                     G *saved_mean, G *saved_invstd, float *dy_sum_local, float *dot_p_local) {
+  // block level memory
+  __shared__ float shared_dy[kNumWarps];
+  __shared__ float shared_dot_p[kNumWarps];
+  int warpId = threadIdx.x / kWarpSize;  // threads are arranged in warps of 32 executed together
+  int laneId = threadIdx.x % kWarpSize;
+
+  int plane = blockIdx.x;  // this thread will only function on a single plane
+  int plane_size = N * H * W;
+  float mean = static_cast<float>(saved_mean[plane]);
+
+  if (threadIdx.x < kNumWarps) {
+    shared_dy[threadIdx.x] = static_cast<float>(0);
+    shared_dot_p[threadIdx.x] = static_cast<float>(0);
+  }
+
+  __syncthreads();  // ensure all 0 init complete across all values
+
+  float dy_sum = static_cast<float>(0);
+  float dot_p = static_cast<float>(0);
+
+  // individual thread level reduction
+  for (int x = threadIdx.x; x < plane_size; x += blockDim.x) {
+    int index = (x / (H * W) * C * H * W) + (plane * H * W) + (x % (H * W));
+    float input_value = HalfFloatInputConvert(x_input[index]);
+    float dy_value = HalfFloatInputConvert(dy[index]);
+    dy_sum += dy_value;
+    dot_p += (input_value - mean) * dy_value;
+  }
+  __syncthreads();
+  // warp reduce all values in every value to a single value
+  for (int offset = kWarpSize / 2; offset > 0; offset /= 2) {
+    float other_dy_sum = __shfl_down_sync(0xffffffff, dy_sum, offset);
+    float other_dot_p = __shfl_down_sync(0xffffffff, dot_p, offset);
+    dy_sum += other_dy_sum;
+    dot_p += other_dot_p;
+  }
+  __syncwarp();
+  if (laneId == 0) {
+    shared_dy[warpId] = dy_sum;
+    shared_dot_p[warpId] = dot_p;
+    // one value per warp now
+  }
+  __syncthreads();
+  if (warpId == 0) {
+    dy_sum = shared_dy[laneId];
+    dot_p = shared_dot_p[laneId];
+    __syncwarp();
+    for (int offset = kWarpSize / 2; offset > 0; offset /= 2) {
+      float other_dy = __shfl_down_sync(0xffffffff, dy_sum, offset);
+      float other_dot_p = __shfl_down_sync(0xffffffff, dot_p, offset);
+      dy_sum += other_dy;
+      dot_p += other_dot_p;
+    }
+    __syncwarp();
+  }
+  if (threadIdx.x == 0) {
+    dy_sum_local[plane] = dy_sum;
+    dot_p_local[plane] = dot_p;
+  }
+  return;
+}
+
+template <typename T, typename S, typename G>
+__global__ void SyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, T *dx,
+                                      G *saved_mean, G *saved_invstd, float *dy_sum_red, float *dot_p_red, S *scale,
+                                      S *dscale, S *dbias, float epsilon) {
+  int size = N * C * H * W;
+  int plane_size = N * H * W;
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    int block_num = (pos / W) / H;  // which of N * C blocks
+    int plane = block_num % C;
+    float mean = HalfFloatInputConvert(saved_mean[plane]);
+    float invstd = HalfFloatInputConvert(saved_invstd[plane]);
+    float scale_value = HalfFloatInputConvert(scale[plane]);
+    float div_factor = HalfFloatInputConvert(1) / plane_size;
+    float dy_sum_plane = dy_sum_red[plane];
+    float dot_p_plane = dot_p_red[plane];
+    float grad_mean = dy_sum_plane * div_factor;
+    float proj_scale = dot_p_plane * div_factor * invstd * invstd;
+    float grad_scale = invstd * scale_value;
+    float inp = HalfFloatInputConvert(x_input[pos]);
+    float proj = (inp - mean) * proj_scale;
+    HalfFloatOutputAssign((HalfFloatInputConvert(dy[pos]) - proj - grad_mean) * grad_scale, dx, pos);
+  }
+}
+
+template <typename S, typename G>
+__global__ void SyncBatchNormGradPostScaleBias(size_t C, G *saved_invstd, float *dy_sum_red, float *dot_p_red,
+                                               S *dscale, S *dbias) {
+  for (size_t plane = blockIdx.x * blockDim.x + threadIdx.x; plane < C; plane += blockDim.x * gridDim.x) {
+    float invstd = HalfFloatInputConvert(saved_invstd[plane]);
+    float dy_sum_plane = dy_sum_red[plane];
+    float dot_p_plane = dot_p_red[plane];
+    dscale[plane] = static_cast<S>(dot_p_plane * invstd);
+    dbias[plane] = static_cast<S>(dy_sum_plane);
+  }
+}
+
+template <typename T, typename G>
+void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, G *saved_mean,
+                             G *saved_invstd, float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream) {
+  SyncBatchNormGradPre<<<C, GET_THREADS, 0, cuda_stream>>>(N, C, H, W, x_input, dy, saved_mean, saved_invstd,
+                                                          dy_sum_local, dot_p_local);
+  return;
+}
+template <typename T, typename S, typename G>
+void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, T *dx,
+                              G *saved_mean, G *saved_invstd, float *dy_sum_red, float *dot_p_red, S *scale, S *dscale,
+                              S *dbias, float epsilon, cudaStream_t cuda_stream) {
+  SyncBatchNormGradPost<<<C, GET_THREADS, 0, cuda_stream>>>(N, C, H, W, x_input, dy, dx, saved_mean, saved_invstd,
+                                                            dy_sum_red, dot_p_red, scale, dscale, dbias, epsilon);
+  SyncBatchNormGradPostScaleBias<<<GET_BLOCKS(C), std::min(C, static_cast<size_t>(GET_THREADS)), 0, cuda_stream>>>(
+    C, saved_invstd, dy_sum_red, dot_p_red, dscale, dbias);
+}
+// PRE FUNCTION
+template CUDA_LIB_EXPORT void CalSyncBatchNormGradPre<float, float>(size_t N, size_t C, size_t H, size_t W,
+                                                                    const float *x_input, const float *dy,
+                                                                    float *saved_mean, float *saved_invstd,
+                                                                    float *dy_sum_local, float *dot_p_local,
+                                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGradPre<float, half>(size_t N, size_t C, size_t H, size_t W,
+                                                                   const float *x_input, const float *dy,
+                                                                   half *saved_mean, half *saved_invstd,
+                                                                   float *dy_sum_local, float *dot_p_local,
+                                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGradPre<half, float>(size_t N, size_t C, size_t H, size_t W,
+                                                                   const half *x_input, const half *dy,
+                                                                   float *saved_mean, float *saved_invstd,
+                                                                   float *dy_sum_local, float *dot_p_local,
+                                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGradPre<half, half>(size_t N, size_t C, size_t H, size_t W,
+                                                                  const half *x_input, const half *dy, half *saved_mean,
+                                                                  half *saved_invstd, float *dy_sum_local,
+                                                                  float *dot_p_local, cudaStream_t cuda_stream);
+// POST FUNCTION
+template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost<float, float, float>(size_t N, size_t C, size_t H, size_t W,
+                                                                            const float *x_input, const float *dy,
+                                                                            float *dx, float *saved_mean,
+                                                                            float *saved_invstd, float *dy_sum_red,
+                                                                            float *dot_p_red, float *scale,
+                                                                            float *dscale, float *dbias, float epsilon,
+                                                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost<half, float, float>(size_t N, size_t C, size_t H, size_t W,
+                                                                           const half *x_input, const half *dy,
+                                                                           half *dx, float *saved_mean,
+                                                                           float *saved_invstd, float *dy_sum_red,
+                                                                           float *dot_p_red, float *scale,
+                                                                           float *dscale, float *dbias, float epsilon,
+                                                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost<float, half, float>(size_t N, size_t C, size_t H, size_t W,
+                                                                           const float *x_input, const float *dy,
+                                                                           float *dx, float *saved_mean,
+                                                                           float *saved_invstd, float *dy_sum_red,
+                                                                           float *dot_p_red, half *scale, half *dscale,
+                                                                           half *dbias, float epsilon,
+                                                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost<half, half, float>(size_t N, size_t C, size_t H, size_t W,
+                                                                          const half *x_input, const half *dy, half *dx,
+                                                                          float *saved_mean, float *saved_invstd,
+                                                                          float *dy_sum_red, float *dot_p_red,
+                                                                          half *scale, half *dscale, half *dbias,
+                                                                          float epsilon, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost<float, float, half>(size_t N, size_t C, size_t H, size_t W,
+                                                                           const float *x_input, const float *dy,
+                                                                           float *dx, half *saved_mean,
+                                                                           half *saved_invstd, float *dy_sum_red,
+                                                                           float *dot_p_red, float *scale,
+                                                                           float *dscale, float *dbias, float epsilon,
+                                                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost<half, float, half>(size_t N, size_t C, size_t H, size_t W,
+                                                                          const half *x_input, const half *dy, half *dx,
+                                                                          half *saved_mean, half *saved_invstd,
+                                                                          float *dy_sum_red, float *dot_p_red,
+                                                                          float *scale, float *dscale, float *dbias,
+                                                                          float epsilon, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost<float, half, half>(size_t N, size_t C, size_t H, size_t W,
+                                                                          const float *x_input, const float *dy,
+                                                                          float *dx, half *saved_mean,
+                                                                          half *saved_invstd, float *dy_sum_red,
+                                                                          float *dot_p_red, half *scale, half *dscale,
+                                                                          half *dbias, float epsilon,
+                                                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost<half, half, half>(size_t N, size_t C, size_t H, size_t W,
+                                                                         const half *x_input, const half *dy, half *dx,
+                                                                         half *saved_mean, half *saved_invstd,
+                                                                         float *dy_sum_red, float *dot_p_red,
+                                                                         half *scale, half *dscale, half *dbias,
+                                                                         float epsilon, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cuh
new file mode 100644
index 00000000000..efaa72e91b9
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cuh
@@ -0,0 +1,29 @@
+// /**
+//  * Copyright 2021 Huawei Technologies Co., Ltd
+//  *
+//  * Licensed under the Apache License, Version 2.0 (the "License");
+//  * you may not use this file except in compliance with the License.
+//  * You may obtain a copy of the License at
+//  *
+//  * http://www.apache.org/licenses/LICENSE-2.0
+//  *
+//  * Unless required by applicable law or agreed to in writing, software
+//  * distributed under the License is distributed on an "AS IS" BASIS,
+//  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  * See the License for the specific language governing permissions and
+//  * limitations under the License.
+//  */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SYNC_BATCH_NORM_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SYNC_BATCH_NORM_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T, typename G>
+CUDA_LIB_EXPORT void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy,
+                                             G *saved_mean, G *invstd_saved, float *dy_sum_local, float *dot_p_local,
+                                             cudaStream_t cuda_stream);
+template <typename T, typename S, typename G>
+CUDA_LIB_EXPORT void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy,
+                                              T *dx, G *saved_mean, G *invstd_saved, float *dy_sum_red,
+                                              float *dot_p_red, S *scale, S *dscale, S *dbias, float epsilon,
+                                              cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SYNC_BATCH_NORM_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cu
similarity index 57%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cu
index e4126b8bf8c..52e1d11f735 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cu
@@ -15,9 +15,8 @@
  */
 
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cuh"
 
 const int kWarpSize = 32;
 const int kNumWarps = 32;
@@ -199,50 +198,64 @@ void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, const T *input
   return;
 }
 
-template void CalSyncBatchNormPre<float>(size_t N, size_t C, size_t H, size_t W, const float *input, int *output_n,
-                                         float *output_mean, float *output_var, float epsilon,
-                                         cudaStream_t cuda_stream);
-template void CalSyncBatchNormPre<half>(size_t N, size_t C, size_t H, size_t W, const half *input, int *output_n,
-                                        float *output_mean, float *output_var, float epsilon, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormPre<float>(size_t N, size_t C, size_t H, size_t W, const float *input,
+                                                         int *output_n, float *output_mean, float *output_var,
+                                                         float epsilon, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormPre<half>(size_t N, size_t C, size_t H, size_t W, const half *input,
+                                                        int *output_n, float *output_mean, float *output_var,
+                                                        float epsilon, cudaStream_t cuda_stream);
 
-template void CalSyncBatchNormGather<float, float>(size_t N_, size_t C_, size_t H_, size_t W_, int *counts_global,
-                                                   float *means_global, float *invstds_global, int *counts_local,
-                                                   float *means_local, float *invstds_local, float *running_mean_output,
-                                                   float *running_var_output, float *running_mean_input,
-                                                   float *running_var_input, float epsilon, float momentum,
-                                                   size_t group_rank, size_t group_size, cudaStream_t cuda_stream);
-template void CalSyncBatchNormGather<float, half>(size_t N_, size_t C_, size_t H_, size_t W_, int *counts_global,
-                                                  float *means_global, float *invstds_global, int *counts_local,
-                                                  float *means_local, float *invstds_local, float *running_mean_output,
-                                                  float *running_var_output, half *running_mean_input,
-                                                  half *running_var_input, float epsilon, float momentum,
-                                                  size_t group_rank, size_t group_size, cudaStream_t cuda_stream);
-template void CalSyncBatchNormGather<half, float>(size_t N_, size_t C_, size_t H_, size_t W_, int *counts_global,
-                                                  float *means_global, float *invstds_global, int *counts_local,
-                                                  float *means_local, float *invstds_local, half *running_mean_output,
-                                                  half *running_var_output, float *running_mean_input,
-                                                  float *running_var_input, float epsilon, float momentum,
-                                                  size_t group_rank, size_t group_size, cudaStream_t cuda_stream);
-template void CalSyncBatchNormGather<half, half>(size_t N_, size_t C_, size_t H_, size_t W_, int *counts_global,
-                                                 float *means_global, float *invstds_global, int *counts_local,
-                                                 float *means_local, float *invstds_local, half *running_mean_output,
-                                                 half *running_var_output, half *running_mean_input,
-                                                 half *running_var_input, float epsilon, float momentum,
-                                                 size_t group_rank, size_t group_size, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGather<float, float>(size_t N_, size_t C_, size_t H_, size_t W_,
+                                                                   int *counts_global, float *means_global,
+                                                                   float *invstds_global, int *counts_local,
+                                                                   float *means_local, float *invstds_local,
+                                                                   float *running_mean_output,
+                                                                   float *running_var_output, float *running_mean_input,
+                                                                   float *running_var_input, float epsilon,
+                                                                   float momentum, size_t group_rank, size_t group_size,
+                                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGather<float, half>(size_t N_, size_t C_, size_t H_, size_t W_,
+                                                                  int *counts_global, float *means_global,
+                                                                  float *invstds_global, int *counts_local,
+                                                                  float *means_local, float *invstds_local,
+                                                                  float *running_mean_output, float *running_var_output,
+                                                                  half *running_mean_input, half *running_var_input,
+                                                                  float epsilon, float momentum, size_t group_rank,
+                                                                  size_t group_size, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGather<half, float>(size_t N_, size_t C_, size_t H_, size_t W_,
+                                                                  int *counts_global, float *means_global,
+                                                                  float *invstds_global, int *counts_local,
+                                                                  float *means_local, float *invstds_local,
+                                                                  half *running_mean_output, half *running_var_output,
+                                                                  float *running_mean_input, float *running_var_input,
+                                                                  float epsilon, float momentum, size_t group_rank,
+                                                                  size_t group_size, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormGather<half, half>(size_t N_, size_t C_, size_t H_, size_t W_,
+                                                                 int *counts_global, float *means_global,
+                                                                 float *invstds_global, int *counts_local,
+                                                                 float *means_local, float *invstds_local,
+                                                                 half *running_mean_output, half *running_var_output,
+                                                                 half *running_mean_input, half *running_var_input,
+                                                                 float epsilon, float momentum, size_t group_rank,
+                                                                 size_t group_size, cudaStream_t cuda_stream);
 
-template void CalSyncBatchNormPost<float, float>(size_t N, size_t C, size_t H, size_t W, const float *input,
-                                                 float *output, float *means_local, float *invstds_local, float *scale,
-                                                 float *bias, float *output_scale, float *output_bias, float epsilon,
-                                                 cudaStream_t cuda_stream);
-template void CalSyncBatchNormPost<float, half>(size_t N, size_t C, size_t H, size_t W, const float *input,
-                                                float *output, float *means_local, float *invstds_local, half *scale,
-                                                half *bias, half *output_scale, half *output_bias, float epsilon,
-                                                cudaStream_t cuda_stream);
-template void CalSyncBatchNormPost<half, float>(size_t N, size_t C, size_t H, size_t W, const half *input, half *output,
-                                                float *means_local, float *invstds_local, float *scale, float *bias,
-                                                float *output_scale, float *output_bias, float epsilon,
-                                                cudaStream_t cuda_stream);
-template void CalSyncBatchNormPost<half, half>(size_t N, size_t C, size_t H, size_t W, const half *input, half *output,
-                                               float *means_local, float *invstds_local, half *scale, half *bias,
-                                               half *output_scale, half *output_bias, float epsilon,
-                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormPost<float, float>(size_t N, size_t C, size_t H, size_t W,
+                                                                 const float *input, float *output, float *means_local,
+                                                                 float *invstds_local, float *scale, float *bias,
+                                                                 float *output_scale, float *output_bias, float epsilon,
+                                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormPost<float, half>(size_t N, size_t C, size_t H, size_t W,
+                                                                const float *input, float *output, float *means_local,
+                                                                float *invstds_local, half *scale, half *bias,
+                                                                half *output_scale, half *output_bias, float epsilon,
+                                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormPost<half, float>(size_t N, size_t C, size_t H, size_t W,
+                                                                const half *input, half *output, float *means_local,
+                                                                float *invstds_local, float *scale, float *bias,
+                                                                float *output_scale, float *output_bias, float epsilon,
+                                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalSyncBatchNormPost<half, half>(size_t N, size_t C, size_t H, size_t W,
+                                                               const half *input, half *output, float *means_local,
+                                                               float *invstds_local, half *scale, half *bias,
+                                                               half *output_scale, half *output_bias, float epsilon,
+                                                               cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cuh
new file mode 100644
index 00000000000..4bcf420dbfc
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cuh
@@ -0,0 +1,35 @@
+// /**
+//  * Copyright 2021 Huawei Technologies Co., Ltd
+//  *
+//  * Licensed under the Apache License, Version 2.0 (the "License");
+//  * you may not use this file except in compliance with the License.
+//  * You may obtain a copy of the License at
+//  *
+//  * http://www.apache.org/licenses/LICENSE-2.0
+//  *
+//  * Unless required by applicable law or agreed to in writing, software
+//  * distributed under the License is distributed on an "AS IS" BASIS,
+//  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  * See the License for the specific language governing permissions and
+//  * limitations under the License.
+//  */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SYNC_BATCH_NORM_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SYNC_BATCH_NORM_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void CalSyncBatchNormPre(size_t N, size_t C, size_t H, size_t W, const T *input, int *output_n,
+                                         float *means_local, float *invstds_local, float epsilon,
+                                         cudaStream_t cuda_stream);
+template <typename T, typename G>
+CUDA_LIB_EXPORT void CalSyncBatchNormGather(size_t N, size_t C, size_t H, size_t W, int *counts_global,
+                                            float *means_global, float *invstds_global, int *counts_local,
+                                            float *means_local, float *invstds_local, T *running_mean_output,
+                                            T *running_var_output, G *running_mean_input, G *running_var_input,
+                                            float epsilon, float momentum, size_t group_rank, size_t group_size,
+                                            cudaStream_t cuda_stream);
+template <typename T, typename S>
+CUDA_LIB_EXPORT void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, const T *input, T *output,
+                                          float *means_local, float *invstds_local, S *scale, S *bias, S *output_scale,
+                                          S *output_bias, float epsilon, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SYNC_BATCH_NORM_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cu
new file mode 100644
index 00000000000..d5c89fc3c08
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cu
@@ -0,0 +1,95 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+
+template <typename T, typename S>
+__global__ void TensorScatterAddKernel(T *input, S *indices, T *update, T *output, const size_t block_size,
+                                       const size_t input_size, const size_t output_size, const size_t indices_dim_0,
+                                       const size_t indices_dim_1, S *indices_stride, S *work_shape) {
+  int i, j;
+  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
+       read_index += blockDim.x * gridDim.x) {
+    size_t write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / block_size;
+    j = read_index % block_size;
+
+    for (size_t k = 0; k < indices_dim_1; k++) {
+      S indices_i = indices[i * indices_dim_1 + k];
+      out_bound |= indices_i >= work_shape[k];
+      write_index += indices_i * indices_stride[k];
+    }
+
+    write_index += j;
+    out_bound |= write_index >= output_size;
+
+    if (!out_bound) {
+      MsAtomicAdd(&output[write_index], update[read_index]);
+    }
+  }
+}
+
+template <typename T, typename S>
+void TensorScatterAdd(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
+                      const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
+                      S *indices_stride, S *work_shape, cudaStream_t stream) {
+  TensorScatterAddKernel<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(
+    input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride,
+    work_shape);
+  return;
+}
+
+template CUDA_LIB_EXPORT void TensorScatterAdd<half, int>(half *input, int *indices, half *update, half *output,
+                                                          const size_t &block_size, const size_t &input_size,
+                                                          const size_t &output_size, const size_t &indices_dim_0,
+                                                          const size_t &indices_dim_1, int *indices_stride,
+                                                          int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterAdd<float, int>(float *input, int *indices, float *update, float *output,
+                                                           const size_t &block_size, const size_t &input_size,
+                                                           const size_t &output_size, const size_t &indices_dim_0,
+                                                           const size_t &indices_dim_1, int *indices_stride,
+                                                           int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterAdd<double, int>(double *input, int *indices, double *update, double *output,
+                                                            const size_t &block_size, const size_t &input_size,
+                                                            const size_t &output_size, const size_t &indices_dim_0,
+                                                            const size_t &indices_dim_1, int *indices_stride,
+                                                            int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterAdd<char, int>(char *input, int *indices, char *update, char *output,
+                                                          const size_t &block_size, const size_t &input_size,
+                                                          const size_t &output_size, const size_t &indices_dim_0,
+                                                          const size_t &indices_dim_1, int *indices_stride,
+                                                          int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterAdd<unsigned char, int>(unsigned char *input, int *indices,
+                                                                   unsigned char *update, unsigned char *output,
+                                                                   const size_t &block_size, const size_t &input_size,
+                                                                   const size_t &output_size,
+                                                                   const size_t &indices_dim_0,
+                                                                   const size_t &indices_dim_1, int *indices_stride,
+                                                                   int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterAdd<int, int>(int *input, int *indices, int *update, int *output,
+                                                         const size_t &block_size, const size_t &input_size,
+                                                         const size_t &output_size, const size_t &indices_dim_0,
+                                                         const size_t &indices_dim_1, int *indices_stride,
+                                                         int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterAdd<double, int64_t>(double *input, int64_t *indices, double *update,
+                                                                double *output, const size_t &block_size,
+                                                                const size_t &input_size, const size_t &output_size,
+                                                                const size_t &indices_dim_0,
+                                                                const size_t &indices_dim_1, int64_t *indices_stride,
+                                                                int64_t *work_shape, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cuh
new file mode 100644
index 00000000000..25c84cabe52
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_ADD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_ADD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void TensorScatterAdd(T *input, S *indices, T *update, T *output, const size_t &block_size,
+                                      const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
+                                      const size_t &indices_dim_1, S *indices_stride, S *work_shape,
+                                      cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_ADD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cu
new file mode 100644
index 00000000000..fe2e52bcb68
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cu
@@ -0,0 +1,125 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+
+template <typename T, typename S>
+__global__ void TensorScatterMaxKernel(T *input, S *indices, T *update, T *output, const size_t block_size,
+                                       const size_t input_size, const size_t output_size, const size_t indices_dim_0,
+                                       const size_t indices_dim_1, S *indices_stride, S *work_shape) {
+  int i, j;
+  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
+       read_index += blockDim.x * gridDim.x) {
+    size_t write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / block_size;
+    j = read_index % block_size;
+
+    for (size_t k = 0; k < indices_dim_1; k++) {
+      S indices_i = indices[i * indices_dim_1 + k];
+      out_bound |= indices_i >= work_shape[k];
+      write_index += indices_i * indices_stride[k];
+    }
+
+    write_index += j;
+    out_bound |= write_index >= output_size;
+
+    if (!out_bound) {
+      MsAtomicMax(&output[write_index], update[read_index]);
+    }
+  }
+}
+
+template <typename T, typename S>
+void TensorScatterMax(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
+                      const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
+                      S *indices_stride, S *work_shape, cudaStream_t stream) {
+  TensorScatterMaxKernel<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(
+    input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride,
+    work_shape);
+  return;
+}
+
+// for int32 index
+template CUDA_LIB_EXPORT void TensorScatterMax<half, int>(half *input, int *indices, half *update, half *output,
+                                                          const size_t &block_size, const size_t &input_size,
+                                                          const size_t &output_size, const size_t &indices_dim_0,
+                                                          const size_t &indices_dim_1, int *indices_stride,
+                                                          int *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMax<float, int>(float *input, int *indices, float *update, float *output,
+                                                           const size_t &block_size, const size_t &input_size,
+                                                           const size_t &output_size, const size_t &indices_dim_0,
+                                                           const size_t &indices_dim_1, int *indices_stride,
+                                                           int *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMax<char, int>(char *input, int *indices, char *update, char *output,
+                                                          const size_t &block_size, const size_t &input_size,
+                                                          const size_t &output_size, const size_t &indices_dim_0,
+                                                          const size_t &indices_dim_1, int *indices_stride,
+                                                          int *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMax<unsigned char, int>(unsigned char *input, int *indices,
+                                                                   unsigned char *update, unsigned char *output,
+                                                                   const size_t &block_size, const size_t &input_size,
+                                                                   const size_t &output_size,
+                                                                   const size_t &indices_dim_0,
+                                                                   const size_t &indices_dim_1, int *indices_stride,
+                                                                   int *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMax<int, int>(int *input, int *indices, int *update, int *output,
+                                                         const size_t &block_size, const size_t &input_size,
+                                                         const size_t &output_size, const size_t &indices_dim_0,
+                                                         const size_t &indices_dim_1, int *indices_stride,
+                                                         int *work_shape, cudaStream_t stream);
+
+// for int64 index
+template CUDA_LIB_EXPORT void TensorScatterMax<half, int64_t>(half *input, int64_t *indices, half *update, half *output,
+                                                              const size_t &block_size, const size_t &input_size,
+                                                              const size_t &output_size, const size_t &indices_dim_0,
+                                                              const size_t &indices_dim_1, int64_t *indices_stride,
+                                                              int64_t *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMax<float, int64_t>(float *input, int64_t *indices, float *update,
+                                                               float *output, const size_t &block_size,
+                                                               const size_t &input_size, const size_t &output_size,
+                                                               const size_t &indices_dim_0, const size_t &indices_dim_1,
+                                                               int64_t *indices_stride, int64_t *work_shape,
+                                                               cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMax<char, int64_t>(char *input, int64_t *indices, char *update, char *output,
+                                                              const size_t &block_size, const size_t &input_size,
+                                                              const size_t &output_size, const size_t &indices_dim_0,
+                                                              const size_t &indices_dim_1, int64_t *indices_stride,
+                                                              int64_t *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMax<unsigned char, int64_t>(unsigned char *input, int64_t *indices,
+                                                                       unsigned char *update, unsigned char *output,
+                                                                       const size_t &block_size,
+                                                                       const size_t &input_size,
+                                                                       const size_t &output_size,
+                                                                       const size_t &indices_dim_0,
+                                                                       const size_t &indices_dim_1,
+                                                                       int64_t *indices_stride, int64_t *work_shape,
+                                                                       cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMax<int, int64_t>(int *input, int64_t *indices, int *update, int *output,
+                                                             const size_t &block_size, const size_t &input_size,
+                                                             const size_t &output_size, const size_t &indices_dim_0,
+                                                             const size_t &indices_dim_1, int64_t *indices_stride,
+                                                             int64_t *work_shape, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cuh
new file mode 100644
index 00000000000..f8cff09de25
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_MAX_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_MAX_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void TensorScatterMax(T *input, S *indices, T *update, T *output, const size_t &block_size,
+                                      const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
+                                      const size_t &indices_dim_1, S *indices_stride, S *work_shape,
+                                      cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_MAX_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cu
new file mode 100644
index 00000000000..222119ba3e9
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cu
@@ -0,0 +1,125 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+
+template <typename T, typename S>
+__global__ void TensorScatterMinKernel(T *input, S *indices, T *update, T *output, const size_t block_size,
+                                          const size_t input_size, const size_t output_size, const size_t indices_dim_0,
+                                          const size_t indices_dim_1, S *indices_stride, S *work_shape) {
+  int i, j;
+  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
+       read_index += blockDim.x * gridDim.x) {
+    size_t write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / block_size;
+    j = read_index % block_size;
+
+    for (size_t k = 0; k < indices_dim_1; k++) {
+      S indices_i = indices[i * indices_dim_1 + k];
+      out_bound |= indices_i >= work_shape[k];
+      write_index += indices_i * indices_stride[k];
+    }
+
+    write_index += j;
+    out_bound |= write_index >= output_size;
+
+    if (!out_bound) {
+      MsAtomicMin(&output[write_index], update[read_index]);
+    }
+  }
+}
+
+template <typename T, typename S>
+void TensorScatterMin(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
+                         const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
+                         S *indices_stride, S *work_shape, cudaStream_t stream) {
+  TensorScatterMinKernel<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(
+    input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride,
+    work_shape);
+  return;
+}
+
+// for int32 index
+template CUDA_LIB_EXPORT void TensorScatterMin<half, int>(half *input, int *indices, half *update, half *output,
+                                                          const size_t &block_size, const size_t &input_size,
+                                                          const size_t &output_size, const size_t &indices_dim_0,
+                                                          const size_t &indices_dim_1, int *indices_stride,
+                                                          int *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMin<float, int>(float *input, int *indices, float *update, float *output,
+                                                           const size_t &block_size, const size_t &input_size,
+                                                           const size_t &output_size, const size_t &indices_dim_0,
+                                                           const size_t &indices_dim_1, int *indices_stride,
+                                                           int *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMin<char, int>(char *input, int *indices, char *update, char *output,
+                                                          const size_t &block_size, const size_t &input_size,
+                                                          const size_t &output_size, const size_t &indices_dim_0,
+                                                          const size_t &indices_dim_1, int *indices_stride,
+                                                          int *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMin<unsigned char, int>(unsigned char *input, int *indices,
+                                                                   unsigned char *update, unsigned char *output,
+                                                                   const size_t &block_size, const size_t &input_size,
+                                                                   const size_t &output_size,
+                                                                   const size_t &indices_dim_0,
+                                                                   const size_t &indices_dim_1, int *indices_stride,
+                                                                   int *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMin<int, int>(int *input, int *indices, int *update, int *output,
+                                                         const size_t &block_size, const size_t &input_size,
+                                                         const size_t &output_size, const size_t &indices_dim_0,
+                                                         const size_t &indices_dim_1, int *indices_stride,
+                                                         int *work_shape, cudaStream_t stream);
+
+// for int64 index
+template CUDA_LIB_EXPORT void TensorScatterMin<half, int64_t>(half *input, int64_t *indices, half *update, half *output,
+                                                              const size_t &block_size, const size_t &input_size,
+                                                              const size_t &output_size, const size_t &indices_dim_0,
+                                                              const size_t &indices_dim_1, int64_t *indices_stride,
+                                                              int64_t *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMin<float, int64_t>(float *input, int64_t *indices, float *update,
+                                                               float *output, const size_t &block_size,
+                                                               const size_t &input_size, const size_t &output_size,
+                                                               const size_t &indices_dim_0, const size_t &indices_dim_1,
+                                                               int64_t *indices_stride, int64_t *work_shape,
+                                                               cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMin<char, int64_t>(char *input, int64_t *indices, char *update, char *output,
+                                                              const size_t &block_size, const size_t &input_size,
+                                                              const size_t &output_size, const size_t &indices_dim_0,
+                                                              const size_t &indices_dim_1, int64_t *indices_stride,
+                                                              int64_t *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMin<unsigned char, int64_t>(unsigned char *input, int64_t *indices,
+                                                                       unsigned char *update, unsigned char *output,
+                                                                       const size_t &block_size,
+                                                                       const size_t &input_size,
+                                                                       const size_t &output_size,
+                                                                       const size_t &indices_dim_0,
+                                                                       const size_t &indices_dim_1,
+                                                                       int64_t *indices_stride, int64_t *work_shape,
+                                                                       cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterMin<int, int64_t>(int *input, int64_t *indices, int *update, int *output,
+                                                             const size_t &block_size, const size_t &input_size,
+                                                             const size_t &output_size, const size_t &indices_dim_0,
+                                                             const size_t &indices_dim_1, int64_t *indices_stride,
+                                                             int64_t *work_shape, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cuh
new file mode 100644
index 00000000000..0cafc15e10a
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_MIN_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_MIN_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void TensorScatterMin(T *input, S *indices, T *update, T *output, const size_t &block_size,
+                                      const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
+                                      const size_t &indices_dim_1, S *indices_stride, S *work_shape,
+                                      cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_MIN_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cu
new file mode 100644
index 00000000000..3d78b6db76e
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cu
@@ -0,0 +1,125 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+
+template <typename T, typename S>
+__global__ void TensorScatterSubKernel(T *input, S *indices, T *update, T *output, const size_t block_size,
+                                       const size_t input_size, const size_t output_size, const size_t indices_dim_0,
+                                       const size_t indices_dim_1, S *indices_stride, S *work_shape) {
+  int i, j;
+  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
+       read_index += blockDim.x * gridDim.x) {
+    size_t write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / block_size;
+    j = read_index % block_size;
+
+    for (size_t k = 0; k < indices_dim_1; k++) {
+      S indices_i = indices[i * indices_dim_1 + k];
+      out_bound |= indices_i >= work_shape[k];
+      write_index += indices_i * indices_stride[k];
+    }
+
+    write_index += j;
+    out_bound |= write_index >= output_size;
+
+    if (!out_bound) {
+      MsAtomicSub(&output[write_index], update[read_index]);
+    }
+  }
+}
+
+template <typename T, typename S>
+void TensorScatterSub(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
+                      const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
+                      S *indices_stride, S *work_shape, cudaStream_t stream) {
+  TensorScatterSubKernel<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(
+    input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride,
+    work_shape);
+  return;
+}
+
+// for int32 index
+template CUDA_LIB_EXPORT void TensorScatterSub<half, int>(half *input, int *indices, half *update, half *output,
+                                                          const size_t &block_size, const size_t &input_size,
+                                                          const size_t &output_size, const size_t &indices_dim_0,
+                                                          const size_t &indices_dim_1, int *indices_stride,
+                                                          int *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterSub<float, int>(float *input, int *indices, float *update, float *output,
+                                                           const size_t &block_size, const size_t &input_size,
+                                                           const size_t &output_size, const size_t &indices_dim_0,
+                                                           const size_t &indices_dim_1, int *indices_stride,
+                                                           int *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterSub<char, int>(char *input, int *indices, char *update, char *output,
+                                                          const size_t &block_size, const size_t &input_size,
+                                                          const size_t &output_size, const size_t &indices_dim_0,
+                                                          const size_t &indices_dim_1, int *indices_stride,
+                                                          int *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterSub<unsigned char, int>(unsigned char *input, int *indices,
+                                                                   unsigned char *update, unsigned char *output,
+                                                                   const size_t &block_size, const size_t &input_size,
+                                                                   const size_t &output_size,
+                                                                   const size_t &indices_dim_0,
+                                                                   const size_t &indices_dim_1, int *indices_stride,
+                                                                   int *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterSub<int, int>(int *input, int *indices, int *update, int *output,
+                                                         const size_t &block_size, const size_t &input_size,
+                                                         const size_t &output_size, const size_t &indices_dim_0,
+                                                         const size_t &indices_dim_1, int *indices_stride,
+                                                         int *work_shape, cudaStream_t stream);
+
+// for int64 index
+template CUDA_LIB_EXPORT void TensorScatterSub<half, int64_t>(half *input, int64_t *indices, half *update, half *output,
+                                                              const size_t &block_size, const size_t &input_size,
+                                                              const size_t &output_size, const size_t &indices_dim_0,
+                                                              const size_t &indices_dim_1, int64_t *indices_stride,
+                                                              int64_t *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterSub<float, int64_t>(float *input, int64_t *indices, float *update,
+                                                               float *output, const size_t &block_size,
+                                                               const size_t &input_size, const size_t &output_size,
+                                                               const size_t &indices_dim_0, const size_t &indices_dim_1,
+                                                               int64_t *indices_stride, int64_t *work_shape,
+                                                               cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterSub<char, int64_t>(char *input, int64_t *indices, char *update, char *output,
+                                                              const size_t &block_size, const size_t &input_size,
+                                                              const size_t &output_size, const size_t &indices_dim_0,
+                                                              const size_t &indices_dim_1, int64_t *indices_stride,
+                                                              int64_t *work_shape, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterSub<unsigned char, int64_t>(unsigned char *input, int64_t *indices,
+                                                                       unsigned char *update, unsigned char *output,
+                                                                       const size_t &block_size,
+                                                                       const size_t &input_size,
+                                                                       const size_t &output_size,
+                                                                       const size_t &indices_dim_0,
+                                                                       const size_t &indices_dim_1,
+                                                                       int64_t *indices_stride, int64_t *work_shape,
+                                                                       cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void TensorScatterSub<int, int64_t>(int *input, int64_t *indices, int *update, int *output,
+                                                             const size_t &block_size, const size_t &input_size,
+                                                             const size_t &output_size, const size_t &indices_dim_0,
+                                                             const size_t &indices_dim_1, int64_t *indices_stride,
+                                                             int64_t *work_shape, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cuh
new file mode 100644
index 00000000000..82d85071398
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_SUB_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_SUB_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void TensorScatterSub(T *input, S *indices, T *update, T *output, const size_t &block_size,
+                                      const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
+                                      const size_t &indices_dim_1, S *indices_stride, S *work_shape,
+                                      cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_SUB_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cu
new file mode 100644
index 00000000000..78f40034b37
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cu
@@ -0,0 +1,114 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+
+template <typename T, typename S>
+__global__ void TensorScatterUpdateKernel(T *input, S *indices, T *update, T *output, const size_t block_size,
+                                          const size_t input_size, const size_t output_size, const size_t indices_dim_0,
+                                          const size_t indices_dim_1, S *indices_stride, S *work_shape) {
+  int i, j;
+  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
+       read_index += blockDim.x * gridDim.x) {
+    size_t write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / block_size;
+    j = read_index % block_size;
+
+    for (size_t k = 0; k < indices_dim_1; k++) {
+      S indices_i = indices[i * indices_dim_1 + k];
+      out_bound |= indices_i >= work_shape[k];
+      write_index += indices_i * indices_stride[k];
+    }
+
+    write_index += j;
+    out_bound |= write_index >= output_size;
+
+    if (!out_bound) {
+      output[write_index] = update[read_index];
+    }
+  }
+}
+
+template <typename T, typename S>
+void TensorScatterUpdate(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
+                         const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
+                         S *indices_stride, S *work_shape, cudaStream_t stream) {
+  TensorScatterUpdateKernel<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(
+    input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride,
+    work_shape);
+  return;
+}
+
+template CUDA_LIB_EXPORT void TensorScatterUpdate<half, int>(half *input, int *indices, half *update, half *output,
+                                                             const size_t &block_size, const size_t &input_size,
+                                                             const size_t &output_size, const size_t &indices_dim_0,
+                                                             const size_t &indices_dim_1, int *indices_stride,
+                                                             int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterUpdate<float, int>(float *input, int *indices, float *update, float *output,
+                                                              const size_t &block_size, const size_t &input_size,
+                                                              const size_t &output_size, const size_t &indices_dim_0,
+                                                              const size_t &indices_dim_1, int *indices_stride,
+                                                              int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterUpdate<double, int>(double *input, int *indices, double *update,
+                                                               double *output, const size_t &block_size,
+                                                               const size_t &input_size, const size_t &output_size,
+                                                               const size_t &indices_dim_0, const size_t &indices_dim_1,
+                                                               int *indices_stride, int *work_shape,
+                                                               cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterUpdate<char, int>(char *input, int *indices, char *update, char *output,
+                                                             const size_t &block_size, const size_t &input_size,
+                                                             const size_t &output_size, const size_t &indices_dim_0,
+                                                             const size_t &indices_dim_1, int *indices_stride,
+                                                             int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterUpdate<unsigned char, int>(unsigned char *input, int *indices,
+                                                                      unsigned char *update, unsigned char *output,
+                                                                      const size_t &block_size,
+                                                                      const size_t &input_size,
+                                                                      const size_t &output_size,
+                                                                      const size_t &indices_dim_0,
+                                                                      const size_t &indices_dim_1, int *indices_stride,
+                                                                      int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterUpdate<int, int>(int *input, int *indices, int *update, int *output,
+                                                            const size_t &block_size, const size_t &input_size,
+                                                            const size_t &output_size, const size_t &indices_dim_0,
+                                                            const size_t &indices_dim_1, int *indices_stride,
+                                                            int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterUpdate<bool, int>(bool *input, int *indices, bool *update, bool *output,
+                                                             const size_t &block_size, const size_t &input_size,
+                                                             const size_t &output_size, const size_t &indices_dim_0,
+                                                             const size_t &indices_dim_1, int *indices_stride,
+                                                             int *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterUpdate<bool, int64_t>(bool *input, int64_t *indices, bool *update,
+                                                                 bool *output, const size_t &block_size,
+                                                                 const size_t &input_size, const size_t &output_size,
+                                                                 const size_t &indices_dim_0,
+                                                                 const size_t &indices_dim_1, int64_t *indices_stride,
+                                                                 int64_t *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterUpdate<float, int64_t>(float *input, int64_t *indices, float *update,
+                                                                  float *output, const size_t &block_size,
+                                                                  const size_t &input_size, const size_t &output_size,
+                                                                  const size_t &indices_dim_0,
+                                                                  const size_t &indices_dim_1, int64_t *indices_stride,
+                                                                  int64_t *work_shape, cudaStream_t stream);
+template CUDA_LIB_EXPORT void TensorScatterUpdate<double, int64_t>(double *input, int64_t *indices, double *update,
+                                                                   double *output, const size_t &block_size,
+                                                                   const size_t &input_size, const size_t &output_size,
+                                                                   const size_t &indices_dim_0,
+                                                                   const size_t &indices_dim_1, int64_t *indices_stride,
+                                                                   int64_t *work_shape, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cuh
new file mode 100644
index 00000000000..4c216307d30
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_UPDATE_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_UPDATE_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename T, typename S>
+CUDA_LIB_EXPORT void TensorScatterUpdate(T *input, S *indices, T *update, T *output, const size_t &block_size,
+                                         const size_t &input_size, const size_t &output_size,
+                                         const size_t &indices_dim_0, const size_t &indices_dim_1, S *indices_stride,
+                                         S *work_shape, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_UPDATE_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cu
new file mode 100644
index 00000000000..25c84ca0027
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cu
@@ -0,0 +1,81 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh"
+#include "include/cuda_fp16.h"
+
+template <typename T>
+__global__ void Tile(const size_t output_size, const size_t input_size, const size_t shape_size,
+                     const size_t *input_shape, const size_t *output_shape, const T *input, T *output) {
+  // for example 4-D: pos = pos_array[0] * output_shape[1] * output_shape[2] * output_shape[3] +
+  //                        pos_array[1] * output_shape[2] * output_shape[3] +
+  //                        pos_array[2] * output_shape[3] +
+  //                        pos_array[3]
+  size_t pos_array[TILE_MAX_DIMENSION];
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_size; pos += blockDim.x * gridDim.x) {
+    size_t tmp_pos = pos;
+    size_t pos_size = output_size / output_shape[0];
+    pos_array[0] = tmp_pos / pos_size;
+    for (size_t i = 1; i < shape_size; i++) {
+      tmp_pos -= pos_array[i - 1] * pos_size;
+      pos_size = pos_size / output_shape[i];
+      pos_array[i] = tmp_pos / pos_size;
+    }
+    for (size_t i = 0; i < shape_size; i++) {
+      pos_array[i] = pos_array[i] % input_shape[i];
+    }
+    pos_size = input_size;
+    size_t input_pos = 0;
+    for (size_t i = 0; i < shape_size; i++) {
+      pos_size /= input_shape[i];
+      input_pos += (pos_array[i] * pos_size);
+    }
+    output[pos] = input[input_pos];
+  }
+}
+
+template <typename T>
+void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, const size_t *input_shape,
+             const size_t *output_shape, const T *input, T *output, cudaStream_t cuda_stream) {
+  Tile<<<GET_BLOCKS(output_size), GET_THREADS, 0, cuda_stream>>>(output_size, input_size, shape_size, input_shape,
+                                                                 output_shape, input, output);
+  return;
+}
+
+template CUDA_LIB_EXPORT void CalTile<double>(const size_t output_size, const size_t input_size,
+                                              const size_t shape_size, const size_t *input_shape,
+                                              const size_t *output_shape, const double *input, double *output,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTile<float>(const size_t output_size, const size_t input_size, const size_t shape_size,
+                                             const size_t *input_shape, const size_t *output_shape, const float *input,
+                                             float *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTile<half>(const size_t output_size, const size_t input_size, const size_t shape_size,
+                                            const size_t *input_shape, const size_t *output_shape, const half *input,
+                                            half *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTile<int16_t>(const size_t output_size, const size_t input_size,
+                                               const size_t shape_size, const size_t *input_shape,
+                                               const size_t *output_shape, const int16_t *input, int16_t *output,
+                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTile<int>(const size_t output_size, const size_t input_size, const size_t shape_size,
+                                           const size_t *input_shape, const size_t *output_shape, const int *input,
+                                           int *output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTile<int64_t>(const size_t output_size, const size_t input_size,
+                                               const size_t shape_size, const size_t *input_shape,
+                                               const size_t *output_shape, const int64_t *input, int64_t *output,
+                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTile<bool>(const size_t output_size, const size_t input_size, const size_t shape_size,
+                                            const size_t *input_shape, const size_t *output_shape, const bool *input,
+                                            bool *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh
similarity index 54%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh
index 3709254aaf5..6816f0e21d4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_APPLY_GRADIENT_DESCENT_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_APPLY_GRADIENT_DESCENT_IMPL_CUH_
-
-#include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TILE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TILE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#define TILE_MAX_DIMENSION 100
 
 template <typename T>
-void CalApplyGradientDescent(const size_t &size, T *var, const T *alpha, const T *delta, T *output,
+CUDA_LIB_EXPORT void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size,
+                             const size_t *input_shape, const size_t *output_shape, const T *input, T *output,
                              cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_APPLY_GRADIENT_DESCENT_IMPL_CUH_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TILE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cu
similarity index 93%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cu
index 6e21b3918fb..e56af86ba13 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cu
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/topk_lib.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_lib.cuh"
 #include <limits>
 #include <algorithm>
+#include "include/cuda_fp16.h"
 
 const int kMaxQueue = 128;
 
@@ -221,7 +222,7 @@ void FastTopK(const int outer_size, const int inner_size, const T *input, S k_cu
   }
 }
 
-template void FastTopK(const int outer_size, const int inner_size, const half *input, int k_cut, half *output,
-                       int *output_index, const half init_K, cudaStream_t stream);
-template void FastTopK(const int outer_size, const int inner_size, const float *input, int k_cut, float *output,
-                       int *output_index, const float init_K, cudaStream_t stream);
+template CUDA_LIB_EXPORT void FastTopK(const int outer_size, const int inner_size, const half *input, int k_cut,
+                                       half *output, int *output_index, const half init_K, cudaStream_t stream);
+template CUDA_LIB_EXPORT void FastTopK(const int outer_size, const int inner_size, const float *input, int k_cut,
+                                       float *output, int *output_index, const float init_K, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh
similarity index 57%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh
index a1ca24d09cb..894beb8196b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TOPK_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TOPK_IMPL_CUH_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TOPK_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TOPK_IMPL_CUH_
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T, typename S>
-void FastTopK(const int outer, const int inner, const T *input_addr, S k_cut, T *output, S *indices, const T initK,
-              cudaStream_t stream);
+CUDA_LIB_EXPORT void FastTopK(const int outer, const int inner, const T *input_addr, S k_cut, T *output, S *indices,
+                              const T initK, cudaStream_t stream);
 
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TOPK_IMPL_CUH_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TOPK_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_lib.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_lib.cuh
similarity index 100%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_lib.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_lib.cuh
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cu
new file mode 100755
index 00000000000..a2de4534940
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cu
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include "include/cuda_fp16.h"
+#include "transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
+
+template <typename T>
+using Complex = mindspore::utils::Complex<T>;
+
+template <typename T>
+__global__ void Transpose(const size_t size, const T *input, const size_t *input_shape, const size_t *input_axis,
+                          const size_t shape_size, T *output) {
+  size_t pos_size;
+  size_t temp_pos;
+  size_t newpos;
+  size_t newpos_size;
+  size_t pos_array[TRANSPOSE_MAX_DIMENSION];
+
+  // for example 4-D: pos = posArray[0] * input_shape[1] * input_shape[2] * input_shape[3] +
+  //                        posArray[1] * input_shape[2] * input_shape[3] +
+  //                        posArray[2] * input_shape[3] +
+  //                        posArray[3]
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    temp_pos = pos;
+    pos_size = size / input_shape[0];
+    pos_array[0] = temp_pos / pos_size;
+    for (size_t i = 1; i < shape_size; i++) {
+      temp_pos -= pos_array[i - 1] * pos_size;
+      pos_size = pos_size / input_shape[i];
+      pos_array[i] = temp_pos / pos_size;
+    }
+
+    newpos = pos_array[input_axis[shape_size - 1]];
+    newpos_size = 1;
+    for (int64_t j = shape_size - 2; j >= 0; j--) {
+      newpos_size *= input_shape[input_axis[j + 1]];
+      newpos += pos_array[input_axis[j]] * newpos_size;
+    }
+
+    output[newpos] = input[pos];
+  }
+}
+template <typename T>
+void CalTranspose(const size_t size, const T *input, const size_t *input_shape, const size_t *input_axis,
+                  const size_t shape_size, T *output, cudaStream_t cuda_stream) {
+  Transpose<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, input_shape, input_axis, shape_size,
+                                                               output);
+}
+
+template CUDA_LIB_EXPORT void CalTranspose<double>(const size_t size, const double *input, const size_t *input_shape,
+                                                   const size_t *input_axis, const size_t shape_size, double *output,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTranspose<float>(const size_t size, const float *input, const size_t *input_shape,
+                                                  const size_t *input_axis, const size_t shape_size, float *output,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTranspose<half>(const size_t size, const half *input, const size_t *input_shape,
+                                                 const size_t *input_axis, const size_t shape_size, half *output,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTranspose<int>(const size_t size, const int *input, const size_t *input_shape,
+                                                const size_t *input_axis, const size_t shape_size, int *output,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTranspose<int64_t>(const size_t size, const int64_t *input, const size_t *input_shape,
+                                                    const size_t *input_axis, const size_t shape_size, int64_t *output,
+                                                    cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTranspose<Complex<float>>(const size_t size, const Complex<float> *input,
+                                                           const size_t *input_shape, const size_t *input_axis,
+                                                           const size_t shape_size, Complex<float> *output,
+                                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalTranspose<Complex<double>>(const size_t size, const Complex<double> *input,
+                                                            const size_t *input_shape, const size_t *input_axis,
+                                                            const size_t shape_size, Complex<double> *output,
+                                                            cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh
index c1aec6e0486..8c1d9360229 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRANSPOSE_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRANSPOSE_H_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRANSPOSE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRANSPOSE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #define TRANSPOSE_MAX_DIMENSION 100
 template <typename T>
-void CalTranspose(const size_t size, const T *input, const size_t *input_shape, const size_t *input_axis,
-                  const size_t shape_size, T *output, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CalTranspose(const size_t size, const T *input, const size_t *input_shape,
+                                  const size_t *input_axis, const size_t shape_size, T *output,
+                                  cudaStream_t cuda_stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRANSPOSE_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRANSPOSE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cu
similarity index 71%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cu
index fd0b1b6203a..2c303bcbc38 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cu
@@ -21,7 +21,7 @@
 #include <utility>
 #include <algorithm>
 #include "transpose_impl_opt.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "include/cuda_fp16.h"
 
 // Optimize nchw2nhwc && nhwc2nchw with tiling and shared memory.
 // Firstly, combined 2 dims hw together, treat input and output as 3D tensor.
@@ -255,44 +255,52 @@ void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, const T *
                        d_output, cuda_stream);
 }
 
-template void CalNHWC2NCHWInterface<double>(const size_t size, const size_t shape_size, const double *d_input,
-                                           const size_t *input_shape, const size_t *input_axis,
-                                           const size_t *d_input_shape, const size_t *d_input_axis, double *d_output,
-                                           cudaStream_t cuda_stream);
-template void CalNHWC2NCHWInterface<float>(const size_t size, const size_t shape_size, const float *d_input,
-                                           const size_t *input_shape, const size_t *input_axis,
-                                           const size_t *d_input_shape, const size_t *d_input_axis, float *d_output,
-                                           cudaStream_t cuda_stream);
-template void CalNHWC2NCHWInterface<half>(const size_t size, const size_t shape_size, const half *d_input,
-                                          const size_t *input_shape, const size_t *input_axis,
-                                          const size_t *d_input_shape, const size_t *d_input_axis, half *d_output,
-                                          cudaStream_t cuda_stream);
-template void CalNHWC2NCHWInterface<int>(const size_t size, const size_t shape_size, const int *d_input,
-                                         const size_t *input_shape, const size_t *input_axis,
-                                         const size_t *d_input_shape, const size_t *d_input_axis, int *d_output,
-                                         cudaStream_t cuda_stream);
-template void CalNHWC2NCHWInterface<int64_t>(const size_t size, const size_t shape_size, const int64_t *d_input,
-                                             const size_t *input_shape, const size_t *input_axis,
-                                             const size_t *d_input_shape, const size_t *d_input_axis, int64_t *d_output,
-                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalNHWC2NCHWInterface<double>(const size_t size, const size_t shape_size,
+                                                            const double *d_input, const size_t *input_shape,
+                                                            const size_t *input_axis, const size_t *d_input_shape,
+                                                            const size_t *d_input_axis, double *d_output,
+                                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalNHWC2NCHWInterface<float>(const size_t size, const size_t shape_size,
+                                                           const float *d_input, const size_t *input_shape,
+                                                           const size_t *input_axis, const size_t *d_input_shape,
+                                                           const size_t *d_input_axis, float *d_output,
+                                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalNHWC2NCHWInterface<half>(const size_t size, const size_t shape_size,
+                                                          const half *d_input, const size_t *input_shape,
+                                                          const size_t *input_axis, const size_t *d_input_shape,
+                                                          const size_t *d_input_axis, half *d_output,
+                                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalNHWC2NCHWInterface<int>(const size_t size, const size_t shape_size, const int *d_input,
+                                                         const size_t *input_shape, const size_t *input_axis,
+                                                         const size_t *d_input_shape, const size_t *d_input_axis,
+                                                         int *d_output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalNHWC2NCHWInterface<int64_t>(const size_t size, const size_t shape_size,
+                                                             const int64_t *d_input, const size_t *input_shape,
+                                                             const size_t *input_axis, const size_t *d_input_shape,
+                                                             const size_t *d_input_axis, int64_t *d_output,
+                                                             cudaStream_t cuda_stream);
 
-template void CalNCHW2NHWCInterface<double>(const size_t size, const size_t shape_size, const double *d_input,
-                                           const size_t *input_shape, const size_t *input_axis,
-                                           const size_t *d_input_shape, const size_t *d_input_axis, double *d_output,
-                                           cudaStream_t cuda_stream);
-template void CalNCHW2NHWCInterface<float>(const size_t size, const size_t shape_size, const float *d_input,
-                                           const size_t *input_shape, const size_t *input_axis,
-                                           const size_t *d_input_shape, const size_t *d_input_axis, float *d_output,
-                                           cudaStream_t cuda_stream);
-template void CalNCHW2NHWCInterface<half>(const size_t size, const size_t shape_size, const half *d_input,
-                                          const size_t *input_shape, const size_t *input_axis,
-                                          const size_t *d_input_shape, const size_t *d_input_axis, half *d_output,
-                                          cudaStream_t cuda_stream);
-template void CalNCHW2NHWCInterface<int>(const size_t size, const size_t shape_size, const int *d_input,
-                                         const size_t *input_shape, const size_t *input_axis,
-                                         const size_t *d_input_shape, const size_t *d_input_axis, int *d_output,
-                                         cudaStream_t cuda_stream);
-template void CalNCHW2NHWCInterface<int64_t>(const size_t size, const size_t shape_size, const int64_t *d_input,
-                                             const size_t *input_shape, const size_t *input_axis,
-                                             const size_t *d_input_shape, const size_t *d_input_axis, int64_t *d_output,
-                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalNCHW2NHWCInterface<double>(const size_t size, const size_t shape_size,
+                                                            const double *d_input, const size_t *input_shape,
+                                                            const size_t *input_axis, const size_t *d_input_shape,
+                                                            const size_t *d_input_axis, double *d_output,
+                                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalNCHW2NHWCInterface<float>(const size_t size, const size_t shape_size,
+                                                           const float *d_input, const size_t *input_shape,
+                                                           const size_t *input_axis, const size_t *d_input_shape,
+                                                           const size_t *d_input_axis, float *d_output,
+                                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalNCHW2NHWCInterface<half>(const size_t size, const size_t shape_size,
+                                                          const half *d_input, const size_t *input_shape,
+                                                          const size_t *input_axis, const size_t *d_input_shape,
+                                                          const size_t *d_input_axis, half *d_output,
+                                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalNCHW2NHWCInterface<int>(const size_t size, const size_t shape_size, const int *d_input,
+                                                         const size_t *input_shape, const size_t *input_axis,
+                                                         const size_t *d_input_shape, const size_t *d_input_axis,
+                                                         int *d_output, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalNCHW2NHWCInterface<int64_t>(const size_t size, const size_t shape_size,
+                                                             const int64_t *d_input, const size_t *input_shape,
+                                                             const size_t *input_axis, const size_t *d_input_shape,
+                                                             const size_t *d_input_axis, int64_t *d_output,
+                                                             cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh
new file mode 100644
index 00000000000..87efa082f45
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRANSPOSE_IMPL_OPT_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRANSPOSE_IMPL_OPT_CUH_
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#define TRANSPOSE_MAX_DIMENSION 100
+template <typename T>
+CUDA_LIB_EXPORT void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, const T *d_input,
+                                           const size_t *input_shape, const size_t *input_axis,
+                                           const size_t *d_input_shape, const size_t *d_input_axis, T *output,
+                                           cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, const T *d_input,
+                                           const size_t *input_shape, const size_t *input_axis,
+                                           const size_t *d_input_shape, const size_t *d_input_axis, T *output,
+                                           cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRANSPOSE_IMPL_OPT_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cu
similarity index 64%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cu
index 593fddfbaff..a36cfb788ab 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cu
@@ -15,6 +15,7 @@
  */
 
 #include "triangle_matrix_copy_impl.cuh"
+#include "include/cuda_fp16.h"
 template <typename T>
 __global__ void TriangleMatrixCopyKernel(const T *input, T *output, bool clean, cublasFillMode_t uplo,
                                          const size_t count, const size_t ldb, const size_t m) {
@@ -57,14 +58,16 @@ void TriangleMatrixCopy(const T *input, T *output, bool clean, cublasFillMode_t
   return;
 }
 
-template void TriangleMatrixCopy<float>(const float *input, float *output, bool clean, cublasFillMode_t uplo,
-                                        const size_t count, const size_t ldb, const size_t m, cudaStream_t cuda_stream);
-template void TriangleMatrixCopy<half>(const half *input, half *output, bool clean, cublasFillMode_t uplo,
-                                       const size_t count, const size_t ldb, const size_t m, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TriangleMatrixCopy<float>(const float *input, float *output, bool clean,
+                                                        cublasFillMode_t uplo, const size_t count, const size_t ldb,
+                                                        const size_t m, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TriangleMatrixCopy<half>(const half *input, half *output, bool clean,
+                                                       cublasFillMode_t uplo, const size_t count, const size_t ldb,
+                                                       const size_t m, cudaStream_t cuda_stream);
 
-template void TriangleMatrixCopy<double>(const double *input, double *output, bool clean, cublasFillMode_t uplo,
-                                         const size_t count, const size_t ldb, const size_t m,
-                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void TriangleMatrixCopy<double>(const double *input, double *output, bool clean,
+                                                         cublasFillMode_t uplo, const size_t count, const size_t ldb,
+                                                         const size_t m, cudaStream_t cuda_stream);
 
 template <typename T>
 void MatrixCopy(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
@@ -72,6 +75,9 @@ void MatrixCopy(const T *input, T *output, const size_t count, cudaStream_t cuda
   return;
 }
 
-template void MatrixCopy<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void MatrixCopy<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void MatrixCopy<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixCopy<float>(const float *input, float *output, const size_t count,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixCopy<half>(const half *input, half *output, const size_t count,
+                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void MatrixCopy<double>(const double *input, double *output, const size_t count,
+                                                 cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh
new file mode 100644
index 00000000000..838bbc0a90d
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRIANGLE_MATRIX_COPY_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRIANGLE_MATRIX_COPY_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void TriangleMatrixCopy(const T *input, T *output, bool clean, cublasFillMode_t uplo,
+                                        const size_t count, const size_t ldb, const size_t m, cudaStream_t cuda_stream);
+
+template <typename T>
+CUDA_LIB_EXPORT void MatrixCopy(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRIANGLE_MATRIX_COPY_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cu
similarity index 57%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cu
index 4beb1e0c58a..7e51f0f9b19 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cu
@@ -15,6 +15,7 @@
  */
 
 #include "unary_op_grad_impl.cuh"
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __global__ void SqrtGradKernel(const T *input, const T *dout, T *output, const size_t count) {
@@ -170,52 +171,52 @@ void ReciprocalGrad(const T *input, const T *dout, T *output, const size_t count
   return;
 }
 
-template void SqrtGrad<double>(const double *input, const double *dout, double *output, const size_t count,
-                               cudaStream_t cuda_stream);
-template void RsqrtGrad<double>(const double *input, const double *dout, double *output, const size_t count,
-                                cudaStream_t cuda_stream);
-template void AsinGrad<double>(const double *input, const double *dout, double *output, const size_t count,
-                               cudaStream_t cuda_stream);
-template void ACosGrad<double>(const double *input, const double *dout, double *output, const size_t count,
-                               cudaStream_t cuda_stream);
-template void AtanGrad<double>(const double *input, const double *dout, double *output, const size_t count,
-                               cudaStream_t cuda_stream);
-template void AsinhGrad<double>(const double *input, const double *dout, double *output, const size_t count,
-                                cudaStream_t cuda_stream);
-template void AcoshGrad<double>(const double *input, const double *dout, double *output, const size_t count,
-                                cudaStream_t cuda_stream);
-template void ReciprocalGrad<double>(const double *input, const double *dout, double *output, const size_t count,
-                                     cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SqrtGrad<double>(const double *input, const double *dout, double *output,
+                                               const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void RsqrtGrad<double>(const double *input, const double *dout, double *output,
+                                                const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AsinGrad<double>(const double *input, const double *dout, double *output,
+                                               const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ACosGrad<double>(const double *input, const double *dout, double *output,
+                                               const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AtanGrad<double>(const double *input, const double *dout, double *output,
+                                               const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AsinhGrad<double>(const double *input, const double *dout, double *output,
+                                                const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AcoshGrad<double>(const double *input, const double *dout, double *output,
+                                                const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReciprocalGrad<double>(const double *input, const double *dout, double *output,
+                                                     const size_t count, cudaStream_t cuda_stream);
 
-template void SqrtGrad<float>(const float *input, const float *dout, float *output, const size_t count,
-                              cudaStream_t cuda_stream);
-template void RsqrtGrad<float>(const float *input, const float *dout, float *output, const size_t count,
-                               cudaStream_t cuda_stream);
-template void AsinGrad<float>(const float *input, const float *dout, float *output, const size_t count,
-                              cudaStream_t cuda_stream);
-template void ACosGrad<float>(const float *input, const float *dout, float *output, const size_t count,
-                              cudaStream_t cuda_stream);
-template void AtanGrad<float>(const float *input, const float *dout, float *output, const size_t count,
-                              cudaStream_t cuda_stream);
-template void AsinhGrad<float>(const float *input, const float *dout, float *output, const size_t count,
-                               cudaStream_t cuda_stream);
-template void AcoshGrad<float>(const float *input, const float *dout, float *output, const size_t count,
-                               cudaStream_t cuda_stream);
-template void ReciprocalGrad<float>(const float *input, const float *dout, float *output, const size_t count,
-                                    cudaStream_t cuda_stream);
-template void SqrtGrad<half>(const half *input, const half *dout, half *output, const size_t count,
-                             cudaStream_t cuda_stream);
-template void RsqrtGrad<half>(const half *input, const half *dout, half *output, const size_t count,
-                              cudaStream_t cuda_stream);
-template void AsinGrad<half>(const half *input, const half *dout, half *output, const size_t count,
-                             cudaStream_t cuda_stream);
-template void ACosGrad<half>(const half *input, const half *dout, half *output, const size_t count,
-                             cudaStream_t cuda_stream);
-template void AtanGrad<half>(const half *input, const half *dout, half *output, const size_t count,
-                             cudaStream_t cuda_stream);
-template void AsinhGrad<half>(const half *input, const half *dout, half *output, const size_t count,
-                              cudaStream_t cuda_stream);
-template void AcoshGrad<half>(const half *input, const half *dout, half *output, const size_t count,
-                              cudaStream_t cuda_stream);
-template void ReciprocalGrad<half>(const half *input, const half *dout, half *output, const size_t count,
-                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SqrtGrad<float>(const float *input, const float *dout, float *output,
+                                              const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void RsqrtGrad<float>(const float *input, const float *dout, float *output,
+                                               const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AsinGrad<float>(const float *input, const float *dout, float *output,
+                                              const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ACosGrad<float>(const float *input, const float *dout, float *output,
+                                              const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AtanGrad<float>(const float *input, const float *dout, float *output,
+                                              const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AsinhGrad<float>(const float *input, const float *dout, float *output,
+                                               const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AcoshGrad<float>(const float *input, const float *dout, float *output,
+                                               const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReciprocalGrad<float>(const float *input, const float *dout, float *output,
+                                                    const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void SqrtGrad<half>(const half *input, const half *dout, half *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void RsqrtGrad<half>(const half *input, const half *dout, half *output, const size_t count,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AsinGrad<half>(const half *input, const half *dout, half *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ACosGrad<half>(const half *input, const half *dout, half *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AtanGrad<half>(const half *input, const half *dout, half *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AsinhGrad<half>(const half *input, const half *dout, half *output, const size_t count,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void AcoshGrad<half>(const half *input, const half *dout, half *output, const size_t count,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ReciprocalGrad<half>(const half *input, const half *dout, half *output,
+                                                   const size_t count, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cuh
new file mode 100755
index 00000000000..bac30717450
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cuh
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNARY_OP_GRAD_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNARY_OP_GRAD_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T>
+CUDA_LIB_EXPORT void SqrtGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void RsqrtGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void AsinGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void ACosGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void AtanGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void AsinhGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void AcoshGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void ReciprocalGrad(const T *input, const T *dout, T *output, const size_t count,
+                                    cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNARY_OP_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cu
new file mode 100755
index 00000000000..ee324347eba
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cu
@@ -0,0 +1,920 @@
+/**
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "unary_op_impl.cuh"
+#include "include/cuda_fp16.h"
+template <typename T>
+__global__ void ExponentialKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = expf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void ExponentialKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = exp(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void ExponentialKernel(const half *input, half *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = hexp(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void Expm1Kernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = expm1f(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void Expm1Kernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = expm1(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void LogarithmKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = logf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void LogarithmKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = log(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void LogarithmKernel(const half *input, half *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = hlog(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void Log1pKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = log1pf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void Log1pKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = log1p(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void ErfKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = erff(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void ErfKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = erf(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void ErfcKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = erfcf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void ErfcKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = erfc(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void NegativeKernel(const T *input, T *output, const size_t count) {
+  T neg_one = -1;
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = neg_one * input[i];
+  }
+  return;
+}
+template <typename T>
+__global__ void ReciprocalKernel(const T *input, T *output, const size_t count) {
+  T one = 1.0;
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = one / input[i];
+  }
+  return;
+}
+template <typename T>
+__global__ void SquareKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = input[i] * input[i];
+  }
+  return;
+}
+template <typename T>
+__global__ void SqrtKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = sqrtf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void SqrtKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = sqrt(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void SqrtKernel(const half *input, half *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = hsqrt(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void RsqrtKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = rsqrtf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void RsqrtKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = rsqrt(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void RsqrtKernel(const half *input, half *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = hrsqrt(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void SinKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = sinf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void SinKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = sin(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void SinKernel(const half *input, half *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = hsin(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void AsinKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = asinf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void AsinKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = asin(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void AsinhKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = asinhf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void AsinhKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = asinh(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void CosKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = cosf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void CosKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = cos(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void CosKernel(const half *input, half *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = hcos(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void ACosKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = acosf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void ACosKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = acos(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void AcoshKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = acoshf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void AcoshKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = acosh(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void AtanKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = atanf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void AtanKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = atan(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void AbsKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = abs(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void AbsKernel(const half *input, half *output, const size_t count) {
+  half zero = 0.0;
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = input[i] < zero ? -input[i] : input[i];
+  }
+  return;
+}
+template <typename T>
+__global__ void AbsKernel(const Complex<T> *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = abs(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void RealKernel(const Complex<T> *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = input[i].real();
+  }
+  return;
+}
+template <typename T>
+__global__ void RealKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = input[i];
+  }
+  return;
+}
+template <typename T>
+__global__ void ImagKernel(const Complex<T> *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = input[i].imag();
+  }
+  return;
+}
+template <typename T>
+__global__ void ImagKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    T zero = 0;
+    output[i] = zero;
+  }
+  return;
+}
+template <typename T>
+__global__ void ConjKernel(const Complex<T> *input, Complex<T> *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = Complex<T>(input[i].real(), -input[i].imag());
+  }
+  return;
+}
+template <typename T>
+__global__ void ConjKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = input[i];
+  }
+  return;
+}
+template <typename T>
+__global__ void FloorKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = floorf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void FloorKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = floor(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void FloorKernel(const half *input, half *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = hfloor(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void RintKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = rintf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void RintKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = rint(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void RintKernel(const half *input, half *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = hrint(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void RoundKernel(const T *input, T *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = nearbyintf(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void RoundKernel(const double *input, double *output, const size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = nearbyint(input[i]);
+  }
+  return;
+}
+template <typename T>
+__global__ void SignKernel(const T *input, T *output, const size_t count) {
+  T zero = 0.0;
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    T res;
+    if (input[i] < zero) {
+      res = -1;
+    } else if (input[i] > zero) {
+      res = 1;
+    } else {
+      res = 0;
+    }
+    output[i] = static_cast<T>(res);
+  }
+  return;
+}
+template <typename T>
+void Exponential(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Expm1(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  Expm1Kernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Logarithm(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  LogarithmKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Log1p(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  Log1pKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Erf(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  ErfKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Erfc(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  ErfcKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Negative(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  NegativeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Reciprocal(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  ReciprocalKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Square(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  SquareKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Pow(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  PowKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Sqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  SqrtKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Sin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  SinKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Cos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  CosKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Asin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  AsinKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void ACos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  ACosKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Atan(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  AtanKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Asinh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  AsinhKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Acosh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  AcoshKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Rsqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  RsqrtKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Abs(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  AbsKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Abs(const Complex<T> *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  AbsKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Real(const Complex<T> *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  RealKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Real(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  RealKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Imag(const Complex<T> *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  ImagKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Imag(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  ImagKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Conj(const Complex<T> *input, Complex<T> *output, const size_t count, cudaStream_t cuda_stream) {
+  ConjKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Conj(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  ConjKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Floor(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  FloorKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Rint(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  RintKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Round(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  RoundKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Sign(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
+  SignKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+
+// double
+template CUDA_LIB_EXPORT void Exponential<double>(const double *input, double *output, const size_t count,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Expm1<double>(const double *input, double *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Logarithm<double>(const double *input, double *output, const size_t count,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Log1p<double>(const double *input, double *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Erf<double>(const double *input, double *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Erfc<double>(const double *input, double *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Negative<double>(const double *input, double *output, const size_t count,
+                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Reciprocal<double>(const double *input, double *output, const size_t count,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Square<double>(const double *input, double *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sqrt<double>(const double *input, double *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sin<double>(const double *input, double *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Cos<double>(const double *input, double *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Asin<double>(const double *input, double *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ACos<double>(const double *input, double *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Atan<double>(const double *input, double *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Asinh<double>(const double *input, double *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Acosh<double>(const double *input, double *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Rsqrt<double>(const double *input, double *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Abs<double>(const double *input, double *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Floor<double>(const double *input, double *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Rint<double>(const double *input, double *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Round<double>(const double *input, double *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sign<double>(const double *input, double *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Real<double>(const double *input, double *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<double>(const double *input, double *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<double>(const double *input, double *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+
+
+// float
+template CUDA_LIB_EXPORT void Exponential<float>(const float *input, float *output, const size_t count,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Expm1<float>(const float *input, float *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Logarithm<float>(const float *input, float *output, const size_t count,
+                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Log1p<float>(const float *input, float *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Erf<float>(const float *input, float *output, const size_t count,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Erfc<float>(const float *input, float *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Negative<float>(const float *input, float *output, const size_t count,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Reciprocal<float>(const float *input, float *output, const size_t count,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Square<float>(const float *input, float *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sqrt<float>(const float *input, float *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sin<float>(const float *input, float *output, const size_t count,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Cos<float>(const float *input, float *output, const size_t count,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Asin<float>(const float *input, float *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ACos<float>(const float *input, float *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Atan<float>(const float *input, float *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Asinh<float>(const float *input, float *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Acosh<float>(const float *input, float *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Rsqrt<float>(const float *input, float *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Abs<float>(const float *input, float *output, const size_t count,
+                                         cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Floor<float>(const float *input, float *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Rint<float>(const float *input, float *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Round<float>(const float *input, float *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sign<float>(const float *input, float *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Real<float>(const float *input, float *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<float>(const float *input, float *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<float>(const float *input, float *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+
+// half
+template CUDA_LIB_EXPORT void Exponential<half>(const half *input, half *output, const size_t count,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Expm1<half>(const half *input, half *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Logarithm<half>(const half *input, half *output, const size_t count,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Log1p<half>(const half *input, half *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Erf<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Erfc<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Negative<half>(const half *input, half *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Reciprocal<half>(const half *input, half *output, const size_t count,
+                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Square<half>(const half *input, half *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sqrt<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sin<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Cos<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Asin<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ACos<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Atan<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Asinh<half>(const half *input, half *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Acosh<half>(const half *input, half *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Rsqrt<half>(const half *input, half *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Abs<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Floor<half>(const half *input, half *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Rint<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Round<half>(const half *input, half *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sign<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Real<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
+
+// int8
+template CUDA_LIB_EXPORT void Exponential<char>(const char *input, char *output, const size_t count,
+                                                cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Expm1<char>(const char *input, char *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Logarithm<char>(const char *input, char *output, const size_t count,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Log1p<char>(const char *input, char *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Erf<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Erfc<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Negative<char>(const char *input, char *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Reciprocal<char>(const char *input, char *output, const size_t count,
+                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Square<char>(const char *input, char *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sqrt<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sin<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Cos<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Asin<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ACos<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Atan<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Asinh<char>(const char *input, char *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Acosh<char>(const char *input, char *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Rsqrt<char>(const char *input, char *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Abs<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Floor<char>(const char *input, char *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Rint<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Round<char>(const char *input, char *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sign<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Real<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
+
+// uint8
+template CUDA_LIB_EXPORT void Exponential<unsigned char>(const unsigned char *input, unsigned char *output,
+                                                         const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Expm1<unsigned char>(const unsigned char *input, unsigned char *output,
+                                                   const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Logarithm<unsigned char>(const unsigned char *input, unsigned char *output,
+                                                       const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Log1p<unsigned char>(const unsigned char *input, unsigned char *output,
+                                                   const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Erf<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Erfc<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Negative<unsigned char>(const unsigned char *input, unsigned char *output,
+                                                      const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Reciprocal<unsigned char>(const unsigned char *input, unsigned char *output,
+                                                        const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Square<unsigned char>(const unsigned char *input, unsigned char *output,
+                                                    const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sqrt<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sin<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Cos<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Asin<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ACos<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Atan<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Asinh<unsigned char>(const unsigned char *input, unsigned char *output,
+                                                   const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Acosh<unsigned char>(const unsigned char *input, unsigned char *output,
+                                                   const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Rsqrt<unsigned char>(const unsigned char *input, unsigned char *output,
+                                                   const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Abs<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                 cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Floor<unsigned char>(const unsigned char *input, unsigned char *output,
+                                                   const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Rint<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Round<unsigned char>(const unsigned char *input, unsigned char *output,
+                                                   const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sign<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Real<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
+                                                  cudaStream_t cuda_stream);
+
+// int32
+template CUDA_LIB_EXPORT void Exponential<int>(const int *input, int *output, const size_t count,
+                                               cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Expm1<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Logarithm<int>(const int *input, int *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Log1p<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Erf<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Erfc<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Negative<int>(const int *input, int *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Reciprocal<int>(const int *input, int *output, const size_t count,
+                                              cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Square<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sqrt<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sin<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Cos<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Asin<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void ACos<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Atan<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Asinh<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Acosh<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Rsqrt<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Abs<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Floor<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Rint<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Round<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Sign<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Real<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
+
+// complex64
+template CUDA_LIB_EXPORT void Real<float>(const Complex<float> *input, float *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<float>(const Complex<float> *input, float *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<float>(const Complex<float> *input, Complex<float> *output, const size_t count,
+                                          cudaStream_t cuda_stream);
+
+// complex128
+template CUDA_LIB_EXPORT void Real<double>(const Complex<double> *input, double *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<double>(const Complex<double> *input, double *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<double>(const Complex<double> *input, Complex<double> *output, const size_t count,
+                                           cudaStream_t cuda_stream);
+
+// bool
+template CUDA_LIB_EXPORT void Real<bool>(const bool *input, bool *output, const size_t count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<bool>(const bool *input, bool *output, const size_t count, cudaStream_t cuda_stream);
+
+// int16
+template CUDA_LIB_EXPORT void Real<int16_t>(const int16_t *input, int16_t *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<int16_t>(const int16_t *input, int16_t *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<int16_t>(const int16_t *input, int16_t *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+
+// uint16
+template CUDA_LIB_EXPORT void Real<uint16_t>(const uint16_t *input, uint16_t *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<uint16_t>(const uint16_t *input, uint16_t *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<uint16_t>(const uint16_t *input, uint16_t *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+
+// uint32
+template CUDA_LIB_EXPORT void Real<uint32_t>(const uint32_t *input, uint32_t *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<uint32_t>(const uint32_t *input, uint32_t *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<uint32_t>(const uint32_t *input, uint32_t *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+
+// int64
+template CUDA_LIB_EXPORT void Real<int64_t>(const int64_t *input, int64_t *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<int64_t>(const int64_t *input, int64_t *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<int64_t>(const int64_t *input, int64_t *output, const size_t count,
+                                            cudaStream_t cuda_stream);
+
+// uint64
+template CUDA_LIB_EXPORT void Real<uint64_t>(const uint64_t *input, uint64_t *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Imag<uint64_t>(const uint64_t *input, uint64_t *output, const size_t count,
+                                             cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void Conj<uint64_t>(const uint64_t *input, uint64_t *output, const size_t count,
+                                             cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh
new file mode 100755
index 00000000000..1ff160f77a7
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNARY_OP_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNARY_OP_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
+template <typename T>
+CUDA_LIB_EXPORT void Exponential(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Expm1(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Logarithm(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Log1p(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Erf(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Erfc(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Negative(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Reciprocal(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Square(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Sqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Rsqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Sin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Cos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Asin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void ACos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Atan(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Asinh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Acosh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Abs(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Floor(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Rint(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Round(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Sign(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Real(const Complex<T> *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Real(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Imag(const Complex<T> *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Imag(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Conj(const Complex<T> *input, Complex<T> *output, const size_t count, cudaStream_t cuda_stream);
+template <typename T>
+CUDA_LIB_EXPORT void Conj(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNARY_OP_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cu
similarity index 73%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cu
index d020a4fc9fd..be203aeb085 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cuh"
 
 template <typename S>
 __global__ void AssignToOutput(const int64_t size, const S prob_val, S *output_array) {
@@ -31,6 +31,7 @@ void CalUniformCandidateSampler(const int64_t true_size, const int64_t num_sampl
                                                                            sampled_expected_count);
 }
 
-template void CalUniformCandidateSampler<float>(const int64_t true_size, const int64_t num_sampled,
-                                                const float prob_val, float *true_expected_count,
-                                                float *sampled_expected_count, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void CalUniformCandidateSampler<float>(const int64_t true_size, const int64_t num_sampled,
+                                                                const float prob_val, float *true_expected_count,
+                                                                float *sampled_expected_count,
+                                                                cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cuh
new file mode 100644
index 00000000000..575c8258833
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cuh
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNIFORM_CANDIDATE_SAMPLER_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNIFORM_CANDIDATE_SAMPLER_IMPL_CUH_
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+
+template <typename S>
+CUDA_LIB_EXPORT void CalUniformCandidateSampler(const int64_t true_size, const int64_t num_sampled, const S prob_val,
+                                                S *true_expected_count, S *sampled_expected_count,
+                                                cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNIFORM_CANDIDATE_SAMPLER_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unique_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cu
similarity index 73%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unique_impl.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cu
index 1ecffbc9b06..e604fc3d8ad 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unique_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cu
@@ -23,7 +23,6 @@
 #include <thrust/unique.h>
 #include <algorithm>
 #include "unique_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "include/cuda_fp16.h"
 
 template <typename T, typename S>
@@ -66,11 +65,14 @@ int CalUnique(const T *input, int num_elements, S *input_index, S *sorted_index,
   return output_size;
 }
 
-template int CalUnique<float, int>(const float *input, int num_elements, int *input_index, int *sorted_index,
-                                    float *output, int *index, cudaStream_t cuda_stream);
-template int CalUnique<half, int>(const half *input, int num_elements, int *input_index, int *sorted_index,
-                                   half *output, int *index, cudaStream_t cuda_stream);
-template int CalUnique<int, int>(const int *input, int num_elements, int *input_index, int *sorted_index,
-                                  int *output, int *index, cudaStream_t cuda_stream);
-template int CalUnique<int64_t, int64_t>(const int64_t *input, int num_elements, int64_t *input_index,
-  int64_t *sorted_index, int64_t *output, int64_t *index, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT int CalUnique<float, int>(const float *input, int num_elements, int *input_index,
+                                                   int *sorted_index, float *output, int *index,
+                                                   cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT int CalUnique<half, int>(const half *input, int num_elements, int *input_index,
+                                                  int *sorted_index, half *output, int *index,
+                                                  cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT int CalUnique<int, int>(const int *input, int num_elements, int *input_index,
+                                                 int *sorted_index, int *output, int *index, cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT int CalUnique<int64_t, int64_t>(const int64_t *input, int num_elements, int64_t *input_index,
+                                                         int64_t *sorted_index, int64_t *output, int64_t *index,
+                                                         cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh
similarity index 57%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh
index 2ae09fc58dd..f1cf917f079 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh
@@ -14,12 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SQUARE_SUM_ALL_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SQUARE_SUM_ALL_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void SquareSumAll(const size_t input_size_, const T* input_addr_0, const T* input_addr_1,
-                  T* output_addr_0, T* output_addr_1, float* ws_addr_0, float* ws_addr_1, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SQUARE_SUM_ALL_IMPL_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNIQUE_IMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNIQUE_IMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+template <typename T, typename S>
+CUDA_LIB_EXPORT int CalUnique(const T *input, int num_elements, S *input_index, S *sorted_index, T *output, S *index,
+                              cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNIQUE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cu
new file mode 100755
index 00000000000..9cd1baa03d6
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cu
@@ -0,0 +1,75 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cuh"
+#include "include/cuda_fp16.h"
+template <typename T>
+__global__ void Unpack(const size_t size, const size_t output_num,
+                       const size_t dims_after_axis, T** outputs, const T* input) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
+      size_t cur_input_index = pos / dims_after_axis % output_num;
+      size_t cycle_len = output_num * dims_after_axis;
+      size_t local_index = pos / cycle_len * dims_after_axis + pos % cycle_len % dims_after_axis;
+      outputs[cur_input_index][local_index] = input[pos];
+  }
+  return;
+}
+
+template <typename T>
+void UnpackKernel(const size_t size, const size_t output_num,
+                  const size_t dims_after_axis, T** outputs, const T* input,
+                  cudaStream_t cuda_stream) {
+  Unpack<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, output_num,
+                                                            dims_after_axis, outputs, input);
+  return;
+}
+
+template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num,
+                                           const size_t dims_after_axis, int8_t** outputs, const int8_t* input,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num,
+                                           const size_t dims_after_axis, int16_t** outputs, const int16_t* input,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num,
+                                           const size_t dims_after_axis, int** outputs, const int* input,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num,
+                                           const size_t dims_after_axis, int64_t** outputs, const int64_t* input,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num,
+                                           const size_t dims_after_axis, uint8_t** outputs, const uint8_t* input,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num,
+                                           const size_t dims_after_axis, uint16_t** outputs, const uint16_t* input,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num,
+                                           const size_t dims_after_axis, uint32_t** outputs, const uint32_t* input,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num,
+                                           const size_t dims_after_axis, uint64_t** outputs, const uint64_t* input,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num,
+                                           const size_t dims_after_axis, half** outputs, const half* input,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num,
+                                           const size_t dims_after_axis, float** outputs, const float* input,
+                                           cudaStream_t cuda_stream);
+template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num,
+                                           const size_t dims_after_axis, bool** outputs, const bool* input,
+                                           cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unpack.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cuh
similarity index 53%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unpack.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cuh
index c4ea6e21026..420f12ff562 100755
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unpack.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cuh
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNPACKIMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNPACKIMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNPACKIMPL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNPACKIMPL_CUH_
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
-void UnpackKernel(const size_t size, const size_t output_num,
-                  const size_t dims_after_axis, T** outputs, const T* input,
-                  cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNPACKIMPL_H_
+CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num,
+                                  const size_t dims_after_axis, T** outputs, const T* input,
+                                  cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNPACKIMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cu
similarity index 52%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cu
index e3af209b663..850563e1c2e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cu
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cuh"
 #include <limits>
+#include "include/cuda_fp16.h"
 
 template <typename T, typename S>
 __global__ void UnsortedSegmentMax(const T *input, const S *segment_ids, const int64_t num_segments, size_t outer_size,
@@ -71,19 +72,23 @@ void CalUnsortedSegmentMax(const T *input, const S *segment_ids, const int64_t n
   return;
 }
 
-template void CalUnsortedSegmentMax<float, int>(const float *input, const int *segment_ids, const int64_t num_segments,
-                                                size_t outer_size, size_t inner_size, float *output,
-                                                cudaStream_t stream);
-template void CalUnsortedSegmentMax<float, int64_t>(const float *input, const int64_t *segment_ids,
-                                                    const int64_t num_segments, size_t outer_size, size_t inner_size,
-                                                    float *output, cudaStream_t stream);
-template void CalUnsortedSegmentMax<half, int>(const half *input, const int *segment_ids, const int64_t num_segments,
-                                               size_t outer_size, size_t inner_size, half *output, cudaStream_t stream);
-template void CalUnsortedSegmentMax<half, int64_t>(const half *input, const int64_t *segment_ids,
-                                                   const int64_t num_segments, size_t outer_size, size_t inner_size,
-                                                   half *output, cudaStream_t stream);
-template void CalUnsortedSegmentMax<int, int>(const int *input, const int *segment_ids, const int64_t num_segments,
-                                              size_t outer_size, size_t inner_size, int *output, cudaStream_t stream);
-template void CalUnsortedSegmentMax<int, int64_t>(const int *input, const int64_t *segment_ids,
-                                                  const int64_t num_segments, size_t outer_size, size_t inner_size,
-                                                  int *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalUnsortedSegmentMax<float, int>(const float *input, const int *segment_ids,
+                                                                const int64_t num_segments, size_t outer_size,
+                                                                size_t inner_size, float *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalUnsortedSegmentMax<float, int64_t>(const float *input, const int64_t *segment_ids,
+                                                                    const int64_t num_segments, size_t outer_size,
+                                                                    size_t inner_size, float *output,
+                                                                    cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalUnsortedSegmentMax<half, int>(const half *input, const int *segment_ids,
+                                                               const int64_t num_segments, size_t outer_size,
+                                                               size_t inner_size, half *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalUnsortedSegmentMax<half, int64_t>(const half *input, const int64_t *segment_ids,
+                                                                   const int64_t num_segments, size_t outer_size,
+                                                                   size_t inner_size, half *output,
+                                                                   cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalUnsortedSegmentMax<int, int>(const int *input, const int *segment_ids,
+                                                              const int64_t num_segments, size_t outer_size,
+                                                              size_t inner_size, int *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalUnsortedSegmentMax<int, int64_t>(const int *input, const int64_t *segment_ids,
+                                                                  const int64_t num_segments, size_t outer_size,
+                                                                  size_t inner_size, int *output, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cuh
similarity index 56%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cuh
index 4f31f2e3826..86720c235ce 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cuh
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORTED_SEGMENT_MAX_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORTED_SEGMENT_MAX_H_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORTED_SEGMENT_MAX_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORTED_SEGMENT_MAX_CUH_
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 // Setting warp size to sync data across threads
 #define KWARPSIZE 32
 template <typename T, typename S>
-void CalUnsortedSegmentMax(const T *input, const S *segment_ids, const int64_t num_segments, size_t outer_size,
-                           size_t inner_size, T *output, cudaStream_t stream);
+CUDA_LIB_EXPORT void CalUnsortedSegmentMax(const T *input, const S *segment_ids, const int64_t num_segments,
+                                           size_t outer_size, size_t inner_size, T *output, cudaStream_t stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORT_SEGMENT_MAX_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORTED_SEGMENT_MAX_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cu
similarity index 71%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cu
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cu
index ec3141e0397..4e8271fe998 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cu
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cuh"
 #include <limits>
+#include "include/cuda_fp16.h"
 
 template <typename T>
 __device__ __forceinline__ void max_val_init(T *init_val) {
@@ -71,9 +72,12 @@ void CalUnsortedSegmentMin(const T *input, const int *segment_ids, const int64_t
   return;
 }
 
-template void CalUnsortedSegmentMin<float>(const float *input, const int *segment_ids, const int64_t num_segments,
-                                           size_t outer_size, size_t inner_size, float *output, cudaStream_t stream);
-template void CalUnsortedSegmentMin<half>(const half *input, const int *segment_ids, const int64_t num_segments,
-                                          size_t outer_size, size_t inner_size, half *output, cudaStream_t stream);
-template void CalUnsortedSegmentMin<int>(const int *input, const int *segment_ids, const int64_t num_segments,
-                                         size_t outer_size, size_t inner_size, int *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalUnsortedSegmentMin<float>(const float *input, const int *segment_ids,
+                                                           const int64_t num_segments, size_t outer_size,
+                                                           size_t inner_size, float *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalUnsortedSegmentMin<half>(const half *input, const int *segment_ids,
+                                                          const int64_t num_segments, size_t outer_size,
+                                                          size_t inner_size, half *output, cudaStream_t stream);
+template CUDA_LIB_EXPORT void CalUnsortedSegmentMin<int>(const int *input, const int *segment_ids,
+                                                         const int64_t num_segments, size_t outer_size,
+                                                         size_t inner_size, int *output, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cuh
similarity index 55%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cuh
index 4d8603a6f8c..335147731a4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cuh
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORTED_SEGMENT_MIN_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORTED_SEGMENT_MIN_H_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORTED_SEGMENT_MIN_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORTED_SEGMENT_MIN_CUH_
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 // Setting warp size to sync data across threads
 #define KWARPSIZE 32
 template <typename T>
-void CalUnsortedSegmentMin(const T *input, const int *segment_ids, const int64_t num_segments, size_t outer_size,
-                           size_t inner_size, T *output, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORT_SEGMENT_MIN_H_
+CUDA_LIB_EXPORT void CalUnsortedSegmentMin(const T *input, const int *segment_ids, const int64_t num_segments,
+                                           size_t outer_size, size_t inner_size, T *output, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORTED_SEGMENT_MIN_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cu
new file mode 100644
index 00000000000..c0ab224eab0
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cu
@@ -0,0 +1,75 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+
+template<typename T, typename S>
+__global__ void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
+                       T* input_addr, S* ids_addr, T* output_addr) {
+  for (int input_index = blockIdx.x * blockDim.x + threadIdx.x; input_index < input_dim0 * input_dim1;
+      input_index += blockDim.x * gridDim.x) {
+    size_t j = input_index / input_dim1;
+    size_t k = input_index % input_dim1;
+
+    S i = ids_addr[j];
+    if (i < 0 || i >= output_dim0) {
+      continue;
+    }
+    size_t output_index = i * output_dim1 + k;
+    MsAtomicAdd(output_addr + output_index, input_addr[input_index]);
+  }
+}
+
+template<typename T, typename S>
+void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
+                        T* input_addr, S* ids_addr, T* output_addr, cudaStream_t stream) {
+  int size = input_dim0 * input_dim1;
+  UnsortedSegmentSum<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input_dim0, input_dim1,
+                                  output_dim0, output_dim1, input_addr, ids_addr, output_addr);
+  return;
+}
+
+template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0,
+                                                 size_t output_dim1, double* input_addr, int* ids_addr,
+                                                 double* output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0,
+                                                 size_t output_dim1, double* input_addr, int64_t* ids_addr,
+                                                 double* output_addr, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0,
+                                                 size_t output_dim1, float* input_addr, int* ids_addr,
+                                                 float* output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0,
+                                                 size_t output_dim1, float* input_addr, int64_t* ids_addr,
+                                                 float* output_addr, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0,
+                                                 size_t output_dim1, half* input_addr, int* ids_addr, half* output_addr,
+                                                 cudaStream_t stream);
+template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0,
+                                                 size_t output_dim1, half* input_addr, int64_t* ids_addr,
+                                                 half* output_addr, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0,
+                                                 size_t output_dim1, int* input_addr, int* ids_addr, int* output_addr,
+                                                 cudaStream_t stream);
+template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0,
+                                                 size_t output_dim1, int* input_addr, int64_t* ids_addr,
+                                                 int* output_addr, cudaStream_t stream);
+
+
+
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cuh
similarity index 54%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cuh
index 65a9267ea7e..43a2fe9ba5f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cuh
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORT_SEGMENT_SUM_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORT_SEGMENT_SUM_H_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORT_SEGMENT_SUM_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORT_SEGMENT_SUM_CUH_
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template<typename T, typename S>
-void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
-                        T* input_addr, S* ids, T* output_addr, cudaStream_t stream);
+CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
+                                        T* input_addr, S* ids, T* output_addr, cudaStream_t stream);
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORT_SEGMENT_SUM_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORT_SEGMENT_SUM_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/util.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh
similarity index 98%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/util.cuh
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh
index b5aec5a361a..3dbc20374a8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/util.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh
@@ -14,14 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UTIL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UTIL_H_
-
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UTIL_CUH_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UTIL_CUH_
 #include <cuda_fp16.h>
-
 #include <algorithm>
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 #define kThreadsPerBlock (256)
 #define kBlocksPerGrid(n) ((n + kThreadsPerBlock - 1) / kThreadsPerBlock)
@@ -553,4 +550,4 @@ enum : unsigned { warp_size = 32, log_wap_size = 5 };
 __device__ __forceinline__ unsigned LaneId() { return threadIdx.x & (warp_size - 1); }
 __device__ __forceinline__ unsigned WarpId(const unsigned &tid) { return tid >> log_wap_size; }
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UTIL_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UTIL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cu
deleted file mode 100644
index 47f2fe73d85..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cu
+++ /dev/null
@@ -1,138 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cuda_runtime.h>
-#include "depthtospace_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-__global__ void DepthToSpace(const size_t size, const T *input, const size_t in,
-                             const size_t ic, const size_t ih, const size_t iw,
-                             const size_t on, const size_t oc, const size_t oh,
-                             const size_t ow, const size_t r, T *output) {
-  size_t temp_stride = 0;
-  size_t temp_pos = 0;
-  size_t input_pos = 0;
-  size_t output_pos_array[DEPTHTOSPACE_BUFFER_DIMENSION];
-
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size;
-       pos += blockDim.x * gridDim.x) {
-    temp_stride = oc * oh * ow;
-    output_pos_array[0] = pos / temp_stride;
-    temp_pos = pos % temp_stride;
-
-    temp_stride /= oc;
-    output_pos_array[1] = temp_pos / temp_stride;
-    temp_pos = pos % temp_stride;
-
-    temp_stride /= oh;
-    output_pos_array[2] = temp_pos / temp_stride;
-    temp_pos = pos % temp_stride;
-
-    temp_stride /= ow;
-    output_pos_array[3] = temp_pos / temp_stride;
-
-    input_pos += output_pos_array[0];
-    input_pos =
-        (input_pos * ic) +
-        (output_pos_array[1] +
-         (r * (output_pos_array[2] % r) + output_pos_array[3] % r) * oc);
-    input_pos = (input_pos * ih) + (output_pos_array[2] / r);
-    input_pos = (input_pos * iw) + (output_pos_array[3] / r);
-
-    output[pos] = input[input_pos];
-    input_pos = 0;
-  }
-  return;
-}
-
-template <typename T>
-void CalDepthToSpace(const size_t size, const T *input, const size_t in,
-                     const size_t ic, const size_t ih, const size_t iw,
-                     const size_t on, const size_t oc, const size_t oh,
-                     const size_t ow, const size_t r, T *output,
-                     cudaStream_t cuda_stream) {
-  DepthToSpace<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
-      size, input, in, ic, ih, iw, on, oc, oh, ow, r, output);
-  return;
-}
-
-template void CalDepthToSpace<float>(const size_t size, const float *input,
-                                     const size_t in, const size_t ic,
-                                     const size_t ih, const size_t iw,
-                                     const size_t on, const size_t oc,
-                                     const size_t oh, const size_t ow,
-                                     const size_t r, float *output,
-                                     cudaStream_t cuda_stream);
-template void CalDepthToSpace<half>(const size_t size, const half *input,
-                                    const size_t in, const size_t ic,
-                                    const size_t ih, const size_t iw,
-                                    const size_t on, const size_t oc,
-                                    const size_t oh, const size_t ow,
-                                    const size_t r, half *output,
-                                    cudaStream_t cuda_stream);
-template void CalDepthToSpace<int>(const size_t size, const int *input,
-                                   const size_t in, const size_t ic,
-                                   const size_t ih, const size_t iw,
-                                   const size_t on, const size_t oc,
-                                   const size_t oh, const size_t ow,
-                                   const size_t r, int *output,
-                                   cudaStream_t cuda_stream);
-template void CalDepthToSpace<int64_t>(const size_t size, const int64_t *input,
-                                       const size_t in, const size_t ic,
-                                       const size_t ih, const size_t iw,
-                                       const size_t on, const size_t oc,
-                                       const size_t oh, const size_t ow,
-                                       const size_t r, int64_t *output,
-                                       cudaStream_t cuda_stream);
-template void CalDepthToSpace<int16_t>(const size_t size, const int16_t *input,
-                                       const size_t in, const size_t ic,
-                                       const size_t ih, const size_t iw,
-                                       const size_t on, const size_t oc,
-                                       const size_t oh, const size_t ow,
-                                       const size_t r, int16_t *output,
-                                       cudaStream_t cuda_stream);
-template void CalDepthToSpace<int8_t>(const size_t size, const int8_t *input,
-                                      const size_t in, const size_t ic,
-                                      const size_t ih, const size_t iw,
-                                      const size_t on, const size_t oc,
-                                      const size_t oh, const size_t ow,
-                                      const size_t r, int8_t *output,
-                                      cudaStream_t cuda_stream);
-template void CalDepthToSpace<uint8_t>(const size_t size, const uint8_t *input,
-                                       const size_t in, const size_t ic,
-                                       const size_t ih, const size_t iw,
-                                       const size_t on, const size_t oc,
-                                       const size_t oh, const size_t ow,
-                                       const size_t r, uint8_t *output,
-                                       cudaStream_t cuda_stream);
-template void
-CalDepthToSpace<uint16_t>(const size_t size, const uint16_t *input,
-                          const size_t in, const size_t ic, const size_t ih,
-                          const size_t iw, const size_t on, const size_t oc,
-                          const size_t oh, const size_t ow, const size_t r,
-                          uint16_t *output, cudaStream_t cuda_stream);
-template void
-CalDepthToSpace<uint32_t>(const size_t size, const uint32_t *input,
-                          const size_t in, const size_t ic, const size_t ih,
-                          const size_t iw, const size_t on, const size_t oc,
-                          const size_t oh, const size_t ow, const size_t r,
-                          uint32_t *output, cudaStream_t cuda_stream);
-template void
-CalDepthToSpace<uint64_t>(const size_t size, const uint64_t *input,
-                          const size_t in, const size_t ic, const size_t ih,
-                          const size_t iw, const size_t on, const size_t oc,
-                          const size_t oh, const size_t ow, const size_t r,
-                          uint64_t *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cuh
deleted file mode 100644
index 289ecc3673a..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cuh
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DEPTHTOSPACE_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DEPTHTOSPACE_H_
-
-#define DEPTHTOSPACE_BUFFER_DIMENSION 4
-template <typename T>
-void CalDepthToSpace(const size_t size, const T *input, const size_t in,
-                     const size_t ic, const size_t ih, const size_t iw,
-                     const size_t on, const size_t oc, const size_t oh,
-                     const size_t ow, const size_t r, T *output,
-                     cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DEPTHTOSPACE_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout_impl.cuh
deleted file mode 100644
index 57d5ce552f7..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DROPOUT_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DROPOUT_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void DropoutForward(const T *input, T *mask, T *output, float *mask_f, size_t num_count, float keep_prob,
-                    cudaStream_t cuda_stream);
-template <typename T>
-void DropoutBackward(const T *dy, const T *mask, T *dx, size_t num_count, float keep_prob, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DROPOUT_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cu
deleted file mode 100644
index b292abe035a..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cu
+++ /dev/null
@@ -1,118 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "dynamic_range_impl.cuh"
-
-#include <cuda_runtime.h>
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-__global__ void ValidateInputAndInferShape(const T *range_start, const T *range_end, const T *range_delta,
-                                           int64_t *output_shape, DynamicRangeErrorCode *error_code,
-                                           const int64_t max_output_size) {
-  T start = range_start[0];
-  T end = range_end[0];
-  T delta = range_delta[0];
-  *error_code = DynamicRangeErrorCode::kOk;
-
-  if (delta == 0) {
-    *error_code = DynamicRangeErrorCode::kDeltaIsZero;
-    return;
-  }
-
-  if (start < end && delta < 0) {
-    *error_code = DynamicRangeErrorCode::kInvalidNegativeDelta;
-    return;
-  }
-
-  if (start > end && delta > 0) {
-    *error_code = DynamicRangeErrorCode::kInvalidPositiveDelta;
-    return;
-  }
-
-  if (*error_code == DynamicRangeErrorCode::kOk) {
-    int64_t real_output_shape = static_cast<int64_t>(ceil(static_cast<double>(end - start) / delta));
-
-    // verification in case of precision error during calculation of real_output_shape. one multiplication followed by
-    // one addition is much more precise than the division that occurs when calculating real_output_shape.
-    double last_value = start + (delta * (real_output_shape - 1));
-    double epsilon = 1e-6;
-    if ((end > start && last_value > end) || (start > end && last_value < end) || fabsf(last_value - end) < epsilon) {
-      real_output_shape--;
-    }
-
-    if (real_output_shape > max_output_size) {
-        *error_code = DynamicRangeErrorCode::kMaxSizeExceeded;
-    }
-    *output_shape = real_output_shape;
-  }
-}
-
-template <typename T>
-__global__ void Range(const T *range_start, const T *range_end, const T *range_delta, T *output, int64_t *output_shape,
-                      const int64_t max_output_size) {
-  T start = range_start[0];
-  T delta = range_delta[0];
-
-  size_t gt_id = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; gt_id < *output_shape; gt_id += blockDim.x * gridDim.x) {
-    output[gt_id] = gt_id * delta + start;
-  }
-}
-
-template <typename T>
-void CudaValidateInputAndInferShape(const T *range_start, const T *range_end, const T *range_delta,
-                                    int64_t *output_shape, DynamicRangeErrorCode *error_code,
-                                    const int64_t max_output_size, cudaStream_t cuda_stream) {
-  ValidateInputAndInferShape<<<1, 1, 0, cuda_stream>>>(range_start, range_end, range_delta, output_shape, error_code,
-                                                       max_output_size);
-}
-
-template <typename T>
-void CalRange(const T *range_start, const T *range_end, const T *range_delta, T *output, int64_t *output_shape,
-              DynamicRangeErrorCode *error_code, const int64_t max_output_size, cudaStream_t cuda_stream) {
-  Range<<<GET_BLOCKS(max_output_size), GET_THREADS, 0, cuda_stream>>>(range_start, range_end, range_delta,
-                                                                             output, output_shape, max_output_size);
-}
-
-template void CudaValidateInputAndInferShape<int>(const int *range_start, const int *range_end, const int *range_delta,
-                                                  int64_t *output_shape, DynamicRangeErrorCode *error_code,
-                                                  const int64_t max_output_size, cudaStream_t cuda_stream);
-template void CudaValidateInputAndInferShape<int64_t>(const int64_t *range_start, const int64_t *range_end,
-                                                      const int64_t *range_delta, int64_t *output_shape,
-                                                      DynamicRangeErrorCode *error_code, const int64_t max_output_size,
-                                                      cudaStream_t cuda_stream);
-template void CudaValidateInputAndInferShape<float>(const float *range_start, const float *range_end,
-                                                    const float *range_delta, int64_t *output_shape,
-                                                    DynamicRangeErrorCode *error_code, const int64_t max_output_size,
-                                                    cudaStream_t cuda_stream);
-template void CudaValidateInputAndInferShape<double>(const double *range_start, const double *range_end,
-                                                     const double *range_delta, int64_t *output_shape,
-                                                     DynamicRangeErrorCode *error_code, const int64_t max_output_size,
-                                                     cudaStream_t cuda_stream);
-
-template void CalRange<int>(const int *range_start, const int *range_end, const int *range_delta, int *output,
-                            int64_t *output_shape, DynamicRangeErrorCode *error_code, const int64_t max_output_size,
-                            cudaStream_t cuda_stream);
-template void CalRange<int64_t>(const int64_t *range_start, const int64_t *range_end, const int64_t *range_delta,
-                                int64_t *output, int64_t *output_shape, DynamicRangeErrorCode *error_code,
-                                const int64_t max_output_size, cudaStream_t cuda_stream);
-template void CalRange<float>(const float *range_start, const float *range_end, const float *range_delta, float *output,
-                              int64_t *output_shape, DynamicRangeErrorCode *error_code, const int64_t max_output_size,
-                              cudaStream_t cuda_stream);
-template void CalRange<double>(const double *range_start, const double *range_end, const double *range_delta,
-                               double *output, int64_t *output_shape, DynamicRangeErrorCode *error_code,
-                               const int64_t max_output_size, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cuh
deleted file mode 100644
index 535e3443031..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cuh
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_DYNAMIC_RANGE_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_DYNAMIC_RANGE_CUH_
-
-#include <cuda_runtime.h>
-
-enum class DynamicRangeErrorCode {
-  kOk = 0,
-  kDeltaIsZero,
-  kInvalidPositiveDelta,
-  kInvalidNegativeDelta,
-  kMaxSizeExceeded
-};
-
-template <typename T>
-void CudaValidateInputAndInferShape(const T *range_start, const T *range_end, const T *range_delta,
-                                    int64_t *output_shape, DynamicRangeErrorCode *error_code,
-                                    const int64_t max_output_size, cudaStream_t cuda_stream);
-
-template <typename T>
-void CalRange(const T *range_start, const T *range_end, const T *range_delta, T *output, int64_t *output_shape,
-              DynamicRangeErrorCode *error_code, const int64_t max_output_size, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_DYNAMIC_RANGE_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/einsum_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/einsum_impl.cuh
deleted file mode 100644
index f0876f283eb..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/einsum_impl.cuh
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Copyright 2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_EINSUM_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_EINSUM_H_
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#define EINSUM_MAX_DIMENSION 20
-template <typename T>
-struct DynamicSharedMem;
-template <>
-struct DynamicSharedMem<double> {
-  __device__ double *addr() {
-    extern __shared__ double addr_double[];
-    return addr_double;
-  }
-};
-template <>
-struct DynamicSharedMem<float> {
-  __device__ float *addr() {
-    extern __shared__ float addr_float[];
-    return addr_float;
-  }
-};
-template <>
-struct DynamicSharedMem<half> {
-  __device__ half *addr() {
-    extern __shared__ half addr_half[];
-    return addr_half;
-  }
-};
-template <typename T>
-void CalDiagonal(const size_t size, const T *input, const size_t *input_shape, const size_t shape_size,
-                 const size_t left_dim, const size_t right_dim, T *output, cudaStream_t cuda_stream);
-template <typename T>
-void CalDiagonalGrad(const size_t d_size, const T *dout, const size_t *input_shape, const size_t shape_size,
-                     const size_t left_dim, const size_t right_dim, T *d_inp, cudaStream_t cuda_stream);
-template <typename T>
-void CalDot(const size_t size, T *input_a, const T *input_b, T *output, cudaStream_t cuda_stream);
-template <typename T>
-void CalDotGrad(const size_t size, const T dout, T *mid_res, T *input_b, T *input_a, cudaStream_t cuda_stream);
-template <typename T>
-void CalMul(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape, const size_t lft_num,
-            const size_t *rht_shape, const size_t rht_num, const size_t *out_shape, const size_t out_num, const T *x0,
-            const T *x1, T *y, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_EINSUM_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cu
deleted file mode 100644
index 740716adcba..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cu
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-__global__ void SubOffset(T *indices, size_t size, int64_t offset) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
-    indices[pos] -= static_cast<T>(offset);
-  }
-  return;
-}
-
-template <typename T, typename S>
-void CalEmbeddingLookup(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2,
-                        size_t input_dim1, int64_t offset, cudaStream_t stream) {
-  size_t size = output_dim0 * output_dim1 * output_dim2;
-  SubOffset<<<GET_BLOCKS(output_dim1), GET_THREADS, 0, stream>>>(indices, output_dim1, offset);
-  GatherV2Kernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, indices, output, output_dim0, output_dim1,
-                                                               output_dim2, input_dim1);
-  // restore indices
-  SubOffset<<<GET_BLOCKS(output_dim1), GET_THREADS, 0, stream>>>(indices, output_dim1, -offset);
-  return;
-}
-
-template void CalEmbeddingLookup<float, int>(float *input, int *indices, float *output, size_t output_dim0,
-                                             size_t output_dim1, size_t output_dim2, size_t input_dim1, int64_t offset,
-                                             cudaStream_t stream);
-template void CalEmbeddingLookup<float, int64_t>(float *input, int64_t *indices, float *output, size_t output_dim0,
-                                                 size_t output_dim1, size_t output_dim2, size_t input_dim1,
-                                                 int64_t offset, cudaStream_t stream);
-template void CalEmbeddingLookup<half, int>(half *input, int *indices, half *output, size_t output_dim0,
-                                            size_t output_dim1, size_t output_dim2, size_t input_dim1, int64_t offset,
-                                            cudaStream_t stream);
-template void CalEmbeddingLookup<half, int64_t>(half *input, int64_t *indices, half *output, size_t output_dim0,
-                                                size_t output_dim1, size_t output_dim2, size_t input_dim1,
-                                                int64_t offset, cudaStream_t stream);
-template void CalEmbeddingLookup<double, int>(double *input, int *indices, double *output, size_t output_dim0,
-                                              size_t output_dim1, size_t output_dim2, size_t input_dim1, int64_t offset,
-                                              cudaStream_t stream);
-template void CalEmbeddingLookup<double, int64_t>(double *input, int64_t *indices, double *output, size_t output_dim0,
-                                                  size_t output_dim1, size_t output_dim2, size_t input_dim1,
-                                                  int64_t offset, cudaStream_t stream);
-template void CalEmbeddingLookup<int, int>(int *input, int *indices, int *output, size_t output_dim0,
-                                           size_t output_dim1, size_t output_dim2, size_t input_dim1, int64_t offset,
-                                           cudaStream_t stream);
-template void CalEmbeddingLookup<int, int64_t>(int *input, int64_t *indices, int *output, size_t output_dim0,
-                                               size_t output_dim1, size_t output_dim2, size_t input_dim1,
-                                               int64_t offset, cudaStream_t stream);
-template void CalEmbeddingLookup<int16_t, int>(int16_t *input, int *indices, int16_t *output, size_t output_dim0,
-                                               size_t output_dim1, size_t output_dim2, size_t input_dim1,
-                                               int64_t offset, cudaStream_t stream);
-template void CalEmbeddingLookup<int16_t, int64_t>(int16_t *input, int64_t *indices, int16_t *output,
-                                                   size_t output_dim0, size_t output_dim1, size_t output_dim2,
-                                                   size_t input_dim1, int64_t offset, cudaStream_t stream);
-template void CalEmbeddingLookup<int8_t, int>(int8_t *input, int *indices, int8_t *output, size_t output_dim0,
-                                              size_t output_dim1, size_t output_dim2, size_t input_dim1, int64_t offset,
-                                              cudaStream_t stream);
-template void CalEmbeddingLookup<int8_t, int64_t>(int8_t *input, int64_t *indices, int8_t *output, size_t output_dim0,
-                                                  size_t output_dim1, size_t output_dim2, size_t input_dim1,
-                                                  int64_t offset, cudaStream_t stream);
-template void CalEmbeddingLookup<uint8_t, int>(uint8_t *input, int *indices, uint8_t *output, size_t output_dim0,
-                                               size_t output_dim1, size_t output_dim2, size_t input_dim1,
-                                               int64_t offset, cudaStream_t stream);
-template void CalEmbeddingLookup<uint8_t, int64_t>(uint8_t *input, int64_t *indices, uint8_t *output,
-                                                   size_t output_dim0, size_t output_dim1, size_t output_dim2,
-                                                   size_t input_dim1, int64_t offset, cudaStream_t stream);
-template void CalEmbeddingLookup<bool, int>(bool *input, int *indices, bool *output, size_t output_dim0,
-                                            size_t output_dim1, size_t output_dim2, size_t input_dim1, int64_t offset,
-                                            cudaStream_t stream);
-template void CalEmbeddingLookup<bool, int64_t>(bool *input, int64_t *indices, bool *output, size_t output_dim0,
-                                                size_t output_dim1, size_t output_dim2, size_t input_dim1,
-                                                int64_t offset, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cu
deleted file mode 100644
index ae843420306..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cuh"
-
-template <typename T>
-__global__ void ExtractImagePatches(size_t output_size, int64_t stride_row, int64_t stride_col, int64_t rate_row,
-                                    int64_t rate_col, int64_t output_cols, bool need_batch, int64_t row_stride,
-                                    int64_t patch_stride, int64_t other_stride, int64_t input_row_size,
-                                    int64_t input_col_size, int64_t row_padding_top, int64_t col_padding_left,
-                                    int64_t col_input_stride, int64_t row_input_stride, int64_t patch_input_stride,
-                                    int64_t output_depth, const T *input, T *output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_size; pos += blockDim.x * gridDim.x) {
-    const int64_t batch_index = need_batch ? (static_cast<int64_t>(pos) / other_stride) : 0;
-    const int64_t inner_index =
-      need_batch ? (static_cast<int64_t>(pos) - batch_index * other_stride) : static_cast<int64_t>(pos);
-    // inner index
-    const int64_t patch_index = inner_index / patch_stride;
-    const int64_t patch_offset = (inner_index - patch_index * patch_stride) / output_depth;
-    // row
-    const int64_t row_index = patch_index / output_cols;
-    const int64_t row_offset = patch_offset / row_stride;
-    const int64_t input_row = row_index * stride_row + row_offset * rate_row - row_padding_top;
-    if (input_row < 0 || input_row >= input_row_size) {
-      output[pos] = static_cast<T>(0);
-      continue;
-    }
-    // col
-    const int64_t col_index = patch_index - row_index * output_cols;
-    const int64_t col_offset = patch_offset - row_offset * row_stride;
-    const int64_t input_col = col_index * stride_col + col_offset * rate_col - col_padding_left;
-    if (input_col < 0 || input_col >= input_col_size) {
-      output[pos] = static_cast<T>(0);
-      continue;
-    }
-    // depth
-    const int64_t depth = inner_index - (inner_index / output_depth) * output_depth;
-    // input index
-    const int64_t input_index =
-      depth + input_col * col_input_stride + input_row * row_input_stride + batch_index * patch_input_stride;
-    output[pos] = input[static_cast<size_t>(input_index)];
-  }
-  return;
-}
-
-template <typename T>
-void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, int64_t stride_col, int64_t rate_row,
-                                int64_t rate_col, int64_t output_cols, bool need_batch, int64_t row_stride,
-                                int64_t patch_stride, int64_t other_stride, int64_t input_row_size,
-                                int64_t input_col_size, int64_t row_padding_top, int64_t col_padding_left,
-                                int64_t col_input_stride, int64_t row_input_stride, int64_t patch_input_stride,
-                                int64_t output_depth, const T *input, T *output, cudaStream_t stream) {
-  ExtractImagePatches<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(
-    output_size, stride_row, stride_col, rate_row, rate_col, output_cols, need_batch, row_stride, patch_stride,
-    other_stride, input_row_size, input_col_size, row_padding_top, col_padding_left, col_input_stride, row_input_stride,
-    patch_input_stride, output_depth, input, output);
-}
-
-template void CalExtractImagePatchesNHWC<int>(size_t output_size, int64_t stride_row, int64_t stride_col,
-                                              int64_t rate_row, int64_t rate_col, int64_t output_cols, bool need_batch,
-                                              int64_t row_stride, int64_t patch_stride, int64_t other_stride,
-                                              int64_t input_row_size, int64_t input_col_size, int64_t row_padding_top,
-                                              int64_t col_padding_left, int64_t col_input_stride,
-                                              int64_t row_input_stride, int64_t patch_input_stride,
-                                              int64_t output_depth, const int *input, int *output, cudaStream_t stream);
-template void CalExtractImagePatchesNHWC<float>(size_t output_size, int64_t stride_row, int64_t stride_col,
-                                                int64_t rate_row, int64_t rate_col, int64_t output_cols,
-                                                bool need_batch, int64_t row_stride, int64_t patch_stride,
-                                                int64_t other_stride, int64_t input_row_size, int64_t input_col_size,
-                                                int64_t row_padding_top, int64_t col_padding_left,
-                                                int64_t col_input_stride, int64_t row_input_stride,
-                                                int64_t patch_input_stride, int64_t output_depth, const float *input,
-                                                float *output, cudaStream_t stream);
-template void CalExtractImagePatchesNHWC<half>(size_t output_size, int64_t stride_row, int64_t stride_col,
-                                               int64_t rate_row, int64_t rate_col, int64_t output_cols, bool need_batch,
-                                               int64_t row_stride, int64_t patch_stride, int64_t other_stride,
-                                               int64_t input_row_size, int64_t input_col_size, int64_t row_padding_top,
-                                               int64_t col_padding_left, int64_t col_input_stride,
-                                               int64_t row_input_stride, int64_t patch_input_stride,
-                                               int64_t output_depth, const half *input, half *output,
-                                               cudaStream_t stream);
-template void CalExtractImagePatchesNHWC<double>(size_t output_size, int64_t stride_row, int64_t stride_col,
-                                                 int64_t rate_row, int64_t rate_col, int64_t output_cols,
-                                                 bool need_batch, int64_t row_stride, int64_t patch_stride,
-                                                 int64_t other_stride, int64_t input_row_size, int64_t input_col_size,
-                                                 int64_t row_padding_top, int64_t col_padding_left,
-                                                 int64_t col_input_stride, int64_t row_input_stride,
-                                                 int64_t patch_input_stride, int64_t output_depth, const double *input,
-                                                 double *output, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cuh
deleted file mode 100644
index baaf80b611a..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_EXTRACT_IMAGE_PATCHES_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_EXTRACT_IMAGE_PATCHES_IMPL_CUH_
-
-#include <vector>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, int64_t stride_col, int64_t rate_row,
-                                int64_t rate_col, int64_t output_cols, bool need_batch, int64_t row_stride,
-                                int64_t patch_stride, int64_t other_stride, int64_t input_row_size,
-                                int64_t input_col_size, int64_t row_padding_top, int64_t col_padding_left,
-                                int64_t col_input_stride, int64_t row_input_stride, int64_t patch_input_stride,
-                                int64_t output_depth, const T *input, T *output, cudaStream_t stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_EXTRACT_IMAGE_PATCHES_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh
deleted file mode 100644
index 432456116a3..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-void CalLSQNudgePerChannel(const float *input, const int size, float *input_alpha, float *input_quant_max,
-                           float *input_div_alpha, float *input_quant, const bool neg_trunc, const int channel_num,
-                           cudaStream_t cuda_stream);
-
-void CalFakeLearnedScaleQuantPerChannel(float *output, const int size, float *input_alpha, float *input_quant,
-                                        const int channel_num, cudaStream_t cuda_stream);
-
-void CalFakeLearnedScaleQuantPerChannelGrad(float *grad_input, float *grad_alpha, const float *gradient, const int size,
-                                            const float *input_div_alpha, const float *input_quant,
-                                            const bool neg_trunc, const int channel_num, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh
deleted file mode 100644
index 26ca59bddee..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERLAYER_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERLAYER_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-void CalLSQNudgePerLayer(const float *input, const int size, float *input_alpha, float *input_quant_max,
-                         float *input_div_alpha, float *input_quant, const bool neg_trunc, cudaStream_t cuda_stream);
-
-void CalFakeLearnedScaleQuantPerLayer(float *output, const int size, float *input_alpha, float *input_quant,
-                                      cudaStream_t cuda_stream);
-
-void CalFakeLearnedScaleQuantPerLayerGrad(float *grad_input, float *grad_alpha, const float *gradient, const int size,
-                                          const float *input_div_alpha, const float *input_quant, const bool neg_trunc,
-                                          cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERLAYER_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cuh
deleted file mode 100644
index 36ca41adc91..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cuh
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERCHANNEL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERCHANNEL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-void CalNudgePerChannel(float *input_min, float *input_max, const float quant_min, const float quant_max,
-                        float *nudge_min, float *nudge_max, float *scale, const int channel_num, const bool symmetric,
-                        cudaStream_t cuda_stream);
-
-void CalFakeQuantPerChannel(const float *input, float *output, const int total_num, const int channel_num,
-                            const float *nudge_min, const float *nudge_max, const float *scale,
-                            cudaStream_t cuda_stream);
-
-void CalFakeQuantPerChannelGrad(const float *input, const float *gradient, float *output, const int total_num,
-                                const int channel_num, const float *nudge_min, const float *nudge_max,
-                                cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERCHANNEL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cuh
deleted file mode 100644
index 7884c3130b6..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERLAYER_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERLAYER_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-void CalNudgePerLayer(float *input_min, float *input_max, const float quant_min, const float quant_max,
-                      float *nudge_min, float *nudge_max, float *scale, const bool symmetric, cudaStream_t cuda_stream);
-
-void CalFakeQuantPerLayer(const float *input, float *output, const int size, const float *nudge_min,
-                          const float *nudge_max, const float *scale, cudaStream_t cuda_stream);
-
-void CalFakeQuantPerLayerGrad(const float *input, const float *gradient, float *output, const int size,
-                              const float *nudge_min, const float *nudge_max, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERLAYER_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/float_status_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/float_status_impl.cuh
deleted file mode 100644
index bc078cbc681..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/float_status_impl.cuh
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_FLOATSTATUS_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_FLOATSTATUS_H_
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void CalFloatStatus(const size_t size, const T *input, float *output, cudaStream_t stream);
-template <typename T>
-void CalIsNan(const size_t size, const T *input, bool *output, cudaStream_t stream);
-template <typename T>
-void CalIsInf(const size_t size, const T *input, bool *output, cudaStream_t stream);
-template <typename T>
-void CalIsFinite(const size_t size, const T *input, bool *output, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_FLOATSTATUS_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather.cu
deleted file mode 100755
index 3b6e5dd163f..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include "plugin/device/gpu/kernel/cuda_impl/gather.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T, typename S>
-__global__ void GatherKernel(const T *input, const S *index, T *output, const size_t dim_before_axis,
-                             const size_t dim_at_axis_input, const size_t dim_at_axis_output,
-                             const size_t dim_after_axis) {
-  size_t num = dim_before_axis * dim_at_axis_output * dim_after_axis;
-  size_t i, k;
-  for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < num;
-       id += blockDim.x * gridDim.x) {
-    i = id / (dim_at_axis_output * dim_after_axis);
-    k = id % dim_after_axis;
-
-    S j = index[id];
-    if (j < 0) {
-        j += static_cast<S>(dim_at_axis_input);
-    }
-    CUDA_KERNEL_ASSERT(j >= 0);
-    size_t j_read = static_cast<size_t>(j);
-    CUDA_KERNEL_ASSERT(j_read < dim_at_axis_input);
-    size_t read_id = i * dim_at_axis_input * dim_after_axis + j_read * dim_after_axis + k;
-    output[id] = input[read_id];
-  }
-  return;
-}
-template <typename T, typename S>
-void Gather(const T *input, const S *index, T *output, const size_t dim_before_axis,
-            const size_t dim_at_axis_input, const size_t dim_at_axis_output,
-            const size_t dim_after_axis, cudaStream_t stream) {
-  size_t size = dim_before_axis * dim_at_axis_output * dim_after_axis;
-  GatherKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, index, output, dim_before_axis, dim_at_axis_input,
-                                                             dim_at_axis_output, dim_after_axis);
-  return;
-}
-
-template void Gather<double, int>(const double *input, const int *index, double *output,
-                                 const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                 const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                 cudaStream_t stream);
-template void Gather<double, int64_t>(const double *input, const int64_t *index, double *output,
-                                     const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                     const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                     cudaStream_t stream);
-template void Gather<float, int>(const float *input, const int *index, float *output,
-                                 const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                 const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                 cudaStream_t stream);
-template void Gather<float, int64_t>(const float *input, const int64_t *index, float *output,
-                                     const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                     const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                     cudaStream_t stream);
-template void Gather<half, int>(const half *input, const int *index, half *output,
-                                const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                cudaStream_t stream);
-template void Gather<half, int64_t>(const half *input, const int64_t *index, half *output,
-                                    const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                    cudaStream_t stream);
-template void Gather<int64_t, int>(const int64_t *input, const int *index, int64_t *output,
-                                   const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                   const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                   cudaStream_t stream);
-template void Gather<int64_t, int64_t>(const int64_t *input, const int64_t *index, int64_t *output,
-                                       const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                       const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                       cudaStream_t stream);
-template void Gather<int, int>(const int *input, const int *index, int *output,
-                                const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                cudaStream_t stream);
-template void Gather<int, int64_t>(const int *input, const int64_t *index, int *output,
-                                    const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                    cudaStream_t stream);
-template void Gather<int16_t, int>(const int16_t *input, const int *index, int16_t *output,
-                               const size_t dim_before_axis, const size_t dim_at_axis_input,
-                               const size_t dim_at_axis_output, const size_t dim_after_axis,
-                               cudaStream_t stream);
-template void Gather<int16_t, int64_t>(const int16_t *input, const int64_t *index, int16_t *output,
-                                   const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                   const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                   cudaStream_t stream);
-template void Gather<int8_t, int>(const int8_t *input, const int *index, int8_t *output,
-                               const size_t dim_before_axis, const size_t dim_at_axis_input,
-                               const size_t dim_at_axis_output, const size_t dim_after_axis,
-                               cudaStream_t stream);
-template void Gather<int8_t, int64_t>(const int8_t *input, const int64_t *index, int8_t *output,
-                                   const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                   const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                   cudaStream_t stream);
-template void Gather<unsigned char, int>(const unsigned char *input, const int *index, unsigned char *output,
-                                   const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                   const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                   cudaStream_t stream);
-template void Gather<unsigned char, int64_t>(const unsigned char *input, const int64_t *index, unsigned char *output,
-                                       const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                       const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                       cudaStream_t stream);
-template void Gather<bool, int>(const bool *input, const int *index, bool *output,
-                               const size_t dim_before_axis, const size_t dim_at_axis_input,
-                               const size_t dim_at_axis_output, const size_t dim_after_axis,
-                               cudaStream_t stream);
-template void Gather<bool, int64_t>(const bool *input, const int64_t *index, bool *output,
-                                   const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                   const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                   cudaStream_t stream);
-template void Gather<uint16_t, int>(const uint16_t *input, const int *index, uint16_t *output,
-                               const size_t dim_before_axis, const size_t dim_at_axis_input,
-                               const size_t dim_at_axis_output, const size_t dim_after_axis,
-                               cudaStream_t stream);
-template void Gather<uint16_t, int64_t>(const uint16_t *input, const int64_t *index, uint16_t *output,
-                                   const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                   const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                   cudaStream_t stream);
-template void Gather<uint32_t, int>(const uint32_t *input, const int *index, uint32_t *output,
-                               const size_t dim_before_axis, const size_t dim_at_axis_input,
-                               const size_t dim_at_axis_output, const size_t dim_after_axis,
-                               cudaStream_t stream);
-template void Gather<uint32_t, int64_t>(const uint32_t *input, const int64_t *index, uint32_t *output,
-                                   const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                   const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                   cudaStream_t stream);
-template void Gather<uint64_t, int>(const uint64_t *input, const int *index, uint64_t *output,
-                               const size_t dim_before_axis, const size_t dim_at_axis_input,
-                               const size_t dim_at_axis_output, const size_t dim_after_axis,
-                               cudaStream_t stream);
-template void Gather<uint64_t, int64_t>(const uint64_t *input, const int64_t *index, uint64_t *output,
-                                   const size_t dim_before_axis, const size_t dim_at_axis_input,
-                                   const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                   cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather_grad.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather_grad.cu
deleted file mode 100755
index 7e0136caa49..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather_grad.cu
+++ /dev/null
@@ -1,151 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include "plugin/device/gpu/kernel/cuda_impl/gather_grad.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-__global__ void GatherGradKernel(const size_t num, const T *index, const S *grad, S *output,
-                                 const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                 const size_t dim_at_axis_output, const size_t dim_after_axis) {
-  size_t i, k;
-
-  for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < num;
-       id += blockDim.x * gridDim.x) {
-    i = id / (dim_at_axis_index * dim_after_axis);
-    k = id % dim_after_axis;
-
-    T j = index[id];
-    if (j < 0) {
-        j += static_cast<T>(dim_at_axis_output);
-    }
-    CUDA_KERNEL_ASSERT(j >= 0);
-    size_t j_read = static_cast<size_t>(j);
-    CUDA_KERNEL_ASSERT(j_read < dim_at_axis_output);
-    size_t read_id = i * dim_at_axis_output * dim_after_axis + j_read * dim_after_axis + k;
-    MsAtomicAdd(output + read_id, grad[id]);
-  }
-  return;
-}
-
-template <typename S>
-__global__ void InitOutput(const size_t size, S *output) {
-    S zero = 0;
-    for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < size; id += blockDim.x * gridDim.x) {
-        output[id] = zero;
-    }
-    return;
-}
-
-template <typename T, typename S>
-void GatherGrad(const T *index, const S *grad, S *output, const size_t dim_before_axis,
-                const size_t dim_at_axis_index, const size_t dim_at_axis_output, const size_t dim_after_axis,
-                cudaStream_t stream) {
-  size_t size = dim_before_axis * dim_at_axis_output * dim_after_axis;
-  InitOutput<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(size, output);
-
-  size = dim_before_axis * dim_at_axis_index * dim_after_axis;
-  GatherGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(size, index, grad, output,
-                                                                 dim_before_axis, dim_at_axis_index,
-                                                                 dim_at_axis_output, dim_after_axis);
-  return;
-}
-
-template void GatherGrad<int, double>(const int *index, const double *grad, double *output,
-                                     const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                     const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                     cudaStream_t stream);
-template void GatherGrad<int64_t, double>(const int64_t *index, const double *grad, double *output,
-                                          const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                          const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                          cudaStream_t stream);
-template void GatherGrad<int, float>(const int *index, const float *grad, float *output,
-                                     const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                     const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                     cudaStream_t stream);
-template void GatherGrad<int64_t, float>(const int64_t *index, const float *grad, float *output,
-                                         const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                         const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                         cudaStream_t stream);
-template void GatherGrad<int, half>(const int *index, const half *grad, half *output,
-                                    const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                    cudaStream_t stream);
-template void GatherGrad<int64_t, half>(const int64_t *index, const half *grad, half *output,
-                                        const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                        const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                        cudaStream_t stream);
-template void GatherGrad<int, int>(const int *index, const int *grad, int *output,
-                                    const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                    cudaStream_t stream);
-template void GatherGrad<int64_t, int>(const int64_t *index, const int *grad, int *output,
-                                        const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                        const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                        cudaStream_t stream);
-template void GatherGrad<int, int8_t>(const int *index, const int8_t *grad, int8_t *output,
-                                    const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                    cudaStream_t stream);
-template void GatherGrad<int64_t, int8_t>(const int64_t *index, const int8_t *grad, int8_t *output,
-                                        const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                        const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                        cudaStream_t stream);
-template void GatherGrad<int, int16_t>(const int *index, const int16_t *grad, int16_t *output,
-                                    const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                    cudaStream_t stream);
-template void GatherGrad<int64_t, int16_t>(const int64_t *index, const int16_t *grad, int16_t *output,
-                                        const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                        const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                        cudaStream_t stream);
-template void GatherGrad<int, int64_t>(const int *index, const int64_t *grad, int64_t *output,
-                                    const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                    cudaStream_t stream);
-template void GatherGrad<int64_t, int64_t>(const int64_t *index, const int64_t *grad, int64_t *output,
-                                        const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                        const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                        cudaStream_t stream);
-template void GatherGrad<int, unsigned char>(const int *index, const unsigned char *grad, unsigned char *output,
-                                    const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                    cudaStream_t stream);
-template void GatherGrad<int64_t, unsigned char>(const int64_t *index, const unsigned char *grad, unsigned char *output,
-                                        const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                        const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                        cudaStream_t stream);
-template void GatherGrad<int, unsigned int>(const int *index, const unsigned int *grad, unsigned int *output,
-                                             const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                             const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                             cudaStream_t stream);
-template void GatherGrad<int64_t, unsigned int>(const int64_t *index, const unsigned int *grad, unsigned int *output,
-                                                 const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                                 const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                                 cudaStream_t stream);
-template void GatherGrad<int, bool>(const int *index, const bool *grad, bool *output,
-                                    const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                    const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                    cudaStream_t stream);
-template void GatherGrad<int64_t, bool>(const int64_t *index, const bool *grad, bool *output,
-                                        const size_t dim_before_axis, const size_t dim_at_axis_index,
-                                        const size_t dim_at_axis_output, const size_t dim_after_axis,
-                                        cudaStream_t stream);
-
-
-
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gathernd.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gathernd.cu
deleted file mode 100644
index ff9f7033567..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gathernd.cu
+++ /dev/null
@@ -1,114 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/gathernd.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T, typename S>
-__global__ void GatherNdKernel(T *input, S *indices, T *output, const size_t output_dim0, const size_t output_dim1,
-                               const size_t indices_dim1, S *batch_indices, S *batch_strides) {
-  int num = output_dim0 * output_dim1;
-  int i, j;
-  for (int write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num;
-       write_index += blockDim.x * gridDim.x) {
-    i = write_index / output_dim1 % output_dim0;
-    j = write_index % output_dim1;
-
-    bool out_of_bound = false;
-    int read_index = 0;
-    int indices_i = 0;
-    for (size_t k = 0; k < indices_dim1; k++) {
-      size_t ind = indices_dim1 * i + k;
-      indices_i = indices[ind];
-      out_of_bound |= !(indices_i < batch_indices[k]);
-      read_index += indices_i * batch_strides[k];
-    }
-    read_index += j;
-
-    if (!out_of_bound) {
-      output[write_index] = input[read_index];
-    } else {
-      output[write_index] = 0;
-    }
-  }
-  return;
-}
-template <typename T, typename S>
-void GatherNd(T *input, S *indices, T *output, const size_t &output_dim0, const size_t &output_dim1,
-              const size_t &indices_dim1, S *batch_indices, S *batch_strides, cudaStream_t stream) {
-  int size = output_dim0 * output_dim1;
-  GatherNdKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, indices, output, output_dim0, output_dim1,
-                                                               indices_dim1, batch_indices, batch_strides);
-  return;
-}
-
-template void GatherNd<double, int>(double *input, int *indices, double *output, const size_t &output_dim0,
-                                    const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
-                                    int *batch_strides, cudaStream_t stream);
-template void GatherNd<float, int>(float *input, int *indices, float *output, const size_t &output_dim0,
-                                   const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
-                                   int *batch_strides, cudaStream_t stream);
-template void GatherNd<half, int>(half *input, int *indices, half *output, const size_t &output_dim0,
-                                  const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
-                                  int *batch_strides, cudaStream_t stream);
-template void GatherNd<int, int>(int *input, int *indices, int *output, const size_t &output_dim0,
-                                 const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
-                                 int *batch_strides, cudaStream_t stream);
-template void GatherNd<short, int>(short *input, int *indices, short *output, const size_t &output_dim0,  // NOLINT
-                                   const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
-                                   int *batch_strides, cudaStream_t stream);
-template void GatherNd<unsigned int, int>(unsigned int *input, int *indices, unsigned int *output,
-                                    const size_t &output_dim0, const size_t &output_dim1,
-                                    const size_t &indices_dim1, int *batch_indices, int *batch_strides,
-                                    cudaStream_t stream);
-template void GatherNd<char, int>(char *input, int *indices, char *output, const size_t &output_dim0,
-                                      const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
-                                      int *batch_strides, cudaStream_t stream);
-template void GatherNd<unsigned char, int>(unsigned char *input, int *indices, unsigned char *output,
-                                           const size_t &output_dim0, const size_t &output_dim1,
-                                           const size_t &indices_dim1, int *batch_indices, int *batch_strides,
-                                           cudaStream_t stream);
-template void GatherNd<bool, int>(bool *input, int *indices, bool *output, const size_t &output_dim0,
-                                  const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
-                                  int *batch_strides, cudaStream_t stream);
-template void GatherNd<double, int64_t>(double *input, int64_t *indices, double *output, const size_t &output_dim0,
-                                        const size_t &output_dim1, const size_t &indices_dim1, int64_t *batch_indices,
-                                        int64_t *batch_strides, cudaStream_t stream);
-template void GatherNd<float, int64_t>(float *input, int64_t *indices, float *output, const size_t &output_dim0,
-                                       const size_t &output_dim1, const size_t &indices_dim1, int64_t *batch_indices,
-                                       int64_t *batch_strides, cudaStream_t stream);
-template void GatherNd<half, int64_t>(half *input, int64_t *indices, half *output, const size_t &output_dim0,
-                                      const size_t &output_dim1, const size_t &indices_dim1, int64_t *batch_indices,
-                                      int64_t *batch_strides, cudaStream_t stream);
-template void GatherNd<int, int64_t>(int *input, int64_t *indices, int *output, const size_t &output_dim0,
-                                     const size_t &output_dim1, const size_t &indices_dim1, int64_t *batch_indices,
-                                     int64_t *batch_strides, cudaStream_t stream);
-template void GatherNd<short, int64_t>(short *input, int64_t *indices, short *output,  // NOLINT
-                                       const size_t &output_dim0, const size_t &output_dim1, const size_t &indices_dim1,
-                                       int64_t *batch_indices, int64_t *batch_strides, cudaStream_t stream);
-template void GatherNd<unsigned int, int64_t>(unsigned int *input, int64_t *indices, unsigned int *output,
-                                        const size_t &output_dim0, const size_t &output_dim1,
-                                        const size_t &indices_dim1, int64_t *batch_indices,
-                                        int64_t *batch_strides, cudaStream_t stream);
-template void GatherNd<char, int64_t>(char *input, int64_t *indices, char *output, const size_t &output_dim0,
-                                      const size_t &output_dim1, const size_t &indices_dim1, int64_t *batch_indices,
-                                      int64_t *batch_strides, cudaStream_t stream);
-template void GatherNd<unsigned char, int64_t>(unsigned char *input, int64_t *indices, unsigned char *output,
-                                               const size_t &output_dim0, const size_t &output_dim1,
-                                               const size_t &indices_dim1, int64_t *batch_indices,
-                                               int64_t *batch_strides, cudaStream_t stream);
-template void GatherNd<bool, int64_t>(bool *input, int64_t *indices, bool *output, const size_t &output_dim0,
-                                      const size_t &output_dim1, const size_t &indices_dim1, int64_t *batch_indices,
-                                      int64_t *batch_strides, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gatherv2.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gatherv2.cu
deleted file mode 100755
index 6e895d5f0f2..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gatherv2.cu
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include "plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T, typename S>
-__global__ void GatherV2Kernel(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1,
-                               size_t output_dim2, size_t input_dim1) {
-  size_t num = output_dim0 * output_dim1 * output_dim2;
-  size_t i, j, k;
-  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num;
-       write_index += blockDim.x * gridDim.x) {
-    i = write_index / (output_dim1 * output_dim2) % output_dim0;
-    j = write_index / output_dim2 % output_dim1;
-    k = write_index % output_dim2;
-
-    if ((indices[j] >= 0) && (indices[j] < input_dim1)) {
-      size_t read_index = i * input_dim1 * output_dim2 + indices[j] * output_dim2 + k;
-      output[write_index] = input[read_index];
-    } else {
-      output[write_index] = 0;
-    }
-  }
-
-  return;
-}
-template <typename T, typename S>
-void GatherV2(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2,
-              size_t input_dim1, cudaStream_t stream) {
-  size_t size = output_dim0 * output_dim1 * output_dim2;
-  GatherV2Kernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, indices, output, output_dim0, output_dim1,
-                                                               output_dim2, input_dim1);
-  return;
-}
-
-template void GatherV2<float, int>(float *input, int *indices, float *output, size_t output_dim0, size_t output_dim1,
-                                   size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<float, int64_t>(float *input, int64_t *indices, float *output, size_t output_dim0,
-                                       size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<half, int>(half *input, int *indices, half *output, size_t output_dim0, size_t output_dim1,
-                                  size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<half, int64_t>(half *input, int64_t *indices, half *output, size_t output_dim0,
-                                      size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<double, int>(double *input, int *indices, double *output, size_t output_dim0, size_t output_dim1,
-                                    size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<double, int64_t>(double *input, int64_t *indices, double *output, size_t output_dim0,
-                                        size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<int, int>(int *input, int *indices, int *output, size_t output_dim0, size_t output_dim1,
-                                 size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<int, int64_t>(int *input, int64_t *indices, int *output, size_t output_dim0, size_t output_dim1,
-                                     size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<int16_t, int>(int16_t *input, int *indices, int16_t *output, size_t output_dim0,
-                                     size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<int16_t, int64_t>(int16_t *input, int64_t *indices, int16_t *output, size_t output_dim0,
-                                         size_t output_dim1, size_t output_dim2, size_t input_dim1,
-                                         cudaStream_t stream);
-template void GatherV2<int8_t, int>(int8_t *input, int *indices, int8_t *output, size_t output_dim0, size_t output_dim1,
-                                    size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<int8_t, int64_t>(int8_t *input, int64_t *indices, int8_t *output, size_t output_dim0,
-                                        size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<uint32_t, int>(uint32_t *input, int *indices, uint32_t *output, size_t output_dim0,
-                                      size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<uint32_t, int64_t>(uint32_t *input, int64_t *indices, uint32_t *output, size_t output_dim0,
-                                          size_t output_dim1, size_t output_dim2, size_t input_dim1,
-                                          cudaStream_t stream);
-template void GatherV2<uint8_t, int>(uint8_t *input, int *indices, uint8_t *output, size_t output_dim0,
-                                     size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<uint8_t, int64_t>(uint8_t *input, int64_t *indices, uint8_t *output, size_t output_dim0,
-                                         size_t output_dim1, size_t output_dim2, size_t input_dim1,
-                                         cudaStream_t stream);
-template void GatherV2<bool, int>(bool *input, int *indices, bool *output, size_t output_dim0,
-                                     size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream);
-template void GatherV2<bool, int64_t>(bool *input, int64_t *indices, bool *output, size_t output_dim0,
-                                         size_t output_dim1, size_t output_dim2, size_t input_dim1,
-                                         cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hash_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hash_impl.cuh
deleted file mode 100755
index e13d2dc124b..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hash_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_PS_PS_CACHE_KERNEL_HASH_IMPL_H_
-#define MINDSPORE_CCSRC_PS_PS_CACHE_KERNEL_HASH_IMPL_H_
-
-template <typename T>
-void DoHashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size,
-                   const int hash_dim, cudaStream_t cuda_stream);
-
-template <typename T>
-void DoHashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index, const int index_size,
-                  const int hash_dim, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_PS_PS_CACHE_KERNEL_HASH_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cuh
deleted file mode 100644
index 22791f91f7e..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_INSTANCE_NORM_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_INSTANCE_NORM_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-void CopyMemDevice2Device(const size_t N, const size_t C,
-                          float *gamma_addr, float *beta_addr, float *runing_mean_addr, float *runnig_variance_addr,
-                          float *ws_gamma, float *ws_beta, float *ws_mean, float *ws_var,
-                          cudaStream_t cuda_stream);
-void ComputeMean(const size_t N, const size_t C, float *dgamma, float *dbeta, const float *ws_dgamma,
-                 const float *ws_dbeta, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_INSTANCE_NORM_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cuh
deleted file mode 100644
index 1f11270e583..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cuh
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_GRAD_GRAD_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_GRAD_GRAD_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-void LayerNormGradGrad(const int& row_dim, const int& col_dim, const int& param_dim, T* global_sum1, T* global_sum2,
-                       const T& epsilon, const T* dy, const T* x, const T* mean, const T* var, const T* gamma,
-                       const T* grad_dx, const T* grad_dg, const T* grad_db, T* d_dy, T* d_x, T* d_gamma,
-                       cudaStream_t stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_GRAD_GRAD_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cuh
deleted file mode 100644
index 0b292600b6c..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_GRAD_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_GRAD_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-void LayerNormGrad(const int& row_dim, const int& col_dim, const int& param_dim, const T& epsilon, const T* dy,
-                   const T* x, const T* mean, const T* var, const T* gamma, T* dx, T* dg, T* db, cudaStream_t stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_GRAD_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cuh
deleted file mode 100644
index 7493f70d247..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LOCAL_RESPONSE_NORM_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LOCAL_RESPONSE_NORM_H_
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void CalLocalResponseNormNHWC(const T *input, const int depth_radius, const float bias, const float alpha,
-  const float beta, const size_t channels, const size_t num_elements, float *scale, T *output,
-  cudaStream_t cuda_stream);
-
-template <typename T>
-void CalLocalResponseNormGradNHWC(const T *dy, const T *x, const T *y, const int depth_radius, const float bias,
-  const float alpha, const float beta, const size_t channels, const size_t num_elements, float *scale, T *dx,
-  cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LOCAL_RESPONSE_NORM_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh
deleted file mode 100644
index bbb3137e8d7..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LOSS_WITH_REDUCTION_IMPL_CUH
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LOSS_WITH_REDUCTION_IMPL_CUH
-
-#include <map>
-#include <string>
-
-enum class ReductionMode { kNone, kMean, kSum };
-
-static std::map<std::string, ReductionMode> kReductionModeMap{
-  {"none", ReductionMode::kNone}, {"mean", ReductionMode::kMean}, {"sum", ReductionMode::kSum}};
-
-template <typename T>
-void BinaryCrossEntropyLoss(const int &input_size, const ReductionMode &reduction, const T *input_x, const T *input_y,
-                            const T *weight, T *loss, T *tmp_loss, cudaStream_t stream);
-template <typename T>
-void BinaryCrossEntropyLossGrad(const int &input_size, const ReductionMode &reduction, const T *input_x,
-                                const T *input_y, const T *weight, const T *dloss, T *dx, cudaStream_t stream);
-template <typename T>
-void KLDivLoss(const int &input_size, const ReductionMode &reduction, const T *input_x, const T *input_y, T *loss,
-               T *tmp_loss, cudaStream_t stream);
-template <typename T>
-void KLDivLossGrad(const int &input_size, const ReductionMode &reduction, const T *input_x, const T *input_y,
-                   const T *dloss, T *dx, T *dy, cudaStream_t stream);
-template <typename T, typename S>
-void NLLLoss(const int n, const int c, const ReductionMode reduction, const T *input, const int32_t *target,
-             const S *weight, T *loss, S *total_weight, T *tmp_loss, S *tmp_target_weight, cudaStream_t stream);
-template <typename T, typename S>
-void NLLLossGrad(const int n, const int c, const ReductionMode reduction, const T *input, const int32_t *target,
-                 const S *weight, const S *total_weight, const T *dloss, T *dinput, cudaStream_t stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LOSS_WITH_REDUCTION_IMPL_CUH
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_combine_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_combine_impl.cuh
deleted file mode 100644
index f7056fc1885..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_combine_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MATRIXCOMBINE_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MATRIXCOMBINE_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void MatrixCombine(const size_t size, const size_t src_height, const size_t src_width, const size_t dst_width,
-                   const size_t residual, const size_t res_width, const size_t batch, T *input_addr, T *output_addr,
-                   cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MATRIXCOMBINE_H_
-
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cuh
deleted file mode 100644
index ea49f67e7b1..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Copyright 2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_MATRIX_DIAG_PART_IMPL_CUH
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_MATRIX_DIAG_PART_IMPL_CUH
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void MatrixDiagPart(const size_t size, const T *input_matrix_addr, const size_t m, const size_t n, const int64_t l,
-                    const int64_t u, const size_t num_diags, const size_t max_diag_len, const int64_t la,
-                    const int64_t ua, T *padding_value, T *output_addr, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_MATRIX_DIAG_PART_IMPL_CUH
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cuh
deleted file mode 100644
index 19c42579013..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MATRIX_SET_DIAG_IMPL_CUH_
-#define MINDSPORE_MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MATRIX_SET_DIAG_IMPL_CUH_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_col, const int num_diags,
-                   const int max_diag_len, const int lower_index, const int upper_index,
-                   const bool right_align_super_diagonal, const bool right_align_sub_diagonal,
-                   const bool is_single_diag, const T *diag_addr, T *output_addr, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MATRIX_SET_DIAG_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cuh
deleted file mode 100644
index a624d03bbaf..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cuh
+++ /dev/null
@@ -1,23 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_
-template <typename T, typename S>
-void CalMaxPoolWithArgmaxGrad(const T* dy, const S* index, const int n, const int c, const int xHeight,
-                              const int xWidth, const int dyHeight, const int dyWidth, T* dx, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cuh
deleted file mode 100644
index 8b088067edc..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cuh
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_
-template <typename T, typename S>
-void CalMaxPoolWithArgmax(const T* input, const int n, const int c, const int h, const int w, const int windowHeight,
-                          const int windowWidth, const int strideHeight, const int strideWidth, const int padTop,
-                          const int padLeft, const int outputHeight, const int outputWidth, T* output, S *index,
-                          cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cuh
deleted file mode 100644
index bdbc7654c53..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_MIN_MAX_UPDATE_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_MIN_MAX_UPDATE_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-void CalMinMaxPerChannel(float *input, float *input_min, float *input_max, float *output_min, float *output_max,
-                         const int total_num, const int channel_num, const float ema_decay, const bool ema,
-                         cudaStream_t cuda_stream);
-
-void CalMinMaxPerLayer(float *input, float *input_min, float *input_max, float *output_min, float *output_max,
-                       const int size, const float ema_decay, const bool ema, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_MIN_MAX_UPDATE_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cuh
deleted file mode 100755
index d85f49fcd96..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cuh
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MIRROR_PAD_IMPL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MIRROR_PAD_IMPL_H_
-#include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-// preset size of paddings
-#define MAX_PADDINGS 4
-#define PADDING_SIZE 2
-
-// define constants for kernel indexing use
-#define BATCH 0 * PADDING_SIZE
-#define CHANNEL 1 * PADDING_SIZE
-#define HEIGHT 2 * PADDING_SIZE
-#define WIDTH 3 * PADDING_SIZE
-#define TOP 0
-#define BOTTOM 1
-#define LEFT 0
-#define RIGHT 1
-
-template <typename T>
-void CalMirrorPad(const size_t size, const T *input, const int old_batch, const int old_channel, const int old_height,
-                  const int old_width, const int padded_height, const int padded_width, int padd_num,
-                  const int64_t *paddings, int mode, T *output, cudaStream_t cuda_stream);
-template <typename T>
-void CalMirrorPadGrad(const size_t dx_size, const size_t dy_size, T *dy, T *interim, const int output_batch,
-                      const int output_channel, const int output_height, const int output_width, const int input_height,
-                      const int input_width, const int padd_dim, const int64_t *paddings, int mode, T *dx,
-                      cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MIRROR_PAD_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh
deleted file mode 100644
index aec380a6c91..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T, typename S, typename G>
-void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
-                            const S *momentum, bool use_nesterov, cudaStream_t cuda_stream);
-template <typename T, typename S>
-void FusedWeightDecayScaleMomentum(const size_t element_num, T *weight_decay, T *scale, T *variable, T *accumulation,
-                                   const T *learning_rate, const S *gradient, const T *momentum,
-                                   cudaStream_t cuda_stream);
-template <typename T, typename S>
-void FusedWeightDecayMomentum(const size_t element_num, T *weight_decay, T *variable, T *accumulation,
-                              const T *learning_rate, const S *gradient, const T *momentum, cudaStream_t cuda_stream);
-template <typename T, typename S>
-void FusedScaleMomentum(const size_t element_num, T *scale, T *variable, T *accumulation, const T *learning_rate,
-                        const S *gradient, const T *momentum, cudaStream_t cuda_stream);
-template <typename T, typename S>
-void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, const size_t *element, T **weight_decay,
-                                          T **scale, T **variable, T **accumulation, T **learning_rate, S **gradient,
-                                          T **momentum, cudaStream_t cuda_stream);
-template <typename T, typename S>
-void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *element, T **scale, T **variable,
-                               T **accumulation, T **learning_rate, S **gradient, T **momentum,
-                               cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cuh
deleted file mode 100644
index b30b5b43c66..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MULTINOMIAL_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MULTINOMIAL_IMPL_CUH_
-#include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-void InitRandState(int seed, int num, curandState *state, cudaStream_t stream);
-template <typename T>
-void Multinomial(int row, int col, T *probs, curandState *rand_state, int64_t *num_sample, int *output,
-                 cudaStream_t stream);
-template <typename T>
-void CheckNonNeg(const size_t size, const T *input, T *output, cudaStream_t stream);
-template <typename T>
-void CheckZero(const size_t distributions, const size_t categories, const T *input, T *output, cudaStream_t stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MULTINOMIAL_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cuh
deleted file mode 100644
index d3e380b3d3a..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cuh
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_NMS_WITH_MASK_IMPL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_NMS_WITH_MASK_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-void CalSort(const int &inner, T *data_in, T *data_out, int *index_buff, T *data_buff, int box_size_,
-             cudaStream_t stream);
-
-template <typename T>
-void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, T *input, T *output, int *index_buff, int box_size_,
-                   bool *row_mask, cudaStream_t cuda_stream);
-
-template <typename T>
-void CalNms(const int num, const float IOU_value, T *output, bool *sel_boxes, int box_size_, bool *row_mask,
-            cudaStream_t cuda_stream);
-
-int NmsRoundUpPower2(int v);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_NMS_WITH_MASK_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cu
deleted file mode 100644
index 1c2fe95ce19..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-#include "oneslike_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-__global__ void OnesLike(const size_t size, const T* input,  T* output) {
-  int one = 1;
-  T val = static_cast<T>(one);
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
-    output[pos] = val;
-  }
-  return;
-}
-template <typename T>
-void CalOnesLike(const size_t size, const T* input, T* output, cudaStream_t cuda_stream) {
-  OnesLike<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output);
-  return;
-}
-
-template void CalOnesLike<double>(const size_t size, const double* input, double* output, cudaStream_t cuda_stream);
-template void CalOnesLike<float>(const size_t size, const float* input, float* output, cudaStream_t cuda_stream);
-template void CalOnesLike<half>(const size_t size, const half* input, half* output, cudaStream_t cuda_stream);
-template void CalOnesLike<int8_t>(const size_t size, const int8_t* input, int8_t* output, cudaStream_t cuda_stream);
-template void CalOnesLike<int16_t>(const size_t size, const int16_t* input, int16_t* output, cudaStream_t cuda_stream);
-template void CalOnesLike<int32_t>(const size_t size, const int32_t* input, int32_t* output, cudaStream_t cuda_stream);
-template void CalOnesLike<int64_t>(const size_t size, const int64_t* input, int64_t* output, cudaStream_t cuda_stream);
-template void CalOnesLike<uint8_t>(const size_t size, const uint8_t* input, uint8_t* output, cudaStream_t cuda_stream);
-template void CalOnesLike<uint16_t>(const size_t size, const uint16_t* input, uint16_t* output,
-                                    cudaStream_t cuda_stream);
-template void CalOnesLike<uint32_t>(const size_t size, const uint32_t* input, uint32_t* output,
-                                    cudaStream_t cuda_stream);
-template void CalOnesLike<uint64_t>(const size_t size, const uint64_t* input, uint64_t* output,
-                                    cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pack.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pack.cu
deleted file mode 100755
index d9df908bfdc..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pack.cu
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <cuda_runtime.h>
-#include "plugin/device/gpu/kernel/cuda_impl/pack.cuh"
-template <typename T>
-__global__ void Pack(const size_t size, const size_t input_num, const size_t dims_behind_axis, T** inputs, T* output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
-      size_t cur_input_index = pos / dims_behind_axis % input_num;
-      size_t cycle_len = input_num * dims_behind_axis;
-      size_t local_index = pos / cycle_len * dims_behind_axis + pos % cycle_len % dims_behind_axis;
-      output[pos] = inputs[cur_input_index][local_index];
-  }
-  return;
-}
-
-template <typename T>
-void PackKernel(const size_t size, const size_t input_num,
-                const size_t dims_behind_axis, T** inputs, T* output,
-                cudaStream_t cuda_stream) {
-  Pack<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_num, dims_behind_axis, inputs, output);
-  return;
-}
-
-
-template void PackKernel(const size_t size, const size_t input_num,
-                           const size_t dims_behind_axis, int8_t** inputs, int8_t* output,
-                           cudaStream_t cuda_stream);
-template void PackKernel(const size_t size, const size_t input_num,
-                           const size_t dims_behind_axis, int16_t** inputs, int16_t* output,
-                           cudaStream_t cuda_stream);
-template void PackKernel(const size_t size, const size_t input_num,
-                           const size_t dims_behind_axis, int** inputs, int* output,
-                           cudaStream_t cuda_stream);
-template void PackKernel(const size_t size, const size_t input_num,
-                           const size_t dims_behind_axis, int64_t** inputs, int64_t* output,
-                           cudaStream_t cuda_stream);
-template void PackKernel(const size_t size, const size_t input_num,
-                           const size_t dims_behind_axis, uint8_t** inputs, uint8_t* output,
-                           cudaStream_t cuda_stream);
-template void PackKernel(const size_t size, const size_t input_num,
-                           const size_t dims_behind_axis, uint16_t** inputs, uint16_t* output,
-                           cudaStream_t cuda_stream);
-template void PackKernel(const size_t size, const size_t input_num,
-                           const size_t dims_behind_axis, uint32_t** inputs, uint32_t* output,
-                           cudaStream_t cuda_stream);
-template void PackKernel(const size_t size, const size_t input_num,
-                           const size_t dims_behind_axis, uint64_t** inputs, uint64_t* output,
-                           cudaStream_t cuda_stream);
-template void PackKernel(const size_t size, const size_t input_num,
-                         const size_t dims_behind_axis, half** inputs, half* output,
-                         cudaStream_t cuda_stream);
-template void PackKernel(const size_t size, const size_t input_num,
-                         const size_t dims_behind_axis, float** inputs, float* output,
-                         cudaStream_t cuda_stream);
-template void PackKernel(const size_t size, const size_t input_num,
-                           const size_t dims_behind_axis, bool** inputs, bool* output,
-                           cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh
deleted file mode 100644
index 647f353013b..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_PAD_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_PAD_IMPL_CUH_
-#include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-void CalPad(const size_t size, const T* input, const int num, const int channels, const int old_height,
-            const int old_width, const int padded_height, const int padded_width, const int pad_top, const int pad_left,
-            float pad_value, T* output, cudaStream_t cuda_stream);
-template <typename T>
-void CalPadGrad(const size_t size, const T* dy, const int num, const int channels, const int old_height,
-                const int old_width, const int padded_height, const int padded_width, const int pad_top,
-                const int pad_left, T* dx, cudaStream_t cuda_stream);
-template <typename T>
-void CalPadNHWC(const size_t size, const T* input, const int num, const int old_height, const int old_width,
-             const int channels, const int padded_height, const int padded_width, const int pad_top, const int pad_left,
-            float pad_value, T* output, cudaStream_t cuda_stream);
-template <typename T>
-void CalPadGradNHWC(const size_t size, const T* input, const int num, const int old_height, const int old_width,
-                const int channels, const int padded_height, const int padded_width, const int pad_top,
-                const int pad_left, T* output, cudaStream_t cuda_stream);
-template <typename T>
-void CalPadGeneral(const T *input, T *output, const size_t *input_shape, const size_t *strides,
-                   const int *paddings, const int input_size, const size_t input_rank, cudaStream_t cuda_stream);
-template <typename T>
-void CalPad3d(const size_t size, const T* input, const int num, const int channels, const int old_depth,
-              const int old_height, const int old_width, const int padded_depth, const int padded_height,
-              const int padded_width, const int pad_head, const int pad_top, const int pad_left, const float pad_value,
-              T* output, cudaStream_t cuda_stream);
-template <typename T>
-void CalPadGrad3d(const size_t size, const T* dy, const int num, const int channels, const int old_depth,
-                  const int old_height, const int old_width, const int padded_depth, const int padded_height,
-                  const int padded_width, const int pad_head, const int pad_top, const int pad_left, T* dx,
-                  cudaStream_t cuda_stream);
-template <typename T>
-void CalPadNDHWC(const size_t size, const T *input, const int num, const int old_depth, const int old_height,
-                 const int old_width, const int channels, const int padded_depth, const int padded_height,
-                 const int padded_width, const int pad_head, const int pad_top, const int pad_left,
-                 const float pad_value, T *output, cudaStream_t cuda_stream);
-template <typename T>
-void CalPadGradNDHWC(const size_t size, const T *dy, const int num, const int old_depth, const int old_height,
-                     const int old_width, const int channels, const int padded_depth, const int padded_height,
-                     const int padded_width, const int pad_head, const int pad_top, const int pad_left, T *dx,
-                     cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_PAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cuh
deleted file mode 100644
index 9ca45fcfd85..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cuh
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_PS_ROI_POOLING_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_PS_ROI_POOLING_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-void PSROIPoolForwardLauncher(
-    const T* input, const T spatial_scale, const int rois_number, const int feature_height,
-    const int feature_width, const int feature_channels, const int pooled_height, const int pooled_width,
-    const T* roi_boxes, const int group_size, const int output_channels, T* output_data,
-    int* mapping_channel, cudaStream_t stream);
-
-template <typename T>
-void PSROIPoolBackwardLauncher(
-    const T* input_diff, const int* mapping_channel, const int batch_size,
-    const int rois_number, const T spatial_scale, const int feature_channels,
-    const int feature_height, const int feature_width, const int pooled_width, const int pooled_height,
-    const int output_channels, T* output_diff, const T* roi_boxes, cudaStream_t stream);
-
-#endif   // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_PS_ROI_POOLING_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_categorical.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_categorical.cuh
deleted file mode 100644
index 9aaa04e22f0..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_categorical.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RANDOM_CATEGORICAL_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RANDOM_CATEGORICAL_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void GetCdfKernel(const T *logits_addr, double** dev_cdf, const size_t batch_size, const size_t num_classes,
-                  cudaStream_t cuda_stream);
-template <typename S>
-void RandomCategoricalKernel(const size_t num_samples, double** dev_rand, double** dev_cdf,
-                             const size_t batch_size, const size_t num_classes, S *output_addr,
-                             cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RANDOM_CATEGORICAL_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cuh
deleted file mode 100644
index f5b1cfd74c5..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cuh
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RANDOM_CHOICE_WITH_MASK_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RANDOM_CHOICE_WITH_MASK_IMPL_CUH_
-
-#include <cuda_runtime.h>
-#include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#define BLOCKSIZE 256
-#define MAX_DIMENSION 5
-
-template <typename T, typename S, typename K>
-void CalRandomChoiceWithMaskSmall(int input_size, int seedc, int count, K *input, S *output_index, K *output_mask,
-                               cudaStream_t stream);
-
-template <typename T, typename S>
-void CalRandomChoiceWithMask(const int &input_size, const int &input_shape_size, const int &d1, const int &d2,
-                             const int &d3, const int &d4, const int &d5, const int &seedc, const int &count,
-                             const T *input, S *output_index, T *output_mask, S *index_buff, S *mask_buff, S *rank_buff,
-                             S *Tnum_buff, S *tmp_buff, curandState *globalState, cudaStream_t stream);
-
-int RcwmRoundUpPower2(int v);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RANDOM_CHOICE_WITH_MASK_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_op_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_op_impl.cuh
deleted file mode 100644
index 0b558dc01b6..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_op_impl.cuh
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANDOMOPIMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANDOMOPIMPL_H_
-
-#include <curand_kernel.h>
-#include <random>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-void StandardNormal(int seed, int seed2, curandState *globalState,
-                    T *output, size_t count, cudaStream_t cuda_stream);
-template <typename T>
-bool UniformInt(int seed, int seed2, curandState *globalState,
-                T *input1, size_t input_size_1, T *input2, size_t input_size_2,
-                T *output, size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void UniformReal(int seed, int seed2, curandState *globalState,
-                 T *output, size_t count, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANDOMOPIMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cuh
deleted file mode 100755
index b6aadafd54d..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cuh
+++ /dev/null
@@ -1,23 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RealToComplex_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RealToComplex_H_
-
-template <typename T>
-void RealToComplex(const size_t size, const T *input, T *output, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RealToComplex_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_impl.cu
deleted file mode 100644
index c4092a0b88b..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_impl.cu
+++ /dev/null
@@ -1,97 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-__global__ void CalReLUKernel(int size, T *input_addr, T *output_addr) {
-  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
-    output_addr[pos] = input_addr[pos] > static_cast<T>(0) ? input_addr[pos] : static_cast<T>(0);
-  }
-}
-
-template <typename T>
-void CalReLU(int size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) {
-  CalReLUKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr);
-}
-
-template void CalReLU(int size, double *input_addr, double *output_addr, cudaStream_t cuda_stream);
-template void CalReLU(int size, float *input_addr, float *output_addr, cudaStream_t cuda_stream);
-template void CalReLU(int size, half *input_addr, half *output_addr, cudaStream_t cuda_stream);
-template void CalReLU(int size, int8_t *input_addr, int8_t *output_addr, cudaStream_t cuda_stream);
-template void CalReLU(int size, int16_t *input_addr, int16_t *output_addr, cudaStream_t cuda_stream);
-template void CalReLU(int size, int32_t *input_addr, int32_t *output_addr, cudaStream_t cuda_stream);
-template void CalReLU(int size, int64_t *input_addr, int64_t *output_addr, cudaStream_t cuda_stream);
-template void CalReLU(int size, uint8_t *input_addr, uint8_t *output_addr, cudaStream_t cuda_stream);
-
-template <typename T>
-__global__ void ReluV2Kernel(const size_t num, const T *x, T *y, uint32_t *mask) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += blockDim.x * gridDim.x) {
-    T v = x[i];
-    bool p = v > static_cast<T>(0);
-    y[i] = p ? v : static_cast<T>(0);
-
-    auto warp_predict = BallotSync(p, __activemask());
-    if (LaneId() == 0) {
-      mask[WarpId(i)] = warp_predict;
-    }
-  }
-}
-
-template <typename T>
-void ReluV2(const size_t num, const T *x, T *y, uint32_t *mask, cudaStream_t cuda_stream) {
-  ReluV2Kernel<<<kBlocksPerGrid(num), kThreadsPerBlock, 0, cuda_stream>>>(num, x, y, mask);
-}
-
-template <typename T>
-__global__ void ReluGradV2Kernel(const size_t num, const T *dy, const uint32_t *mask, T *dx) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += blockDim.x * gridDim.x) {
-    bool p = mask[WarpId(i)] & (1 << LaneId());
-    dx[i] = p ? dy[i] : static_cast<T>(0);
-  }
-}
-
-template <typename T>
-void ReluGradV2(const size_t num, const T *dy, const uint32_t *mask, T *dx, cudaStream_t cuda_stream) {
-  ReluGradV2Kernel<<<kBlocksPerGrid(num), kThreadsPerBlock, 0, cuda_stream>>>(num, dy, mask, dx);
-}
-
-template void ReluV2(const size_t num, const double *x, double *y, uint32_t *mask, cudaStream_t cuda_stream);
-template void ReluV2(const size_t num, const float *x, float *y, uint32_t *mask, cudaStream_t cuda_stream);
-template void ReluV2(const size_t num, const half *x, half *y, uint32_t *mask, cudaStream_t cuda_stream);
-template void ReluV2(const size_t num, const int8_t *x, int8_t *y, uint32_t *mask, cudaStream_t cuda_stream);
-template void ReluV2(const size_t num, const int16_t *x, int16_t *y, uint32_t *mask, cudaStream_t cuda_stream);
-template void ReluV2(const size_t num, const int32_t *x, int32_t *y, uint32_t *mask, cudaStream_t cuda_stream);
-template void ReluV2(const size_t num, const int64_t *x, int64_t *y, uint32_t *mask, cudaStream_t cuda_stream);
-template void ReluV2(const size_t num, const uint8_t *x, uint8_t *y, uint32_t *mask, cudaStream_t cuda_stream);
-
-template void ReluGradV2(const size_t num, const double *dy, const uint32_t *mask, double *dx,
-        cudaStream_t cuda_stream);
-template void ReluGradV2(const size_t num, const float *dy, const uint32_t *mask, float *dx, cudaStream_t cuda_stream);
-template void ReluGradV2(const size_t num, const half *dy, const uint32_t *mask, half *dx, cudaStream_t cuda_stream);
-template void ReluGradV2(const size_t num, const int8_t *dy, const uint32_t *mask, int8_t *dx,
-        cudaStream_t cuda_stream);
-template void ReluGradV2(const size_t num, const int16_t *dy, const uint32_t *mask, int16_t *dx,
-        cudaStream_t cuda_stream);
-template void ReluGradV2(const size_t num, const int32_t *dy, const uint32_t *mask, int32_t *dx,
-        cudaStream_t cuda_stream);
-template void ReluGradV2(const size_t num, const int64_t *dy, const uint32_t *mask, int64_t *dx,
-        cudaStream_t cuda_stream);
-template void ReluGradV2(const size_t num, const uint8_t *dy, const uint32_t *mask, uint8_t *dx,
-        cudaStream_t cuda_stream);
-
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cuh
deleted file mode 100644
index 4d2a4350232..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cuh
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RESIZE_BILINEAR_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RESIZE_BILINEAR_H_
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void CalResizeBilinear(const T *input, const int n_, const int c_, const int input_h_, const int input_w_,
-  const int output_h_, const int output_w_, const float h_scale, const float w_scale, T *output,
-  cudaStream_t cuda_stream);
-void CalResizeBilinearGrad(const half *input, const int n_, const int c_, const int input_h_, const int input_w_,
-  const int output_h_, const int output_w_, const float h_scale, const float w_scale, half *output, float *interim,
-  cudaStream_t cuda_stream);
-void CalResizeBilinearGrad(const float *input, const int n_, const int c_, const int input_h_, const int input_w_,
-  const int output_h_, const int output_w_, const float h_scale, const float w_scale, float *output, float *interim,
-  cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RESIZE_BILINEAR_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cuh
deleted file mode 100644
index 926b0215f6e..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cuh
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RESIZE_NEAREST_NEIGHBOR_GRAD_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RESIZE_NEAREST_NEIGHBOR_GRAD_IMPL_CUH_
-#include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#define RESIZENEARESTNEIGHBORGRAD_DIMENSION 4
-
-template <typename T>
-void CalResizeNearestNeighborGrad(const int input_size, const T *input, const int s1, const int s2, const int s3,
-                                  const int s4, T *output, const int d1, const int d2, const int d3, const int d4,
-                                  bool align_corners, float h_scale, float w_scale, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RESIZE_NEAREST_NEIGHBOR_GRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cuh
deleted file mode 100644
index fc68a67e117..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cuh
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RESIZE_NEAREST_NEIGHBOR_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RESIZE_NEAREST_NEIGHBOR_IMPL_CUH_
-#include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#define RESIZENEARESTNEIGHBOR_DIMENSION 4
-
-template <typename T>
-void CalResizeNearestNeighbor(const int size, const T *input, const int s1, const int s2, const int s3, const int s4,
-                              T *output, const int d1, const int d2, const int d3, const int d4, bool align_corners,
-                              float h_scale, float w_scale, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RESIZE_NEAREST_NEIGHBOR_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cu
deleted file mode 100644
index 52b98e8c6e0..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cu
+++ /dev/null
@@ -1,159 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <assert.h>
-#include <stdio.h>
-#include <stdint.h>
-#include "plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cuh"
-
-// Util function to convert a 1D input array index to an N-D positional index
-// Required since GPU iterates over all values in an ND array as a 1D array
-__inline__ __device__ void IdxToPos(size_t idx, size_t *pos, size_t cur_thread_idx, size_t *cum_shape,
-                                    size_t shape_size) {
-  size_t rem_val = idx;
-  for (int i = 0; i < shape_size; i++) {
-    pos[cur_thread_idx + i] = rem_val / cum_shape[i];
-    rem_val = rem_val % cum_shape[i];
-  }
-  return;
-}
-
-// Util function to convert a N-D positonal index to a 1D index
-__inline__ __device__ size_t PosToIdx(size_t *pos, size_t cur_thread_idx, size_t *cum_shape, size_t shape_size) {
-  size_t idx = 0;
-  for (int i = 0; i < shape_size; i++) {
-    idx = idx + (pos[cur_thread_idx + i] * cum_shape[i]);
-  }
-  return idx;
-}
-
-// CumShape takes Shape: (2,2,5) => cumShape (10,5,1) which informs how many values
-// each dimension will represent. Required for converting 1d index to positional vector.
-// In this example 10 in dim 0 means, an increase of 1 in this dim leads to another 10 values
-// in the overall array
-__global__ void ComputeCumShape(const size_t *input_shape_ptr, size_t *input_shape_cum_ptr, size_t shape_size) {
-  int cur_val = 1;
-  for (int i = shape_size - 1; i >= 0; i--) {
-    // iterate list in reverse and cummulatively build shape
-    input_shape_cum_ptr[i] = cur_val;
-    cur_val = cur_val * input_shape_ptr[i];
-  }
-  return;
-}
-template <typename T, typename S>
-__global__ void ReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim,
-                                const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr,
-                                size_t *input_shape_cum_ptr, size_t shape_size, T *output) {
-  // calculate which thread this is out of total across all blocks for accessing respective cur_pos_arr memory
-  size_t cur_thread_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-  cur_thread_idx = cur_thread_idx * shape_size;
-  size_t cur_slice = 0;          // current slice as split by the batch_dim
-  size_t cur_slice_seq_len = 0;  // reverse seq length for this slice as provided by user
-  size_t new_idx = 0;            // calculate corresponding reverse element from input
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) {
-    IdxToPos(idx, cur_pos_arr, cur_thread_idx, input_shape_cum_ptr, shape_size);
-    cur_slice = cur_pos_arr[cur_thread_idx + batch_dim];  // all accesses to cur_pos_arr have to be adjusted per thread
-    cur_slice_seq_len = seq_len[cur_slice];
-    if (cur_slice_seq_len == 0) {  // adjust length to 1 if 0 provided, same result in both cases
-      cur_slice_seq_len = 1;
-    }
-    if (cur_pos_arr[cur_thread_idx + seq_dim] > (cur_slice_seq_len - 1)) {  // check if within range
-      // copy value directly and continue - outside of reversal range
-      output[idx] = input[idx];
-      continue;
-    }
-    // find corresponding reverse element in input
-    cur_pos_arr[cur_thread_idx + seq_dim] =
-      (cur_slice_seq_len - 1) - cur_pos_arr[cur_thread_idx + seq_dim];                 // adjust position to target
-    new_idx = PosToIdx(cur_pos_arr, cur_thread_idx, input_shape_cum_ptr, shape_size);  // get the updated index
-    output[idx] = input[new_idx];
-  }
-  return;
-}
-
-template <typename T, typename S>
-void CalReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim,
-                        const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr,
-                        size_t *input_shape_cum_ptr, size_t shape_size, T *output, cudaStream_t cuda_stream) {
-  ComputeCumShape<<<1, 1, 0, cuda_stream>>>(input_shape_ptr, input_shape_cum_ptr, shape_size);
-  ReverseSequence<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
-    size, input, seq_len, batch_dim, seq_dim, cur_pos_arr, input_shape_ptr, input_shape_cum_ptr, shape_size, output);
-  return;
-}
-
-template void CalReverseSequence<int8_t, int>(const size_t size, const int8_t *input, const int *seq_len,
-                                              const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                              const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                              size_t shape_size, int8_t *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<int8_t, int64_t>(const size_t size, const int8_t *input, const int64_t *seq_len,
-                                                  const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                                  const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                                  size_t shape_size, int8_t *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<int16_t, int>(const size_t size, const int16_t *input, const int *seq_len,
-                                               const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                               const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                               size_t shape_size, int16_t *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<int16_t, int64_t>(const size_t size, const int16_t *input, const int64_t *seq_len,
-                                                   const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                                   const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                                   size_t shape_size, int16_t *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<int, int>(const size_t size, const int *input, const int *seq_len,
-                                           const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                           const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                           size_t shape_size, int *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<int, int64_t>(const size_t size, const int *input, const int64_t *seq_len,
-                                               const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                               const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                               size_t shape_size, int *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<int64_t, int>(const size_t size, const int64_t *input, const int *seq_len,
-                                               const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                               const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                               size_t shape_size, int64_t *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<int64_t, int64_t>(const size_t size, const int64_t *input, const int64_t *seq_len,
-                                                   const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                                   const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                                   size_t shape_size, int64_t *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<half, int>(const size_t size, const half *input, const int *seq_len,
-                                            const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                            const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                            size_t shape_size, half *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<half, int64_t>(const size_t size, const half *input, const int64_t *seq_len,
-                                                const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                                const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                                size_t shape_size, half *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<float, int>(const size_t size, const float *input, const int *seq_len,
-                                             const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                             const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                             size_t shape_size, float *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<float, int64_t>(const size_t size, const float *input, const int64_t *seq_len,
-                                                 const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                                 const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                                 size_t shape_size, float *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<double, int>(const size_t size, const double *input, const int *seq_len,
-                                              const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                              const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                              size_t shape_size, double *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<double, int64_t>(const size_t size, const double *input, const int64_t *seq_len,
-                                                  const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                                  const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                                  size_t shape_size, double *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<bool, int>(const size_t size, const bool *input, const int *seq_len,
-                                            const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                            const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                            size_t shape_size, bool *output, cudaStream_t cuda_stream);
-template void CalReverseSequence<bool, int64_t>(const size_t size, const bool *input, const int64_t *seq_len,
-                                                const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr,
-                                                const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr,
-                                                size_t shape_size, bool *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cuh
deleted file mode 100644
index 57b940bce21..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_REVERSE_SEQUENCE_IMPL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_REVERSE_SEQUENCE_IMPL_H_
-#include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-void CalReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim,
-                        const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr,
-                        size_t *intput_shape_cum_ptr, size_t shape_size, T *output, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_REVERSE_SEQUENCE_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cu
deleted file mode 100644
index 81a9e0aa56b..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cuda_runtime.h>
-#include "reverse_v2_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-__global__ void ReverseV2(const T* input, T* output, const size_t* input_shape, const int64_t* strides,
-                          const int64_t* axis, size_t input_size, size_t axis_size) {
-  for (int64_t gt_id = blockIdx.x * blockDim.x + threadIdx.x; gt_id < input_size; gt_id += blockDim.x * gridDim.x) {
-    int64_t intermediate_index = gt_id;
-    for (size_t i = 0; i < axis_size; i++) {
-      int64_t d = axis[i];
-      int64_t pre_reverse_position = (gt_id / strides[d]) % input_shape[d];
-      int64_t reversed_position = input_shape[d] - pre_reverse_position - 1;
-      intermediate_index += ((reversed_position - pre_reverse_position) * strides[d]);
-    }
-
-    output[intermediate_index] = input[gt_id];
-  }
-  return;
-}
-template <typename T>
-void CalReverseV2(const T* input, T* output, const size_t* input_shape, const int64_t* strides, const int64_t* axis,
-                  size_t input_size, size_t axis_size, cudaStream_t cuda_stream) {
-  ReverseV2<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input, output, input_shape, strides, axis,
-                                                                     input_size, axis_size);
-  return;
-}
-
-template void CalReverseV2<half>(const half* input, half* output, const size_t* input_shape, const int64_t* strides,
-                                 const int64_t* axis, size_t input_size, size_t axis_size, cudaStream_t cuda_stream);
-
-template void CalReverseV2<float>(const float* input, float* output, const size_t* input_shape, const int64_t* strides,
-                                  const int64_t* axis, size_t input_size, size_t axis_size, cudaStream_t cuda_stream);
-
-template void CalReverseV2<uint8_t>(const uint8_t* input, uint8_t* output, const size_t* input_shape,
-                                    const int64_t* strides, const int64_t* axis, size_t input_size, size_t axis_size,
-                                    cudaStream_t cuda_stream);
-
-template void CalReverseV2<int16_t>(const int16_t* input, int16_t* output, const size_t* input_shape,
-                                    const int64_t* strides, const int64_t* axis, size_t input_size, size_t axis_size,
-                                    cudaStream_t cuda_stream);
-
-template void CalReverseV2<int32_t>(const int32_t* input, int32_t* output, const size_t* input_shape,
-                                    const int64_t* strides, const int64_t* axis, size_t input_size, size_t axis_size,
-                                    cudaStream_t cuda_stream);
-
-template void CalReverseV2<int64_t>(const int64_t* input, int64_t* output, const size_t* input_shape,
-                                    const int64_t* strides, const int64_t* axis, size_t input_size, size_t axis_size,
-                                    cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh
index 07248a8a7bb..25688b4154a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh
@@ -17,7 +17,7 @@
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RL_BUFFER_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RL_BUFFER_IMPL_H_
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 void BufferAppend(const int64_t capacity, const size_t size, const int *index, const int exp_batch,
                   unsigned char *buffer, const unsigned char *exp, cudaStream_t cuda_stream);
 void IncreaseCount(const int64_t capacity, const int exp_batch, int *count, int *head, int *index,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cuh
deleted file mode 100644
index 40a63ce48de..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cuh
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RMSPROP_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RMSPROP_H_
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-void RmsProp(const T* learning_rate, const T decay, const T momentum, const T epsilon, T* variable, T* mean_square,
-             T* moment, T* gradients, const size_t size, cudaStream_t cuda_stream);
-
-template <typename T>
-void RmsPropCenter(const T* learning_rate, const T* decay, const T* momentum, const T* epsilon, T* variable,
-                   T* mean_gradients, T* mean_square, T* moment, T* gradients, const size_t size,
-                   cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RMSPROP_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cuh
deleted file mode 100644
index 5a71e88d273..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ROI_ALIGN_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ROI_ALIGN_IMPL_H_
-template <typename T>
-void ROIAlign(const T *x, const T *roi_boxes, int roi_rows, int roi_cols, T *out_data, const T spatial_scale,
-              const int sample_num, int roi_end_mode, const int channels, const int height, const int width,
-              const int pooled_height, const int pooled_width, cudaStream_t cuda_stream);
-
-template <typename T>
-void ROIAlignGrad(const T *dy, const T *roi_boxes, int batch_size, int roi_rows, int roi_cols, T *dx,
-                  const T spatial_scale, const int sample_num, int roi_end_mode, const int channels, const int height,
-                  const int width, const int pooled_height, const int pooled_width, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ROI_ALIGN_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cu
deleted file mode 100644
index efc0a21e836..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cu
+++ /dev/null
@@ -1,103 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cuh"
-
-template <typename T, typename S>
-__global__ void ScatterUpdateKernel(const size_t inner_size, const size_t updates_size, const S *indices,
-                                    const T *updates, T *input) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
-    const size_t index = pos / inner_size;
-    const size_t offset = pos % inner_size;
-    const size_t current_pos = indices[index] * inner_size + offset;
-    input[current_pos] = updates[pos];
-  }
-}
-
-template <typename T, typename S>
-__global__ void ScatterAddKernel(const size_t inner_size, const size_t updates_size, const S *indices, const T *updates,
-                                 T *input) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
-    const size_t index = pos / inner_size;
-    const size_t offset = pos % inner_size;
-    const size_t current_pos = indices[index] * inner_size + offset;
-    MsAtomicAdd(&input[current_pos], updates[pos]);
-  }
-}
-
-template <typename T, typename S>
-__global__ void ScatterSubKernel(const size_t inner_size, const size_t updates_size, const S *indices, const T *updates,
-                                 T *input) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
-    const size_t index = pos / inner_size;
-    const size_t offset = pos % inner_size;
-    const size_t current_pos = indices[index] * inner_size + offset;
-    MsAtomicAdd(&input[current_pos], -updates[pos]);
-  }
-}
-
-template <typename T, typename S>
-void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, const size_t &indices_size,
-                 const S *indices, const T *updates, T *input, cudaStream_t cuda_stream) {
-  const size_t updates_size = inner_size * indices_size;
-  switch (func_type) {
-    case SCATTER_FUNC_UPDATE:
-      return ScatterUpdateKernel<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size,
-                                                                                            indices, updates, input);
-    case SCATTER_FUNC_ADD:
-      return ScatterAddKernel<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size,
-                                                                                         indices, updates, input);
-    case SCATTER_FUNC_SUB:
-      return ScatterSubKernel<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size,
-                                                                                         indices, updates, input);
-    default:
-      break;
-  }
-}
-
-template void ScatterFunc<float, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
-                                      const size_t &indices_size, const int *indices, const float *updates,
-                                      float *input, cudaStream_t cuda_stream);
-template void ScatterFunc<float, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
-                                          const size_t &indices_size, const int64_t *indices, const float *updates,
-                                          float *input, cudaStream_t cuda_stream);
-template void ScatterFunc<half, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
-                                     const size_t &indices_size, const int *indices, const half *updates, half *input,
-                                     cudaStream_t cuda_stream);
-template void ScatterFunc<half, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
-                                         const size_t &indices_size, const int64_t *indices, const half *updates,
-                                         half *input, cudaStream_t cuda_stream);
-template void ScatterFunc<int, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
-                                    const size_t &indices_size, const int *indices, const int *updates, int *input,
-                                    cudaStream_t cuda_stream);
-template void ScatterFunc<int, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
-                                        const size_t &indices_size, const int64_t *indices, const int *updates,
-                                        int *input, cudaStream_t cuda_stream);
-template void ScatterFunc<unsigned char, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
-                                              const size_t &indices_size, const int *indices,
-                                              const unsigned char *updates, unsigned char *input,
-                                              cudaStream_t cuda_stream);
-template void ScatterFunc<unsigned char, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
-                                                  const size_t &indices_size, const int64_t *indices,
-                                                  const unsigned char *updates, unsigned char *input,
-                                                  cudaStream_t cuda_stream);
-template void ScatterFunc<int8_t, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
-                                       const size_t &indices_size, const int *indices, const int8_t *updates,
-                                       int8_t *input, cudaStream_t cuda_stream);
-template void ScatterFunc<int8_t, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
-                                           const size_t &indices_size, const int64_t *indices, const int8_t *updates,
-                                           int8_t *input, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd.cu
deleted file mode 100644
index 1780e97447c..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd.cu
+++ /dev/null
@@ -1,110 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/scatter_nd.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-__global__ void ScatterNdKernel(S *indices, T *update, T *output, const size_t block_size, const size_t input_size,
-                                const size_t output_size, const size_t indices_dim_0, const size_t indices_dim_1,
-                                S *indices_stride, S *work_shape) {
-  int i, j;
-  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
-       read_index += blockDim.x * gridDim.x) {
-    size_t write_index = 0;
-    bool out_bound = false;
-
-    i = read_index / block_size;
-    j = read_index % block_size;
-
-    for (size_t k = 0; k < indices_dim_1; k++) {
-      S indices_i = indices[i * indices_dim_1 + k];
-      out_bound |= indices_i >= work_shape[k];
-      write_index += indices_i * indices_stride[k];
-    }
-
-    write_index += j;
-    out_bound |= write_index >= output_size;
-
-    if (!out_bound) {
-      MsAtomicAdd(&output[write_index], update[read_index]);
-    }
-  }
-}
-
-template <typename T, typename S>
-void ScatterNd(S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
-               const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, S *indices_stride,
-               S *work_shape, cudaStream_t stream) {
-  ScatterNdKernel<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(indices, update, output, block_size, input_size,
-                                                                       output_size, indices_dim_0, indices_dim_1,
-                                                                       indices_stride, work_shape);
-  return;
-}
-
-template void ScatterNd<double, int>(int *indices, double *update, double *output, const size_t &block_size,
-                                     const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
-                                     const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                     cudaStream_t stream);
-template void ScatterNd<double, int64_t>(int64_t *indices, double *update, double *output, const size_t &block_size,
-                                         const size_t &input_size, const size_t &output_size,
-                                         const size_t &indices_dim_0, const size_t &indices_dim_1,
-                                         int64_t *indices_stride, int64_t *work_shape, cudaStream_t stream);
-template void ScatterNd<float, int>(int *indices, float *update, float *output, const size_t &block_size,
-                                    const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
-                                    const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                    cudaStream_t stream);
-template void ScatterNd<float, int64_t>(int64_t *indices, float *update, float *output, const size_t &block_size,
-                                        const size_t &input_size, const size_t &output_size,
-                                        const size_t &indices_dim_0, const size_t &indices_dim_1,
-                                        int64_t *indices_stride, int64_t *work_shape, cudaStream_t stream);
-template void ScatterNd<half, int>(int *indices, half *update, half *output, const size_t &block_size,
-                                   const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
-                                   const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                   cudaStream_t stream);
-template void ScatterNd<half, int64_t>(int64_t *indices, half *update, half *output, const size_t &block_size,
-                                       const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
-                                       const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
-                                       cudaStream_t stream);
-template void ScatterNd<int, int>(int *indices, int *update, int *output, const size_t &block_size,
-                                  const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
-                                  const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                  cudaStream_t stream);
-template void ScatterNd<int, int64_t>(int64_t *indices, int *update, int *output, const size_t &block_size,
-                                      const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
-                                      const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
-                                      cudaStream_t stream);
-// NOLINTNEXTLINE
-template void ScatterNd<short, int>(int *indices, short *update, short *output, const size_t &block_size,
-                                    const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
-                                    const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                    cudaStream_t stream);
-// NOLINTNEXTLINE
-template void ScatterNd<short, int64_t>(int64_t *indices, short *update, short *output, const size_t &block_size,
-                                        const size_t &input_size, const size_t &output_size,
-                                        const size_t &indices_dim_0, const size_t &indices_dim_1,
-                                        int64_t *indices_stride, int64_t *work_shape, cudaStream_t stream);
-template void ScatterNd<unsigned char, int>(int *indices, unsigned char *update, unsigned char *output,
-                                            const size_t &block_size, const size_t &input_size,
-                                            const size_t &output_size, const size_t &indices_dim_0,
-                                            const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                            cudaStream_t stream);
-template void ScatterNd<unsigned char, int64_t>(int64_t *indices, unsigned char *update, unsigned char *output,
-                                                const size_t &block_size, const size_t &input_size,
-                                                const size_t &output_size, const size_t &indices_dim_0,
-                                                const size_t &indices_dim_1, int64_t *indices_stride,
-                                                int64_t *work_shape, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd.cuh
deleted file mode 100644
index 8bf142abaf9..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_SCATTER_ND_GPU_CU_H
-#define MINDSPORE_SCATTER_ND_GPU_CU_H
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-void ScatterNd(S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
-               const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, S *indices_stride,
-               S *work_shape, cudaStream_t stream);
-#endif  // MINDSPORE_SCATTER_ND_GPU_CU_H
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cu
deleted file mode 100644
index 8aca3518dd2..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cu
+++ /dev/null
@@ -1,181 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cuh"
-
-template <typename T, typename S>
-__global__ void ScatterNdUpdate(const size_t unit_size, const size_t index_depth, const size_t updates_size,
-                                const S *out_strides, const S *indices, const T *updates, T *input) {
-  int i, j;
-  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size);
-       read_index += blockDim.x * gridDim.x) {
-    size_t write_index = 0;
-    bool out_bound = false;
-
-    i = read_index / unit_size;
-    j = read_index % unit_size;
-
-    for (size_t k = 0; k < index_depth; k++) {
-      S indices_i = indices[i * index_depth + k];
-      out_bound |= indices_i < 0;
-      write_index += indices_i * out_strides[k] * unit_size;
-    }
-
-    write_index += j;
-
-    if (!out_bound) {
-      input[write_index] = updates[read_index];
-    }
-  }
-}
-
-template <typename T, typename S>
-__global__ void ScatterNdAdd(const size_t unit_size, const size_t index_depth, const size_t updates_size,
-                             const S *out_strides, const S *indices, const T *updates, T *input) {
-  int i, j;
-  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size);
-       read_index += blockDim.x * gridDim.x) {
-    size_t write_index = 0;
-    bool out_bound = false;
-
-    i = read_index / unit_size;
-    j = read_index % unit_size;
-
-    for (size_t k = 0; k < index_depth; k++) {
-      S indices_i = indices[i * index_depth + k];
-      out_bound |= indices_i < 0;
-      write_index += indices_i * out_strides[k] * unit_size;
-    }
-
-    write_index += j;
-
-    if (!out_bound) {
-      MsAtomicAdd(&input[write_index], updates[read_index]);
-    }
-  }
-}
-
-template <typename T, typename S>
-__global__ void ScatterNdSub(const size_t unit_size, const size_t index_depth, const size_t updates_size,
-                             const S *out_strides, const S *indices, const T *updates, T *input) {
-  int i, j;
-  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size);
-       read_index += blockDim.x * gridDim.x) {
-    size_t write_index = 0;
-    bool out_bound = false;
-
-    i = read_index / unit_size;
-    j = read_index % unit_size;
-
-    for (size_t k = 0; k < index_depth; k++) {
-      S indices_i = indices[i * index_depth + k];
-      out_bound |= indices_i < 0;
-      write_index += indices_i * out_strides[k] * unit_size;
-    }
-
-    write_index += j;
-
-    if (!out_bound) {
-      MsAtomicAdd(&input[write_index], -updates[read_index]);
-    }
-  }
-}
-
-template <typename T, typename S>
-void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, const size_t &num_units,
-                         const size_t &index_depth, const S *out_strides, const S *indices, const T *updates, T *input,
-                         cudaStream_t cuda_stream) {
-  const size_t updates_size = unit_size * num_units;
-  switch (func_type) {
-    case SCATTER_ND_FUNC_UPDATE:
-      return ScatterNdUpdate<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(
-        unit_size, index_depth, updates_size, out_strides, indices, updates, input);
-    case SCATTER_ND_FUNC_ADD:
-      return ScatterNdAdd<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(
-        unit_size, index_depth, updates_size, out_strides, indices, updates, input);
-    case SCATTER_ND_FUNC_SUB:
-      return ScatterNdSub<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(
-        unit_size, index_depth, updates_size, out_strides, indices, updates, input);
-    default:
-      break;
-  }
-}
-
-template void CalScatterNdFunctor<double, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                   const size_t &num_units, const size_t &index_depth,
-                                                   const int64_t *out_strides, const int64_t *indices,
-                                                   const double *updates, double *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<double, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                   const size_t &num_units, const size_t &index_depth,
-                                                   const int32_t *out_strides, const int32_t *indices,
-                                                   const double *updates, double *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<float, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                  const size_t &num_units, const size_t &index_depth,
-                                                  const int64_t *out_strides, const int64_t *indices,
-                                                  const float *updates, float *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<float, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                  const size_t &num_units, const size_t &index_depth,
-                                                  const int32_t *out_strides, const int32_t *indices,
-                                                  const float *updates, float *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<half, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                 const size_t &num_units, const size_t &index_depth,
-                                                 const int64_t *out_strides, const int64_t *indices,
-                                                 const half *updates, half *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<half, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                 const size_t &num_units, const size_t &index_depth,
-                                                 const int32_t *out_strides, const int32_t *indices,
-                                                 const half *updates, half *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<int32_t, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                    const size_t &num_units, const size_t &index_depth,
-                                                    const int64_t *out_strides, const int64_t *indices,
-                                                    const int32_t *updates, int32_t *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<int32_t, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                    const size_t &num_units, const size_t &index_depth,
-                                                    const int32_t *out_strides, const int32_t *indices,
-                                                    const int32_t *updates, int32_t *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<int16_t, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                    const size_t &num_units, const size_t &index_depth,
-                                                    const int64_t *out_strides, const int64_t *indices,
-                                                    const int16_t *updates, int16_t *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<int16_t, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                    const size_t &num_units, const size_t &index_depth,
-                                                    const int32_t *out_strides, const int32_t *indices,
-                                                    const int16_t *updates, int16_t *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<uint8_t, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                    const size_t &num_units, const size_t &index_depth,
-                                                    const int64_t *out_strides, const int64_t *indices,
-                                                    const uint8_t *updates, uint8_t *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<uint8_t, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                    const size_t &num_units, const size_t &index_depth,
-                                                    const int32_t *out_strides, const int32_t *indices,
-                                                    const uint8_t *updates, uint8_t *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<int8_t, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                   const size_t &num_units, const size_t &index_depth,
-                                                   const int64_t *out_strides, const int64_t *indices,
-                                                   const int8_t *updates, int8_t *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<int8_t, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                   const size_t &num_units, const size_t &index_depth,
-                                                   const int32_t *out_strides, const int32_t *indices,
-                                                   const int8_t *updates, int8_t *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<bool, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                 const size_t &num_units, const size_t &index_depth,
-                                                 const int64_t *out_strides, const int64_t *indices,
-                                                 const bool *updates, bool *input, cudaStream_t cuda_stream);
-template void CalScatterNdFunctor<bool, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
-                                                 const size_t &num_units, const size_t &index_depth,
-                                                 const int32_t *out_strides, const int32_t *indices,
-                                                 const bool *updates, bool *input, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/select_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/select_impl.cu
deleted file mode 100644
index 4f11a9682ea..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/select_impl.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <include/cuda_runtime.h>
-#include "plugin/device/gpu/kernel/cuda_impl/select_impl.cuh"
-
-template <typename T>
-__global__ void Select(const size_t size, const bool* cond, const T* input_x, const T* input_y, T* output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
-    output[pos] = cond[pos] ? input_x[pos] : input_y[pos];
-  }
-  return;
-}
-
-template <typename T>
-void CalSelect(const size_t size, const bool* cond, const T* input_x, const T* input_y, T* output,
-               cudaStream_t cuda_stream) {
-  Select<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, cond, input_x, input_y, output);
-  return;
-}
-
-template void CalSelect<double>(const size_t size, const bool* cond, const double* input_X, const double* input_y,
-                                double* output, cudaStream_t cuda_stream);
-template void CalSelect<float>(const size_t size, const bool* cond, const float* input_X, const float* input_y,
-                               float* output, cudaStream_t cuda_stream);
-template void CalSelect<int>(const size_t size, const bool* cond, const int* input_X, const int* input_y, int* output,
-                             cudaStream_t cuda_stream);
-template void CalSelect<half>(const size_t size, const bool* cond, const half* input_X, const half* input_y,
-                              half* output, cudaStream_t cuda_stream);
-template void CalSelect<int64_t>(const size_t size, const bool* cond, const int64_t* input_X, const int64_t* input_y,
-                              int64_t* output, cudaStream_t cuda_stream);
-template void CalSelect<bool>(const size_t size, const bool *cond, const bool *input_X, const bool *input_y,
-                                 bool *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cu
deleted file mode 100644
index f63cf62786d..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cu
+++ /dev/null
@@ -1,132 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <algorithm>
-#include <numeric>
-#include <functional>
-#include "plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cuh"
-
-namespace {
-constexpr size_t kMaxDim = 8;
-}
-
-template <typename T, size_t N>
-class VectorWrapper {
- public:
-  explicit VectorWrapper(const std::vector<T> &v) { std::copy(v.begin(), v.end(), data); }
-  ~VectorWrapper() {}
-  __device__ T& operator[](size_t index) { return data[index]; }
-
- private:
-  T data[N];
-};
-
-template <typename T>
-__global__ void CopySlicesKernel(VectorWrapper<int64_t, kMaxDim> begins, VectorWrapper<int64_t, kMaxDim> stride,
-                                 VectorWrapper<size_t, kMaxDim> u, VectorWrapper<size_t, kMaxDim> u_offset,
-                                 VectorWrapper<size_t, kMaxDim> o_offset, const T *update_addr, T *output_addr) {
-  size_t update_num = u[0] * u_offset[0];
-
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < update_num; pos += blockDim.x * gridDim.x) {
-    size_t i = pos / (u_offset[0]) % u[0];
-    size_t j = pos / (u_offset[1]) % u[1];
-    size_t k = pos / (u_offset[2]) % u[2];
-    size_t l = pos / (u_offset[3]) % u[3];
-    size_t m = pos / (u_offset[4]) % u[4];
-    size_t n = pos / (u_offset[5]) % u[5];
-    size_t o = pos / (u[7]) % u[6];
-    size_t p = pos % u[7];
-
-    size_t output_idx = (i * stride[0] + begins[0]) * o_offset[0] + (j * stride[1] + begins[1]) * o_offset[1] +
-                        (k * stride[2] + begins[2]) * o_offset[2] + (l * stride[3] + begins[3]) * o_offset[3] +
-                        (m * stride[4] + begins[4]) * o_offset[4] + (n * stride[5] + begins[5]) * o_offset[5] +
-                        (o * stride[6] + begins[6]) * o_offset[6] + (p * stride[7] + begins[7]);
-    output_addr[output_idx] = update_addr[pos];
-  }
-}
-
-std::vector<size_t> CalculateOffset(const std::vector<size_t> &shape) {
-  std::vector<size_t> offset(kMaxDim);
-  offset[7] = 1;
-  offset[6] = offset[7] * shape[7];
-  offset[5] = offset[6] * shape[6];
-  offset[4] = offset[5] * shape[5];
-  offset[3] = offset[4] * shape[4];
-  offset[2] = offset[3] * shape[3];
-  offset[1] = offset[2] * shape[2];
-  offset[0] = offset[1] * shape[1];
-  return offset;
-}
-
-template <typename T>
-void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape, const T *update, T *output,
-                cudaStream_t cuda_stream) {
-  size_t size = std::accumulate(update_shape.begin(), update_shape.end(), 1, std::multiplies<size_t>());
-
-  VectorWrapper<size_t, kMaxDim> o_offset(CalculateOffset(output_shape));
-  VectorWrapper<size_t, kMaxDim> u_offset(CalculateOffset(update_shape));
-
-  VectorWrapper<int64_t, kMaxDim> begins(begin);
-  VectorWrapper<int64_t, kMaxDim> strides(stride);
-  VectorWrapper<size_t, kMaxDim> update_shapes(update_shape);
-
-  CopySlicesKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(begins, strides, update_shapes, u_offset,
-                                                                      o_offset, update, output);
-}
-
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
-                         const bool *update, bool *output, cudaStream_t cuda_stream);
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
-                         const double *update, double *output, cudaStream_t cuda_stream);
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
-                         const float *update, float *output, cudaStream_t cuda_stream);
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
-                         const half *update, half *output, cudaStream_t cuda_stream);
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
-                         const int64_t *update, int64_t *output, cudaStream_t cuda_stream);
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape, const int *update,
-                         int *output, cudaStream_t cuda_stream);
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
-                         const short *update, short *output, cudaStream_t cuda_stream);  // NOLINT
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
-                         const int8_t *update, int8_t *output, cudaStream_t cuda_stream);
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
-                         const uint64_t *update, uint64_t *output, cudaStream_t cuda_stream);
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
-                         const uint32_t *update, uint32_t *output, cudaStream_t cuda_stream);
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
-                         const uint16_t *update, uint16_t *output, cudaStream_t cuda_stream);
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
-                         const unsigned char *update, unsigned char *output, cudaStream_t cuda_stream);
-template void CopySlices(const std::vector<size_t> &update_shape, const std::vector<int64_t> &begin,
-                         const std::vector<int64_t> &stride, const std::vector<size_t> &output_shape,
-                         const char *update, char *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_impl.cu
deleted file mode 100644
index f9b2e1dabb9..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_impl.cu
+++ /dev/null
@@ -1,618 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"
-
-template <typename T>
-__global__ void Slice1D(const size_t s1, const size_t l1, const size_t d1, const T *input, T *output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1; pos += blockDim.x * gridDim.x) {
-    output[pos] = input[pos + s1];
-  }
-}
-
-template <typename T>
-__global__ void Slice2D(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1,
-                        const size_t d2, const T *input, T *output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2; pos += blockDim.x * gridDim.x) {
-    size_t i = pos / l2 % l1;
-    size_t j = pos % l2;
-
-    size_t offset = (i + s1) * d2 + (j + s2);
-    output[pos] = input[offset];
-  }
-}
-
-template <typename T>
-__global__ void Slice3D(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2,
-                        const size_t l3, const size_t d1, const size_t d2, const size_t d3, const T *input, T *output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3; pos += blockDim.x * gridDim.x) {
-    size_t i = pos / (l2 * l3) % l1;
-    size_t j = pos / l3 % l2;
-    size_t k = pos % l3;
-
-    size_t offset = (i + s1) * (d2 * d3) + (j + s2) * d3 + (k + s3);
-    output[pos] = input[offset];
-  }
-}
-
-template <typename T>
-__global__ void Slice4D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                        const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                        const size_t d3, const size_t d4, const T *input, T *output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4; pos += blockDim.x * gridDim.x) {
-    size_t i = pos / (l2 * l3 * l4) % l1;
-    size_t j = pos / (l3 * l4) % l2;
-    size_t k = pos / l4 % l3;
-    size_t o = pos % l4;
-
-    size_t offset = (i + s1) * (d2 * d3 * d4) + (j + s2) * (d3 * d4) + (k + s3) * d4 + (o + s4);
-    output[pos] = input[offset];
-  }
-}
-
-template <typename T>
-__global__ void Slice5D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                        const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
-                        const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
-                        const T *input, T *output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4 * l5;
-       pos += blockDim.x * gridDim.x) {
-    size_t i = pos / (l2 * l3 * l4 * l5) % l1;
-    size_t j = pos / (l3 * l4 * l5) % l2;
-    size_t k = pos / (l4 * l5) % l3;
-    size_t o = pos / l5 % l4;
-    size_t q = pos % l5;
-
-    size_t offset =
-      (i + s1) * (d2 * d3 * d4 * d5) + (j + s2) * (d3 * d4 * d5) + (k + s3) * (d4 * d5) + (o + s4) * d5 + (q + s5);
-    output[pos] = input[offset];
-  }
-}
-
-template <typename T>
-__global__ void Slice6D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                        const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                        const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3,
-                        const size_t d4, const size_t d5, const size_t d6, const T *input, T *output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4 * l5 * l6;
-       pos += blockDim.x * gridDim.x) {
-    size_t i = pos / (l2 * l3 * l4 * l5 * l6) % l1;
-    size_t j = pos / (l3 * l4 * l5 * l6) % l2;
-    size_t k = pos / (l4 * l5 * l6) % l3;
-    size_t o = pos / (l5 * l6) % l4;
-    size_t q = pos / l6 % l5;
-    size_t r = pos % l6;
-
-    size_t offset =
-      (i + s1) * (d2 * d3 * d4 * d5 * d6) + (j + s2) * (d3 * d4 * d5 * d6) + (k + s3) * (d4 * d5 * d6) + (o + s4) *
-      (d5 * d6) + (q + s5) * d6 + (r + s6);
-    output[pos] = input[offset];
-  }
-}
-
-template <typename T>
-__global__ void Slice7D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                        const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3,
-                        const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1,
-                        const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
-                        const size_t d7, const T *input, T *output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4 * l5 * l6 * l7;
-       pos += blockDim.x * gridDim.x) {
-    size_t i = pos / (l2 * l3 * l4 * l5 * l6 * l7) % l1;
-    size_t j = pos / (l3 * l4 * l5 * l6 * l7) % l2;
-    size_t k = pos / (l4 * l5 * l6 * l7) % l3;
-    size_t o = pos / (l5 * l6 * l7) % l4;
-    size_t q = pos / (l6 * l7) % l5;
-    size_t r = pos / l7 % l6;
-    size_t s = pos % l7;
-
-    size_t offset =
-      (i + s1) * (d2 * d3 * d4 * d5 * d6 * d7) + (j + s2) * (d3 * d4 * d5 * d6 * d7) + (k + s3) * (d4 * d5 * d6 * d7)+
-      (o + s4) * (d5 * d6 * d7) + (q + s5) * (d6 * d7) + (r + s6) * d7 + (s + s7);
-    output[pos] = input[offset];
-  }
-}
-
-template <typename T>
-__global__ void Slice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
-                        const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                        const size_t d1, const size_t d2, const size_t d3, const size_t d4,
-                        const T *dy, T *dx) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (l1 * l2 * l3 * l4); pos += blockDim.x * gridDim.x) {
-    size_t i = pos / (l2 * l3 * l4) % l1;
-    size_t j = pos / (l3 * l4) % l2;
-    size_t k = pos / l4 % l3;
-    size_t o = pos % l4;
-    size_t input_idx = (i + s1) * (d2 * d3 * d4) + (j + s2) * (d3 * d4) + (k + s3) * d4 + (o + s4);
-    dx[input_idx] = dy[pos];
-  }
-}
-
-template <typename T>
-__global__ void FillArray(T *addr, const size_t len, const float value) {
-  T value_ = static_cast<T>(value);
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < len; pos += blockDim.x * gridDim.x) {
-    addr[pos] = value_;
-  }
-  return;
-}
-template <typename T>
-void FillDeviceArray(const size_t input_size, T *addr, const float value, cudaStream_t cuda_stream) {
-  FillArray<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(addr, input_size, value);
-  return;
-}
-template <typename T>
-void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const T *input, T *output, cudaStream_t stream) {
-  Slice1D<<<GET_BLOCKS(l1), GET_THREADS, 0, stream>>>(s1, l1, d1, input, output);
-}
-template <typename T>
-void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, const size_t d2,
-                   const T *input, T *output, cudaStream_t stream) {
-  Slice2D<<<GET_BLOCKS(l1 * l2), GET_THREADS, 0, stream>>>(s1, s2, l1, l2, d1, d2, input, output);
-}
-template <typename T>
-void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, const size_t l3,
-                   const size_t d1, const size_t d2, const size_t d3, const T *input, T *output, cudaStream_t stream) {
-  Slice3D<<<GET_BLOCKS(l1 * l2 * l3), GET_THREADS, 0, stream>>>(s1, s2, s3, l1, l2, l3, d1, d2, d3, input, output);
-}
-template <typename T>
-void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, const size_t l2,
-                   const size_t l3, const size_t l4, const size_t d1, const size_t d2, const size_t d3, const size_t d4,
-                   const T *input, T *output, cudaStream_t stream) {
-  Slice4D<<<GET_BLOCKS(l1 * l2 * l3 * l4), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, l1, l2, l3, l4, d1, d2, d3, d4,
-                                                                     input, output);
-}
-template <typename T>
-void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t l1,
-                   const size_t l2, const size_t l3, const size_t l4, const size_t l5, const size_t d1, const size_t d2,
-                   const size_t d3, const size_t d4, const size_t d5, const T *input, T *output, cudaStream_t stream) {
-  Slice5D<<<GET_BLOCKS(l1 * l2 * l3 * l4 * l5), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, s5, l1, l2, l3, l4, l5, d1,
-                                                                          d2, d3, d4, d5, input, output);
-}
-template <typename T>
-void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t s6,
-                   const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, const size_t l6,
-                   const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
-                   const T *input, T *output, cudaStream_t stream) {
-  Slice6D<<<GET_BLOCKS(l1 * l2 * l3 * l4 * l5 * l6), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, s5, s6, l1, l2, l3, l4,
-                                                                               l5, l6, d1, d2, d3, d4, d5, d6, input,
-                                                                               output);
-}
-template <typename T>
-void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t s6,
-                   const size_t s7, const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
-                   const size_t l6, const size_t l7, const size_t d1, const size_t d2, const size_t d3, const size_t d4,
-                   const size_t d5, const size_t d6, const size_t d7, const T *input, T *output, cudaStream_t stream) {
-  Slice7D<<<GET_BLOCKS(l1 * l2 * l3 * l4 * l5 * l6 * l7), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, s5, s6, s7, l1, l2,
-                                                                                    l3, l4, l5, l6, l7, d1, d2, d3, d4,
-                                                                                    d5, d6, d7, input, output);
-}
-template <typename T>
-void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                   const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                   const size_t d3, const size_t d4, const T *dy, T *dx, cudaStream_t stream) {
-  Slice4DGrad<<<GET_BLOCKS(l1 * l2 * l3 * l4), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, l1, l2, l3, l4, d1, d2, d3, d4,
-                                                                     dy, dx);
-}
-
-template <typename T>
-__global__ void StridedSliceKernel(const size_t b0, const size_t b1, const size_t b2, const size_t b3, const size_t b4,
-                                   const size_t b5, const size_t b6, const size_t s0, const size_t s1, const size_t s2,
-                                   const size_t s3, const size_t s4, const size_t s5, const size_t s6, const size_t i0,
-                                   const size_t i1, const size_t i2, const size_t i3, const size_t i4, const size_t i5,
-                                   const size_t i6, const size_t o0, const size_t o1, const size_t o2, const size_t o3,
-                                   const size_t o4, const size_t o5, const size_t o6, const T *input_addr,
-                                   T *output_addr) {
-  size_t output_num = o0 * o1 * o2 * o3 * o4 * o5 * o6;
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_num; pos += blockDim.x * gridDim.x) {
-    size_t i = pos / (o1 * o2 * o3 * o4 * o5 * o6) % o0;
-    size_t j = pos / (o2 * o3 * o4 * o5 * o6) % o1;
-    size_t k = pos / (o3 * o4 * o5 * o6) % o2;
-    size_t l = pos / (o4 * o5 * o6) % o3;
-    size_t m = pos / (o5 * o6) % o4;
-    size_t n = pos / (o6) % o5;
-    size_t o = pos % o6;
-
-    size_t input_idx = (i * s0 + b0) * i1 * i2 * i3 * i4 * i5 * i6 + (j * s1 + b1) * i2 * i3 * i4 * i5 * i6 +
-                       (k * s2 + b2) * i3 * i4 * i5 * i6 + (l * s3 + b3) * i4 * i5 * i6 + (m * s4 + b4) * i5 * i6 +
-                       (n * s5 + b5) * i6 + (o * s6 + b6);
-    output_addr[pos] = input_addr[input_idx];
-  }
-}
-
-template <typename T>
-void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                  const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape, const T *input,
-                  T *output, cudaStream_t cuda_stream) {
-  size_t size = output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3] * output_shape[4] *
-                output_shape[5] * output_shape[6];
-  StridedSliceKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
-    begin[0], begin[1], begin[2], begin[3], begin[4], begin[5], begin[6], strides[0], strides[1], strides[2],
-    strides[3], strides[4], strides[5], strides[6], input_shape[0], input_shape[1], input_shape[2], input_shape[3],
-    input_shape[4], input_shape[5], input_shape[6], output_shape[0], output_shape[1], output_shape[2], output_shape[3],
-    output_shape[4], output_shape[5], output_shape[6], input, output);
-}
-
-template <typename T>
-__global__ void StridedSliceGradKernel(const size_t b0, const size_t b1, const size_t b2, const size_t b3,
-                                       const size_t b4, const size_t b5, const size_t b6, const size_t s0,
-                                       const size_t s1, const size_t s2, const size_t s3, const size_t s4,
-                                       const size_t s5, const size_t s6, const size_t i0, const size_t i1,
-                                       const size_t i2, const size_t i3, const size_t i4, const size_t i5,
-                                       const size_t i6, const size_t o0, const size_t o1, const size_t o2,
-                                       const size_t o3, const size_t o4, const size_t o5, const size_t o6, const T *dy,
-                                       T *dx) {
-  size_t output_num = o0 * o1 * o2 * o3 * o4 * o5 * o6;
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_num; pos += blockDim.x * gridDim.x) {
-    size_t i = pos / (o1 * o2 * o3 * o4 * o5 * o6) % o0;
-    size_t j = pos / (o2 * o3 * o4 * o5 * o6) % o1;
-    size_t k = pos / (o3 * o4 * o5 * o6) % o2;
-    size_t l = pos / (o4 * o5 * o6) % o3;
-    size_t m = pos / (o5 * o6) % o4;
-    size_t n = pos / (o6) % o5;
-    size_t o = pos % o6;
-
-    size_t input_idx = (i * s0 + b0) * i1 * i2 * i3 * i4 * i5 * i6 + (j * s1 + b1) * i2 * i3 * i4 * i5 * i6 +
-                       (k * s2 + b2) * i3 * i4 * i5 * i6 + (l * s3 + b3) * i4 * i5 * i6 + (m * s4 + b4) * i5 * i6 +
-                       (n * s5 + b5) * i6 + (o * s6 + b6);
-    dx[input_idx] = dy[pos];
-  }
-  return;
-}
-
-template <typename T>
-void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                      const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape, const T *dy, T *dx,
-                      cudaStream_t cuda_stream) {
-  size_t size = dy_shape[0] * dy_shape[1] * dy_shape[2] * dy_shape[3] * dy_shape[4] * dy_shape[5] * dy_shape[6];
-  StridedSliceGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
-    begin[0], begin[1], begin[2], begin[3], begin[4], begin[5], begin[6], strides[0], strides[1], strides[2],
-    strides[3], strides[4], strides[5], strides[6], dx_shape[0], dx_shape[1], dx_shape[2], dx_shape[3], dx_shape[4],
-    dx_shape[5], dx_shape[6], dy_shape[0], dy_shape[1], dy_shape[2], dy_shape[3], dy_shape[4], dy_shape[5], dy_shape[6],
-    dy, dx);
-}
-
-template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const double *input, double *output,
-                            cudaStream_t stream);
-template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const float *input, float *output,
-                            cudaStream_t stream);
-template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const half *input, half *output,
-                            cudaStream_t stream);
-template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const int *input, int *output,
-                            cudaStream_t stream);
-template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const short *input, short *output,  // NOLINT
-                            cudaStream_t stream);
-template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const unsigned char *input,
-                            unsigned char *output, cudaStream_t stream);
-template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const int64_t *input, int64_t *output,
-                            cudaStream_t stream);
-template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const bool *input, bool *output,
-                            cudaStream_t stream);
-
-template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1,
-                            const size_t d2, const double *input, double *output, cudaStream_t stream);
-template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1,
-                            const size_t d2, const float *input, float *output, cudaStream_t stream);
-template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1,
-                            const size_t d2, const half *input, half *output, cudaStream_t stream);
-template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1,
-                            const size_t d2, const int *input, int *output, cudaStream_t stream);
-template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1,
-                            const size_t d2, const short *input, short *output, cudaStream_t stream);  // NOLINT
-template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1,
-                            const size_t d2, const unsigned char *input, unsigned char *output, cudaStream_t stream);
-template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1,
-                            const size_t d2, const int64_t *input, int64_t *output, cudaStream_t stream);
-template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1,
-                            const size_t d2, const bool *input, bool *output, cudaStream_t stream);
-
-template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2,
-                            const size_t l3, const size_t d1, const size_t d2, const size_t d3, const double *input,
-                            double *output, cudaStream_t stream);
-template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2,
-                            const size_t l3, const size_t d1, const size_t d2, const size_t d3, const float *input,
-                            float *output, cudaStream_t stream);
-template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2,
-                            const size_t l3, const size_t d1, const size_t d2, const size_t d3, const half *input,
-                            half *output, cudaStream_t stream);
-template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2,
-                            const size_t l3, const size_t d1, const size_t d2, const size_t d3, const int *input,
-                            int *output, cudaStream_t stream);
-template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2,
-                            const size_t l3, const size_t d1, const size_t d2, const size_t d3, const short *input,  // NOLINT
-                            short *output, cudaStream_t stream);  // NOLINT
-template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2,
-                            const size_t l3, const size_t d1, const size_t d2, const size_t d3,
-                            const unsigned char *input, unsigned char *output, cudaStream_t stream);
-template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2,
-                            const size_t l3, const size_t d1, const size_t d2, const size_t d3, const int64_t *input,
-                            int64_t *output, cudaStream_t stream);
-template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2,
-                            const size_t l3, const size_t d1, const size_t d2, const size_t d3, const bool *input,
-                            bool *output, cudaStream_t stream);
-
-template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                            const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                            const size_t d3, const size_t d4, const double *input, double *output, cudaStream_t stream);
-template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                            const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                            const size_t d3, const size_t d4, const float *input, float *output, cudaStream_t stream);
-template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                            const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                            const size_t d3, const size_t d4, const half *input, half *output, cudaStream_t stream);
-template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                            const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                            const size_t d3, const size_t d4, const int *input, int *output, cudaStream_t stream);
-template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                            const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                            const size_t d3, const size_t d4, const short *input, short *output,  // NOLINT
-                            cudaStream_t stream);
-template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                            const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                            const size_t d3, const size_t d4, const unsigned char *input, unsigned char *output,
-                            cudaStream_t stream);
-template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                            const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                            const size_t d3, const size_t d4, const int64_t *input, int64_t *output,
-                            cudaStream_t stream);
-template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                            const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                            const size_t d3, const size_t d4, const bool *input, bool *output, cudaStream_t stream);
-
-template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
-                            const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
-                            const double *input, double *output, cudaStream_t stream);
-template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
-                            const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
-                            const float *input, float *output, cudaStream_t stream);
-template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
-                            const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
-                            const half *input, half *output, cudaStream_t stream);
-template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
-                            const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
-                            const int64_t *input, int64_t *output, cudaStream_t stream);
-template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
-                            const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
-                            const int *input, int *output, cudaStream_t stream);
-template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
-                            const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
-                            const short *input, short *output, cudaStream_t stream);  // NOLINT
-template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
-                            const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
-                            const unsigned char *input, unsigned char *output, cudaStream_t stream);
-template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
-                            const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
-                            const bool *input, bool *output, cudaStream_t stream);
-
-template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                            const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3,
-                            const size_t d4, const size_t d5, const size_t d6, const double *input, double *output,
-                            cudaStream_t stream);
-template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                            const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3,
-                            const size_t d4, const size_t d5, const size_t d6, const float *input, float *output,
-                            cudaStream_t stream);
-template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                            const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3,
-                            const size_t d4, const size_t d5, const size_t d6, const half *input, half *output,
-                            cudaStream_t stream);
-template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                            const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3,
-                            const size_t d4, const size_t d5, const size_t d6, const int64_t *input, int64_t *output,
-                            cudaStream_t stream);
-template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                            const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3,
-                            const size_t d4, const size_t d5, const size_t d6, const int *input, int *output,
-                            cudaStream_t stream);
-template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                            const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3,
-                            const size_t d4, const size_t d5, const size_t d6, const short *input, short *output,  // NOLINT
-                            cudaStream_t stream);  // NOLINT
-template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                            const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3,
-                            const size_t d4, const size_t d5, const size_t d6, const unsigned char *input,
-                            unsigned char *output, cudaStream_t stream);
-template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                            const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3,
-                            const size_t d4, const size_t d5, const size_t d6, const bool *input, bool *output,
-                            cudaStream_t stream);
-
-template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3,
-                            const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1,
-                            const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
-                            const size_t d7, const double *input, double *output, cudaStream_t stream);
-template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3,
-                            const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1,
-                            const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
-                            const size_t d7, const float *input, float *output, cudaStream_t stream);
-template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3,
-                            const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1,
-                            const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
-                            const size_t d7, const half *input, half *output, cudaStream_t stream);
-template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3,
-                            const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1,
-                            const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
-                            const size_t d7, const int64_t *input, int64_t *output, cudaStream_t stream);
-template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3,
-                            const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1,
-                            const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
-                            const size_t d7, const int *input, int *output, cudaStream_t stream);
-template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3,
-                            const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1,
-                            const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
-                            const size_t d7, const short *input, short *output, cudaStream_t stream);  // NOLINT
-template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3,
-                            const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1,
-                            const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
-                            const size_t d7, const unsigned char *input, unsigned char *output, cudaStream_t stream);
-template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5,
-                            const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3,
-                            const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1,
-                            const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
-                            const size_t d7, const bool *input, bool *output, cudaStream_t stream);
-
-template void CalSlice4DGrad<double>(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
-                                     const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                                     const size_t d1, const size_t d2, const size_t d3, const size_t d4,
-                                     const double *dy, double *dx, cudaStream_t stream);
-template void CalSlice4DGrad<float>(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                                    const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                                    const size_t d3, const size_t d4, const float *dy, float *dx, cudaStream_t stream);
-template void CalSlice4DGrad<half>(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                                   const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                                   const size_t d3, const size_t d4, const half *dy, half *dx, cudaStream_t stream);
-template void CalSlice4DGrad<int>(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                                  const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                                  const size_t d3, const size_t d4, const int *dy, int *dx, cudaStream_t stream);
-template void CalSlice4DGrad<short>(const size_t s1, const size_t s2, const size_t s3, const size_t s4,  // NOLINT
-                                    const size_t l1,
-                                    const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                                    const size_t d3, const size_t d4, const short *dy, short *dx,  // NOLINT
-                                    cudaStream_t stream);
-template void CalSlice4DGrad<unsigned char>(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
-                                            const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                                            const size_t d1, const size_t d2, const size_t d3, const size_t d4,
-                                            const unsigned char *dy, unsigned char *dx, cudaStream_t stream);
-template void CalSlice4DGrad<int64_t>(const size_t s1, const size_t s2, const size_t s3, const size_t s4,
-                                      const size_t l1, const size_t l2, const size_t l3, const size_t l4,
-                                      const size_t d1, const size_t d2, const size_t d3, const size_t d4,
-                                      const int64_t *dy, int64_t *dx, cudaStream_t stream);
-template void CalSlice4DGrad<bool>(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                                   const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                                   const size_t d3, const size_t d4, const bool *dy, bool *dx, cudaStream_t stream);
-
-template void FillDeviceArray<bool>(const size_t input_size, bool *addr, const float value, cudaStream_t cuda_stream);
-template void FillDeviceArray<int64_t>(const size_t input_size, int64_t *addr, const float value,
-                                       cudaStream_t cuda_stream);
-template void FillDeviceArray<int>(const size_t input_size, int *addr, const float value, cudaStream_t cuda_stream);
-template void FillDeviceArray<short>(const size_t input_size, short *addr, const float value,  // NOLINT
-                                     cudaStream_t cuda_stream);
-template void FillDeviceArray<int8_t>(const size_t input_size, int8_t *addr, const float value,
-                                      cudaStream_t cuda_stream);
-template void FillDeviceArray<uint64_t>(const size_t input_size, uint64_t *addr, const float value,
-                                        cudaStream_t cuda_stream);
-template void FillDeviceArray<uint32_t>(const size_t input_size, uint32_t *addr, const float value,
-                                        cudaStream_t cuda_stream);
-template void FillDeviceArray<uint16_t>(const size_t input_size, uint16_t *addr, const float value,
-                                        cudaStream_t cuda_stream);
-template void FillDeviceArray<unsigned char>(const size_t input_size, unsigned char *addr, const float value,
-                                             cudaStream_t cuda_stream);
-template void FillDeviceArray<half>(const size_t input_size, half *addr, const float value, cudaStream_t cuda_stream);
-template void FillDeviceArray<float>(const size_t input_size, float *addr, const float value, cudaStream_t cuda_stream);
-template void FillDeviceArray<double>(const size_t input_size, double *addr, const float value,
-                                      cudaStream_t cuda_stream);
-
-template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
-                           const bool *input, bool *output, cudaStream_t cuda_stream);
-template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
-                           const double *input, double *output, cudaStream_t cuda_stream);
-template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
-                           const float *input, float *output, cudaStream_t cuda_stream);
-template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
-                           const half *input, half *output, cudaStream_t cuda_stream);
-template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
-                           const int64_t *input, int64_t *output, cudaStream_t cuda_stream);
-template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
-                           const int *input, int *output, cudaStream_t cuda_stream);
-template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
-                           const short *input, short *output, cudaStream_t cuda_stream);  // NOLINT
-template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
-                           const int8_t *input, int8_t *output, cudaStream_t cuda_stream);
-template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
-                           const uint64_t *input, uint64_t *output, cudaStream_t cuda_stream);
-template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
-                           const uint32_t *input, uint32_t *output, cudaStream_t cuda_stream);
-template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
-                           const uint16_t *input, uint16_t *output, cudaStream_t cuda_stream);
-template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                           const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape,
-                           const unsigned char *input, unsigned char *output, cudaStream_t cuda_stream);
-
-template void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape, const bool *dy,
-                               bool *dx, cudaStream_t cuda_stream);
-template void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
-                               const double *dy, double *dx, cudaStream_t cuda_stream);
-template void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
-                               const float *dy, float *dx, cudaStream_t cuda_stream);
-template void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape, const half *dy,
-                               half *dx, cudaStream_t cuda_stream);
-template void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
-                               const int64_t *dy, int64_t *dx, cudaStream_t cuda_stream);
-template void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape, const int *dy,
-                               int *dx, cudaStream_t cuda_stream);
-template void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
-                               const short *dy,                       // NOLINT
-                               short *dx, cudaStream_t cuda_stream);  // NOLINT
-template void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
-                               const int8_t *dy, int8_t *dx, cudaStream_t cuda_stream);
-template void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
-                               const uint64_t *dy, uint64_t *dx, cudaStream_t cuda_stream);
-template void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
-                               const uint32_t *dy, uint32_t *dx, cudaStream_t cuda_stream);
-template void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
-                               const uint16_t *dy, uint16_t *dx, cudaStream_t cuda_stream);
-template void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                               const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape,
-                               const unsigned char *dy, unsigned char *dx, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh
deleted file mode 100644
index d1780d54f7f..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SLICE_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SLICE_IMPL_CUH_
-
-#include <cuda_runtime.h>
-#include <vector>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename...S>
-void SliceKernel(const T *input, T *output, const size_t output_size, cudaStream_t cuda_stream, S...pack);
-
-template <typename T>
-void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1,
-                    const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2,
-                    const size_t d3, const size_t d4, const T *dy, T *dx, cudaStream_t stream);
-
-template <typename T>
-void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const T *input, T *output, cudaStream_t stream);
-
-template <typename T>
-void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, const size_t d2,
-                   const T *input, T *output, cudaStream_t stream);
-
-template <typename T>
-void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, const size_t l3,
-                   const size_t d1, const size_t d2, const size_t d3, const T *input, T *output, cudaStream_t stream);
-
-template <typename T>
-void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, const size_t l2,
-                   const size_t l3, const size_t l4, const size_t d1, const size_t d2, const size_t d3, const size_t d4,
-                   const T *input, T *output, cudaStream_t stream);
-
-template <typename T>
-void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t l1,
-                   const size_t l2, const size_t l3, const size_t l4, const size_t l5, const size_t d1, const size_t d2,
-                   const size_t d3, const size_t d4, const size_t d5, const T *input, T *output, cudaStream_t stream);
-
-template <typename T>
-void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t s6,
-                   const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, const size_t l6,
-                   const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6,
-                   const T *input, T *output, cudaStream_t stream);
-
-template <typename T>
-void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t s6,
-                   const size_t s7, const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5,
-                   const size_t l6, const size_t l7, const size_t d1, const size_t d2, const size_t d3, const size_t d4,
-                   const size_t d5, const size_t d6, const size_t d7, const T *input, T *output, cudaStream_t stream);
-
-template <typename T>
-void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
-                  const std::vector<int64_t> &strides, const std::vector<size_t> &output_shape, const T *input,
-                  T *output, cudaStream_t cuda_stream);
-
-template <typename T>
-void StridedSliceGrad(const std::vector<size_t> &dy_shape, const std::vector<int64_t> &begin,
-                      const std::vector<int64_t> &strides, const std::vector<size_t> &dx_shape, const T *dy, T *dx,
-                      cudaStream_t cuda_stream);
-
-template <typename T>
-void FillDeviceArray(const size_t input_size, T *addr, const float value, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SLICE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cuh
deleted file mode 100644
index ef6409763a0..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cuh
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SMOOTH_L1_LOSS_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SMOOTH_L1_LOSS_H_
-template <typename T>
-void SmoothL1Loss(const int &input_size, const float &beta, const T *prediction, const T *target, T *loss,
-                  cudaStream_t stream);
-template <typename T>
-void SmoothL1LossGrad(const int &input_size, const float &beta, const T *prediction, const T *target, const T *dloss,
-                      T *dx, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SMOOTH_L1_LOSS_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cu
deleted file mode 100644
index 0c1c4b67b2e..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cuda_runtime.h>
-#include "spacetobatch_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-__global__ void SpaceToBatch(const size_t size, const T *input, const size_t in,
-                             const size_t ih, const size_t iw, const size_t ic,
-                             const size_t on, const size_t oh, const size_t ow,
-                             const size_t oc, const size_t pad_up, const size_t pad_dn,
-                             const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-                             T *output) {
-  size_t temp_stride = 0;
-  size_t temp_pos = 0;
-  size_t idx_in = 0;
-  size_t idx_ic = 0;
-  size_t idx_ih = 0;
-  size_t idx_iw = 0;
-  size_t idx_on = 0;
-  size_t output_pos = 0;
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size;
-    pos += blockDim.x * gridDim.x) {
-    temp_stride = ic * ih * iw;
-    idx_in = pos / temp_stride;
-    temp_pos = pos % temp_stride;
-
-    temp_stride /= ic;
-    idx_ic = temp_pos / temp_stride;
-    temp_pos = pos % temp_stride;
-
-    temp_stride /= ih;
-    idx_ih = temp_pos / temp_stride;
-    temp_pos = pos % temp_stride;
-
-    temp_stride /= iw;
-    idx_iw = temp_pos / temp_stride;
-
-    idx_on = (((idx_ih + pad_up) % block_num) * block_num + ((idx_iw + pad_lft) % block_num)) * in + idx_in;
-    output_pos = idx_on * oc;
-    output_pos = (output_pos + idx_ic) * oh;
-    output_pos = (output_pos + ((idx_ih + pad_up) - (idx_on / (in * block_num))) / block_num) * ow;
-    output_pos = (output_pos + ((idx_iw + pad_lft) - ((idx_on / in) % block_num)) / block_num);
-    output[output_pos] = input[pos];
-  }
-  return;
-}
-
-template <typename T>
-void CalSpaceToBatch(const size_t size, const T *input, const size_t in,
-                     const size_t ih, const size_t iw, const size_t ic,
-                     const size_t on, const size_t oh, const size_t ow,
-                     const size_t oc, const size_t pad_up, const size_t pad_dn,
-                     const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-                     T *output, cudaStream_t cuda_stream) {
-  cudaMemset(output, 0, on * oc * oh * ow * sizeof(T));
-  SpaceToBatch<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
-    size, input, in, ih, iw, ic, on, oh, ow, oc, pad_up, pad_dn, pad_lft, pad_rht, block_num, output);
-  return;
-}
-
-template void CalSpaceToBatch<float>(const size_t size, const float *input, const size_t in,
-  const size_t ih, const size_t iw, const size_t ic,
-  const size_t on, const size_t oh, const size_t ow,
-  const size_t oc, const size_t pad_up, const size_t pad_dn,
-  const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-  float *output, cudaStream_t cuda_stream);
-template void CalSpaceToBatch<half>(const size_t size, const half *input, const size_t in,
-  const size_t ih, const size_t iw, const size_t ic,
-  const size_t on, const size_t oh, const size_t ow,
-  const size_t oc, const size_t pad_up, const size_t pad_dn,
-  const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-  half *output, cudaStream_t cuda_stream);
-template void CalSpaceToBatch<int>(const size_t size, const int *input, const size_t in,
-  const size_t ih, const size_t iw, const size_t ic,
-  const size_t on, const size_t oh, const size_t ow,
-  const size_t oc, const size_t pad_up, const size_t pad_dn,
-  const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-  int *output, cudaStream_t cuda_stream);
-template void CalSpaceToBatch<int64_t>(const size_t size, const int64_t *input, const size_t in,
-  const size_t ih, const size_t iw, const size_t ic,
-  const size_t on, const size_t oh, const size_t ow,
-  const size_t oc, const size_t pad_up, const size_t pad_dn,
-  const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-  int64_t *output, cudaStream_t cuda_stream);
-template void CalSpaceToBatch<int16_t>(const size_t size, const int16_t *input, const size_t in,
-  const size_t ih, const size_t iw, const size_t ic,
-  const size_t on, const size_t oh, const size_t ow,
-  const size_t oc, const size_t pad_up, const size_t pad_dn,
-  const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-  int16_t *output, cudaStream_t cuda_stream);
-template void CalSpaceToBatch<int8_t>(const size_t size, const int8_t *input, const size_t in,
-  const size_t ih, const size_t iw, const size_t ic,
-  const size_t on, const size_t oh, const size_t ow,
-  const size_t oc, const size_t pad_up, const size_t pad_dn,
-  const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-  int8_t *output, cudaStream_t cuda_stream);
-template void CalSpaceToBatch<uint8_t>(const size_t size, const uint8_t *input, const size_t in,
-  const size_t ih, const size_t iw, const size_t ic,
-  const size_t on, const size_t oh, const size_t ow,
-  const size_t oc, const size_t pad_up, const size_t pad_dn,
-  const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-  uint8_t *output, cudaStream_t cuda_stream);
-template void CalSpaceToBatch<uint16_t>(const size_t size, const uint16_t *input, const size_t in,
-  const size_t ih, const size_t iw, const size_t ic,
-  const size_t on, const size_t oh, const size_t ow,
-  const size_t oc, const size_t pad_up, const size_t pad_dn,
-  const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-  uint16_t *output, cudaStream_t cuda_stream);
-template void CalSpaceToBatch<uint32_t>(const size_t size, const uint32_t *input, const size_t in,
-  const size_t ih, const size_t iw, const size_t ic,
-  const size_t on, const size_t oh, const size_t ow,
-  const size_t oc, const size_t pad_up, const size_t pad_dn,
-  const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-  uint32_t *output, cudaStream_t cuda_stream);
-template void CalSpaceToBatch<uint64_t>(const size_t size, const uint64_t *input, const size_t in,
-  const size_t ih, const size_t iw, const size_t ic,
-  const size_t on, const size_t oh, const size_t ow,
-  const size_t oc, const size_t pad_up, const size_t pad_dn,
-  const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-  uint64_t *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cuh
deleted file mode 100644
index 93209f3235c..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPACETOBATCH_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPACETOBATCH_H_
-template <typename T>
-void CalSpaceToBatch(const size_t size, const T *input, const size_t in,
-                     const size_t ih, const size_t iw, const size_t ic,
-                     const size_t on, const size_t oh, const size_t ow,
-                     const size_t oc, const size_t pad_up, const size_t pad_dn,
-                     const size_t pad_lft, const size_t pad_rht, const size_t block_num,
-                     T *output, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPACETOBATCH_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cu
deleted file mode 100644
index 16905e9bbf1..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cu
+++ /dev/null
@@ -1,138 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-#include "spacetodepth_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-__global__ void SpaceToDepth(const size_t size, const T *input, const size_t in,
-                             const size_t ic, const size_t ih, const size_t iw,
-                             const size_t on, const size_t oc, const size_t oh,
-                             const size_t ow, const size_t r, T *output) {
-  size_t temp_stride = 0;
-  size_t temp_pos = 0;
-  size_t output_pos = 0;
-  size_t input_pos_array[SPACETODEPTH_BUFFER_DIMENSION];
-
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size;
-       pos += blockDim.x * gridDim.x) {
-    temp_stride = ic * ih * iw;
-    input_pos_array[0] = pos / temp_stride;
-    temp_pos = pos % temp_stride;
-
-    temp_stride /= ic;
-    input_pos_array[1] = temp_pos / temp_stride;
-    temp_pos = pos % temp_stride;
-
-    temp_stride /= ih;
-    input_pos_array[2] = temp_pos / temp_stride;
-    temp_pos = pos % temp_stride;
-
-    temp_stride /= iw;
-    input_pos_array[3] = temp_pos / temp_stride;
-
-    output_pos += input_pos_array[0];
-    output_pos = (output_pos * oc) +
-                 (input_pos_array[1] +
-                  (r * (input_pos_array[2] % r) + input_pos_array[3] % r) * ic);
-    output_pos = (output_pos * oh) + (input_pos_array[2] / r);
-    output_pos = (output_pos * ow) + (input_pos_array[3] / r);
-
-    output[output_pos] = input[pos];
-    output_pos = 0;
-  }
-  return;
-}
-
-template <typename T>
-void CalSpaceToDepth(const size_t size, const T *input, const size_t in,
-                     const size_t ic, const size_t ih, const size_t iw,
-                     const size_t on, const size_t oc, const size_t oh,
-                     const size_t ow, const size_t r, T *output,
-                     cudaStream_t cuda_stream) {
-  SpaceToDepth<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
-      size, input, in, ic, ih, iw, on, oc, oh, ow, r, output);
-  return;
-}
-
-template void CalSpaceToDepth<float>(const size_t size, const float *input,
-                                     const size_t in, const size_t ic,
-                                     const size_t ih, const size_t iw,
-                                     const size_t on, const size_t oc,
-                                     const size_t oh, const size_t ow,
-                                     const size_t r, float *output,
-                                     cudaStream_t cuda_stream);
-template void CalSpaceToDepth<half>(const size_t size, const half *input,
-                                    const size_t in, const size_t ic,
-                                    const size_t ih, const size_t iw,
-                                    const size_t on, const size_t oc,
-                                    const size_t oh, const size_t ow,
-                                    const size_t r, half *output,
-                                    cudaStream_t cuda_stream);
-template void CalSpaceToDepth<int>(const size_t size, const int *input,
-                                   const size_t in, const size_t ic,
-                                   const size_t ih, const size_t iw,
-                                   const size_t on, const size_t oc,
-                                   const size_t oh, const size_t ow,
-                                   const size_t r, int *output,
-                                   cudaStream_t cuda_stream);
-template void CalSpaceToDepth<int64_t>(const size_t size, const int64_t *input,
-                                       const size_t in, const size_t ic,
-                                       const size_t ih, const size_t iw,
-                                       const size_t on, const size_t oc,
-                                       const size_t oh, const size_t ow,
-                                       const size_t r, int64_t *output,
-                                       cudaStream_t cuda_stream);
-template void CalSpaceToDepth<int16_t>(const size_t size, const int16_t *input,
-                                       const size_t in, const size_t ic,
-                                       const size_t ih, const size_t iw,
-                                       const size_t on, const size_t oc,
-                                       const size_t oh, const size_t ow,
-                                       const size_t r, int16_t *output,
-                                       cudaStream_t cuda_stream);
-template void CalSpaceToDepth<int8_t>(const size_t size, const int8_t *input,
-                                      const size_t in, const size_t ic,
-                                      const size_t ih, const size_t iw,
-                                      const size_t on, const size_t oc,
-                                      const size_t oh, const size_t ow,
-                                      const size_t r, int8_t *output,
-                                      cudaStream_t cuda_stream);
-template void CalSpaceToDepth<uint8_t>(const size_t size, const uint8_t *input,
-                                       const size_t in, const size_t ic,
-                                       const size_t ih, const size_t iw,
-                                       const size_t on, const size_t oc,
-                                       const size_t oh, const size_t ow,
-                                       const size_t r, uint8_t *output,
-                                       cudaStream_t cuda_stream);
-template void
-CalSpaceToDepth<uint16_t>(const size_t size, const uint16_t *input,
-                          const size_t in, const size_t ic, const size_t ih,
-                          const size_t iw, const size_t on, const size_t oc,
-                          const size_t oh, const size_t ow, const size_t r,
-                          uint16_t *output, cudaStream_t cuda_stream);
-template void
-CalSpaceToDepth<uint32_t>(const size_t size, const uint32_t *input,
-                          const size_t in, const size_t ic, const size_t ih,
-                          const size_t iw, const size_t on, const size_t oc,
-                          const size_t oh, const size_t ow, const size_t r,
-                          uint32_t *output, cudaStream_t cuda_stream);
-template void
-CalSpaceToDepth<uint64_t>(const size_t size, const uint64_t *input,
-                          const size_t in, const size_t ic, const size_t ih,
-                          const size_t iw, const size_t on, const size_t oc,
-                          const size_t oh, const size_t ow, const size_t r,
-                          uint64_t *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cuh
deleted file mode 100644
index 85ef76460c9..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cuh
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPACETODEPTH_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPACETODEPTH_H_
-
-#define SPACETODEPTH_BUFFER_DIMENSION 4
-template <typename T>
-void CalSpaceToDepth(const size_t size, const T *input, const size_t in,
-                     const size_t ic, const size_t ih, const size_t iw,
-                     const size_t on, const size_t oc, const size_t oh,
-                     const size_t ow, const size_t r, T *output,
-                     cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPACETODEPTH_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cuh
deleted file mode 100644
index fc133b00b73..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMP_SPARSE_APPLY_PROXIMAL_ADAGRAD_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMP_SPARSE_APPLY_PROXIMAL_ADAGRAD_IMPL_CUH_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void CalSparseApplyProximalAdagrad(const size_t size, const size_t indices_size, const T *learning_rate,
-                                   const T *l1_regularization, const T *l2_regularization, const T *gradient,
-                                   const int *indices, T *variable, T *accumulation, T *variable_out,
-                                   T *accumulation_out, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMP_SPARSE_APPLY_PROXIMAL_ADAGRAD_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_cross_entropy_cuda_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_cross_entropy_cuda_impl.cuh
deleted file mode 100755
index 3a32c6e36a2..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_cross_entropy_cuda_impl.cuh
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPARSECROSSENTROPYCUDAIMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPARSECROSSENTROPYCUDAIMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-void CalCrossEntropy(const float *logits, T *labels, const int batch_size, const int class_num, float *loss,
-                     cudaStream_t cuda_stream);
-
-template <typename T>
-void CalCrossEntropyGrad(const float *logits, T *labels, const int batch_size, const int class_num, float *grad,
-                         cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPARSECROSSENTROPYCUDAIMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cuh
deleted file mode 100644
index c5cebe6d0fd..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cuh
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SPARSE_FTRL_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SPARSE_FTRL_IMPL_H_
-template <typename T, typename S>
-void CalSparseApplyFtrl(const T *gradient, const S *indices, const int num_index, const size_t n_stride,
-                        const float learning_rate, const float l1_regularization, const float l2_regularization,
-                        const float learning_rate_power, const bool use_locking, T *variable, T *accumulation,
-                        T *linear, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SPARSE_FTRL_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/split_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/split_impl.cu
deleted file mode 100755
index 359a64b4cd5..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/split_impl.cu
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <cuda_runtime.h>
-#include "plugin/device/gpu/kernel/cuda_impl/split_impl.cuh"
-template <typename T>
-__global__ void Split(const size_t size, const int axis_step, const int all_size_before_axis,
-                      const int all_size_axis, const T* input, T** outputs) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
-    int num = pos % all_size_before_axis / all_size_axis;
-    int block = num / axis_step;
-    int block_pos = pos / all_size_before_axis * axis_step * all_size_axis +
-                    num % axis_step * all_size_axis + pos % all_size_axis;
-    outputs[block][block_pos] = input[pos];
-  }
-  return;
-}
-
-template <typename T>
-void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
-                 const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) {
-  Split<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, axis_step, all_size_before_axis,
-                                                           all_size_axis, input, outputs);
-  return;
-}
-
-template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
-                          const int all_size_axis, const half* input, half** outputs,
-                          cudaStream_t cuda_stream);
-template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
-                          const int all_size_axis, const float* input, float** outputs,
-                          cudaStream_t cuda_stream);
-template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
-                          const int all_size_axis, const double* input, double** outputs,
-                          cudaStream_t cuda_stream);
-template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
-                          const int all_size_axis, const int* input, int** outputs,
-                          cudaStream_t cuda_stream);
-template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
-                          const int all_size_axis, const uint32_t* input, uint32_t** outputs,
-                          cudaStream_t cuda_stream);
-template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
-                          const int all_size_axis, const int64_t* input, int64_t** outputs,
-                          cudaStream_t cuda_stream);
-template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
-                          const int all_size_axis, const bool* input, bool** outputs,
-                          cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cu
index 3400326d011..4b5084008ba 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 __global__ void AngleAtomEnergyKernel(int angle_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh
index e21047e456d..f09f72de66a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_ANGLE_ANGLE_ATOM_ENERGY_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void AngleAtomEnergy(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
                      const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cu
index 717125297de..c57befff8eb 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 __global__ void AngleEnergyKernel(int angle_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh
index 0a824658052..be75db9a29c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_ANGLE_ANGLE_ENERGY_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void AngleEnergy(int angle_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b,
                  const int *atom_c, const float *angle_k, const float *angle_theta0, float *ene, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cu
index f441c9a54d4..6457de26a8f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 __global__ void AngleForceKernel(int angle_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh
index b150c2e8d1e..24276a23860 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_ANGLE_ANGLE_FORCE_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void AngleForce(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a,
                 const int *atom_b, const int *atom_c, const float *angle_k, const float *angle_theta0, float *frc_f,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cu
index d9d2f2da601..b186683585a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 __global__ void AngleForceWithAtomEnergyKernel(int angle_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh
index d4f11819e48..c8f647fc1e8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_ANGLE_ANGLE_FORCE_WITH_ATOM_ENERGY_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void AngleForceWithAtomEnergy(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
                               const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cu
index 96f71952d9a..bab1f905c68 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cu
@@ -16,7 +16,7 @@
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __global__ void BondAtomEnergyCudaKernel(const int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
                                          const VECTOR *scaler, const int *atom_a, const int *atom_b,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh
index edeb4e881c8..832eea77397 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_ATOM_ENERGY_GPU_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void BondAtomEnergy(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a,
                     const int *atom_b, const float *bond_k, const float *bond_r0, float *atom_ene, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cu
index 34143947e23..bb1007036c6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cu
@@ -16,7 +16,7 @@
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __global__ void BondEnergyCudaKernel(const int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler,
                                      const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh
index 5cd6514b4e0..fbba36e5d38 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_ENERGY_CUDA_GPU_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void BondEnergy(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f, const float *scaler_f,
                 const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cu
index c2a074f0b75..d3ae9f8f47b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cu
@@ -16,7 +16,7 @@
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __global__ void BondForceCudaKernel(int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler,
                                     const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh
index b72a408fda6..e401f9deba4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_FORCE_CUDA_GPU_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void BondForce(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f, const float *scaler_f,
                const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cu
index 2161f2b422d..fab62a62d90 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cu
@@ -16,7 +16,7 @@
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __global__ void BondForceWithAtomEnergyAndVirialKernel(const int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
                                                        const VECTOR *scaler, const int *atom_a, const int *atom_b,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh
index 1e1438c8434..98319adde34 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_FORCE_WITH_ATOM_VIRIAL_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void BondForceWithAtomEnergyAndVirial(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f,
                                       const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cu
index 353fa4da59a..11064927158 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cu
@@ -16,7 +16,7 @@
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __global__ void BondForceWithAtomEnergyKernel(int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
                                               const VECTOR *scaler, const int *atom_a, const int *atom_b,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh
index 06fe3247697..4b11524afc9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_FORCE_WITH_ATOM_ENERGY_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void BondForceWithAtomEnergy(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
                              const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cu
index 83f9a0f3c18..78dd7f56754 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cu
@@ -16,7 +16,7 @@
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __global__ void BondForceWithAtomVirialKernel(int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
                                               const VECTOR *scaler, const int *atom_a, const int *atom_b,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh
index 33b498dc4d6..9757c081ea1 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_FORCE_WITH_ATOM_VIRIAL_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void BondForceWithAtomVirial(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
                              const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cu
index bf887ada532..8c9abb85541 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cu
@@ -16,7 +16,7 @@
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __device__ __host__ float fc(float Rij) {
   const float PI = 3.141592654;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh
index e3ee3e875d0..0c235d4fd9d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh
@@ -17,7 +17,7 @@
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_ATOMCRDTOCV_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_ATOMCRDTOCV_IMPL_H_
 
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void AtomCrdToCV(int atom_numbers, int start_serial, int end_serial, int number, const float *crd_f,
                  const float *old_crd, float *nowarp_crd, int *box_map_times, float *box, float *g_radial,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh
index 0b3197810e2..5ab563f8e1c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void CrdToUintCrd(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
                   unsigned int *uint_crd_f, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh
index f26cb13e5a8..18b531fc58d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_QUARTER_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void CrdToUintCrdQuarter(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
                          unsigned int *uint_crd_f, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh
index becb653f48a..e3fcfb3bcce 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTEROFMASS_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void GetCenterOfMass(int residue_numbers, int *start, int *end, float *crd_f, float *atom_mass,
                      float *residue_mass_inverse, float *center_of_mass_f, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh
index 4b168216ddb..6355344e09f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTER_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void GetCenterOfGeometry(const int center_numbers, float center_numbers_inverse, const int *center_atoms,
                          const float *crd_f, float *center_of_geometry_f, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh
index fad2d93c90d..c60c9a4df4c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MAPCENTEROFMASS_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void MapCenterOfMass(int residue_numbers, int *start, int *end, float *center_of_mass_f,
                      float *box_length_f, float *no_wrap_crd_f, float *crd_f, float* scaler, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh
index 4fd44c8567c..1a41f87f92e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MDTEMPERATURE_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void MDTemperature(const int residue_numbers, const int *start, const int *end, const float *atom_vel_f,
                    const float *atom_mass, float *ek, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh
index ed256448166..1f2bdb4251e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh
@@ -17,7 +17,7 @@
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_TOTAL_C6_GET_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_TOTAL_C6_GET_IMPL_H_
 
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void total_c6_get(int atom_numbers, int *atom_lj_type, float *d_lj_b, float *d_factor, cudaStream_t stream);
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh
index 270427d31ce..349de212bec 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh
@@ -27,7 +27,7 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <cufft.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 #define TWO_DIVIDED_BY_SQRT_PI 1.1283791670218446
 #define CONSTANT_kB 0.00198716
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cu
index ba87498ab3f..eef09d37a45 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cu
@@ -17,7 +17,7 @@
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __global__ void Calculate_No_Wrap_Crd(int atom_numbers, INT_VECTOR *box_map_times, VECTOR *box, VECTOR *crd,
                                       VECTOR *nowrap_crd) {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh
index fce32ed57ed..27116d8aa26 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh
@@ -19,7 +19,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRDMCMAP_CAL_NO_WRAP_CRD_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void calculatenowrapcrd(int atom_numbers, int *box_map_times_f, float *box_f, float *crd_f, float *nowrap_crd_f,
                         cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh
index 953b0e20131..3218bd5fba5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh
@@ -19,7 +19,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRDMCMAP_REFRESH_BOXMAPTIMES_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void refresh_boxmaptimes(int atom_numbers, float *box_length_inverse, float *crd_f, float *old_crd_f,
                          int *box_map_times_f, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cu
index 76d92f18bdc..4803c11f25f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 __global__ void DihedralAtomEnergyKernel(int dihedral_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh
index e421de8f79d..56d1baba7b9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_DIHEDRAL_DIHEDRAL_ATOM_ENERGY_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void DihedralAtomEnergy(int dihedral_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
                         const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d, const int *ipn,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cu
index c08b888f07f..ea58fa92c31 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 __global__ void DihedralEnergyKernel(int dihedral_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh
index fb485172d0e..eb8dc46e75c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_DIHEDRAL_DIHEDRAL_ENERGY_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void DihedralEnergy(int dihedral_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a,
                     const int *atom_b, const int *atom_c, const int *atom_d, const int *ipn, const float *pk,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cu
index 089679b0752..e90780cf1ec 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 __global__ void DihedralForceKernel(int dihedral_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh
index 00e08b58446..5804e3b825b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_DIHEDRAL_DIHEDRAL_FORCE_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void DihedralForce(int dihedral_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
                    const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d, const int *ipn,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cu
index dc6936a475a..46874a0c78a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 __global__ void DihedralForceWithAtomEnergyKernel(int dihedral_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh
index 6841567f4c6..dc9ca6ea818 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_DIHEDRAL_DIHEDRAL_FORCE_WITH_ATOM_ENERGY_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void DihedralForceWithAtomEnergy(int dihedral_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
                                  const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh
index 7e85481ec20..b51a52ef1a4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh
@@ -22,7 +22,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LJ_DIRECT_CF_FORCE_WITH_LJ_VIRIAL_DIRECT_CF_ENERGY_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void LJ_Direct_CF_Force_With_LJ_Virial_Direct_CF_Energy(
   const int atom_numbers, const float cutoff, const float pme_beta, const unsigned int *uint_crd_f, const int *LJtype,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh
index 19290c90ef2..25045a5ebed 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_ENERGY_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void LJEnergy(const int atom_numbers, const float cutoff_square, const int *uint_crd_f, const int *LJtype,
               const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh
index 904a4227ee0..c37fdcdea97 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_FORCE_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void LJForce(const int atom_numbers, const float cutoff_square, const int *uint_crd_f, const int *LJtype,
              const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh
index 54711ae355a..2bd59ee88e0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_FORCE_WITH_PME_DIRECT_FORCE_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void LJForceWithPMEDirectForce(const int atom_numbers, const float cutoff, const float pme_beta, const int *uint_crd_f,
                                const int *LJtype, const float *charge, const float *scaler_f, float *uint_crd_with_LJ,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh
index bdda30249c0..0e22abe4352 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_PME_DIRECT_FORCE_WITH_ATOM_ENERGY_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void LJDirectCFForceWithAtomEnergy(const int atom_numbers, const float cutoff, const float pme_beta,
                                    const int *uint_crd_f, const int *LJtype, const float *charge, const float *scaler_f,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh
index 7ccc500226f..616ce19b6d8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_CF_ATOM_ENERGY_IMPL_H
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void Dihedral14CFAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                             const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh
index 52a588b53cf..dfc31a357f0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_CF_ENERGY_IMPL_H
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void Dihedral14CFEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
                         const float *charge, float *uint_crd_with_LJ_f, const float *boxlength_f, const int *a_14,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh
index 7b443c789ae..286bc9d5ac0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_ATOM_ENERGY_IMPL_H
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void Dihedral14LJAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                             const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh
index 74153b6739c..914d6e0a010 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_IMPL_H
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void Dihedral14LJCFForceWithAtomEnergyAndVirial(const int dihedral_14_numbers, const int atom_numbers,
                                                 const int *uint_crd_f, const int *LJtype, const float *charge,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh
index 27208029f31..ad2957b79c0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_IMPL_H
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void Dihedral14LJCFForceWithAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                                        const int *LJtype, const float *charge, float *uint_crd_with_LJ_f,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh
index 04cc1a2849b..cd9c125c899 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_ENERGY_IMPL_H
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void Dihedral14LJEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
                         const float *charge, float *uint_crd_with_LJ_f, const float *boxlength_f, const int *a_14,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh
index e107310d5b1..ab67d0c076c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_FORCE_IMPL_H
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void Dihedral14LJForce(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
                        const float *charge, const float *boxlength_f, const int *a_14, const int *b_14,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh
index c25dd8d06fb..0b97b56b685 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_FORCE_WITH_DIRECT_CF_IMPL_H
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void Dihedral14LJForceWithDirectCF(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                                    const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh
index 8f4029f036d..a91b582c5d3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh
@@ -21,7 +21,7 @@
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_IMPL_H_
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 struct VECTOR {
   float x;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cu
index b129d9dd952..fc2102e52c0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 __global__ void MD_Iteration_Gradient_Descent(const int atom_numbers, VECTOR *crd, VECTOR *frc,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh
index f1f829014f4..0455f49219a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_GRADIENT_DESCENT_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 void MDIterationGradientDescent(const int atom_numbers, float *crd, float *frc, const float learning_rate,
                                 cudaStream_t stream);
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh
index e87a9ed85bb..af5756a7fa6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh
@@ -22,7 +22,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NVTIT_MD_ITERATION_LEAP_FROG_IMPL_H
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void MDIterationLeapFrog(const int atom_numbers, float *vel, float *crd, float *frc, float *acc,
                          const float *inverse_mass, const float dt, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cu
index 7c8237112b9..a2a636726b8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 __global__ void MD_Iteration_Leap_Frog_With_LiuJian_kernel(const int atom_numbers, const float half_dt, const float dt,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh
index 1595d4f845a..58bd87184c4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 void MD_Iteration_Leap_Frog_With_LiuJian(const int atom_numbers, const float half_dt, const float dt,
                                          const float exp_gamma, int float4_numbers, float *inverse_mass,
                                          float *sqrt_mass_inverse, float *vel, float *crd, float *frc, float *acc,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cu
index 3e029614c41..b8c53e91acf 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 __global__ void MD_Iteration_Leap_Frog_With_LiuJian_With_Max_Velocity(
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh
index 6e554f6ae8a..cf832952d7b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_WITH_MAX_VEL_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 void MD_Iteration_Leap_Frog_With_LiuJian_With_Max_Vel(const int atom_numbers, const float half_dt, const float dt,
                                                       const float exp_gamma, int float4_numbers, float *inverse_mass,
                                                       float *sqrt_mass_inverse, float *vel, float *crd, float *frc,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cu
index 3eaad366d8b..60d96b1c008 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 __global__ void MD_Iteration_Leap_Frog_With_Max_Velocity(const int atom_numbers, VECTOR *vel, VECTOR *crd, VECTOR *frc,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh
index 047610dcaea..4c6fa476483 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_WITH_MAX_VEL_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 void MDIterationLeapFrogWithMaxVelocity(const int atom_numbers, float *vel, float *crd, float *frc, float *acc,
                                         const float *inverse_mass, const float dt, const float max_velocity,
                                         cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cu
index d1254924886..54138754bd4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cu
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
 
 void MD_Iteration_Setup_Random_State(int float4_numbers, curandStatePhilox4_32_10_t *rand_state, int seed,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh
index 3f8185ca556..b3d268c3013 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_SETUP_RANDOM_STATE_GPU_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 void MD_Iteration_Setup_Random_State(int float4_numbers, curandStatePhilox4_32_10_t *rand_state, int seed,
                                      cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh
index 77a27a1b2b3..bb2f6c87bab 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh
@@ -17,8 +17,8 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_FFT_3D_IMPL_H_
 
 #include <cufft.h>
-#include "utils/complex.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
 void FFT3D(int Nfft, T *input_tensor, Complex<T> *output_tensor, const cufftHandle &FFT_plan_r2c, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh
index b184bc1c564..68d12d08cae 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh
@@ -17,8 +17,8 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_IFFT_3D_IMPL_H_
 
 #include <cufft.h>
-#include "utils/complex.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
 void IFFT3D(int Nfft, Complex<T> *input_tensor, T *output_tensor, const cufftHandle &FFT_plan_c2r, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh
index 8ee149cd9db..f5697b779ee 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_IMPL_H_
 
 #include <cufft.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
                float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, const int *uint_crd_f,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh
index d565ae57a66..49de269ca66 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_UPDATE_IMPL_H_
 
 #include <cufft.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void PMEEnergyUpdate(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
                      float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh
index 12cec6454f3..8cbbf282054 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_EXCLUDED_FORCE_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void PMEExcludedForce(const int atom_numbers, const float pme_beta, const int *uint_crd_f, const float *sacler_f,
                       const float *charge, const int *excluded_list_start, const int *excluded_list,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh
index 5393743e619..e21cd655f6c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh
@@ -17,7 +17,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_RECIPROCAL_FORCE_IMPL_H_
 
 #include <cufft.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 struct _VECTOR {
   float x;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cu
index b3c65e9bf97..546f1a96f72 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cu
@@ -16,7 +16,7 @@
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __global__ void restrain_energy_kernel(const int restrain_numbers, const int *restrain_list, const VECTOR *crd,
                                        const VECTOR *crd_ref, const float weight, const VECTOR boxlength, float *ene) {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh
index f7dbfeabb1a..a58fca6cbfe 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_RESTRAIN_RESTRAIN_ENERGY_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void restrainenergy(int restrain_numbers, int atom_numbers, float weight, const int *restrain_list, const float *crd_f,
                     const float *crd_ref, const float *boxlength_f, float *ene, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_atom_energy_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_atom_energy_virial_impl.cuh
index 3e175457b5a..94f3398e8d8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_atom_energy_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_atom_energy_virial_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_RESTRAIN_RESTRAIN_FORCE_ATOM_ENERGY_VIRIAL_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void restrainforcewithatomenergyandvirial(int restrain_numbers, int atom_numbers, const int *restrain_list,
                                           const float *crd_f, const float *crd_ref_f, const float weight,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cu
index 779484179d4..cff6a302b80 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cu
@@ -16,7 +16,7 @@
 
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
 
 __global__ void restrainforcekernel(int restrain_numbers, const int *restrain_list, const UNSIGNED_INT_VECTOR *uint_crd,
                                     const UNSIGNED_INT_VECTOR *uint_crd_ref, const float factor, const VECTOR *scaler,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh
index bfc67e8da69..1788b9c55c3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_RESTRAIN_RESTAIN_FORCE_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void restrainforce(int restrain_numbers, int atom_numbers, const int *restrain_list, const int *uint_crd_f,
                    const int *uint_crd_ref, const float factor, const float *scaler_f, float *frc_f,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh
index 41ffceaff3a..6af297ba7a0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh
@@ -22,7 +22,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_CYCLE_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void Constrain_Force_Cycle(int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f,
                            const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh
index fc96cde5cde..3c77e8d0355 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh
@@ -22,7 +22,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_CYCLE_WITH_VIRIAL_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void Constrain_Force_Cycle_With_Virial(int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f,
                                        const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh
index e365d0ce765..05b351c053b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh
@@ -22,7 +22,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_VIRIAL_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void constrain_force_cycle_update(int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f,
                                   const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/last_crd_to_dr_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/last_crd_to_dr_impl.cuh
index 6152cab2026..2d71397680a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/last_crd_to_dr_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/last_crd_to_dr_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_LAST_CRD_TO_DR_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void lastcrdtodr(int constrain_pair_numbers, const float *atom_crd_f, const float *quarter_crd_to_uint_crd_cof_f,
                  const float *uint_dr_to_dr_f, float *constrain_pair_f, const int *atom_i_serials,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_crd_vel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_crd_vel_impl.cuh
index f5de7eb46b7..a3c3d40f452 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_crd_vel_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_crd_vel_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_REFRESH_CRD_VEL_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void refreshcrdvel(int atom_numbers, float dt_inverse, float dt, float exp_gamma, float half_exp_gamma_plus_half,
                    float *test_frc_f, float *mass_inverse, float *crd_f, float *vel_f, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_uint_crd_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_uint_crd_impl.cuh
index b74aa1edcea..ecdb5c7ec72 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_uint_crd_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_uint_crd_impl.cuh
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_REFRESH_UINT_CRD_IMPL_H_
 
 #include <curand_kernel.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 void refreshuintcrd(int atom_numbers, float half_exp_gamma_plus_half, const float *crd_f,
                     const float *quarter_crd_to_uint_crd_cof_f, const float *test_frc_f, const float *mass_inverse,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cu
deleted file mode 100644
index e9ae27d86d5..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cu
+++ /dev/null
@@ -1,199 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cuh"
-
-const int kWarpSize = 32;
-const int kNumWarps = 32;
-
-__inline__ __device__ float HalfFloatInputConvert(const half val) { return __half2float(val); }
-__inline__ __device__ float HalfFloatInputConvert(const float val) { return val; }
-__inline__ __device__ void HalfFloatOutputAssign(const float val, float *arr, int idx) { arr[idx] = val; }
-__inline__ __device__ void HalfFloatOutputAssign(const float val, half *arr, int idx) { arr[idx] = __float2half(val); }
-
-template <typename T, typename G>
-__global__ void SyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy,
-                                     G *saved_mean, G *saved_invstd, float *dy_sum_local, float *dot_p_local) {
-  // block level memory
-  __shared__ float shared_dy[kNumWarps];
-  __shared__ float shared_dot_p[kNumWarps];
-  int warpId = threadIdx.x / kWarpSize;  // threads are arranged in warps of 32 executed together
-  int laneId = threadIdx.x % kWarpSize;
-
-  int plane = blockIdx.x;  // this thread will only function on a single plane
-  int plane_size = N * H * W;
-  float mean = static_cast<float>(saved_mean[plane]);
-
-  if (threadIdx.x < kNumWarps) {
-    shared_dy[threadIdx.x] = static_cast<float>(0);
-    shared_dot_p[threadIdx.x] = static_cast<float>(0);
-  }
-
-  __syncthreads();  // ensure all 0 init complete across all values
-
-  float dy_sum = static_cast<float>(0);
-  float dot_p = static_cast<float>(0);
-
-  // individual thread level reduction
-  for (int x = threadIdx.x; x < plane_size; x += blockDim.x) {
-    int index = (x / (H * W) * C * H * W) + (plane * H * W) + (x % (H * W));
-    float input_value = HalfFloatInputConvert(x_input[index]);
-    float dy_value = HalfFloatInputConvert(dy[index]);
-    dy_sum += dy_value;
-    dot_p += (input_value - mean) * dy_value;
-  }
-  __syncthreads();
-  // warp reduce all values in every value to a single value
-  for (int offset = kWarpSize / 2; offset > 0; offset /= 2) {
-    float other_dy_sum = __shfl_down_sync(0xffffffff, dy_sum, offset);
-    float other_dot_p = __shfl_down_sync(0xffffffff, dot_p, offset);
-    dy_sum += other_dy_sum;
-    dot_p += other_dot_p;
-  }
-  __syncwarp();
-  if (laneId == 0) {
-    shared_dy[warpId] = dy_sum;
-    shared_dot_p[warpId] = dot_p;
-    // one value per warp now
-  }
-  __syncthreads();
-  if (warpId == 0) {
-    dy_sum = shared_dy[laneId];
-    dot_p = shared_dot_p[laneId];
-    __syncwarp();
-    for (int offset = kWarpSize / 2; offset > 0; offset /= 2) {
-      float other_dy = __shfl_down_sync(0xffffffff, dy_sum, offset);
-      float other_dot_p = __shfl_down_sync(0xffffffff, dot_p, offset);
-      dy_sum += other_dy;
-      dot_p += other_dot_p;
-    }
-    __syncwarp();
-  }
-  if (threadIdx.x == 0) {
-    dy_sum_local[plane] = dy_sum;
-    dot_p_local[plane] = dot_p;
-  }
-  return;
-}
-
-template <typename T, typename S, typename G>
-__global__ void SyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, T *dx,
-                                      G *saved_mean, G *saved_invstd, float *dy_sum_red, float *dot_p_red, S *scale,
-                                      S *dscale, S *dbias, float epsilon) {
-  int size = N * C * H * W;
-  int plane_size = N * H * W;
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
-    int block_num = (pos / W) / H;  // which of N * C blocks
-    int plane = block_num % C;
-    float mean = HalfFloatInputConvert(saved_mean[plane]);
-    float invstd = HalfFloatInputConvert(saved_invstd[plane]);
-    float scale_value = HalfFloatInputConvert(scale[plane]);
-    float div_factor = HalfFloatInputConvert(1) / plane_size;
-    float dy_sum_plane = dy_sum_red[plane];
-    float dot_p_plane = dot_p_red[plane];
-    float grad_mean = dy_sum_plane * div_factor;
-    float proj_scale = dot_p_plane * div_factor * invstd * invstd;
-    float grad_scale = invstd * scale_value;
-    float inp = HalfFloatInputConvert(x_input[pos]);
-    float proj = (inp - mean) * proj_scale;
-    HalfFloatOutputAssign((HalfFloatInputConvert(dy[pos]) - proj - grad_mean) * grad_scale, dx, pos);
-  }
-}
-
-template <typename S, typename G>
-__global__ void SyncBatchNormGradPostScaleBias(size_t C, G *saved_invstd, float *dy_sum_red, float *dot_p_red,
-                                               S *dscale, S *dbias) {
-  for (size_t plane = blockIdx.x * blockDim.x + threadIdx.x; plane < C; plane += blockDim.x * gridDim.x) {
-    float invstd = HalfFloatInputConvert(saved_invstd[plane]);
-    float dy_sum_plane = dy_sum_red[plane];
-    float dot_p_plane = dot_p_red[plane];
-    dscale[plane] = static_cast<S>(dot_p_plane * invstd);
-    dbias[plane] = static_cast<S>(dy_sum_plane);
-  }
-}
-
-template <typename T, typename G>
-void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, G *saved_mean,
-                             G *saved_invstd, float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream) {
-  SyncBatchNormGradPre<<<C, GET_THREADS, 0, cuda_stream>>>(N, C, H, W, x_input, dy, saved_mean, saved_invstd,
-                                                          dy_sum_local, dot_p_local);
-  return;
-}
-template <typename T, typename S, typename G>
-void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, T *dx,
-                              G *saved_mean, G *saved_invstd, float *dy_sum_red, float *dot_p_red, S *scale, S *dscale,
-                              S *dbias, float epsilon, cudaStream_t cuda_stream) {
-  SyncBatchNormGradPost<<<C, GET_THREADS, 0, cuda_stream>>>(N, C, H, W, x_input, dy, dx, saved_mean, saved_invstd,
-                                                            dy_sum_red, dot_p_red, scale, dscale, dbias, epsilon);
-  SyncBatchNormGradPostScaleBias<<<GET_BLOCKS(C), std::min(C, static_cast<size_t>(GET_THREADS)), 0, cuda_stream>>>(
-    C, saved_invstd, dy_sum_red, dot_p_red, dscale, dbias);
-}
-// PRE FUNCTION
-template void CalSyncBatchNormGradPre<float, float>(size_t N, size_t C, size_t H, size_t W, const float *x_input,
-                                                    const float *dy, float *saved_mean, float *saved_invstd,
-                                                    float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream);
-template void CalSyncBatchNormGradPre<float, half>(size_t N, size_t C, size_t H, size_t W, const float *x_input,
-                                                   const float *dy, half *saved_mean, half *saved_invstd,
-                                                   float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream);
-template void CalSyncBatchNormGradPre<half, float>(size_t N, size_t C, size_t H, size_t W, const half *x_input,
-                                                   const half *dy, float *saved_mean, float *saved_invstd,
-                                                   float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream);
-template void CalSyncBatchNormGradPre<half, half>(size_t N, size_t C, size_t H, size_t W, const half *x_input,
-                                                  const half *dy, half *saved_mean, half *saved_invstd,
-                                                  float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream);
-// POST FUNCTION
-template void CalSyncBatchNormGradPost<float, float, float>(size_t N, size_t C, size_t H, size_t W,
-                                                            const float *x_input, const float *dy, float *dx,
-                                                            float *saved_mean, float *saved_invstd, float *dy_sum_red,
-                                                            float *dot_p_red, float *scale, float *dscale, float *dbias,
-                                                            float epsilon, cudaStream_t cuda_stream);
-template void CalSyncBatchNormGradPost<half, float, float>(size_t N, size_t C, size_t H, size_t W, const half *x_input,
-                                                           const half *dy, half *dx, float *saved_mean,
-                                                           float *saved_invstd, float *dy_sum_red, float *dot_p_red,
-                                                           float *scale, float *dscale, float *dbias, float epsilon,
-                                                           cudaStream_t cuda_stream);
-template void CalSyncBatchNormGradPost<float, half, float>(size_t N, size_t C, size_t H, size_t W, const float *x_input,
-                                                           const float *dy, float *dx, float *saved_mean,
-                                                           float *saved_invstd, float *dy_sum_red, float *dot_p_red,
-                                                           half *scale, half *dscale, half *dbias, float epsilon,
-                                                           cudaStream_t cuda_stream);
-template void CalSyncBatchNormGradPost<half, half, float>(size_t N, size_t C, size_t H, size_t W, const half *x_input,
-                                                          const half *dy, half *dx, float *saved_mean,
-                                                          float *saved_invstd, float *dy_sum_red, float *dot_p_red,
-                                                          half *scale, half *dscale, half *dbias, float epsilon,
-                                                          cudaStream_t cuda_stream);
-template void CalSyncBatchNormGradPost<float, float, half>(size_t N, size_t C, size_t H, size_t W, const float *x_input,
-                                                           const float *dy, float *dx, half *saved_mean,
-                                                           half *saved_invstd, float *dy_sum_red, float *dot_p_red,
-                                                           float *scale, float *dscale, float *dbias, float epsilon,
-                                                           cudaStream_t cuda_stream);
-template void CalSyncBatchNormGradPost<half, float, half>(size_t N, size_t C, size_t H, size_t W, const half *x_input,
-                                                          const half *dy, half *dx, half *saved_mean,
-                                                          half *saved_invstd, float *dy_sum_red, float *dot_p_red,
-                                                          float *scale, float *dscale, float *dbias, float epsilon,
-                                                          cudaStream_t cuda_stream);
-template void CalSyncBatchNormGradPost<float, half, half>(size_t N, size_t C, size_t H, size_t W, const float *x_input,
-                                                          const float *dy, float *dx, half *saved_mean,
-                                                          half *saved_invstd, float *dy_sum_red, float *dot_p_red,
-                                                          half *scale, half *dscale, half *dbias, float epsilon,
-                                                          cudaStream_t cuda_stream);
-template void CalSyncBatchNormGradPost<half, half, half>(size_t N, size_t C, size_t H, size_t W, const half *x_input,
-                                                         const half *dy, half *dx, half *saved_mean, half *saved_invstd,
-                                                         float *dy_sum_red, float *dot_p_red, half *scale, half *dscale,
-                                                         half *dbias, float epsilon, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cuh
deleted file mode 100644
index 9378cde8580..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-// /**
-//  * Copyright 2021 Huawei Technologies Co., Ltd
-//  *
-//  * Licensed under the Apache License, Version 2.0 (the "License");
-//  * you may not use this file except in compliance with the License.
-//  * You may obtain a copy of the License at
-//  *
-//  * http://www.apache.org/licenses/LICENSE-2.0
-//  *
-//  * Unless required by applicable law or agreed to in writing, software
-//  * distributed under the License is distributed on an "AS IS" BASIS,
-//  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  * See the License for the specific language governing permissions and
-//  * limitations under the License.
-//  */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SYNC_BATCH_NORM_GRAD_IMPL_CUH
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SYNC_BATCH_NORM_GRAD_IMPL_CUH
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T, typename G>
-void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, G *saved_mean,
-                             G *invstd_saved, float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream);
-template <typename T, typename S, typename G>
-void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, T *dx,
-                              G *saved_mean, G *invstd_saved, float *dy_sum_red, float *dot_p_red, S *scale, S *dscale,
-                              S *dbias, float epsilon, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SYNC_BATCH_NORM_GRAD_IMPL_CUH
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cuh
deleted file mode 100644
index 4a12e2b3b92..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cuh
+++ /dev/null
@@ -1,33 +0,0 @@
-// /**
-//  * Copyright 2021 Huawei Technologies Co., Ltd
-//  *
-//  * Licensed under the Apache License, Version 2.0 (the "License");
-//  * you may not use this file except in compliance with the License.
-//  * You may obtain a copy of the License at
-//  *
-//  * http://www.apache.org/licenses/LICENSE-2.0
-//  *
-//  * Unless required by applicable law or agreed to in writing, software
-//  * distributed under the License is distributed on an "AS IS" BASIS,
-//  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  * See the License for the specific language governing permissions and
-//  * limitations under the License.
-//  */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SYNC_BATCH_NORM_IMPL_CUH
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SYNC_BATCH_NORM_IMPL_CUH
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void CalSyncBatchNormPre(size_t N, size_t C, size_t H, size_t W, const T *input, int *output_n, float *means_local,
-                         float *invstds_local, float epsilon, cudaStream_t cuda_stream);
-template <typename T, typename G>
-void CalSyncBatchNormGather(size_t N, size_t C, size_t H, size_t W, int *counts_global, float *means_global,
-                            float *invstds_global, int *counts_local, float *means_local, float *invstds_local,
-                            T *running_mean_output, T *running_var_output, G *running_mean_input, G *running_var_input,
-                            float epsilon, float momentum, size_t group_rank, size_t group_size,
-                            cudaStream_t cuda_stream);
-template <typename T, typename S>
-void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, const T *input, T *output, float *means_local,
-                          float *invstds_local, S *scale, S *bias, S *output_scale, S *output_bias, float epsilon,
-                          cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SYNC_BATCH_NORM_IMPL_CUH
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cu
deleted file mode 100644
index 7e8154a0510..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cu
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-__global__ void TensorScatterAddKernel(T *input, S *indices, T *update, T *output, const size_t block_size,
-                                       const size_t input_size, const size_t output_size, const size_t indices_dim_0,
-                                       const size_t indices_dim_1, S *indices_stride, S *work_shape) {
-  int i, j;
-  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
-       read_index += blockDim.x * gridDim.x) {
-    size_t write_index = 0;
-    bool out_bound = false;
-
-    i = read_index / block_size;
-    j = read_index % block_size;
-
-    for (size_t k = 0; k < indices_dim_1; k++) {
-      S indices_i = indices[i * indices_dim_1 + k];
-      out_bound |= indices_i >= work_shape[k];
-      write_index += indices_i * indices_stride[k];
-    }
-
-    write_index += j;
-    out_bound |= write_index >= output_size;
-
-    if (!out_bound) {
-      MsAtomicAdd(&output[write_index], update[read_index]);
-    }
-  }
-}
-
-template <typename T, typename S>
-void TensorScatterAdd(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
-                      const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
-                      S *indices_stride, S *work_shape, cudaStream_t stream) {
-  TensorScatterAddKernel<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(
-    input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride,
-    work_shape);
-  return;
-}
-
-template void TensorScatterAdd<half, int>(half *input, int *indices, half *update, half *output,
-                                          const size_t &block_size, const size_t &input_size, const size_t &output_size,
-                                          const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
-                                          int *work_shape, cudaStream_t stream);
-template void TensorScatterAdd<float, int>(float *input, int *indices, float *update, float *output,
-                                           const size_t &block_size, const size_t &input_size,
-                                           const size_t &output_size, const size_t &indices_dim_0,
-                                           const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                           cudaStream_t stream);
-template void TensorScatterAdd<double, int>(double *input, int *indices, double *update, double *output,
-                                            const size_t &block_size, const size_t &input_size,
-                                            const size_t &output_size, const size_t &indices_dim_0,
-                                            const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                            cudaStream_t stream);
-template void TensorScatterAdd<char, int>(char *input, int *indices, char *update, char *output,
-                                          const size_t &block_size, const size_t &input_size, const size_t &output_size,
-                                          const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
-                                          int *work_shape, cudaStream_t stream);
-template void TensorScatterAdd<unsigned char, int>(unsigned char *input, int *indices, unsigned char *update,
-                                                   unsigned char *output, const size_t &block_size,
-                                                   const size_t &input_size, const size_t &output_size,
-                                                   const size_t &indices_dim_0, const size_t &indices_dim_1,
-                                                   int *indices_stride, int *work_shape, cudaStream_t stream);
-template void TensorScatterAdd<int, int>(int *input, int *indices, int *update, int *output, const size_t &block_size,
-                                         const size_t &input_size, const size_t &output_size,
-                                         const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
-                                         int *work_shape, cudaStream_t stream);
-template void TensorScatterAdd<double, int64_t>(double *input, int64_t *indices, double *update, double *output,
-                                                const size_t &block_size, const size_t &input_size,
-                                                const size_t &output_size, const size_t &indices_dim_0,
-                                                const size_t &indices_dim_1, int64_t *indices_stride,
-                                                int64_t *work_shape, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cuh
deleted file mode 100644
index f92f4e2ad99..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_ADD_IMPL_CUH
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_ADD_IMPL_CUH
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-void TensorScatterAdd(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
-                      const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
-                      S *indices_stride, S *work_shape, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_ADD_IMPL_CUH
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cu
deleted file mode 100644
index 0f47a19dd48..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cu
+++ /dev/null
@@ -1,117 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-__global__ void TensorScatterMaxKernel(T *input, S *indices, T *update, T *output, const size_t block_size,
-                                       const size_t input_size, const size_t output_size, const size_t indices_dim_0,
-                                       const size_t indices_dim_1, S *indices_stride, S *work_shape) {
-  int i, j;
-  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
-       read_index += blockDim.x * gridDim.x) {
-    size_t write_index = 0;
-    bool out_bound = false;
-
-    i = read_index / block_size;
-    j = read_index % block_size;
-
-    for (size_t k = 0; k < indices_dim_1; k++) {
-      S indices_i = indices[i * indices_dim_1 + k];
-      out_bound |= indices_i >= work_shape[k];
-      write_index += indices_i * indices_stride[k];
-    }
-
-    write_index += j;
-    out_bound |= write_index >= output_size;
-
-    if (!out_bound) {
-      MsAtomicMax(&output[write_index], update[read_index]);
-    }
-  }
-}
-
-template <typename T, typename S>
-void TensorScatterMax(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
-                      const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
-                      S *indices_stride, S *work_shape, cudaStream_t stream) {
-  TensorScatterMaxKernel<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(
-    input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride,
-    work_shape);
-  return;
-}
-
-// for int32 index
-template void TensorScatterMax<half, int>(half *input, int *indices, half *update, half *output,
-                                          const size_t &block_size, const size_t &input_size, const size_t &output_size,
-                                          const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
-                                          int *work_shape, cudaStream_t stream);
-
-template void TensorScatterMax<float, int>(float *input, int *indices, float *update, float *output,
-                                           const size_t &block_size, const size_t &input_size,
-                                           const size_t &output_size, const size_t &indices_dim_0,
-                                           const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                           cudaStream_t stream);
-
-template void TensorScatterMax<char, int>(char *input, int *indices, char *update, char *output,
-                                          const size_t &block_size, const size_t &input_size, const size_t &output_size,
-                                          const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
-                                          int *work_shape, cudaStream_t stream);
-
-template void TensorScatterMax<unsigned char, int>(unsigned char *input, int *indices, unsigned char *update,
-                                                   unsigned char *output, const size_t &block_size,
-                                                   const size_t &input_size, const size_t &output_size,
-                                                   const size_t &indices_dim_0, const size_t &indices_dim_1,
-                                                   int *indices_stride, int *work_shape, cudaStream_t stream);
-
-template void TensorScatterMax<int, int>(int *input, int *indices, int *update, int *output, const size_t &block_size,
-                                         const size_t &input_size, const size_t &output_size,
-                                         const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
-                                         int *work_shape, cudaStream_t stream);
-
-// for int64 index
-template void TensorScatterMax<half, int64_t>(half *input, int64_t *indices, half *update, half *output,
-                                              const size_t &block_size, const size_t &input_size,
-                                              const size_t &output_size, const size_t &indices_dim_0,
-                                              const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
-                                              cudaStream_t stream);
-
-template void TensorScatterMax<float, int64_t>(float *input, int64_t *indices, float *update, float *output,
-                                               const size_t &block_size, const size_t &input_size,
-                                               const size_t &output_size, const size_t &indices_dim_0,
-                                               const size_t &indices_dim_1, int64_t *indices_stride,
-                                               int64_t *work_shape, cudaStream_t stream);
-
-template void TensorScatterMax<char, int64_t>(char *input, int64_t *indices, char *update, char *output,
-                                              const size_t &block_size, const size_t &input_size,
-                                              const size_t &output_size, const size_t &indices_dim_0,
-                                              const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
-                                              cudaStream_t stream);
-
-template void TensorScatterMax<unsigned char, int64_t>(unsigned char *input, int64_t *indices, unsigned char *update,
-                                                       unsigned char *output, const size_t &block_size,
-                                                       const size_t &input_size, const size_t &output_size,
-                                                       const size_t &indices_dim_0, const size_t &indices_dim_1,
-                                                       int64_t *indices_stride, int64_t *work_shape,
-                                                       cudaStream_t stream);
-
-template void TensorScatterMax<int, int64_t>(int *input, int64_t *indices, int *update, int *output,
-                                             const size_t &block_size, const size_t &input_size,
-                                             const size_t &output_size, const size_t &indices_dim_0,
-                                             const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
-                                             cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cuh
deleted file mode 100644
index b09b4165a1c..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_MAX_IMPL_CUH
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_MAX_IMPL_CUH
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-void TensorScatterMax(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
-                      const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
-                      S *indices_stride, S *work_shape, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_MAX_IMPL_CUH
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cu
deleted file mode 100644
index f11791c9706..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cu
+++ /dev/null
@@ -1,120 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-__global__ void TensorScatterMinKernel(T *input, S *indices, T *update, T *output, const size_t block_size,
-                                          const size_t input_size, const size_t output_size, const size_t indices_dim_0,
-                                          const size_t indices_dim_1, S *indices_stride, S *work_shape) {
-  int i, j;
-  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
-       read_index += blockDim.x * gridDim.x) {
-    size_t write_index = 0;
-    bool out_bound = false;
-
-    i = read_index / block_size;
-    j = read_index % block_size;
-
-    for (size_t k = 0; k < indices_dim_1; k++) {
-      S indices_i = indices[i * indices_dim_1 + k];
-      out_bound |= indices_i >= work_shape[k];
-      write_index += indices_i * indices_stride[k];
-    }
-
-    write_index += j;
-    out_bound |= write_index >= output_size;
-
-    if (!out_bound) {
-      MsAtomicMin(&output[write_index], update[read_index]);
-    }
-  }
-}
-
-template <typename T, typename S>
-void TensorScatterMin(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
-                         const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
-                         S *indices_stride, S *work_shape, cudaStream_t stream) {
-  TensorScatterMinKernel<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(
-    input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride,
-    work_shape);
-  return;
-}
-
-// for int32 index
-template void TensorScatterMin<half, int>(half *input, int *indices, half *update, half *output,
-                                          const size_t &block_size, const size_t &input_size,
-                                          const size_t &output_size, const size_t &indices_dim_0,
-                                          const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                          cudaStream_t stream);
-
-template void TensorScatterMin<float, int>(float *input, int *indices, float *update, float *output,
-                                           const size_t &block_size, const size_t &input_size,
-                                           const size_t &output_size, const size_t &indices_dim_0,
-                                           const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                           cudaStream_t stream);
-
-template void TensorScatterMin<char, int>(char *input, int *indices, char *update, char *output,
-                                          const size_t &block_size, const size_t &input_size,
-                                          const size_t &output_size, const size_t &indices_dim_0,
-                                          const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                          cudaStream_t stream);
-
-template void TensorScatterMin<unsigned char, int>(unsigned char *input, int *indices, unsigned char *update,
-                                                   unsigned char *output, const size_t &block_size,
-                                                   const size_t &input_size, const size_t &output_size,
-                                                   const size_t &indices_dim_0, const size_t &indices_dim_1,
-                                                   int *indices_stride, int *work_shape, cudaStream_t stream);
-
-template void TensorScatterMin<int, int>(int *input, int *indices, int *update, int *output,
-                                         const size_t &block_size, const size_t &input_size,
-                                         const size_t &output_size, const size_t &indices_dim_0,
-                                         const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                         cudaStream_t stream);
-
-// for int64 index
-template void TensorScatterMin<half, int64_t>(half *input, int64_t *indices, half *update, half *output,
-                                              const size_t &block_size, const size_t &input_size,
-                                              const size_t &output_size, const size_t &indices_dim_0,
-                                              const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
-                                              cudaStream_t stream);
-
-template void TensorScatterMin<float, int64_t>(float *input, int64_t *indices, float *update, float *output,
-                                               const size_t &block_size, const size_t &input_size,
-                                               const size_t &output_size, const size_t &indices_dim_0,
-                                               const size_t &indices_dim_1, int64_t *indices_stride,
-                                               int64_t *work_shape, cudaStream_t stream);
-
-template void TensorScatterMin<char, int64_t>(char *input, int64_t *indices, char *update, char *output,
-                                              const size_t &block_size, const size_t &input_size,
-                                              const size_t &output_size, const size_t &indices_dim_0,
-                                              const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
-                                              cudaStream_t stream);
-
-template void TensorScatterMin<unsigned char, int64_t>(unsigned char *input, int64_t *indices, unsigned char *update,
-                                                       unsigned char *output, const size_t &block_size,
-                                                       const size_t &input_size, const size_t &output_size,
-                                                       const size_t &indices_dim_0, const size_t &indices_dim_1,
-                                                       int64_t *indices_stride, int64_t *work_shape,
-                                                       cudaStream_t stream);
-
-template void TensorScatterMin<int, int64_t>(int *input, int64_t *indices, int *update, int *output,
-                                             const size_t &block_size, const size_t &input_size,
-                                             const size_t &output_size, const size_t &indices_dim_0,
-                                             const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
-                                             cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cuh
deleted file mode 100644
index c1453c9a8ba..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_MIN_IMPL_CUH
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_MIN_IMPL_CUH
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-void TensorScatterMin(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
-                         const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
-                         S *indices_stride, S *work_shape, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_MIN_IMPL_CUH
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cu
deleted file mode 100644
index 5230b844b5f..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cu
+++ /dev/null
@@ -1,117 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-__global__ void TensorScatterSubKernel(T *input, S *indices, T *update, T *output, const size_t block_size,
-                                       const size_t input_size, const size_t output_size, const size_t indices_dim_0,
-                                       const size_t indices_dim_1, S *indices_stride, S *work_shape) {
-  int i, j;
-  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
-       read_index += blockDim.x * gridDim.x) {
-    size_t write_index = 0;
-    bool out_bound = false;
-
-    i = read_index / block_size;
-    j = read_index % block_size;
-
-    for (size_t k = 0; k < indices_dim_1; k++) {
-      S indices_i = indices[i * indices_dim_1 + k];
-      out_bound |= indices_i >= work_shape[k];
-      write_index += indices_i * indices_stride[k];
-    }
-
-    write_index += j;
-    out_bound |= write_index >= output_size;
-
-    if (!out_bound) {
-      MsAtomicSub(&output[write_index], update[read_index]);
-    }
-  }
-}
-
-template <typename T, typename S>
-void TensorScatterSub(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
-                      const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
-                      S *indices_stride, S *work_shape, cudaStream_t stream) {
-  TensorScatterSubKernel<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(
-    input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride,
-    work_shape);
-  return;
-}
-
-// for int32 index
-template void TensorScatterSub<half, int>(half *input, int *indices, half *update, half *output,
-                                          const size_t &block_size, const size_t &input_size, const size_t &output_size,
-                                          const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
-                                          int *work_shape, cudaStream_t stream);
-
-template void TensorScatterSub<float, int>(float *input, int *indices, float *update, float *output,
-                                           const size_t &block_size, const size_t &input_size,
-                                           const size_t &output_size, const size_t &indices_dim_0,
-                                           const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                           cudaStream_t stream);
-
-template void TensorScatterSub<char, int>(char *input, int *indices, char *update, char *output,
-                                          const size_t &block_size, const size_t &input_size, const size_t &output_size,
-                                          const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
-                                          int *work_shape, cudaStream_t stream);
-
-template void TensorScatterSub<unsigned char, int>(unsigned char *input, int *indices, unsigned char *update,
-                                                   unsigned char *output, const size_t &block_size,
-                                                   const size_t &input_size, const size_t &output_size,
-                                                   const size_t &indices_dim_0, const size_t &indices_dim_1,
-                                                   int *indices_stride, int *work_shape, cudaStream_t stream);
-
-template void TensorScatterSub<int, int>(int *input, int *indices, int *update, int *output, const size_t &block_size,
-                                         const size_t &input_size, const size_t &output_size,
-                                         const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
-                                         int *work_shape, cudaStream_t stream);
-
-// for int64 index
-template void TensorScatterSub<half, int64_t>(half *input, int64_t *indices, half *update, half *output,
-                                              const size_t &block_size, const size_t &input_size,
-                                              const size_t &output_size, const size_t &indices_dim_0,
-                                              const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
-                                              cudaStream_t stream);
-
-template void TensorScatterSub<float, int64_t>(float *input, int64_t *indices, float *update, float *output,
-                                               const size_t &block_size, const size_t &input_size,
-                                               const size_t &output_size, const size_t &indices_dim_0,
-                                               const size_t &indices_dim_1, int64_t *indices_stride,
-                                               int64_t *work_shape, cudaStream_t stream);
-
-template void TensorScatterSub<char, int64_t>(char *input, int64_t *indices, char *update, char *output,
-                                              const size_t &block_size, const size_t &input_size,
-                                              const size_t &output_size, const size_t &indices_dim_0,
-                                              const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
-                                              cudaStream_t stream);
-
-template void TensorScatterSub<unsigned char, int64_t>(unsigned char *input, int64_t *indices, unsigned char *update,
-                                                       unsigned char *output, const size_t &block_size,
-                                                       const size_t &input_size, const size_t &output_size,
-                                                       const size_t &indices_dim_0, const size_t &indices_dim_1,
-                                                       int64_t *indices_stride, int64_t *work_shape,
-                                                       cudaStream_t stream);
-
-template void TensorScatterSub<int, int64_t>(int *input, int64_t *indices, int *update, int *output,
-                                             const size_t &block_size, const size_t &input_size,
-                                             const size_t &output_size, const size_t &indices_dim_0,
-                                             const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
-                                             cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cuh
deleted file mode 100644
index 6b691c4b195..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_SUB_IMPL_CUH
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_SUB_IMPL_CUH
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-void TensorScatterSub(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
-                      const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
-                      S *indices_stride, S *work_shape, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_SUB_IMPL_CUH
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cu
deleted file mode 100644
index db9ba141211..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cu
+++ /dev/null
@@ -1,108 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-__global__ void TensorScatterUpdateKernel(T *input, S *indices, T *update, T *output, const size_t block_size,
-                                          const size_t input_size, const size_t output_size, const size_t indices_dim_0,
-                                          const size_t indices_dim_1, S *indices_stride, S *work_shape) {
-  int i, j;
-  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
-       read_index += blockDim.x * gridDim.x) {
-    size_t write_index = 0;
-    bool out_bound = false;
-
-    i = read_index / block_size;
-    j = read_index % block_size;
-
-    for (size_t k = 0; k < indices_dim_1; k++) {
-      S indices_i = indices[i * indices_dim_1 + k];
-      out_bound |= indices_i >= work_shape[k];
-      write_index += indices_i * indices_stride[k];
-    }
-
-    write_index += j;
-    out_bound |= write_index >= output_size;
-
-    if (!out_bound) {
-      output[write_index] = update[read_index];
-    }
-  }
-}
-
-template <typename T, typename S>
-void TensorScatterUpdate(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
-                         const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
-                         S *indices_stride, S *work_shape, cudaStream_t stream) {
-  TensorScatterUpdateKernel<<<GET_BLOCKS(output_size), GET_THREADS, 0, stream>>>(
-    input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride,
-    work_shape);
-  return;
-}
-
-template void TensorScatterUpdate<half, int>(half *input, int *indices, half *update, half *output,
-                                             const size_t &block_size, const size_t &input_size,
-                                             const size_t &output_size, const size_t &indices_dim_0,
-                                             const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                             cudaStream_t stream);
-template void TensorScatterUpdate<float, int>(float *input, int *indices, float *update, float *output,
-                                              const size_t &block_size, const size_t &input_size,
-                                              const size_t &output_size, const size_t &indices_dim_0,
-                                              const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                              cudaStream_t stream);
-template void TensorScatterUpdate<double, int>(double *input, int *indices, double *update, double *output,
-                                               const size_t &block_size, const size_t &input_size,
-                                               const size_t &output_size, const size_t &indices_dim_0,
-                                               const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                               cudaStream_t stream);
-template void TensorScatterUpdate<char, int>(char *input, int *indices, char *update, char *output,
-                                             const size_t &block_size, const size_t &input_size,
-                                             const size_t &output_size, const size_t &indices_dim_0,
-                                             const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                             cudaStream_t stream);
-template void TensorScatterUpdate<unsigned char, int>(unsigned char *input, int *indices, unsigned char *update,
-                                                      unsigned char *output, const size_t &block_size,
-                                                      const size_t &input_size, const size_t &output_size,
-                                                      const size_t &indices_dim_0, const size_t &indices_dim_1,
-                                                      int *indices_stride, int *work_shape, cudaStream_t stream);
-template void TensorScatterUpdate<int, int>(int *input, int *indices, int *update, int *output,
-                                            const size_t &block_size, const size_t &input_size,
-                                            const size_t &output_size, const size_t &indices_dim_0,
-                                            const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                            cudaStream_t stream);
-template void TensorScatterUpdate<bool, int>(bool *input, int *indices, bool *update, bool *output,
-                                             const size_t &block_size, const size_t &input_size,
-                                             const size_t &output_size, const size_t &indices_dim_0,
-                                             const size_t &indices_dim_1, int *indices_stride, int *work_shape,
-                                             cudaStream_t stream);
-template void TensorScatterUpdate<bool, int64_t>(bool *input, int64_t *indices, bool *update, bool *output,
-                                                 const size_t &block_size, const size_t &input_size,
-                                                 const size_t &output_size, const size_t &indices_dim_0,
-                                                 const size_t &indices_dim_1, int64_t *indices_stride,
-                                                 int64_t *work_shape, cudaStream_t stream);
-template void TensorScatterUpdate<float, int64_t>(float *input, int64_t *indices, float *update, float *output,
-                                                   const size_t &block_size, const size_t &input_size,
-                                                   const size_t &output_size, const size_t &indices_dim_0,
-                                                   const size_t &indices_dim_1, int64_t *indices_stride,
-                                                   int64_t *work_shape, cudaStream_t stream);
-template void TensorScatterUpdate<double, int64_t>(double *input, int64_t *indices, double *update, double *output,
-                                                   const size_t &block_size, const size_t &input_size,
-                                                   const size_t &output_size, const size_t &indices_dim_0,
-                                                   const size_t &indices_dim_1, int64_t *indices_stride,
-                                                   int64_t *work_shape, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cuh
deleted file mode 100644
index c5e59b9fee9..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_UPDATE_IMPL_CUH
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_UPDATE_IMPL_CUH
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T, typename S>
-void TensorScatterUpdate(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
-                         const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
-                         S *indices_stride, S *work_shape, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_UPDATE_IMPL_CUH
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tile_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tile_impl.cu
deleted file mode 100644
index 233259155c9..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tile_impl.cu
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh"
-
-template <typename T>
-__global__ void Tile(const size_t output_size, const size_t input_size, const size_t shape_size,
-                     const size_t *input_shape, const size_t *output_shape, const T *input, T *output) {
-  // for example 4-D: pos = pos_array[0] * output_shape[1] * output_shape[2] * output_shape[3] +
-  //                        pos_array[1] * output_shape[2] * output_shape[3] +
-  //                        pos_array[2] * output_shape[3] +
-  //                        pos_array[3]
-  size_t pos_array[TILE_MAX_DIMENSION];
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_size; pos += blockDim.x * gridDim.x) {
-    size_t tmp_pos = pos;
-    size_t pos_size = output_size / output_shape[0];
-    pos_array[0] = tmp_pos / pos_size;
-    for (size_t i = 1; i < shape_size; i++) {
-      tmp_pos -= pos_array[i - 1] * pos_size;
-      pos_size = pos_size / output_shape[i];
-      pos_array[i] = tmp_pos / pos_size;
-    }
-    for (size_t i = 0; i < shape_size; i++) {
-      pos_array[i] = pos_array[i] % input_shape[i];
-    }
-    pos_size = input_size;
-    size_t input_pos = 0;
-    for (size_t i = 0; i < shape_size; i++) {
-      pos_size /= input_shape[i];
-      input_pos += (pos_array[i] * pos_size);
-    }
-    output[pos] = input[input_pos];
-  }
-}
-
-template <typename T>
-void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, const size_t *input_shape,
-             const size_t *output_shape, const T *input, T *output, cudaStream_t cuda_stream) {
-  Tile<<<GET_BLOCKS(output_size), GET_THREADS, 0, cuda_stream>>>(output_size, input_size, shape_size, input_shape,
-                                                                 output_shape, input, output);
-  return;
-}
-
-template void CalTile<double>(const size_t output_size, const size_t input_size, const size_t shape_size,
-                              const size_t *input_shape, const size_t *output_shape, const double *input,
-                              double *output, cudaStream_t cuda_stream);
-template void CalTile<float>(const size_t output_size, const size_t input_size, const size_t shape_size,
-                             const size_t *input_shape, const size_t *output_shape, const float *input, float *output,
-                             cudaStream_t cuda_stream);
-template void CalTile<half>(const size_t output_size, const size_t input_size, const size_t shape_size,
-                            const size_t *input_shape, const size_t *output_shape, const half *input, half *output,
-                            cudaStream_t cuda_stream);
-template void CalTile<int16_t>(const size_t output_size, const size_t input_size, const size_t shape_size,
-                               const size_t *input_shape, const size_t *output_shape, const int16_t *input,
-                               int16_t *output, cudaStream_t cuda_stream);
-template void CalTile<int>(const size_t output_size, const size_t input_size, const size_t shape_size,
-                           const size_t *input_shape, const size_t *output_shape, const int *input, int *output,
-                           cudaStream_t cuda_stream);
-template void CalTile<int64_t>(const size_t output_size, const size_t input_size, const size_t shape_size,
-                               const size_t *input_shape, const size_t *output_shape, const int64_t *input,
-                               int64_t *output, cudaStream_t cuda_stream);
-template void CalTile<bool>(const size_t output_size, const size_t input_size, const size_t shape_size,
-                               const size_t *input_shape, const size_t *output_shape, const bool *input,
-                               bool *output, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh
deleted file mode 100644
index 97e43f5dcd5..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TILE_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TILE_IMPL_CUH_
-
-#define TILE_MAX_DIMENSION 100
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename T>
-void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, const size_t *input_shape,
-             const size_t *output_shape, const T *input, T *output, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TILE_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl.cu
deleted file mode 100755
index 9dbe1656720..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl.cu
+++ /dev/null
@@ -1,86 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-
-#include "transpose_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "utils/complex.h"
-
-template <typename T>
-using Complex = mindspore::utils::Complex<T>;
-
-template <typename T>
-__global__ void Transpose(const size_t size, const T *input, const size_t *input_shape, const size_t *input_axis,
-                          const size_t shape_size, T *output) {
-  size_t pos_size;
-  size_t temp_pos;
-  size_t newpos;
-  size_t newpos_size;
-  size_t pos_array[TRANSPOSE_MAX_DIMENSION];
-
-  // for example 4-D: pos = posArray[0] * input_shape[1] * input_shape[2] * input_shape[3] +
-  //                        posArray[1] * input_shape[2] * input_shape[3] +
-  //                        posArray[2] * input_shape[3] +
-  //                        posArray[3]
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
-    temp_pos = pos;
-    pos_size = size / input_shape[0];
-    pos_array[0] = temp_pos / pos_size;
-    for (size_t i = 1; i < shape_size; i++) {
-      temp_pos -= pos_array[i - 1] * pos_size;
-      pos_size = pos_size / input_shape[i];
-      pos_array[i] = temp_pos / pos_size;
-    }
-
-    newpos = pos_array[input_axis[shape_size - 1]];
-    newpos_size = 1;
-    for (int64_t j = shape_size - 2; j >= 0; j--) {
-      newpos_size *= input_shape[input_axis[j + 1]];
-      newpos += pos_array[input_axis[j]] * newpos_size;
-    }
-
-    output[newpos] = input[pos];
-  }
-}
-template <typename T>
-void CalTranspose(const size_t size, const T *input, const size_t *input_shape, const size_t *input_axis,
-                  const size_t shape_size, T *output, cudaStream_t cuda_stream) {
-  Transpose<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, input_shape, input_axis, shape_size,
-                                                               output);
-}
-
-template void CalTranspose<double>(const size_t size, const double *input, const size_t *input_shape,
-                                   const size_t *input_axis, const size_t shape_size, double *output,
-                                   cudaStream_t cuda_stream);
-template void CalTranspose<float>(const size_t size, const float *input, const size_t *input_shape,
-                                  const size_t *input_axis, const size_t shape_size, float *output,
-                                  cudaStream_t cuda_stream);
-template void CalTranspose<half>(const size_t size, const half *input, const size_t *input_shape,
-                                 const size_t *input_axis, const size_t shape_size, half *output,
-                                 cudaStream_t cuda_stream);
-template void CalTranspose<int>(const size_t size, const int *input, const size_t *input_shape,
-                                const size_t *input_axis, const size_t shape_size, int *output,
-                                cudaStream_t cuda_stream);
-template void CalTranspose<int64_t>(const size_t size, const int64_t *input, const size_t *input_shape,
-                                    const size_t *input_axis, const size_t shape_size, int64_t *output,
-                                    cudaStream_t cuda_stream);
-template void CalTranspose<Complex<float>>(const size_t size, const Complex<float> *input, const size_t *input_shape,
-                                const size_t *input_axis, const size_t shape_size, Complex<float> *output,
-                                cudaStream_t cuda_stream);
-template void CalTranspose<Complex<double>>(const size_t size, const Complex<double> *input, const size_t *input_shape,
-                                    const size_t *input_axis, const size_t shape_size, Complex<double> *output,
-                                    cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh
deleted file mode 100644
index e7c6306d299..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRANSPOSE_OPT_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRANSPOSE_OPT_H_
-
-#include <cuda_runtime.h>
-
-#define TRANSPOSE_MAX_DIMENSION 100
-template <typename T>
-void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, const T *d_input, const size_t *input_shape,
-                           const size_t *input_axis, const size_t *d_input_shape, const size_t *d_input_axis, T *output,
-                           cudaStream_t cuda_stream);
-
-template <typename T>
-void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, const T *d_input, const size_t *input_shape,
-                           const size_t *input_axis, const size_t *d_input_shape, const size_t *d_input_axis, T *output,
-                           cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRANSPOSE_OPT_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh
deleted file mode 100644
index bdf14788930..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRIANGLEMATRIXCOPYIMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRIANGLEMATRIXCOPYIMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void TriangleMatrixCopy(const T *input, T *output, bool clean, cublasFillMode_t uplo, const size_t count,
-                        const size_t ldb, const size_t m, cudaStream_t cuda_stream);
-
-template <typename T>
-void MatrixCopy(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRIANGLEMATRIXCOPYIMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cuh
deleted file mode 100755
index 36c5c7fbe43..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cuh
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOP_GRAD_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOP_GRAD_IMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-template <typename T>
-void SqrtGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void RsqrtGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void AsinGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void ACosGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void AtanGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void AsinhGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void AcoshGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void ReciprocalGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOP_GRAD_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cu
deleted file mode 100755
index 63a942c6123..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cu
+++ /dev/null
@@ -1,820 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "unary_op_impl.cuh"
-template <typename T>
-__global__ void ExponentialKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = expf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void ExponentialKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = exp(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void ExponentialKernel(const half *input, half *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = hexp(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void Expm1Kernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = expm1f(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void Expm1Kernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = expm1(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void LogarithmKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = logf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void LogarithmKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = log(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void LogarithmKernel(const half *input, half *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = hlog(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void Log1pKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = log1pf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void Log1pKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = log1p(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void ErfKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = erff(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void ErfKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = erf(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void ErfcKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = erfcf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void ErfcKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = erfc(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void NegativeKernel(const T *input, T *output, const size_t count) {
-  T neg_one = -1;
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = neg_one * input[i];
-  }
-  return;
-}
-template <typename T>
-__global__ void ReciprocalKernel(const T *input, T *output, const size_t count) {
-  T one = 1.0;
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = one / input[i];
-  }
-  return;
-}
-template <typename T>
-__global__ void SquareKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = input[i] * input[i];
-  }
-  return;
-}
-template <typename T>
-__global__ void SqrtKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = sqrtf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void SqrtKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = sqrt(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void SqrtKernel(const half *input, half *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = hsqrt(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void RsqrtKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = rsqrtf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void RsqrtKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = rsqrt(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void RsqrtKernel(const half *input, half *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = hrsqrt(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void SinKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = sinf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void SinKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = sin(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void SinKernel(const half *input, half *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = hsin(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void AsinKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = asinf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void AsinKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = asin(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void AsinhKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = asinhf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void AsinhKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = asinh(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void CosKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = cosf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void CosKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = cos(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void CosKernel(const half *input, half *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = hcos(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void ACosKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = acosf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void ACosKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = acos(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void AcoshKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = acoshf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void AcoshKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = acosh(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void AtanKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = atanf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void AtanKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = atan(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void AbsKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = abs(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void AbsKernel(const half *input, half *output, const size_t count) {
-  half zero = 0.0;
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = input[i] < zero ? -input[i] : input[i];
-  }
-  return;
-}
-template <typename T>
-__global__ void AbsKernel(const Complex<T> *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = abs(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void RealKernel(const Complex<T> *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = input[i].real();
-  }
-  return;
-}
-template <typename T>
-__global__ void RealKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = input[i];
-  }
-  return;
-}
-template <typename T>
-__global__ void ImagKernel(const Complex<T> *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = input[i].imag();
-  }
-  return;
-}
-template <typename T>
-__global__ void ImagKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    T zero = 0;
-    output[i] = zero;
-  }
-  return;
-}
-template <typename T>
-__global__ void ConjKernel(const Complex<T> *input, Complex<T> *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = Complex<T>(input[i].real(), -input[i].imag());
-  }
-  return;
-}
-template <typename T>
-__global__ void ConjKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = input[i];
-  }
-  return;
-}
-template <typename T>
-__global__ void FloorKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = floorf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void FloorKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = floor(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void FloorKernel(const half *input, half *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = hfloor(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void RintKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = rintf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void RintKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = rint(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void RintKernel(const half *input, half *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = hrint(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void RoundKernel(const T *input, T *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = nearbyintf(input[i]);
-  }
-  return;
-}
-template <>
-__global__ void RoundKernel(const double *input, double *output, const size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = nearbyint(input[i]);
-  }
-  return;
-}
-template <typename T>
-__global__ void SignKernel(const T *input, T *output, const size_t count) {
-  T zero = 0.0;
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    T res;
-    if (input[i] < zero) {
-      res = -1;
-    } else if (input[i] > zero) {
-      res = 1;
-    } else {
-      res = 0;
-    }
-    output[i] = static_cast<T>(res);
-  }
-  return;
-}
-template <typename T>
-void Exponential(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Expm1(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  Expm1Kernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Logarithm(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  LogarithmKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Log1p(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  Log1pKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Erf(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  ErfKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Erfc(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  ErfcKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Negative(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  NegativeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Reciprocal(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  ReciprocalKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Square(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  SquareKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Pow(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  PowKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Sqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  SqrtKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Sin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  SinKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Cos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  CosKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Asin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  AsinKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void ACos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  ACosKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Atan(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  AtanKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Asinh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  AsinhKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Acosh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  AcoshKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Rsqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  RsqrtKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Abs(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  AbsKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Abs(const Complex<T> *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  AbsKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Real(const Complex<T> *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  RealKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Real(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  RealKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Imag(const Complex<T> *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  ImagKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Imag(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  ImagKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Conj(const Complex<T> *input, Complex<T> *output, const size_t count, cudaStream_t cuda_stream) {
-  ConjKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Conj(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  ConjKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Floor(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  FloorKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Rint(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  RintKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Round(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  RoundKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-template <typename T>
-void Sign(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) {
-  SignKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
-  return;
-}
-
-// double
-template void Exponential<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Expm1<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Logarithm<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Log1p<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Erf<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Erfc<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Negative<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Reciprocal<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Square<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Sqrt<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Sin<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Cos<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Asin<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void ACos<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Atan<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Asinh<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Acosh<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Rsqrt<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Abs<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Floor<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Rint<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Round<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Sign<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Real<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Conj<double>(const double *input, double *output, const size_t count, cudaStream_t cuda_stream);
-
-
-// float
-template void Exponential<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Expm1<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Logarithm<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Log1p<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Erf<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Erfc<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Negative<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Reciprocal<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Square<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Sqrt<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Sin<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Cos<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Asin<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void ACos<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Atan<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Asinh<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Acosh<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Rsqrt<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Abs<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Floor<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Rint<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Round<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Sign<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Real<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Conj<float>(const float *input, float *output, const size_t count, cudaStream_t cuda_stream);
-
-// half
-template void Exponential<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Expm1<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Logarithm<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Log1p<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Erf<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Erfc<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Negative<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Reciprocal<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Square<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Sqrt<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Sin<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Cos<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Asin<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void ACos<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Atan<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Asinh<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Acosh<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Rsqrt<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Abs<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Floor<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Rint<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Round<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Sign<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Real<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-template void Conj<half>(const half *input, half *output, const size_t count, cudaStream_t cuda_stream);
-
-// int8
-template void Exponential<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Expm1<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Logarithm<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Log1p<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Erf<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Erfc<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Negative<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Reciprocal<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Square<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Sqrt<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Sin<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Cos<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Asin<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void ACos<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Atan<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Asinh<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Acosh<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Rsqrt<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Abs<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Floor<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Rint<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Round<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Sign<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Real<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-template void Conj<char>(const char *input, char *output, const size_t count, cudaStream_t cuda_stream);
-
-// uint8
-template void Exponential<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                         cudaStream_t cuda_stream);
-template void Expm1<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                   cudaStream_t cuda_stream);
-template void Logarithm<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                       cudaStream_t cuda_stream);
-template void Log1p<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                   cudaStream_t cuda_stream);
-template void Erf<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                 cudaStream_t cuda_stream);
-template void Erfc<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                  cudaStream_t cuda_stream);
-template void Negative<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                      cudaStream_t cuda_stream);
-template void Reciprocal<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                        cudaStream_t cuda_stream);
-template void Square<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                    cudaStream_t cuda_stream);
-template void Sqrt<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                  cudaStream_t cuda_stream);
-template void Sin<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                 cudaStream_t cuda_stream);
-template void Cos<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                 cudaStream_t cuda_stream);
-template void Asin<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                  cudaStream_t cuda_stream);
-template void ACos<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                  cudaStream_t cuda_stream);
-template void Atan<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                  cudaStream_t cuda_stream);
-template void Asinh<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                   cudaStream_t cuda_stream);
-template void Acosh<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                   cudaStream_t cuda_stream);
-template void Rsqrt<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                   cudaStream_t cuda_stream);
-template void Abs<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                 cudaStream_t cuda_stream);
-template void Floor<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                   cudaStream_t cuda_stream);
-template void Rint<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                  cudaStream_t cuda_stream);
-template void Round<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                   cudaStream_t cuda_stream);
-template void Sign<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                  cudaStream_t cuda_stream);
-template void Real<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                  cudaStream_t cuda_stream);
-template void Imag<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                  cudaStream_t cuda_stream);
-template void Conj<unsigned char>(const unsigned char *input, unsigned char *output, const size_t count,
-                                  cudaStream_t cuda_stream);
-
-// int32
-template void Exponential<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Expm1<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Logarithm<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Log1p<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Erf<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Erfc<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Negative<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Reciprocal<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Square<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Sqrt<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Sin<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Cos<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Asin<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void ACos<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Atan<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Asinh<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Acosh<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Rsqrt<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Abs<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Floor<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Rint<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Round<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Sign<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Real<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-template void Conj<int>(const int *input, int *output, const size_t count, cudaStream_t cuda_stream);
-
-// complex64
-template void Real<float>(const Complex<float> *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<float>(const Complex<float> *input, float *output, const size_t count, cudaStream_t cuda_stream);
-template void Conj<float>(const Complex<float> *input, Complex<float> *output, const size_t count,
-                          cudaStream_t cuda_stream);
-
-// complex128
-template void Real<double>(const Complex<double> *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<double>(const Complex<double> *input, double *output, const size_t count, cudaStream_t cuda_stream);
-template void Conj<double>(const Complex<double> *input, Complex<double> *output, const size_t count,
-                           cudaStream_t cuda_stream);
-
-// bool
-template void Real<bool>(const bool *input, bool *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<bool>(const bool *input, bool *output, const size_t count, cudaStream_t cuda_stream);
-
-// int16
-template void Real<int16_t>(const int16_t *input, int16_t *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<int16_t>(const int16_t *input, int16_t *output, const size_t count, cudaStream_t cuda_stream);
-template void Conj<int16_t>(const int16_t *input, int16_t *output, const size_t count, cudaStream_t cuda_stream);
-
-// uint16
-template void Real<uint16_t>(const uint16_t *input, uint16_t *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<uint16_t>(const uint16_t *input, uint16_t *output, const size_t count, cudaStream_t cuda_stream);
-template void Conj<uint16_t>(const uint16_t *input, uint16_t *output, const size_t count, cudaStream_t cuda_stream);
-
-// uint32
-template void Real<uint32_t>(const uint32_t *input, uint32_t *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<uint32_t>(const uint32_t *input, uint32_t *output, const size_t count, cudaStream_t cuda_stream);
-template void Conj<uint32_t>(const uint32_t *input, uint32_t *output, const size_t count, cudaStream_t cuda_stream);
-
-// int64
-template void Real<int64_t>(const int64_t *input, int64_t *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<int64_t>(const int64_t *input, int64_t *output, const size_t count, cudaStream_t cuda_stream);
-template void Conj<int64_t>(const int64_t *input, int64_t *output, const size_t count, cudaStream_t cuda_stream);
-
-// uint64
-template void Real<uint64_t>(const uint64_t *input, uint64_t *output, const size_t count, cudaStream_t cuda_stream);
-template void Imag<uint64_t>(const uint64_t *input, uint64_t *output, const size_t count, cudaStream_t cuda_stream);
-template void Conj<uint64_t>(const uint64_t *input, uint64_t *output, const size_t count, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cuh
deleted file mode 100755
index 346f7bcdf50..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cuh
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_
-
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-#include "utils/complex.h"
-template <typename T>
-void Exponential(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Expm1(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Logarithm(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Log1p(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Erf(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Erfc(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Negative(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Reciprocal(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Square(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Sqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Rsqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Sin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Cos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Asin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void ACos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Atan(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Asinh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Acosh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Abs(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Floor(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Rint(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Round(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Sign(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Real(const Complex<T> *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Real(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Imag(const Complex<T> *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Imag(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Conj(const Complex<T> *input, Complex<T> *output, const size_t count, cudaStream_t cuda_stream);
-template <typename T>
-void Conj(const T *input, T *output, const size_t count, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cuh
deleted file mode 100644
index 0432e0ac384..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_UNIFORM_CANDIDATE_SAMPLER_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_UNIFORM_CANDIDATE_SAMPLER_IMPL_CUH_
-#include <cuda_runtime.h>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
-
-template <typename S>
-void CalUniformCandidateSampler(const int64_t true_size, const int64_t num_sampled, const S prob_val,
-                                S *true_expected_count, S *sampled_expected_count, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_UNIFORM_CANDIDATE_SAMPLER_IMPL_CUH_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unique_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unique_impl.cuh
deleted file mode 100644
index 92dc4740df7..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unique_impl.cuh
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_UNIQUE_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_UNIQUE_H_
-template <typename T, typename S>
-int CalUnique(const T *input, int num_elements, S *input_index, S *sorted_index, T *output, S *index,
-               cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_UNIQUE_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unpack.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unpack.cu
deleted file mode 100755
index 8af59861828..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unpack.cu
+++ /dev/null
@@ -1,74 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <cuda_runtime.h>
-#include "plugin/device/gpu/kernel/cuda_impl/unpack.cuh"
-template <typename T>
-__global__ void Unpack(const size_t size, const size_t output_num,
-                       const size_t dims_after_axis, T** outputs, const T* input) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
-      size_t cur_input_index = pos / dims_after_axis % output_num;
-      size_t cycle_len = output_num * dims_after_axis;
-      size_t local_index = pos / cycle_len * dims_after_axis + pos % cycle_len % dims_after_axis;
-      outputs[cur_input_index][local_index] = input[pos];
-  }
-  return;
-}
-
-template <typename T>
-void UnpackKernel(const size_t size, const size_t output_num,
-                  const size_t dims_after_axis, T** outputs, const T* input,
-                  cudaStream_t cuda_stream) {
-  Unpack<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, output_num,
-                                                            dims_after_axis, outputs, input);
-  return;
-}
-
-template void UnpackKernel(const size_t size, const size_t output_num,
-                           const size_t dims_after_axis, int8_t** outputs, const int8_t* input,
-                           cudaStream_t cuda_stream);
-template void UnpackKernel(const size_t size, const size_t output_num,
-                           const size_t dims_after_axis, int16_t** outputs, const int16_t* input,
-                           cudaStream_t cuda_stream);
-template void UnpackKernel(const size_t size, const size_t output_num,
-                           const size_t dims_after_axis, int** outputs, const int* input,
-                           cudaStream_t cuda_stream);
-template void UnpackKernel(const size_t size, const size_t output_num,
-                           const size_t dims_after_axis, int64_t** outputs, const int64_t* input,
-                           cudaStream_t cuda_stream);
-template void UnpackKernel(const size_t size, const size_t output_num,
-                           const size_t dims_after_axis, uint8_t** outputs, const uint8_t* input,
-                           cudaStream_t cuda_stream);
-template void UnpackKernel(const size_t size, const size_t output_num,
-                           const size_t dims_after_axis, uint16_t** outputs, const uint16_t* input,
-                           cudaStream_t cuda_stream);
-template void UnpackKernel(const size_t size, const size_t output_num,
-                           const size_t dims_after_axis, uint32_t** outputs, const uint32_t* input,
-                           cudaStream_t cuda_stream);
-template void UnpackKernel(const size_t size, const size_t output_num,
-                           const size_t dims_after_axis, uint64_t** outputs, const uint64_t* input,
-                           cudaStream_t cuda_stream);
-template void UnpackKernel(const size_t size, const size_t output_num,
-                           const size_t dims_after_axis, half** outputs, const half* input,
-                           cudaStream_t cuda_stream);
-template void UnpackKernel(const size_t size, const size_t output_num,
-                           const size_t dims_after_axis, float** outputs, const float* input,
-                           cudaStream_t cuda_stream);
-template void UnpackKernel(const size_t size, const size_t output_num,
-                           const size_t dims_after_axis, bool** outputs, const bool* input,
-                           cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cu
deleted file mode 100644
index 0f37245f60d..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cu
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
-
-template<typename T, typename S>
-__global__ void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
-                       T* input_addr, S* ids_addr, T* output_addr) {
-  for (int input_index = blockIdx.x * blockDim.x + threadIdx.x; input_index < input_dim0 * input_dim1;
-      input_index += blockDim.x * gridDim.x) {
-    size_t j = input_index / input_dim1;
-    size_t k = input_index % input_dim1;
-
-    S i = ids_addr[j];
-    if (i < 0 || i >= output_dim0) {
-      continue;
-    }
-    size_t output_index = i * output_dim1 + k;
-    MsAtomicAdd(output_addr + output_index, input_addr[input_index]);
-  }
-}
-
-template<typename T, typename S>
-void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
-                        T* input_addr, S* ids_addr, T* output_addr, cudaStream_t stream) {
-  int size = input_dim0 * input_dim1;
-  UnsortedSegmentSum<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input_dim0, input_dim1,
-                                  output_dim0, output_dim1, input_addr, ids_addr, output_addr);
-  return;
-}
-
-template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
-                                 double* input_addr, int* ids_addr, double* output_addr, cudaStream_t stream);
-template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
-                                 double* input_addr, int64_t* ids_addr, double* output_addr, cudaStream_t stream);
-
-template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
-                                 float* input_addr, int* ids_addr, float* output_addr, cudaStream_t stream);
-template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
-                                 float* input_addr, int64_t* ids_addr, float* output_addr, cudaStream_t stream);
-
-template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
-                                 half* input_addr, int* ids_addr, half* output_addr, cudaStream_t stream);
-template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
-                                 half* input_addr, int64_t* ids_addr, half* output_addr, cudaStream_t stream);
-
-template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
-                                 int* input_addr, int* ids_addr, int* output_addr, cudaStream_t stream);
-template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1,
-                                 int* input_addr, int64_t* ids_addr, int* output_addr, cudaStream_t stream);
-
-
-
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel_factory.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel_factory.cc
index d4c22f699e6..22b67974a1a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel_factory.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel_factory.cc
@@ -18,7 +18,7 @@
 
 #include "utils/ms_utils.h"
 #include "runtime/device/kernel_info.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "kernel/common_utils.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/addn_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/addn_gpu_kernel.h
index 97b13b8b379..631ad498e49 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/addn_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/addn_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/assign_add_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/assign_add_gpu_kernel.h
index 3fa7f97df38..52388592dea 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/assign_add_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/assign_add_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_complex_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_complex_gpu_kernel.h
index a74873cd924..febbe873e47 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_complex_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_complex_gpu_kernel.h
@@ -24,7 +24,7 @@
 #include <complex>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h
index 4ec4d963aa8..5b965853545 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_grad_gpu_kernel.h
index 0db01af46bb..cdb0885abf1 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_grad_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cast_all_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cast_all_gpu_kernel.h
index 76ae9eec4e5..2c335613362 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cast_all_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cast_all_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_gpu_kernel.h
index 09ce07ecc10..7da6c9b7241 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_gpu_kernel.h
@@ -21,9 +21,9 @@
 #include <vector>
 #include <string>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/eye_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_solve_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_solve_gpu_kernel.h
index da5be233b06..d6af6258a42 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_solve_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_solve_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <vector>
 #include <string>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_trsm_solve_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_trsm_solve_gpu_kernel.h
index 00a2975f149..603be4333ea 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_trsm_solve_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_trsm_solve_gpu_kernel.h
@@ -21,8 +21,8 @@
 #include <vector>
 #include <string>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/eye_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumprod_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumprod_gpu_kernel.h
index 5eb46717038..78dae0b7918 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumprod_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumprod_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumsum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumsum_gpu_kernel.h
index 8e0bcb52731..cb2887af803 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumsum_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumsum_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/determinant_triangle_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/determinant_triangle_gpu_kernel.h
index 2d94bc31184..824482d83e6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/determinant_triangle_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/determinant_triangle_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_c_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_c_gpu_kernel.h
index 40039a605c8..9afa14b8df1 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_c_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_c_gpu_kernel.h
@@ -25,15 +25,15 @@
 #include <complex>
 #include <algorithm>
 #include <type_traits>
-#include "plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 #include "utils/convert_utils.h"
-#include "utils/complex.h"
-#include "plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_gpu_kernel.h
index 98dc8cf3a3a..cbb627cf771 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_gpu_kernel.h
@@ -24,13 +24,13 @@
 #include <complex>
 #include <algorithm>
 #include <type_traits>
-#include "plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 #include "utils/convert_utils.h"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/einsum_helper.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/einsum_helper.h
index 1c9aa4ab6be..d62c649bc84 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/einsum_helper.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/einsum_helper.h
@@ -28,9 +28,9 @@
 #include <functional>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/einsum_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/equalcount_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/equalcount_gpu_kernel.h
index 5029e91941c..8acd40ebcbd 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/equalcount_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/equalcount_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/float_status_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/float_status_gpu_kernel.h
index 758b3148827..a789bfc9ccf 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/float_status_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/float_status_gpu_kernel.h
@@ -23,8 +23,8 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/float_status_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/index_add_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/index_add_gpu_kernel.h
index 22ce1d636f8..706d17c2fea 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/index_add_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/index_add_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/index_add_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/linspace.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/linspace.h
index de37b2896c9..738d602dedd 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/linspace.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/linspace.h
@@ -22,7 +22,7 @@
 #include <iostream>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/linspace.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/logical_not_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/logical_not_gpu_kernel.h
index 915f3e178b8..82709487c0b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/logical_not_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/logical_not_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <vector>
 #include <algorithm>
 #include <functional>
-#include "plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "utils/convert_utils.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_gpu_kernel.h
index 75139d839bd..08bd7ec2fe1 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_gpu_kernel.h
@@ -26,7 +26,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 #include "utils/convert_utils.h"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_solve_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_solve_gpu_kernel.h
index e8a5b0e3388..4f11a03c568 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_solve_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_solve_gpu_kernel.h
@@ -26,7 +26,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 #include "utils/convert_utils.h"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/matmul_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/matmul_gpu_kernel.h
index 5b78d5a0c03..c6114592578 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/matmul_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/matmul_gpu_kernel.h
@@ -24,7 +24,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/fill_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cuh"
 #include "utils/convert_utils.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/multinomial_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/multinomial_gpu_kernel.h
index b3eaadd590c..297a7e20b1d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/multinomial_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/multinomial_gpu_kernel.h
@@ -26,8 +26,8 @@
 #include "plugin/device/gpu/hal/device/gpu_memory_allocator.h"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/nms_with_mask_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/nms_with_mask_gpu_kernel.h
index d5e2fcd7c70..521317e8963 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/nms_with_mask_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/nms_with_mask_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <iostream>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/random_op_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/random_op_gpu_kernel.h
index 6b85adafe9b..810c1a28d86 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/random_op_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/random_op_gpu_kernel.h
@@ -24,7 +24,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/random_op_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cuh"
 #include "include/curand.h"
 #include "utils/ms_context.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/solve_triangular_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/solve_triangular_gpu_kernel.h
index 66ce1e64541..6093bd8c777 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/solve_triangular_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/solve_triangular_gpu_kernel.h
@@ -24,7 +24,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/square_sum_all_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/square_sum_all_gpu_kernel.h
index 90736d2be71..54e9c61330b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/square_sum_all_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/square_sum_all_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/squared_difference_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/squared_difference_kernel.h
index 8c5e5c778dd..170a342e0ce 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/squared_difference_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/squared_difference_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_complex_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_complex_gpu_kernel.cc
index 7a7b8653c80..b5d19e3974c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_complex_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_complex_gpu_kernel.cc
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "utils/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
 #include "plugin/device/gpu/kernel/math/unary_op_complex_gpu_kernel.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_gpu_kernel.h
index 04440f79f4b..d27110cf9b3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_gpu_kernel.h
@@ -24,7 +24,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_grad_gpu_kernel.h
index 01f390e530d..a12768c26ed 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_grad_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/update_thor_gradient.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/update_thor_gradient.h
index 4e64d99acd3..3e17aeeb6c3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/update_thor_gradient.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/update_thor_gradient.h
@@ -23,8 +23,8 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
 #include "utils/convert_utils.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_gpu_kernel.h
index 43d9b03c421..a06f9cdd9fd 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_gpu_kernel.h
@@ -25,7 +25,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 #include "utils/utils.h"
-#include "plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_grad_gpu_kernel.h
index 2f3ac96e6a4..d4d34fc69e8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_grad_gpu_kernel.h
@@ -25,7 +25,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 #include "utils/utils.h"
-#include "plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/activation_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/activation_gpu_kernel.h
index 17cb6c616e5..0267002ca20 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/activation_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/activation_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adagrad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adagrad_gpu_kernel.h
index 9cf9ee60322..34b2471bbf2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adagrad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adagrad_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_gpu_kernel.h
index 1859fc83329..e3c9a9cfdf3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr size_t INPUT_NUM = 10;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_weight_decay_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_weight_decay_gpu_kernel.h
index 59cadec12c8..c2260757903 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_weight_decay_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_weight_decay_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr size_t INPUT_NUM = 9;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_gpu_kernel.h
index 758d2042719..cab901423cb 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_grad_gpu_kernel.h
index 0ae0cc8fe6f..adc7b8f67cc 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_grad_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/apply_gradient_descent_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/apply_gradient_descent_gpu_kernel.h
index 7be111f46f2..a61fec56d51 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/apply_gradient_descent_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/apply_gradient_descent_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/batch_norm_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/batch_norm_grad_gpu_kernel.h
index 9c7d6dbe5fb..a9d4dc3ff82 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/batch_norm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/batch_norm_grad_gpu_kernel.h
@@ -24,7 +24,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bce_with_logits_loss_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bce_with_logits_loss_kernel.h
index f699b98091c..4c210ba30b2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bce_with_logits_loss_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bce_with_logits_loss_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bias_add_grad_gpu_kenel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bias_add_grad_gpu_kenel.h
index f745ca0ffa6..ab56bc9c3d5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bias_add_grad_gpu_kenel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bias_add_grad_gpu_kenel.h
@@ -24,7 +24,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_gpu_kernel.h
index 1453a1b3b9d..8b12f8e0cc4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh"
 #include "kernel/common_utils.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_grad_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_grad_kernel.h
index f8679afdade..d28c1b9370d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_grad_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_grad_kernel.h
@@ -21,7 +21,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh"
 #include "kernel/common_utils.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/clip_grad_norm_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/clip_grad_norm_gpu_kernel.h
index e151dc6d618..f73a5337ede 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/clip_grad_norm_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/clip_grad_norm_gpu_kernel.h
@@ -24,7 +24,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cuh"
 
 namespace mindspore::kernel {
 constexpr size_t kArgMaxDim = 7;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/combine_momentum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/combine_momentum_gpu_kernel.h
index 4e162839f86..7652713b5f6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/combine_momentum_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/combine_momentum_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_gpu_kernel.h
index ea1ef4b8e01..b107df67ece 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_filter_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_filter_gpu_kernel.h
index e2f6aafd961..ab5ad4245dd 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_filter_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_filter_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_input_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_input_gpu_kernel.h
index 7047b900dc5..3b5660483b8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_input_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_input_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <vector>
 #include <map>
 
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_gpu_kernel.h
index 82083b5cd66..9c38cf96184 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_filter_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_filter_gpu_kernel.h
index d7127c2d4ba..a47db13b235 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_filter_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_filter_gpu_kernel.h
@@ -21,11 +21,11 @@
 #include <string>
 #include <vector>
 
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_input_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_input_gpu_kernel.h
index cb325b0d951..28e887bdf71 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_input_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_input_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_transpose_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_transpose_gpu_kernel.h
index 795ea225198..b47f90f2e8e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_transpose_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_transpose_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <utility>
 #include <vector>
 
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ctcloss_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ctcloss_gpu_kernel.h
index 3ff6d4c7fc6..55f60491ec8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ctcloss_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ctcloss_gpu_kernel.h
@@ -24,7 +24,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/hal/device/gpu_memory_allocator.h"
-#include "plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr size_t kPrevOutput0th = 0;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout3d_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout3d_gpu_kernel.h
index dc517b7df10..f922b08c458 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout3d_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout3d_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cuh"
 #include "include/curand.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_gpu_kernel.h
index ec803510ed9..717678c10cb 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/dropout_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cuh"
 #include "include/curand.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_grad_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_grad_kernel.h
index 21993303113..f29a282d694 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_grad_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_grad_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/dropout_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ftrl_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ftrl_gpu_kernel.h
index b6e00a6fe66..381a2f6fe1e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ftrl_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ftrl_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr size_t INPUT_NUM = 8;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_adam_weight_decay.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_adam_weight_decay.h
index 8feb0255f65..8e95bcf250f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_adam_weight_decay.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_adam_weight_decay.h
@@ -21,7 +21,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_grad_v2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_grad_v2_gpu_kernel.h
index 12c4d3ff358..32bcc041b97 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_grad_v2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_grad_v2_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <functional>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_v2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_v2_gpu_kernel.h
index 015395292e0..c258eca61c2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_v2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_v2_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <functional>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_scale_momentum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_scale_momentum_gpu_kernel.h
index b5d06a0f7bf..9579ad694f0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_scale_momentum_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_scale_momentum_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr size_t INPUT_NUM = 6;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_momentum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_momentum_gpu_kernel.h
index f67bba6eecb..5ce11019c63 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_momentum_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_momentum_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr size_t INPUT_NUM = 6;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_scale_momentum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_scale_momentum_gpu_kernel.h
index 06cc7d9bee4..e86efa29e10 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_scale_momentum_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_scale_momentum_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr size_t INPUT_NUM = 7;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_grad_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_grad_kernel.h
index 7a1edeeee0d..1080e6f9423 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_grad_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_grad_kernel.h
@@ -21,7 +21,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/gelu_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_kernel.h
index 62554dc8f53..b37c8abc739 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_kernel.h
@@ -21,7 +21,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/gelu_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_gpu_kernel.h
index f5072507216..39991192200 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_grad_gpu_kernel.h
index 1bae26179d9..e5e672d8141 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_gpu_kernel.h
index 4123ff2a091..01b248e015a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/hswish_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_grad_gpu_kernel.h
index d772c1cda4d..2c6e4535a2d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/hswish_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/im2col_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/im2col_gpu_kernel.h
index 177f811f3ee..1730617aac6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/im2col_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/im2col_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_gpu_kernel.h
index d3039b119b5..759b319c067 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 #include "utils/utils.h"
-#include "plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_grad_gpu_kernel.h
index 812edbdf594..c9e9a9775ff 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_grad_gpu_kernel.h
@@ -24,7 +24,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_gpu_kernel.h
index a2c1aa81efc..75f4fb23d3b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh"
 #include "kernel/common_utils.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_grad_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_grad_kernel.h
index c5125546430..03ce54b2374 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_grad_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_grad_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh"
 #include "kernel/common_utils.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2_loss_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2_loss_gpu_kernel.h
index 121685e9f11..fab679c2ea4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2_loss_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2_loss_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/l2_loss.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_gpu_kernel.h
index d10b352a5da..9fe274988c0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_gpu_kernel.h
@@ -22,8 +22,8 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_grad_gpu_kernel.h
index 59f6bd61b70..9bc08a94ace 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_grad_gpu_kernel.h
@@ -22,8 +22,8 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_gpu_kernel.h
index b0f9d5bfb69..5a398972e02 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_gpu_kernel.h
index 134c5699f35..7d1790adccc 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_grad_gpu_kernel.h
index 4ddd774e096..e64fd9b3058 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_gpu_kernel.h
index b9bc6a4987b..eb9918ffaf2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_gpu_kernel.h
@@ -22,8 +22,8 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh"
 #include "utils/utils.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_grad_gpu_kernel.h
index 7dde87fc38b..c0e0bd481b8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_grad_gpu_kernel.h
@@ -22,8 +22,8 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh"
 #include "utils/utils.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_gpu_kernel.h
index 659aa168081..458b75890ce 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_grad_gpu_kernel.h
index d263c02f91b..065312c5d93 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_grad_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_gpu_kernel.h
index 79c07c84d66..efb3a8f08b2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_grad_gpu_kernel.h
index 06d8d715684..c7d837fa2b4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_grad_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/momentum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/momentum_gpu_kernel.h
index 03e667c944d..3673fb93302 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/momentum_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/momentum_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr size_t INPUT_NUM = 5;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_gpu_kernel.h
index fe5c27f1959..e1641a8f3d6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh"
 #include "kernel/common_utils.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_grad_gpu_kernel.h
index c5533e4fb24..45193109ad8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_grad_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh"
 #include "kernel/common_utils.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pad_gpu_kernel.h
index 972902526be..f1aa84afe74 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pad_gpu_kernel.h
@@ -22,8 +22,8 @@
 #include <string>
 #include <algorithm>
 
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_gpu_kernel.h
index cc5f2e4a8f7..ff1286d9161 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_grad_gpu_kernel.h
index 0efd058fc8c..960db387035 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_grad_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_gpu_kernel.h
index 6c16d19b6b9..aa1d0743992 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_gpu_kernel.h
@@ -23,7 +23,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/prelu_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_grad_gpu_kernel.h
index 373bc58aa8f..cc3fbe80f5a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_grad_gpu_kernel.h
@@ -23,7 +23,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_gpu_kernel.h
index 34f0db1d592..4998d402856 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <functional>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_grad_gpu_kernel.h
index 8a0d4a86eba..9a481ae8515 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_grad_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <functional>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_gpu_kernel.h
index 6573863641d..51e700b3032 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_gpu_kernel.h
index ce5852877f0..4022810d7dd 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_v2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_v2_gpu_kernel.h
index c3b9708d26a..d9e0395bcab 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_v2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_v2_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <functional>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_v2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_v2_gpu_kernel.h
index 72be64cf00f..5eae299261b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_v2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_v2_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <functional>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_gpu_kernel.h
index 5059dbd3422..eb3855ff0e5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_grad_gpu_kernel.h
index 28f39509f1b..dd83598bce8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/rmsprop_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/rmsprop_gpu_kernel.h
index 9ff5d483084..28d8b6ca0ae 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/rmsprop_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/rmsprop_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_gpu_kernel.h
index 53f631a1da1..52b08df28d3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_grad_gpu_kernel.h
index 7578704d7bb..f655555a152 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_grad_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sgd_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sgd_gpu_kernel.h
index 82c9f815f31..e78a7c82b20 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sgd_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sgd_gpu_kernel.h
@@ -18,7 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_NN_SGD_KERNEL_H_
 
 #include <vector>
-#include "plugin/device/gpu/kernel/cuda_impl/sgd_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.h
index f2491bfae49..70f6873cc58 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.h
index e92086d25f3..31f51d8a64c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_gpu_kernel.h
index 4d227013cd0..abf91d8fa05 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_grad_gpu_kernel.h
index 57b81854cf6..a119076e0ff 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_cross_entropy_with_logits_gpu_kernel.h
index f712642eb47..682036ae87c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_cross_entropy_with_logits_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_cross_entropy_with_logits_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_gpu_kernel.h
index 3ab78abb323..08d49d889ff 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_grad_gpu_kernel.h
index dad15533522..188059f7ab2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_grad_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_gpu_kernel.h
index 4fab63e732f..a33f0f23326 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/softplus_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_grad_gpu_kernel.h
index 06abc62edd5..887ba91629e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_grad_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/softplus_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_apply_proximal_adagrad_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_apply_proximal_adagrad_kernel.h
index b0c8c69ef0e..2916745ee4b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_apply_proximal_adagrad_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_apply_proximal_adagrad_kernel.h
@@ -25,7 +25,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_ftrl_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_ftrl_gpu_kernel.h
index ded51c00bb3..990127a0605 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_ftrl_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_ftrl_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr size_t INPUT_NUM = 5;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h
index d5fc085fc9b..1f467e88714 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_decode_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_decode_gpu_kernel.h
index 2b01569cf25..390de32af9b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_decode_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_decode_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include <string>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_encode_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_encode_gpu_kernel.h
index 28559c72e67..d89f80be69f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_encode_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_encode_gpu_kernel.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include <string>
-#include "plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/check_valid_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/other/check_valid_gpu_kernel.h
index 77957902c5e..8dfa8e82cd9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/check_valid_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/check_valid_gpu_kernel.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include <string>
-#include "plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.h
index b19bf3b8b8f..7a9a7ac6e69 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.h
@@ -24,7 +24,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_stitch_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_stitch_gpu_kernel.cc
index 06c0b8fb136..b846f51d774 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_stitch_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_stitch_gpu_kernel.cc
@@ -16,7 +16,7 @@
 #include "plugin/device/gpu/kernel/other/dynamic_stitch_gpu_kernel.h"
 #include <functional>
 #include "kernel/common_utils.h"
-#include "plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cuh"
 #include "plugin/device/gpu/hal/device/gpu_common.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/iou_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/other/iou_gpu_kernel.h
index 9aa4339681e..3c09ddee697 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/iou_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/iou_gpu_kernel.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include <string>
-#include "plugin/device/gpu/kernel/cuda_impl/iou_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_gpu_kernel.h
index e1a0c4ea144..ffef444f504 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_grad_gpu_kernel.h
index 0c0c0b93df8..92c8ef0f649 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_gpu_kernel.h
index 840c1d408df..f24ff530b6d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_grad_gpu_kernel.h
index 3c10b80cb17..8ba620b24a7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_gpu_kernel.h
index 002ec255724..b17888a5d32 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.cc
index 2d18708d4ff..5befe79234a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.cc
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.h
index 366738ae507..996e26891c6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_gpu_kernel.cc
index 3bf75434ba9..e6e9fef93a2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_gpu_kernel.cc
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh"
 #include <thrust/extrema.h>
 #include <thrust/pair.h>
 #include <thrust/device_vector.h>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.cc
index cc62653da44..704ee886b16 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_gpu_kernel.cc
index bbab3696446..74cca389fc4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_gpu_kernel.cc
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh"
 #include <thrust/extrema.h>
 #include <thrust/pair.h>
 #include <thrust/device_vector.h>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.cc
index 89bf52e91c6..30432bab050 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_gpu_kernel.cc
index 5556795993f..3d907e915aa 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_gpu_kernel.cc
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/quant/fake_quant_perchannel_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh"
 #include <thrust/extrema.h>
 #include <thrust/pair.h>
 #include <thrust/device_vector.h>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_grad_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_grad_gpu_kernel.cc
index d16860c541c..f2055b4e923 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_grad_gpu_kernel.cc
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/quant/fake_quant_perchannel_grad_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_gpu_kernel.cc
index 5345d19a92a..514da145d36 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_gpu_kernel.cc
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/quant/fake_quant_perlayer_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh"
 #include <thrust/extrema.h>
 #include <thrust/pair.h>
 #include <thrust/device_vector.h>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_grad_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_grad_gpu_kernel.cc
index b7d27020ad5..6259e78ad0a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_grad_gpu_kernel.cc
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/quant/fake_quant_perlayer_grad_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perchannel_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perchannel_gpu_kernel.cc
index 5bc204ebbdb..0e387af2b73 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perchannel_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perchannel_gpu_kernel.cc
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/quant/minmax_update_perchannel_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh"
 #include <thrust/extrema.h>
 #include <thrust/pair.h>
 #include <thrust/device_vector.h>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perlayer_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perlayer_gpu_kernel.cc
index 1adc65610de..08b6d8d6c66 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perlayer_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perlayer_gpu_kernel.cc
@@ -15,7 +15,7 @@
  */
 
 #include "plugin/device/gpu/kernel/quant/minmax_update_perlayer_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh"
 #include <thrust/extrema.h>
 #include <thrust/pair.h>
 #include <thrust/device_vector.h>
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_categorical_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_categorical_gpu_kernel.h
index baace853243..22b7789319a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_categorical_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_categorical_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <random>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/random_categorical.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_choice_with_mask_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_choice_with_mask_gpu_kernel.h
index f46d1eba0b3..7f290fbab07 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_choice_with_mask_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_choice_with_mask_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <random>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/random/uniform_candidate_sampler_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/random/uniform_candidate_sampler_gpu_kernel.h
index 638783918c3..0b18458243f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/random/uniform_candidate_sampler_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/random/uniform_candidate_sampler_gpu_kernel.h
@@ -25,7 +25,7 @@
 #include <limits>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/buffer_sample_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/buffer_sample_gpu_kernel.cc
index f94e2c8478e..d3ee193db2d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/buffer_sample_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/buffer_sample_gpu_kernel.cc
@@ -21,7 +21,7 @@
 
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh"
 #include "plugin/device/gpu/hal/device/gpu_common.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/tag_environment.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/tag_environment.cc
index 31049380c13..71b02676f8c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/tag_environment.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/tag_environment.cc
@@ -22,7 +22,7 @@
 #include <map>
 #include <utility>
 #include "plugin/device/gpu/hal/device/cuda_driver.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_atom_energy_kernel.h
index b6dcaba508a..a22dd0e6a9d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_atom_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_atom_energy_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_energy_kernel.h
index ac5155d7c24..21daaea65db 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_energy_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_kernel.h
index 5afb3dc5066..b638c05b2f7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_with_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_with_atom_energy_kernel.h
index 62ec9169957..54e44c84521 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_with_atom_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_with_atom_energy_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_atom_energy_cuda_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_atom_energy_cuda_gpu_kernel.h
index 0dd63d6a938..0aa39729ae5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_atom_energy_cuda_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_atom_energy_cuda_gpu_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_energy_cuda_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_energy_cuda_gpu_kernel.h
index 96c2bbf7f14..5be844e02c9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_energy_cuda_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_energy_cuda_gpu_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_cuda_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_cuda_gpu_kernel.h
index 871d8ac9d48..fcb51d8c4d2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_cuda_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_cuda_gpu_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.h
index 873c243450a..ae7e0d0572e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_kernel.h
index c4f9f82f183..4c735ca7512 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_virial_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_virial_kernel.h
index 102fd693d4a..55c1977aeca 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_virial_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_virial_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/atomcrdtocv_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/atomcrdtocv_kernel.h
index 7f577d7fa14..02b7550fca4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/atomcrdtocv_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/atomcrdtocv_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_kernel.h
index f13ac4d4dab..9d0243d0977 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_quarter_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_quarter_kernel.h
index 2d67f62a78b..935965cc1c3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_quarter_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_quarter_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/get_center_of_mass_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/get_center_of_mass_kernel.h
index 5ad1021e99a..6c390793a89 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/get_center_of_mass_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/get_center_of_mass_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/getcenter_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/getcenter_kernel.h
index 38662c9dc8c..f89ca37698f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/getcenter_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/getcenter_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/map_center_of_mass_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/map_center_of_mass_kernel.h
index cc7019080a1..97dc17d5e58 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/map_center_of_mass_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/map_center_of_mass_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/mdtemperature_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/mdtemperature_kernel.h
index addec135971..f3f8339da2f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/mdtemperature_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/mdtemperature_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/total_c6_get_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/total_c6_get_kernel.h
index 3ce82aa1807..a1bcd4a1dde 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/total_c6_get_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/total_c6_get_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/cal_no_wrap_crd_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/cal_no_wrap_crd_kernel.h
index b5d370e3570..8a5cbda121e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/cal_no_wrap_crd_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/cal_no_wrap_crd_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/refresh_boxmaptimes_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/refresh_boxmaptimes_kernel.h
index 88eb632e8f8..ba052a8f385 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/refresh_boxmaptimes_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/refresh_boxmaptimes_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_atom_energy_kernel.h
index 9dc3528d87d..94b6bd21e18 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_atom_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_atom_energy_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_energy_kernel.h
index fd7c3e95ac7..a8a51293e04 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_energy_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_kernel.h
index 926a8a3b314..d07471ee7d5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_with_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_with_atom_energy_kernel.h
index b4b6aa337b6..4d4086517dc 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_with_atom_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_with_atom_energy_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.h
index 2af64f733f9..4526cf2e1f6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.h
@@ -25,7 +25,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_energy_kernel.h
index bcedc485c97..34fc479bb57 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_energy_kernel.h
@@ -21,7 +21,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_kernel.h
index e2c7e9ae6a2..ffd02e02fa8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_kernel.h
@@ -21,7 +21,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_kernel.h
index da2227ee73f..f5eba16255f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_kernel.h
@@ -21,7 +21,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_update_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_update_kernel.h
index a007d330eaa..388f5014082 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_update_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_update_kernel.h
@@ -25,7 +25,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_virial_energy_update_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_virial_energy_update_kernel.h
index 0359ff31383..ac10ff932aa 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_virial_energy_update_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_virial_energy_update_kernel.h
@@ -25,7 +25,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_atom_energy_kernel.h
index 92beeba773a..6c17cee4cdc 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_atom_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_atom_energy_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_energy_kernel.h
index adbe74c15bc..ef3bc821769 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_energy_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_atom_energy_kernel.h
index 26b766a650f..51fd3e02153 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_atom_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_atom_energy_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.h
index 00427d8ab4f..17865345593 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.h
index 3b7932b4d92..6e33ebf2561 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_energy_kernel.h
index b44d8d66572..edd9c83d3d3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_energy_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_gpu_kernel.h
index 49b3cb13234..f51ebc7cc54 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.h
index ad8af605b8a..baa6a71b36d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.h
@@ -23,7 +23,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_kernel.h
index 64e2d544a39..416017d726b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_kernel.h
@@ -27,7 +27,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_new_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_new_kernel.h
index d0f2348a4d9..3ed8bf99cdd 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_new_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_new_kernel.h
@@ -26,7 +26,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_gradient_descent_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_gradient_descent_kernel.h
index 4fc6661f49b..3f0bca623a2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_gradient_descent_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_gradient_descent_kernel.h
@@ -25,7 +25,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_kernel.h
index c55aac62a07..720eb6c908c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_kernel.h
@@ -28,7 +28,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.h
index f1cde2bfa0d..dc35a5f8b90 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.h
@@ -25,7 +25,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_with_max_vel_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_with_max_vel_kernel.h
index e8ddd4ba75f..ecac60493cc 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_with_max_vel_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_with_max_vel_kernel.h
@@ -25,7 +25,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_with_max_vel_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_with_max_vel_kernel.h
index 15805f0a3de..cb03973e778 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_with_max_vel_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_with_max_vel_kernel.h
@@ -25,7 +25,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_setup_random_state.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_setup_random_state.h
index 6326d3eeca6..64277b9c221 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_setup_random_state.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_setup_random_state.h
@@ -25,7 +25,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/fft_3d_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/fft_3d_kernel.h
index fd6c04d3353..6805d39935e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/fft_3d_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/fft_3d_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/ifft_3d_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/ifft_3d_kernel.h
index c2c0e06144a..1072ad21413 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/ifft_3d_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/ifft_3d_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_kernel.h
index ad2e7ba0ae0..ef8193648c3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_update_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_update_kernel.h
index 1c43ceeaf95..e611b79cfb1 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_update_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_update_kernel.h
@@ -24,7 +24,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_kernel.h
index d055a95aefb..2104c1c9b98 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_kernel.h
@@ -24,7 +24,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_update_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_update_kernel.h
index ee448d3d623..63f60e392d5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_update_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_update_kernel.h
@@ -25,7 +25,7 @@
 #include <map>
 #include <string>
 #include <vector>
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh"
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_kernel.h
index 0b4f9c235ab..353c3d9b7ca 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_kernel.h
@@ -26,7 +26,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh"
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_update_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_update_kernel.h
index 4f29259d8eb..d71cdadf6e1 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_update_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_update_kernel.h
@@ -24,7 +24,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_energy_kernel.h
index 87367c18e1f..441087ae770 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_energy_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_energy_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_atom_energy_virial_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_atom_energy_virial_kernel.h
index a854b1a1309..e2a6da86d99 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_atom_energy_virial_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_atom_energy_virial_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_kernel.h
index ef162944f99..8e180f58254 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_kernel.h
index 1758efd614a..078ae2c1a7d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_with_virial_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_with_virial_kernel.h
index d29518fa01b..9b3402134f4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_with_virial_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_with_virial_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_kernel.h
index 236f5659c4d..672ba22f6c2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_kernel.h
@@ -30,7 +30,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_virial_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_virial_kernel.h
index 9c04703f69e..983a233316d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_virial_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_virial_kernel.h
@@ -29,7 +29,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_kernel.h
index 4b1ff8a1a78..bdc2cf0f5bf 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_kernel.h
@@ -29,7 +29,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/last_crd_to_dr_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/last_crd_to_dr_kernel.h
index 589578a676e..990a6b7e99e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/last_crd_to_dr_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/last_crd_to_dr_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_crd_vel_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_crd_vel_kernel.h
index 069b482d7e5..3d12554afa9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_crd_vel_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_crd_vel_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_uint_crd_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_uint_crd_kernel.h
index 37f45d544bf..f1d7f0ec3c3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_uint_crd_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_uint_crd_kernel.h
@@ -26,7 +26,7 @@
 
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc b/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc
index eb9687af39f..cd4632e1c9c 100644
--- a/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc
+++ b/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc
@@ -16,7 +16,7 @@
 
 #include "ps/ps_cache/gpu/gpu_ps_cache.h"
 #include "ps/ps_cache/ps_cache_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/hash_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cuh"
 #include "plugin/device/gpu/hal/device/gpu_common.h"
 #include "plugin/device/gpu/hal/device/cuda_driver.h"
 #include "plugin/device/gpu/hal/device/gpu_memory_allocator.h"
diff --git a/mindspore/python/mindspore/run_check/_check_version.py b/mindspore/python/mindspore/run_check/_check_version.py
index 78ee448fed4..673f2b8c100 100644
--- a/mindspore/python/mindspore/run_check/_check_version.py
+++ b/mindspore/python/mindspore/run_check/_check_version.py
@@ -143,12 +143,12 @@ class GPUEnvChecker(EnvChecker):
             logger.warning(f"MindSpore version {__version__} and cudDNN version {cudnn_version} "
                            "does not match, please refer to the installation guide for version matching "
                            "information: https://www.mindspore.cn/install. The recommended version is "
-                           "CUDA10.1 with cuDNN7.6.x and CUAD11.1 with cuDNN8.0.x")
+                           "CUDA10.1 with cuDNN7.6.x and CUDA11.1 with cuDNN8.0.x")
         if cudnn_version and int(cudnn_version) < 800 and int(str(self.v).split('.')[0]) > 10:
             logger.warning(f"CUDA version {self.v} and cuDNN version {cudnn_version} "
                            "does not match, please refer to the installation guide for version matching "
                            "information: https://www.mindspore.cn/install. The recommended version is "
-                           "CUAD11.1 with cuDNN8.0.x")
+                           "CUDA11.1 with cuDNN8.0.x")
 
     def _check_version(self):
         """Check cuda version"""
diff --git a/tests/ut/cpp/base/complex_test.cc b/tests/ut/cpp/base/complex_test.cc
index 634555eed33..14a61c71548 100644
--- a/tests/ut/cpp/base/complex_test.cc
+++ b/tests/ut/cpp/base/complex_test.cc
@@ -16,7 +16,7 @@
 #include <memory>
 
 #include "common/common_test.h"
-#include "utils/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
 
 namespace mindspore {