回退 'Pull Request !30120 : Package the cuda operators as a dynamic link library'

2022-02-18 08:59:07 +00:00 · 2022-02-18 08:59:07 +00:00 · 739c4b236a
parent 2135ba4400
commit 739c4b236a
790 changed files with 10696 additions and 11668 deletions
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@ -212,11 +212,6 @@ if(ENABLE_GPU)
        DESTINATION ${INSTALL_LIB_DIR}
        COMPONENT mindspore
    )
-    install(
-            TARGETS cuda_ops
-            DESTINATION ${INSTALL_LIB_DIR}
-            COMPONENT mindspore
-    )
 endif()

 if(ENABLE_D)
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@ -114,8 +114,6 @@ if(ENABLE_GPU)
            "plugin/device/gpu/kernel/*.cu"
            )

-    list(REMOVE_ITEM GPU_SRC_LIST "plugin/device/gpu/kernel/cuda_impl/cuda_ops/*.cu")
-
    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
    list(REMOVE_ITEM GPU_SRC_LIST "plugin/device/gpu/hal/device/blocking_queue.cc"
            "plugin/device/gpu/hal/device/gpu_buffer_mgr.cc")
@ -147,8 +145,6 @@ if(ENABLE_GPU)
    cuda_add_library(gpu_cuda_lib STATIC ${GPU_SRC_LIST})
    set(CMAKE_CXX_FLAGS ${NVCC_TMP_CMAKE_CXX_FLAGS})
    add_compile_definitions(ENABLE_GPU)
-
-    add_subdirectory(plugin/device/gpu/kernel/cuda_impl/cuda_ops)
 endif()


@ -434,7 +430,7 @@ endif()

 if(ENABLE_GPU)
    message("add gpu lib to c_expression")
-    target_link_libraries(_c_expression PRIVATE gpu_cuda_lib gpu_queue cublas cuda_ops
+    target_link_libraries(_c_expression PRIVATE gpu_cuda_lib gpu_queue cublas
                          ${CUDA_PATH}/lib64/libcurand.so
                          ${CUDNN_LIBRARY_PATH}
                          ${CUDA_PATH}/lib64/libcudart.so
--- a/mindspore/ccsrc/cxx_api/CMakeLists.txt
+++ b/mindspore/ccsrc/cxx_api/CMakeLists.txt
@ -140,7 +140,7 @@ if(ENABLE_D)
 endif()

 if(ENABLE_GPU)
-    target_link_libraries(mindspore_shared_lib PRIVATE  gpu_cuda_lib gpu_queue cublas cuda_ops
+    target_link_libraries(mindspore_shared_lib PRIVATE  gpu_cuda_lib gpu_queue cublas
                          ${CUDA_PATH}/lib64/libcurand.so
                          ${CUDNN_LIBRARY_PATH}
                          ${CUDA_PATH}/lib64/libcudart.so
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h
@ -14,15 +14,12 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUDA_COMMON_H_
-#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUDA_COMMON_H_
+#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_CUDA_COMMON_H_
+#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_CUDA_COMMON_H_

-#include <cudnn.h>
-#include <cublas_v2.h>
 #include <algorithm>
-#include <cusolverDn.h>
+#include "plugin/device/gpu/hal/device/gpu_device_manager.h"

-#define CUDA_LIB_EXPORT __attribute__((visibility("default")))
 #define CUDA_KERNEL_ASSERT(cond)                                                       \
  if (!(cond)) {                                                                       \
    __assert_fail(#cond, __FILE__, static_cast<unsigned int>(__LINE__), __FUNCTION__); \
@ -43,10 +40,22 @@ class CudaCommon {
  void set_check_sm(const bool &flag) { check_sm_ = flag; }
  bool check_sm() const { return check_sm_; }

-  static CudaCommon &GetInstance();
+  static CudaCommon &GetInstance() {
+    static CudaCommon instance;
+    return instance;
+  }

 private:
-  CudaCommon();
+  CudaCommon() {
+    uint32_t device_id = GPUDeviceManager::GetInstance().cur_device_id();
+    cudaDeviceProp prop;
+    (void)cudaGetDeviceProperties(&prop, device_id);
+    threads_per_block_ = prop.maxThreadsPerBlock;
+    max_blocks_ = prop.multiProcessorCount;
+    major_sm_ = prop.major;
+    minor_sm_ = prop.minor;
+    max_share_memory_ = prop.sharedMemPerBlock;
+  }
  ~CudaCommon() = default;
  CudaCommon(const CudaCommon &) = delete;
  CudaCommon &operator=(const CudaCommon &) = delete;
@ -71,4 +80,4 @@ class CudaCommon {
 }  // namespace device
 }  // namespace mindspore

-#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUDA_COMMON_H_
+#endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_CUDA_COMMON_H_
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc
@ -26,7 +26,7 @@
 #include "kernel/oplib/oplib.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "plugin/device/gpu/kernel/custom/custom_aot_gpu_kernel.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "utils/ms_context.h"
 #include "utils/ms_utils.h"
 #include "utils/utils.h"
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
@ -28,7 +28,7 @@
 #include "plugin/device/gpu/hal/device/gpu_buffer_mgr.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/hal/device/gpu_common.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "plugin/device/gpu/hal/hardware/optimizer.h"
 #include "utils/ms_device_shape_transfer.h"
 #include "utils/context/graph_kernel_flags.h"
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmax_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmax_gpu_kernel.h
@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmaxandminwithvalue_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmaxandminwithvalue_gpu_kernel.h
@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h
@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h
@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/cast_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/cast_gpu_kernel.h
@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/concatv2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/concatv2_gpu_kernel.h
@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/crop_and_resize_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/crop_and_resize_gpu_kernel.h
@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/depthtospace_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/depthtospace_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/dynamic_range_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/dynamic_range_gpu_kernel.h
@ -21,7 +21,7 @@

 #include <vector>

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/embedding_lookup_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/embedding_lookup_gpu_kernel.h
@ -21,7 +21,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/extract_image_patches_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/extract_image_patches_gpu_kernel.h
@ -23,8 +23,8 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/gather.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_grad_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/gather_grad.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gathernd_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gathernd_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/gathernd.cuh"
 #include "backend/common/session/anf_runtime_algorithm.h"

 namespace mindspore {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gatherv2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gatherv2_gpu_kernel.h
@ -21,7 +21,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh"
 #include "backend/common/session/anf_runtime_algorithm.h"

 namespace mindspore {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/in_top_k_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/in_top_k_gpu_kernel.h
@ -22,9 +22,9 @@

 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_band_part_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_band_part_gpu_kernel.h
@ -23,9 +23,9 @@
 #include <cuda_runtime.h>
 #include <vector>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "utils/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cuh"
+#include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_diag_part_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_diag_part_gpu_kernel.h
@ -25,9 +25,9 @@
 #include <string>
 #include <utility>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "utils/complex.h"
+#include "plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cuh"
+#include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_set_diag_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_set_diag_gpu_kernel.h
@ -26,7 +26,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T>
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/meshgrid_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/meshgrid_gpu_kernel.h
@ -22,8 +22,8 @@
 #include <utility>
 #include <vector>

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h"
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/one_hot_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/one_hot_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/oneslike_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/oneslike_gpu_kernel.h
@ -19,7 +19,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops//oneslike_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T>
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/pack_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/pack_gpu_kernel.h
@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/pack.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/range_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/range_gpu_kernel.h
@ -19,7 +19,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/range_impl.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr float kStartDefault = 0.;
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_grad_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_sequence_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_sequence_gpu_kernel.h
@ -22,8 +22,8 @@
 #include <iostream>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cuh"
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"

 namespace mindspore {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_v2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_v2_gpu_kernel.h
@ -22,7 +22,7 @@

 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_functor_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_functor_gpu_kernel.h
@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_functor_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_functor_gpu_kernel.h
@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_gpu_kernel.h
@ -19,7 +19,7 @@

 #include <vector>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/scatter_nd.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/select_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/select_gpu_kernel.h
@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/select_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h
@ -23,7 +23,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_grad_gpu_kernel.h
@ -23,7 +23,7 @@
 #include <utility>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/sort_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/sort_gpu_kernel.h
@ -24,9 +24,9 @@

 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetobatch_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetobatch_gpu_kernel.h
@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetodepth_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetodepth_gpu_kernel.h
@ -21,7 +21,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/split_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/split_gpu_kernel.h
@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/split_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_gpu_kernel.h
@ -23,7 +23,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/arrays/strided_slice_gpu_common.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_grad_gpu_kernel.h
@ -23,7 +23,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/arrays/strided_slice_gpu_common.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_copy_slices_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_copy_slices_gpu_kernel.h
@ -25,7 +25,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "kernel/common_utils.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_add_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_add_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include <string>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_max_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_max_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include <algorithm>

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_min_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_min_gpu_kernel.h
@ -19,7 +19,7 @@

 #include <vector>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_sub_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_sub_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include <algorithm>

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_update_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_update_gpu_kernel.h
@ -19,7 +19,7 @@

 #include <vector>
 #include <algorithm>
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tile_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tile_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/topk_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/topk_gpu_kernel.h
@ -21,8 +21,8 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/transpose_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/transpose_gpu_kernel.h
@ -21,8 +21,8 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr size_t kDimSize4 = 4;
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/unique_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unpack_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unpack_gpu_kernel.h
@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/unpack.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_max_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_max_gpu_kernel.h
@ -21,7 +21,7 @@
 #include <limits>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_min_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_min_gpu_kernel.h
@ -21,7 +21,7 @@
 #include <limits>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_sum_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_sum_gpu_kernel.h
@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cu
@ -14,8 +14,7 @@
 * limitations under the License.
 */

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh"
-#include "include/cuda_fp16.h"
+#include "plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh"

 template <typename T>
 __device__ __forceinline__ T SqrtFunc(T input) {
@ -114,50 +113,50 @@ void ApplyAdagrad(const size_t size,
          size, update_slots, learning_rate, gradient, variable, accumulation);
 }

-template CUDA_LIB_EXPORT void ApplyAdagrad<float, float, float>(const size_t size,
-                                                                const bool update_slots,
-                                                                const float *learning_rate,
-                                                                const float *gradient,
-                                                                float *variable,
-                                                                float *accumulation,
-                                                                cudaStream_t cuda_stream);
+template void ApplyAdagrad<float, float, float>(const size_t size,
+                                  const bool update_slots,
+                                  const float *learning_rate,
+                                  const float *gradient,
+                                  float *variable,
+                                  float *accumulation,
+                                  cudaStream_t cuda_stream);

-template CUDA_LIB_EXPORT void ApplyAdagrad<half, half, half>(const size_t size,
-                                                             const bool update_slots,
-                                                             const half *learning_rate,
-                                                             const half *gradient,
-                                                             half *variable,
-                                                             half *accumulation,
-                                                             cudaStream_t cuda_stream);
+template void ApplyAdagrad<half, half, half>(const size_t size,
+                                 const bool update_slots,
+                                 const half *learning_rate,
+                                 const half *gradient,
+                                 half *variable,
+                                 half *accumulation,
+                                 cudaStream_t cuda_stream);

-template CUDA_LIB_EXPORT void ApplyAdagrad<half, float, half>(const size_t size,
-                                                              const bool update_slots,
-                                                              const float *learning_rate,
-                                                              const half *gradient,
-                                                              half *variable,
-                                                              half *accumulation,
-                                                              cudaStream_t cuda_stream);
+template void ApplyAdagrad<half, float, half>(const size_t size,
+                                 const bool update_slots,
+                                 const float *learning_rate,
+                                 const half *gradient,
+                                 half *variable,
+                                 half *accumulation,
+                                 cudaStream_t cuda_stream);

-template CUDA_LIB_EXPORT void ApplyAdagrad<float, float, half>(const size_t size,
-                                                               const bool update_slots,
-                                                               const float *learning_rate,
-                                                               const half *gradient,
-                                                               float *variable,
-                                                               float *accumulation,
-                                                               cudaStream_t cuda_stream);
+template void ApplyAdagrad<float, float, half>(const size_t size,
+                                 const bool update_slots,
+                                 const float *learning_rate,
+                                 const half *gradient,
+                                 float *variable,
+                                 float *accumulation,
+                                 cudaStream_t cuda_stream);

-template CUDA_LIB_EXPORT void ApplyAdagrad<float, half, float>(const size_t size,
-                                                               const bool update_slots,
-                                                               const half *learning_rate,
-                                                               const float *gradient,
-                                                               float *variable,
-                                                               float *accumulation,
-                                                               cudaStream_t cuda_stream);
+template void ApplyAdagrad<float, half, float>(const size_t size,
+                                 const bool update_slots,
+                                 const half *learning_rate,
+                                 const float *gradient,
+                                 float *variable,
+                                 float *accumulation,
+                                 cudaStream_t cuda_stream);

-template CUDA_LIB_EXPORT void ApplyAdagrad<half, float, float>(const size_t size,
-                                                               const bool update_slots,
-                                                               const float *learning_rate,
-                                                               const float *gradient,
-                                                               half *variable,
-                                                               half *accumulation,
-                                                               cudaStream_t cuda_stream);
+template void ApplyAdagrad<half, float, float>(const size_t size,
+                                 const bool update_slots,
+                                 const float *learning_rate,
+                                 const float *gradient,
+                                 half *variable,
+                                 half *accumulation,
+                                 cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh
@ -0,0 +1,30 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_
+
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+template <typename T, typename S, typename G>
+void ApplyAdagrad(const size_t size,
+                  const bool update_slots,
+                  const S *learning_rate,
+                  const G *gradient,
+                  T *variable,
+                  T *accumulation,
+                  cudaStream_t stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cu
@ -14,8 +14,7 @@
 * limitations under the License.
 */

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh"
-#include "include/cuda_fp16.h"
+#include "plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh"

 template <typename T>
 __device__ __forceinline__ T SqrtFunc(T input) {
@ -83,19 +82,16 @@ void AdamWeightDecayOp(const size_t size, const T *gradient, const float *learni
                                                                           epsilon, decay, variable, m, v);
 }

-template CUDA_LIB_EXPORT void ApplyAdam<float>(const size_t size, const float *gradient, const float *beta1_power,
-                                               const float *beta2_power, const float *learning_rate, const float *beta1,
-                                               const float *beta2, const float *epsilon, float *variable, float *m,
-                                               float *v, cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void ApplyAdam<half>(const size_t size, const half *gradient, const half *beta1_power,
-                                              const half *beta2_power, const half *learning_rate, const half *beta1,
-                                              const half *beta2, const half *epsilon, half *variable, half *m, half *v,
-                                              cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void AdamWeightDecayOp<float>(const size_t size, const float *gradient,
-                                                       const float *learning_rate, const float *beta1,
-                                                       const float *beta2, const float *epsilon, const float *decay,
-                                                       float *variable, float *m, float *v, cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void AdamWeightDecayOp<half>(const size_t size, const half *gradient,
-                                                      const float *learning_rate, const float *beta1,
-                                                      const float *beta2, const float *epsilon, const float *decay,
-                                                      half *variable, half *m, half *v, cudaStream_t cuda_stream);
+template void ApplyAdam<float>(const size_t size, const float *gradient, const float *beta1_power,
+                               const float *beta2_power, const float *learning_rate, const float *beta1,
+                               const float *beta2, const float *epsilon, float *variable, float *m, float *v,
+                               cudaStream_t cuda_stream);
+template void ApplyAdam<half>(const size_t size, const half *gradient, const half *beta1_power, const half *beta2_power,
+                              const half *learning_rate, const half *beta1, const half *beta2, const half *epsilon,
+                              half *variable, half *m, half *v, cudaStream_t cuda_stream);
+template void AdamWeightDecayOp<float>(const size_t size, const float *gradient, const float *learning_rate,
+                                       const float *beta1, const float *beta2, const float *epsilon, const float *decay,
+                                       float *variable, float *m, float *v, cudaStream_t cuda_stream);
+template void AdamWeightDecayOp<half>(const size_t size, const half *gradient, const float *learning_rate,
+                                      const float *beta1, const float *beta2, const float *epsilon, const float *decay,
+                                      half *variable, half *m, half *v, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh
@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
+
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+template <typename T>
+void ApplyAdam(const size_t size, const T *gradient, const T *beta1_power, const T *beta2_power, const T *learning_rate,
+               const T *beta1, const T *beta2, const T *epsilon, T *variable, T *m, T *v, cudaStream_t cuda_stream);
+template <typename T>
+void AdamWeightDecayOp(const size_t size, const T *gradient, const float *learning_rate, const float *beta1,
+                       const float *beta2, const float *epsilon, const float *decay, T *variable, T *m, T *v,
+                       cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cu
@ -15,6 +15,7 @@
 */

 #include "adam_weight_decay_impl.cuh"
+#include "plugin/device/gpu/hal/device/cuda_common.h"

 template <typename T>
 __global__ void AdamWeightDecayKernel(const int element_num_, const bool need_decay, const float *beta1,
@ -43,8 +44,7 @@ void AdamWeightDecay(const int &element_num_, const bool &need_decay, const floa
    gradient);
 }

-template CUDA_LIB_EXPORT void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1,
-                                              const float *one_sub_beta1, const float *beta2,
-                                              const float *one_sub_beta2, const float *epsilon, const float *lr,
-                                              const float *weight_decay, float *m, float *v, float *param,
-                                              float *gradient, cudaStream_t stream);
+template void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1,
+                              const float *one_sub_beta1, const float *beta2, const float *one_sub_beta2,
+                              const float *epsilon, const float *lr, const float *weight_decay, float *m, float *v,
+                              float *param, float *gradient, cudaStream_t stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cuh
@ -0,0 +1,24 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ADAM_WEIGHT_DECAY_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ADAM_WEIGHT_DECAY_H_
+template <typename T>
+void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1, const float *one_sub_beta1,
+                     const float *beta2, const float *one_sub_beta2, const float *epsilon, const float *lr,
+                     const float *weight_decay, T *m, T *v, T *param, T *gradient, cudaStream_t stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ADAM_WEIGHT_DECAY_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cu
@ -14,8 +14,7 @@
 * limitations under the License.
 */

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh"
-#include "include/cuda_fp16.h"
+#include "plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh"

 __device__ inline uint start_index(uint a, uint b, uint c) {
  return floorf(__uint2float_rn(a * c) / __uint2float_rn(b));
@ -169,17 +168,14 @@ void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const
    size, input_height, input_width, output_height, output_width, input_data, output_data);
 }

-template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad<float>(const uint size, const uint input_height,
-                                                                const uint input_width, const uint output_height,
-                                                                const uint output_width, float *input_data,
-                                                                float *output_data, cudaStream_t cuda_stream);
+template void ApplyAdaptiveAvgPool2DGrad<float>(const uint size, const uint input_height, const uint input_width,
+                                                const uint output_height, const uint output_width, float *input_data,
+                                                float *output_data, cudaStream_t cuda_stream);

-template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad<half>(const uint size, const uint input_height,
-                                                               const uint input_width, const uint output_height,
-                                                               const uint output_width, half *input_data,
-                                                               half *output_data, cudaStream_t cuda_stream);
+template void ApplyAdaptiveAvgPool2DGrad<half>(const uint size, const uint input_height, const uint input_width,
+                                               const uint output_height, const uint output_width, half *input_data,
+                                               half *output_data, cudaStream_t cuda_stream);

-template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad<double>(const uint size, const uint input_height,
-                                                                 const uint input_width, const uint output_height,
-                                                                 const uint output_width, double *input_data,
-                                                                 double *output_data, cudaStream_t cuda_stream);
+template void ApplyAdaptiveAvgPool2DGrad<double>(const uint size, const uint input_height, const uint input_width,
+                                                 const uint output_height, const uint output_width, double *input_data,
+                                                 double *output_data, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh
@ -14,11 +14,13 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_GRAD_IMPL_CUH_
-#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_GRAD_IMPL_CUH_
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_H_

+#include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
-CUDA_LIB_EXPORT void CalPReLUGrad(size_t input_size, size_t weight_size, size_t per_channel_size, const T *dy,
-                                  const T *x, const T *w, T *dx, T *dw, float *dw_array, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_GRAD_IMPL_CUH_
+void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const uint input_width,
+                                const uint output_height, const uint output_width, T *input_data,
+                                T *output_data, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cu
@ -14,8 +14,7 @@
 * limitations under the License.
 */

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh"
-#include "include/cuda_fp16.h"
+#include "plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh"

 __device__ inline uint start_index(uint a, uint b, uint c) {
  return floorf(__uint2float_rn(a * c) / __uint2float_rn(b));
@ -156,17 +155,14 @@ void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint
    size, input_height, input_width, output_height, output_width, input_data, output_data);
 }

-template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D<float>(const uint size, const uint input_height,
-                                                            const uint input_width, const uint output_height,
-                                                            const uint output_width, float *input_data,
-                                                            float *output_data, cudaStream_t cuda_stream);
+template void ApplyAdaptiveAvgPool2D<float>(const uint size, const uint input_height, const uint input_width,
+                                            const uint output_height, const uint output_width, float *input_data,
+                                            float *output_data, cudaStream_t cuda_stream);

-template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D<half>(const uint size, const uint input_height,
-                                                           const uint input_width, const uint output_height,
-                                                           const uint output_width, half *input_data,
-                                                           half *output_data, cudaStream_t cuda_stream);
+template void ApplyAdaptiveAvgPool2D<half>(const uint size, const uint input_height, const uint input_width,
+                                           const uint output_height, const uint output_width, half *input_data,
+                                           half *output_data, cudaStream_t cuda_stream);

-template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D<double>(const uint size, const uint input_height,
-                                                             const uint input_width, const uint output_height,
-                                                             const uint output_width, double *input_data,
-                                                             double *output_data, cudaStream_t cuda_stream);
+template void ApplyAdaptiveAvgPool2D<double>(const uint size, const uint input_height, const uint input_width,
+                                             const uint output_height, const uint output_width, double *input_data,
+                                             double *output_data, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh
@ -14,11 +14,12 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_IMPL_CUH_
-#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_IMPL_CUH_
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVEAVGPOOL2D_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVEAVGPOOL2D_IMPL_H_

+#include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
-CUDA_LIB_EXPORT void CalPReLU(size_t input_size, size_t weight_size, size_t per_channel_size,
-                              const T *input, const T *weight, T *output, cudaStream_t cuda_stream);
-#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_IMPL_CUH_
+void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint input_width, const uint output_height,
+                            const uint output_width, T *input_data, T *output_data, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVEAVGPOOL2D_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_impl.cuh
@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_
+
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+template <typename T>
+void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask, cudaStream_t cuda_stream);
+
+template <typename T>
+void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cu
@ -14,8 +14,8 @@
 * limitations under the License.
 */

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"

 template <typename T>
 __global__ void AddReluV2Kernel(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask) {
@ -49,20 +49,20 @@ void AddReluGradV2(const size_t num, const T *x1, const T *x2, const uint32_t *m
  AddReluGradV2Kernel<<<kBlocksPerGrid(num), kThreadsPerBlock, 0, cuda_stream>>>(num, x1, x2, mask, dx);
 }

-template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const float *x1, const float *x2, float *y, uint32_t *mask,
-                                        cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const half *x1, const half *x2, half *y, uint32_t *mask,
-                                        cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const int32_t *x1, const int32_t *x2, int32_t *y,
-                                        uint32_t *mask, cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const int64_t *x1, const int64_t *x2, int64_t *y,
-                                        uint32_t *mask, cudaStream_t cuda_stream);
+template void AddReluV2(const size_t num, const float *x1, const float *x2, float *y, uint32_t *mask,
+                        cudaStream_t cuda_stream);
+template void AddReluV2(const size_t num, const half *x1, const half *x2, half *y, uint32_t *mask,
+                        cudaStream_t cuda_stream);
+template void AddReluV2(const size_t num, const int32_t *x1, const int32_t *x2, int32_t *y, uint32_t *mask,
+                        cudaStream_t cuda_stream);
+template void AddReluV2(const size_t num, const int64_t *x1, const int64_t *x2, int64_t *y, uint32_t *mask,
+                        cudaStream_t cuda_stream);

-template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const float *x1, const float *x2, const uint32_t *mask,
-                                            float *dx, cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const half *x1, const half *x2, const uint32_t *mask,
-                                            half *dx, cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const int32_t *x1, const int32_t *x2,
-                                            const uint32_t *mask, int32_t *dx, cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const int64_t *x1, const int64_t *x2,
-                                            const uint32_t *mask, int64_t *dx, cudaStream_t cuda_stream);
+template void AddReluGradV2(const size_t num, const float *x1, const float *x2, const uint32_t *mask, float *dx,
+                            cudaStream_t cuda_stream);
+template void AddReluGradV2(const size_t num, const half *x1, const half *x2, const uint32_t *mask, half *dx,
+                            cudaStream_t cuda_stream);
+template void AddReluGradV2(const size_t num, const int32_t *x1, const int32_t *x2, const uint32_t *mask, int32_t *dx,
+                            cudaStream_t cuda_stream);
+template void AddReluGradV2(const size_t num, const int64_t *x1, const int64_t *x2, const uint32_t *mask, int64_t *dx,
+                            cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh
@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_
+
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+template <typename T>
+void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask, cudaStream_t cuda_stream);
+
+template <typename T>
+void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cu
@ -14,8 +14,7 @@
 * limitations under the License.
 */

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh"
-#include "include/cuda_fp16.h"
+#include "plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh"

 template <typename T>
 __global__ void ApplyGradientDescent(const size_t size, T *var, const T *alpha, const T *delta, T *output) {
@ -32,8 +31,7 @@ void CalApplyGradientDescent(const size_t &size, T *var, const T *alpha, const T
  ApplyGradientDescent<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, var, alpha, delta, output);
 }

-template CUDA_LIB_EXPORT void CalApplyGradientDescent<float>(const size_t &size, float *var, const float *alpha,
-                                                             const float *delta, float *output,
-                                                             cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void CalApplyGradientDescent<half>(const size_t &size, half *var, const half *alpha,
-                                                            const half *delta, half *output, cudaStream_t cuda_stream);
+template void CalApplyGradientDescent<float>(const size_t &size, float *var, const float *alpha, const float *delta,
+                                             float *output, cudaStream_t cuda_stream);
+template void CalApplyGradientDescent<half>(const size_t &size, half *var, const half *alpha, const half *delta,
+                                            half *output, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh
@ -14,14 +14,14 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TILE_IMPL_CUH_
-#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TILE_IMPL_CUH_
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
-#define TILE_MAX_DIMENSION 100
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_APPLY_GRADIENT_DESCENT_IMPL_CUH_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_APPLY_GRADIENT_DESCENT_IMPL_CUH_
+
+#include <cuda_runtime.h>
+#include "plugin/device/gpu/hal/device/cuda_common.h"

 template <typename T>
-CUDA_LIB_EXPORT void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size,
-                             const size_t *input_shape, const size_t *output_shape, const T *input, T *output,
+void CalApplyGradientDescent(const size_t &size, T *var, const T *alpha, const T *delta, T *output,
                             cudaStream_t cuda_stream);

-#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TILE_IMPL_CUH_
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_APPLY_GRADIENT_DESCENT_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cu
@ -15,6 +15,8 @@
 */

 #include "argmax_impl.cuh"
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "include/cuda_fp16.h"
 template <typename T, typename S>
 __global__ void Argmax(const T *input, const S bound, const size_t outer_size,
                       const size_t inner_size, S *output) {
@ -44,9 +46,7 @@ void CalArgmax(const T *input, const S bound, const size_t outer_size, const siz
  return;
 }

-template
-CUDA_LIB_EXPORT void CalArgmax<float, int>(const float *input, const int bound, const size_t outer_size,
+template void CalArgmax<float, int>(const float *input, const int bound, const size_t outer_size,
                                    const size_t inner_size, int *output, cudaStream_t cuda_stream);
-template
-CUDA_LIB_EXPORT void CalArgmax<half, int>(const half *input, const int bound, const size_t outer_size,
+template void CalArgmax<half, int>(const half *input, const int bound, const size_t outer_size,
                                   const size_t inner_size, int *output, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh
@ -0,0 +1,23 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_
+template <typename T, typename S>
+void CalArgmax(const T *input, const S bound, const size_t outer_size, const size_t inner_size, S *output,
+               cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cu
@ -15,6 +15,8 @@
 */

 #include "assign_add_impl.cuh"
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "include/cuda_fp16.h"
 template <typename T>
 __global__ void AssignAdd(const size_t size, T* ref, const T* value, T* output) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
@ -31,11 +33,10 @@ void CalAssignAdd(const size_t size, T* ref, const T* value, T* output, cudaStre
  return;
 }

-template CUDA_LIB_EXPORT void CalAssignAdd<float>(const size_t size, float* ref, const float* value, float* output,
-                                                  cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void CalAssignAdd<half>(const size_t size, half* ref, const half* value, half* output,
-                                                 cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void CalAssignAdd<int>(const size_t size, int* ref, const int* value, int* output,
-                                                cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void CalAssignAdd<int64_t>(const size_t size, int64_t* ref, const int64_t* value,
-                                                    int64_t* output, cudaStream_t cuda_stream);
+template void CalAssignAdd<float>(const size_t size, float* ref, const float* value, float* output,
+                                  cudaStream_t cuda_stream);
+template void CalAssignAdd<half>(const size_t size, half* ref, const half* value, half* output,
+                                 cudaStream_t cuda_stream);
+template void CalAssignAdd<int>(const size_t size, int* ref, const int* value, int* output, cudaStream_t cuda_stream);
+template void CalAssignAdd<int64_t>(const size_t size, int64_t* ref, const int64_t* value, int64_t* output,
+                                    cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cuh
@ -14,11 +14,9 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ASSIGN_ADD_IMPL_CUH_
-#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ASSIGN_ADD_IMPL_CUH_
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
-#include "include/cuda_fp16.h"
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ASSIGNADD_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ASSIGNADD_H_
 template <typename T>
-CUDA_LIB_EXPORT void CalAssignAdd(const size_t size, T* ref, const T* value, T* output, cudaStream_t cuda_stream);
+void CalAssignAdd(const size_t size, T* ref, const T* value, T* output, cudaStream_t cuda_stream);

-#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ASSIGN_ADD_IMPL_CUH_
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ASSIGNADD_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cu
@ -109,11 +109,10 @@ void BatchNormFold2Forward(const T *x, const T *beta, const T *gamma, const T *b
    x, beta, gamma, batch_std, batch_mean, running_std, running_mean, global_step, y, freeze_bn, N, C, H, W);
 }

-template CUDA_LIB_EXPORT void BatchNormFold2Forward<float>(const float *x, const float *beta, const float *gamma,
-                                                           const float *batch_std, const float *batch_mean,
-                                                           const float *running_std, const float *running_mean,
-                                                           const int *global_step, float *y, int freeze_bn, size_t N,
-                                                           size_t C, size_t H, size_t W, cudaStream_t cuda_stream);
+template void BatchNormFold2Forward<float>(const float *x, const float *beta, const float *gamma,
+                                           const float *batch_std, const float *batch_mean, const float *running_std,
+                                           const float *running_mean, const int *global_step, float *y, int freeze_bn,
+                                           size_t N, size_t C, size_t H, size_t W, cudaStream_t cuda_stream);

 template <typename T>
 void BatchNormFold2GradReduce(const T *dout, const T *x, T *d_beta, T *tmp, T *reduce_x, T *tmp2, T *tmp_x, size_t N,
@ -125,10 +124,9 @@ void BatchNormFold2GradReduce(const T *dout, const T *x, T *d_beta, T *tmp, T *r
  BatchNormFold2GradReduce2<<<GET_BLOCKS(C), GET_THREADS, 0, cuda_stream>>>(tmp, d_beta, tmp2, reduce_x, N, C);
 }

-template CUDA_LIB_EXPORT void BatchNormFold2GradReduce<float>(const float *dout, const float *x, float *d_beta,
-                                                              float *tmp, float *reduce_x, float *tmp2, float *tmp_x,
-                                                              size_t N, size_t C, size_t H, size_t W,
-                                                              cudaStream_t cuda_stream);
+template void BatchNormFold2GradReduce<float>(const float *dout, const float *x, float *d_beta, float *tmp,
+                                              float *reduce_x, float *tmp2, float *tmp_x, size_t N, size_t C, size_t H,
+                                              size_t W, cudaStream_t cuda_stream);

 template <typename T>
 void CalBatchNormFold2GradNotFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std,
@ -138,12 +136,11 @@ void CalBatchNormFold2GradNotFreeze(const T *d_beta, const T *reduce_x, const T
    d_beta, reduce_x, batch_mean, batch_std, running_mean, running_std, gamma, d_gamma, d_batch_mean, d_batch_std, C);
 }

-template CUDA_LIB_EXPORT void CalBatchNormFold2GradNotFreeze<float>(const float *d_beta, const float *reduce_x,
-                                                                    const float *batch_mean, const float *batch_std,
-                                                                    const float *running_mean, const float *running_std,
-                                                                    const float *gamma, float *d_gamma,
-                                                                    float *d_batch_mean, float *d_batch_std, size_t C,
-                                                                    cudaStream_t cuda_stream);
+template void CalBatchNormFold2GradNotFreeze<float>(const float *d_beta, const float *reduce_x, const float *batch_mean,
+                                                    const float *batch_std, const float *running_mean,
+                                                    const float *running_std, const float *gamma, float *d_gamma,
+                                                    float *d_batch_mean, float *d_batch_std, size_t C,
+                                                    cudaStream_t cuda_stream);

 template <typename T>
 void CalBatchNormFold2GradFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std,
@ -155,12 +152,11 @@ void CalBatchNormFold2GradFreeze(const T *d_beta, const T *reduce_x, const T *ba
  ThrustFillWith(d_batch_std, C, (T)0.f, cuda_stream);
 }

-template CUDA_LIB_EXPORT void CalBatchNormFold2GradFreeze<float>(const float *d_beta, const float *reduce_x,
-                                                                 const float *batch_mean, const float *batch_std,
-                                                                 const float *running_mean, const float *running_std,
-                                                                 const float *gamma, float *d_gamma,
-                                                                 float *d_batch_mean, float *d_batch_std, size_t C,
-                                                                 cudaStream_t cuda_stream);
+template void CalBatchNormFold2GradFreeze<float>(const float *d_beta, const float *reduce_x, const float *batch_mean,
+                                                 const float *batch_std, const float *running_mean,
+                                                 const float *running_std, const float *gamma, float *d_gamma,
+                                                 float *d_batch_mean, float *d_batch_std, size_t C,
+                                                 cudaStream_t cuda_stream);

 template <typename T>
 void CalBatchNormFold2GradNotFreezeDxMul(const T *batch_std, const T *running_std, T *d_x, size_t N, size_t C, size_t H,
@ -168,7 +164,6 @@ void CalBatchNormFold2GradNotFreezeDxMul(const T *batch_std, const T *running_st
  DxMul<<<GET_BLOCKS(N * C * H * W), GET_THREADS, 0, cuda_stream>>>(N, C, H * W, batch_std, running_std, d_x);
 }

-template CUDA_LIB_EXPORT void CalBatchNormFold2GradNotFreezeDxMul<float>(const float *batch_std,
-                                                                         const float *running_std, float *d_x,
-                                                                         size_t N, size_t C, size_t H, size_t W,
-                                                                         cudaStream_t cuda_stream);
+template void CalBatchNormFold2GradNotFreezeDxMul<float>(const float *batch_std, const float *running_std, float *d_x,
+                                                         size_t N, size_t C, size_t H, size_t W,
+                                                         cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh
@ -0,0 +1,40 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMFOLD2_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMFOLD2_H_
+
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+template <typename T>
+void BatchNormFold2Forward(const T *x, const T *beta, const T *gamma, const T *batch_std, const T *batch_mean,
+                           const T *running_std, const T *running_mean, const int *global_step, T *y, int freeze_bn,
+                           size_t N, size_t C, size_t H, size_t W, cudaStream_t cuda_stream);
+template <typename T>
+void CalBatchNormFold2GradNotFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std,
+                                    const T *running_mean, const T *running_std, const T *gamma, T *d_gamma,
+                                    T *d_batch_mean, T *d_batch_std, size_t C, cudaStream_t cuda_stream);
+template <typename T>
+void CalBatchNormFold2GradFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std,
+                                 const T *running_mean, const T *running_std, const T *gamma, T *d_gamma,
+                                 T *d_batch_mean, T *d_batch_std, size_t C, cudaStream_t cuda_stream);
+template <typename T>
+void BatchNormFold2GradReduce(const T *dout, const T *x, T *d_beta, T *tmp, T *reduce_x, T *tmp2, T *tmp_x, size_t N,
+                              size_t C, size_t H, size_t W, cudaStream_t cuda_stream);
+
+template <typename T>
+void CalBatchNormFold2GradNotFreezeDxMul(const T *batch_std, const T *running_std, T *d_x, size_t N, size_t C, size_t H,
+                                         size_t W, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMFOLD2_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cu
@ -18,6 +18,7 @@
 #include <thrust/fill.h>
 #include <thrust/system/cuda/execution_policy.h>
 #include "batchnorm_fold_impl.cuh"
+#include "plugin/device/gpu/hal/device/cuda_common.h"

 template <typename T>
 __global__ void UpdateRunningStd(int channel_size, const double epsilon, T* running_std) {
@ -54,8 +55,8 @@ void CalUpdateRunningStd(int channel_size, double epsilon, T* running_std, cudaS
  return;
 }

-template CUDA_LIB_EXPORT void CalUpdateRunningStd<float>(int channel_size, double epsilon, float* running_std,
-                                                         cudaStream_t cuda_stream);
+template void CalUpdateRunningStd<float>(int channel_size, double epsilon, float* running_std,
+                                         cudaStream_t cuda_stream);

 template <typename T>
 void CalUpdateBatchStd(int channel_size, T* batch_std, cudaStream_t cuda_stream) {
@ -63,7 +64,7 @@ void CalUpdateBatchStd(int channel_size, T* batch_std, cudaStream_t cuda_stream)
  return;
 }

-template CUDA_LIB_EXPORT void CalUpdateBatchStd<float>(int channel_size, float* batch_std, cudaStream_t cuda_stream);
+template void CalUpdateBatchStd<float>(int channel_size, float* batch_std, cudaStream_t cuda_stream);

 template <typename T>
 void CalBatchNormFoldGrad(const T* d_batch_mean, const T* d_batch_std, const T* x, const T* batch_mean,
@ -73,10 +74,9 @@ void CalBatchNormFoldGrad(const T* d_batch_mean, const T* d_batch_std, const T*
    d_batch_mean, d_batch_std, x, batch_mean, batch_std, batch_size, channel_size, height, width, dx);
 }

-template CUDA_LIB_EXPORT void CalBatchNormFoldGrad<float>(const float* d_batch_mean, const float* d_batch_std,
-                                                          const float* x, const float* batch_mean,
-                                                          const float* batch_std, int batch_size, int channel_size,
-                                                          int height, int width, float* dx, cudaStream_t cuda_stream);
+template void CalBatchNormFoldGrad<float>(const float* d_batch_mean, const float* d_batch_std, const float* x,
+                                          const float* batch_mean, const float* batch_std, int batch_size,
+                                          int channel_size, int height, int width, float* dx, cudaStream_t cuda_stream);

 template <typename T>
 void ThrustFillWith(T* array, int size, T tofill, cudaStream_t cuda_stream) {
@ -84,5 +84,5 @@ void ThrustFillWith(T* array, int size, T tofill, cudaStream_t cuda_stream) {
  thrust::fill(thrust::cuda::par.on(cuda_stream), dev_ptr, dev_ptr + size, tofill);
 }

-template CUDA_LIB_EXPORT void ThrustFillWith<float>(float* array, int size, float tofill, cudaStream_t cuda_stream);
+template void ThrustFillWith<float>(float* array, int size, float tofill, cudaStream_t cuda_stream);

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh
@ -0,0 +1,32 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORM_FOLD_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORM_FOLD_H_
+
+template <typename T>
+void CalUpdateRunningStd(int channel_size, double epsilon, T* running_std, cudaStream_t cuda_stream);
+
+template <typename T>
+void CalUpdateBatchStd(int channel_size, T* batch_std, cudaStream_t cuda_stream);
+
+template <typename T>
+void CalBatchNormFoldGrad(const T* d_batch_mean, const T* d_batch_std, const T* x, const T* batch_mean,
+                          const T* batch_std, int batch_size, int channel_size, int height, int width, T* dx,
+                          cudaStream_t cuda_stream);
+template <typename T>
+void ThrustFillWith(T* array, int size, T tofill, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BATCHNORM_FOLD_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cu
@ -21,7 +21,6 @@
 #include <thrust/system/cuda/execution_policy.h>
 #include "batchnorm_grad_impl.cuh"
 #include "include/cuda_runtime.h"
-#include "include/cuda_fp16.h"

 const int kWarpSize = 32;
 const int kBlockSize = 1024;
@ -112,12 +111,10 @@ void CalBatchNormGrad(T *x, T *dy, float *scale, float *save_mean, float *save_v
                                                         epsilon, N, C, H, W);
 }

-template CUDA_LIB_EXPORT void CalBatchNormGrad<float>(float *x, float *dy, float *scale, float *save_mean,
-                                                      float *save_variance, float *dx, float *bn_scale, float *bn_bias,
-                                                      double epsilon, int N, int C, int H, int W,
-                                                      cudaStream_t cuda_stream);
+template void CalBatchNormGrad<float>(float *x, float *dy, float *scale, float *save_mean, float *save_variance,
+                                      float *dx, float *bn_scale, float *bn_bias, double epsilon, int N, int C, int H,
+                                      int W, cudaStream_t cuda_stream);

-template CUDA_LIB_EXPORT void CalBatchNormGrad<half>(half *x, half *dy, float *scale, float *save_mean,
-                                                     float *save_variance, half *dx, float *bn_scale, float *bn_bias,
-                                                     double epsilon, int N, int C, int H, int W,
-                                                     cudaStream_t cuda_stream);
+template void CalBatchNormGrad<half>(half *x, half *dy, float *scale, float *save_mean, float *save_variance, half *dx,
+                                     float *bn_scale, float *bn_bias, double epsilon, int N, int C, int H, int W,
+                                     cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cuh
@ -0,0 +1,24 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMGRAD_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMGRAD_H_
+
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+template <typename T>
+void CalBatchNormGrad(T *x, T *dy, float *scale, float *save_mean, float *save_variance, T *dx, float *bn_scale,
+                      float *bn_bias, double epsilon, int N, int C, int H, int W, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMGRAD_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cu
@ -0,0 +1,133 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cuda_runtime.h>
+#include "batchtospace_impl.cuh"
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+
+template <typename T>
+__global__ void BatchToSpace(const size_t size, const T *input, const size_t in,
+                             const size_t ih, const size_t iw, const size_t ic,
+                             const size_t on, const size_t oh, const size_t ow,
+                             const size_t oc, const size_t crop_up, const size_t crop_dn,
+                             const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                             T *output) {
+  size_t temp_stride = 0;
+  size_t temp_pos = 0;
+  size_t idx_on = 0;
+  size_t idx_oc = 0;
+  size_t idx_oh = 0;
+  size_t idx_ow = 0;
+  size_t idx_in = 0;
+  size_t input_pos = 0;
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size;
+       pos += blockDim.x * gridDim.x) {
+    temp_stride = oc * oh * ow;
+    idx_on = pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= oc;
+    idx_oc = temp_pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= oh;
+    idx_oh = temp_pos / temp_stride;
+    temp_pos = pos % temp_stride;
+
+    temp_stride /= ow;
+    idx_ow = temp_pos / temp_stride;
+
+    idx_in = (((idx_oh + crop_up) % block_num) * block_num + ((idx_ow + crop_lft) % block_num)) * on + idx_on;
+    input_pos = idx_in * ic;
+    input_pos = (input_pos + idx_oc) * ih;
+    input_pos = (input_pos + ((idx_oh + crop_up) - (idx_in / (on * block_num))) / block_num) * iw;
+    input_pos = (input_pos + ((idx_ow + crop_lft) - ((idx_in / on) % block_num)) / block_num);
+    output[pos] = input[input_pos];
+  }
+  return;
+}
+
+template <typename T>
+void CalBatchToSpace(const size_t size, const T *input, const size_t in,
+  const size_t ih, const size_t iw, const size_t ic,
+  const size_t on, const size_t oh, const size_t ow,
+  const size_t oc, const size_t crop_up, const size_t crop_dn,
+  const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+  T *output, cudaStream_t cuda_stream) {
+  BatchToSpace<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+    size, input, in, ih, iw, ic, on, oh, ow, oc, crop_up, crop_dn, crop_lft, crop_rht, block_num, output);
+  return;
+}
+
+template void CalBatchToSpace<float>(const size_t size, const float *input, const size_t in,
+                                     const size_t ih, const size_t iw, const size_t ic,
+                                     const size_t on, const size_t oh, const size_t ow,
+                                     const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                     const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                                     float *output, cudaStream_t cuda_stream);
+template void CalBatchToSpace<half>(const size_t size, const half *input, const size_t in,
+                                    const size_t ih, const size_t iw, const size_t ic,
+                                    const size_t on, const size_t oh, const size_t ow,
+                                    const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                    const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                                    half *output, cudaStream_t cuda_stream);
+template void CalBatchToSpace<int>(const size_t size, const int *input, const size_t in,
+                                   const size_t ih, const size_t iw, const size_t ic,
+                                   const size_t on, const size_t oh, const size_t ow,
+                                   const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                   const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                                   int *output, cudaStream_t cuda_stream);
+template void CalBatchToSpace<int64_t>(const size_t size, const int64_t *input, const size_t in,
+                                       const size_t ih, const size_t iw, const size_t ic,
+                                       const size_t on, const size_t oh, const size_t ow,
+                                       const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                       const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                                       int64_t *output, cudaStream_t cuda_stream);
+template void CalBatchToSpace<int16_t>(const size_t size, const int16_t *input, const size_t in,
+                                       const size_t ih, const size_t iw, const size_t ic,
+                                       const size_t on, const size_t oh, const size_t ow,
+                                       const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                       const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                                       int16_t *output, cudaStream_t cuda_stream);
+template void CalBatchToSpace<int8_t>(const size_t size, const int8_t *input, const size_t in,
+                                      const size_t ih, const size_t iw, const size_t ic,
+                                      const size_t on, const size_t oh, const size_t ow,
+                                      const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                      const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                                      int8_t *output, cudaStream_t cuda_stream);
+template void CalBatchToSpace<uint8_t>(const size_t size, const uint8_t *input, const size_t in,
+                                       const size_t ih, const size_t iw, const size_t ic,
+                                       const size_t on, const size_t oh, const size_t ow,
+                                       const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                       const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                                       uint8_t *output, cudaStream_t cuda_stream);
+template void CalBatchToSpace<uint16_t>(const size_t size, const uint16_t *input, const size_t in,
+                                        const size_t ih, const size_t iw, const size_t ic,
+                                        const size_t on, const size_t oh, const size_t ow,
+                                        const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                        const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                                        uint16_t *output, cudaStream_t cuda_stream);
+template void CalBatchToSpace<uint32_t>(const size_t size, const uint32_t *input, const size_t in,
+                                        const size_t ih, const size_t iw, const size_t ic,
+                                        const size_t on, const size_t oh, const size_t ow,
+                                        const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                        const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                                        uint32_t *output, cudaStream_t cuda_stream);
+template void CalBatchToSpace<uint64_t>(const size_t size, const uint64_t *input, const size_t in,
+                                        const size_t ih, const size_t iw, const size_t ic,
+                                        const size_t on, const size_t oh, const size_t ow,
+                                        const size_t oc, const size_t crop_up, const size_t crop_dn,
+                                        const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                                        uint64_t *output, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh
@ -0,0 +1,27 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHTOSPACE_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHTOSPACE_H_
+template <typename T>
+void CalBatchToSpace(const size_t size, const T *input, const size_t in,
+                     const size_t ih, const size_t iw, const size_t ic,
+                     const size_t on, const size_t oh, const size_t ow,
+                     const size_t oc, const size_t crop_up, const size_t crop_dn,
+                     const size_t crop_lft, const size_t crop_rht, const size_t block_num,
+                     T *output, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHTOSPACE_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cu
@ -14,8 +14,7 @@
 * limitations under the License.
 */

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cuh"
-#include "include/cuda_fp16.h"
+#include "plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh"

 __device__ __forceinline__ size_t Index(const size_t &index, const size_t &dim) { return dim == 1 ? 0 : index; }

@ -115,18 +114,15 @@ void CalBCEWithLogitsLoss(const size_t input_size, const T *predict, const T *ta
  return;
 }

-template CUDA_LIB_EXPORT void CalBCEWithLogitsLoss<half>(const size_t input_size, const half *predict,
-                                                         const half *target, const size_t *input_shape,
-                                                         const size_t shape_size, const half *weight,
-                                                         const size_t *weight_shape, const bool weight_need_broadcast,
-                                                         const half *pos_weight, const size_t *pos_weight_shape,
-                                                         const bool pos_weight_need_broadcast, half *shape_broadcasted,
-                                                         half *output, cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void CalBCEWithLogitsLoss<float>(const size_t input_size, const float *predict,
-                                                          const float *target, const size_t *input_shape,
-                                                          const size_t shape_size, const float *weight,
-                                                          const size_t *weight_shape, const bool weight_need_broadcast,
-                                                          const float *pos_weight, const size_t *pos_weight_shape,
-                                                          const bool pos_weight_need_broadcast,
-                                                          float *shape_broadcasted, float *output,
-                                                          cudaStream_t cuda_stream);
+template void CalBCEWithLogitsLoss<half>(const size_t input_size, const half *predict, const half *target,
+                                         const size_t *input_shape, const size_t shape_size, const half *weight,
+                                         const size_t *weight_shape, const bool weight_need_broadcast,
+                                         const half *pos_weight, const size_t *pos_weight_shape,
+                                         const bool pos_weight_need_broadcast, half *shape_broadcasted, half *output,
+                                         cudaStream_t cuda_stream);
+template void CalBCEWithLogitsLoss<float>(const size_t input_size, const float *predict, const float *target,
+                                          const size_t *input_shape, const size_t shape_size, const float *weight,
+                                          const size_t *weight_shape, const bool weight_need_broadcast,
+                                          const float *pos_weight, const size_t *pos_weight_shape,
+                                          const bool pos_weight_need_broadcast, float *shape_broadcasted, float *output,
+                                          cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh
@ -0,0 +1,30 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_BCE_WITH_LOGITS_LOSS_IMPL_CUH_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_BCE_WITH_LOGITS_LOSS_IMPL_CUH_
+
+#define MAX_LOGITS_DIMENSION 8
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+
+template <typename T>
+void CalBCEWithLogitsLoss(const size_t input_size, const T *predict, const T *target, const size_t *input_shape,
+                          const size_t shape_size, const T *weight, const size_t *weight_shape,
+                          const bool weight_need_broadcast, const T *pos_weight, const size_t *pos_weight_shape,
+                          const bool pos_weight_need_broadcast, T *shape_broadcasted, T *output,
+                          cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_BCE_WITH_LOGITS_LOSS_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cu
@ -17,8 +17,9 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh"

 const int kWarpSize = 32;
 // tuning param, for those nhw >= kLargeSize, launch more blocks to solve
@ -164,13 +165,11 @@ void CalBiasAddGradNHWC(const size_t size, const size_t bias_size,
  return;
 }

-template CUDA_LIB_EXPORT void CalBiasAddGradNCHW(const size_t size, const size_t bias_size,
-                                                 const int height, const int width,
-                                                 const float*  dy, float*  db, cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void CalBiasAddGradNCHW(const size_t size, const size_t bias_size,
-                                                 const int height, const int width,
-                                                 const half*  dy, half*  db, cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void CalBiasAddGradNHWC(const size_t size, const size_t bias_size,
-                                                 const float*  dy, float*  db,  cudaStream_t cuda_stream);
-template CUDA_LIB_EXPORT void CalBiasAddGradNHWC(const size_t size, const size_t bias_size, const half*  dy,
-                                                 half*  db,  cudaStream_t cuda_stream);
+template void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, const int height, const int width,
+                                 const float*  dy, float*  db, cudaStream_t cuda_stream);
+template void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, const int height, const int width,
+                                 const half*  dy, half*  db, cudaStream_t cuda_stream);
+template void CalBiasAddGradNHWC(const size_t size, const size_t bias_size,
+                                 const float*  dy, float*  db,  cudaStream_t cuda_stream);
+template void CalBiasAddGradNHWC(const size_t size, const size_t bias_size, const half*  dy,
+                                 half*  db,  cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cuh
@ -14,12 +14,14 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT3D_IMPL_CUH_
-#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT3D_IMPL_CUH_
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BIASADDGRAD_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BIASADDGRAD_H_

+#include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
-CUDA_LIB_EXPORT void Dropout3DForward(const T *input, bool *mask, T *output, float *rand_f, const size_t num_count,
-                                      const float keep_prob, const size_t num_per_chan, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT3D_IMPL_CUH_
+void CalBiasAddGradNHWC(const size_t size, const size_t bias_size,
+                        const T*  dy, T*  db,  cudaStream_t cuda_stream);
+template <typename T>
+void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, const int height, const int width,
+                        const T*  dy, T*  db,  cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BIASADDGRAD_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cu
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh"

 template <typename T>
 __global__ void BoundingBoxDecodeKernel(const size_t size, const T *rois, const T *deltas, T *bboxes, const float m1,
@ -74,11 +74,8 @@ void BoundingBoxDecode(const size_t size, const T *rois, const T *deltas, T *bbo
                                                                             ratio_clip);
 }

-template CUDA_LIB_EXPORT void BoundingBoxDecode<float>(const size_t size, const float *rois, const float *deltas,
-                                                       float *bboxes,
-                                                       const float &m1, const float &m2,
-                                                       const float &m3, const float &m4,
-                                                       const float &s1, const float &s2,
-                                                       const float &s3, const float &s4,
-                                                       const int &max_height, const int &max_width,
-                                                       const float &ratio_clip, cudaStream_t cuda_stream);
+template void BoundingBoxDecode<float>(const size_t size, const float *rois, const float *deltas, float *bboxes,
+                                       const float &m1, const float &m2, const float &m3, const float &m4,
+                                       const float &s1, const float &s2, const float &s3, const float &s4,
+                                       const int &max_height, const int &max_width, const float &ratio_clip,
+                                       cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh
@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_
+
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+template <typename T>
+void BoundingBoxDecode(const size_t size, const T *rois, const T *deltas, T *bboxes, const float &m1, const float &m2,
+                       const float &m3, const float &m4, const float &s1, const float &s2, const float &s3,
+                       const float &s4, const int &max_height, const int &max_width, const float &ratio_clip,
+                       cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cu
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh"

 template <typename T>
 __global__ void BoundingBoxEncodeKernel(const size_t size, const T *anchor_box, const T *groundtruth_box, T *deltas,
@ -56,10 +56,7 @@ void BoundingBoxEncode(const size_t size, const T *anchor_box, const T *groundtr
                                                                             m1, m2, m3, m4, s1, s2, s3, s4);
 }

-template CUDA_LIB_EXPORT void BoundingBoxEncode<float>(const size_t size, const float *anchor_box,
-                                                       const float *groundtruth_box, float *deltas,
-                                                       const float &m1, const float &m2,
-                                                       const float &m3, const float &m4,
-                                                       const float &s1, const float &s2,
-                                                       const float &s3, const float &s4,
-                                                       cudaStream_t cuda_stream);
+template void BoundingBoxEncode<float>(const size_t size, const float *anchor_box, const float *groundtruth_box,
+                                       float *deltas, const float &m1, const float &m2, const float &m3,
+                                       const float &m4, const float &s1, const float &s2, const float &s3,
+                                       const float &s4, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh
@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_
+
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+template <typename T>
+void BoundingBoxEncode(const size_t size, const T *anchor_box, const T *groundtruth_box, T *deltas, const float &m1,
+                       const float &m2, const float &m3, const float &m4, const float &s1, const float &s2,
+                       const float &s3, const float &s4, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cu
@ -14,8 +14,9 @@
 * limitations under the License.
 */

-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cuh"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
+#include "plugin/device/gpu/hal/device/cuda_common.h"

 template <typename T>
 struct MinimumGradFunc {
@ -112,48 +113,37 @@ void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
  NoBroadcastGradKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, grad_x1, grad_x2, op, x1, x2, dy, dx1, dx2);
 }

-template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
-                                              enum BroadcastGradOpType op, const double *x1, const double *x2,
-                                              const double *dy, double *dx1, double *dx2, cudaStream_t stream);
-template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
-                                              enum BroadcastGradOpType op, const float *x1, const float *x2,
-                                              const float *dy, float *dx1, float *dx2, cudaStream_t stream);
-template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
-                                              enum BroadcastGradOpType op, const int *x1, const int *x2,
-                                              const int *dy, int *dx1, int *dx2, cudaStream_t stream);
-template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
-                                              enum BroadcastGradOpType op, const half *x1, const half *x2,
-                                              const half *dy, half *dx1, half *dx2, cudaStream_t stream);
-template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2,
-                                              enum BroadcastGradOpType op, const int64_t *x1, const int64_t *x2,
-                                              const int64_t *dy, int64_t *dx1, int64_t *dx2, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3,
-                                            const int &r0, const int &r1, const int &r2, const int &r3,
-                                            const int &d0, const int &d1, const int &d2, const int &d3,
-                                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
-                                            const double *x1, const double *x2, const double *dy,
-                                            double *dx1, double *dx2, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3,
-                                            const int &r0, const int &r1, const int &r2, const int &r3,
-                                            const int &d0, const int &d1, const int &d2, const int &d3,
-                                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
-                                            const float *x1, const float *x2, const float *dy, float *dx1, float *dx2,
-                                            cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3,
-                                            const int &r0, const int &r1, const int &r2, const int &r3,
-                                            const int &d0, const int &d1, const int &d2, const int &d3,
-                                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
-                                            const int *x1, const int *x2, const int *dy, int *dx1, int *dx2,
-                                            cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3,
-                                            const int &r0, const int &r1, const int &r2, const int &r3,
-                                            const int &d0, const int &d1, const int &d2, const int &d3,
-                                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
-                                            const half *x1, const half *x2, const half *dy, half *dx1, half *dx2,
-                                            cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3,
-                                            const int &r0, const int &r1, const int &r2, const int &r3,
-                                            const int &d0, const int &d1, const int &d2, const int &d3,
-                                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
-                                            const int64_t *x1, const int64_t *x2, const int64_t *dy,
-                                            int64_t *dx1, int64_t *dx2, cudaStream_t stream);
+template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
+                              const double *x1, const double *x2, const double *dy, double *dx1, double *dx2,
+                              cudaStream_t stream);
+template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
+                              const float *x1, const float *x2, const float *dy, float *dx1, float *dx2,
+                              cudaStream_t stream);
+template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
+                              const int *x1, const int *x2, const int *dy, int *dx1, int *dx2, cudaStream_t stream);
+template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
+                              const half *x1, const half *x2, const half *dy, half *dx1, half *dx2,
+                              cudaStream_t stream);
+template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
+                              const int64_t *x1, const int64_t *x2, const int64_t *dy, int64_t *dx1, int64_t *dx2,
+                              cudaStream_t stream);
+template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
+                            const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
+                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const double *x1,
+                            const double *x2, const double *dy, double *dx1, double *dx2, cudaStream_t stream);
+template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
+                            const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
+                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const float *x1,
+                            const float *x2, const float *dy, float *dx1, float *dx2, cudaStream_t stream);
+template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
+                            const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
+                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const int *x1,
+                            const int *x2, const int *dy, int *dx1, int *dx2, cudaStream_t stream);
+template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
+                            const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
+                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const half *x1,
+                            const half *x2, const half *dy, half *dx1, half *dx2, cudaStream_t stream);
+template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
+                            const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
+                            const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const int64_t *x1,
+                            const int64_t *x2, const int64_t *dy, int64_t *dx1, int64_t *dx2, cudaStream_t stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh
@ -0,0 +1,38 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_
+
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+
+enum BroadcastGradOpType {
+  BROADCAST_GRAD_TYPE_MAXIMUM = 0,
+  BROADCAST_GRAD_TYPE_MINIMUM = 1,
+  BROADCAST_GRAD_TYPE_INVALID = 0xffffffff,
+};
+
+template <typename T>
+void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
+                   const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
+                   const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const T *x1, const T *x2,
+                   const T *dy, T *dx1, T *dx2, cudaStream_t stream);
+
+template <typename T>
+void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
+                     const T *x1, const T *x2, const T *dy, T *dx1, T *dx2, cudaStream_t stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu
@ -16,8 +16,9 @@

 #include <vector>
 #include <iostream>
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
-#include "include/cuda_fp16.h"
+
+#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
+#include "plugin/device/gpu/hal/device/cuda_common.h"

 // Basic function
 template <typename T>
@ -565,30 +566,30 @@ void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *
  }
 }

-template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
-                                         const double *x0, const double *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
-                                         const float *x0, const float *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
-                                         const half *x0, const half *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
-                                         const int *x0, const int *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
-                                         const int8_t *x0, const int8_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
-                                         const uint8_t *x0, const uint8_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
-                                         const int64_t *x0, const int64_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
-                                         const int16_t *x0, const int16_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
-                                         const uint16_t *x0, const uint16_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
-                                         const uint32_t *x0, const uint32_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
-                                         const uint64_t *x0, const uint64_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op,
-                                         const bool *x0, const bool *x1, bool *y, cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const double *x0, const double *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int *x0, const int *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int8_t *x0, const int8_t *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const uint8_t *x0, const uint8_t *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int64_t *x0, const int64_t *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int16_t *x0, const int16_t *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const uint16_t *x0, const uint16_t *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const uint32_t *x0, const uint32_t *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const uint64_t *x0, const uint64_t *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const bool *x0, const bool *x1, bool *y,
+                         cudaStream_t stream);
 // Element-wise ArithMetic
 template <typename T, typename Func>
 __global__ void ElewiseArithKernel(const int nums, const T *x0, const T *x1, T *y) {
@ -702,46 +703,46 @@ void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const T1 *x0,
  return ElewiseArithComplexKernel(nums, op, x0, x1, y, stream);
 }

-template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
-                                           const double *x0, const double *x1, double *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
-                                           const float *x0, const float *x1, float *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
-                                           const half *x0, const half *x1, half *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
-                                           const int *x0, const int *x1, int *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
-                                           const int8_t *x0, const int8_t *x1, int8_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
-                                           const uint8_t *x0, const uint8_t *x1, uint8_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
-                                           const int64_t *x0, const int64_t *x1, int64_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
-                                           const int16_t *x0, const int16_t *x1, int16_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
-                                           const uint16_t *x0, const uint16_t *x1, uint16_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
-                                           const uint32_t *x0, const uint32_t *x1, uint32_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
-                                           const uint64_t *x0, const uint64_t *x1, uint64_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op,
-                                           const bool *x0, const bool *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<float> *x0,
-                                                  const Complex<float> *x1, Complex<float> *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<float> *x0,
-                                                  const float *x1, Complex<float> *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const float *x0,
-                                                  const Complex<float> *x1, Complex<float> *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<double> *x0,
-                                                  const Complex<double> *x1, Complex<double> *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<double> *x0,
-                                                  const double *x1, Complex<double> *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const double *x0,
-                                                  const Complex<double> *x1, Complex<double> *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const float *x0,
-                                                  const float *x1, Complex<float> *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const double *x0,
-                                                  const double *x1, Complex<double> *y, cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const double *x0, const double *x1, double *y,
+                           cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, float *y,
+                           cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, half *y,
+                           cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int *x0, const int *x1, int *y,
+                           cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int8_t *x0, const int8_t *x1, int8_t *y,
+                           cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const uint8_t *x0, const uint8_t *x1, uint8_t *y,
+                           cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int64_t *x0, const int64_t *x1, int64_t *y,
+                           cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int16_t *x0, const int16_t *x1, int16_t *y,
+                           cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const uint16_t *x0, const uint16_t *x1,
+                           uint16_t *y, cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const uint32_t *x0, const uint32_t *x1,
+                           uint32_t *y, cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const uint64_t *x0, const uint64_t *x1,
+                           uint64_t *y, cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const bool *x0, const bool *x1, bool *y,
+                           cudaStream_t stream);
+template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<float> *x0,
+                                  const Complex<float> *x1, Complex<float> *y, cudaStream_t stream);
+template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<float> *x0, const float *x1,
+                                  Complex<float> *y, cudaStream_t stream);
+template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const float *x0, const Complex<float> *x1,
+                                  Complex<float> *y, cudaStream_t stream);
+template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<double> *x0,
+                                  const Complex<double> *x1, Complex<double> *y, cudaStream_t stream);
+template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex<double> *x0, const double *x1,
+                                  Complex<double> *y, cudaStream_t stream);
+template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const double *x0, const Complex<double> *x1,
+                                  Complex<double> *y, cudaStream_t stream);
+template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1,
+                                  Complex<float> *y, cudaStream_t stream);
+template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const double *x0, const double *x1,
+                                  Complex<double> *y, cudaStream_t stream);

 // Broadcast comparison
 __device__ __forceinline__ size_t Index(const size_t &index, const size_t &dim) { return dim == 1 ? 0 : index; }
@ -835,42 +836,42 @@ void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t>
  }
 }

-template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const double *x0,
-                                           const double *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const float *x0,
-                                           const float *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const half *x0,
-                                           const half *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int *x0,
-                                           const int *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int8_t *x0,
-                                           const int8_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                           const uint8_t *x0, const uint8_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                           const int64_t *x0, const int64_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                           const int16_t *x0, const int16_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                           const uint16_t *x0, const uint16_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                           const uint32_t *x0, const uint32_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                           const uint64_t *x0, const uint64_t *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const bool *x0,
-                                           const bool *x1, bool *y, cudaStream_t stream);
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const double *x0,
+                           const double *x1, bool *y, cudaStream_t stream);
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const float *x0, const float *x1,
+                           bool *y, cudaStream_t stream);
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const half *x0, const half *x1,
+                           bool *y, cudaStream_t stream);
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int *x0, const int *x1,
+                           bool *y, cudaStream_t stream);
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int8_t *x0,
+                           const int8_t *x1, bool *y, cudaStream_t stream);
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint8_t *x0,
+                           const uint8_t *x1, bool *y, cudaStream_t stream);
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int64_t *x0,
+                           const int64_t *x1, bool *y, cudaStream_t stream);
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int16_t *x0,
+                           const int16_t *x1, bool *y, cudaStream_t stream);
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint16_t *x0,
+                           const uint16_t *x1, bool *y, cudaStream_t stream);
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint32_t *x0,
+                           const uint32_t *x1, bool *y, cudaStream_t stream);
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint64_t *x0,
+                           const uint64_t *x1, bool *y, cudaStream_t stream);
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const bool *x0, const bool *x1,
+                           bool *y, cudaStream_t stream);
 // Broadcast Arithmetic
 template <typename T, typename Func>
 __global__ void BroadcastArithKernel(const size_t l0, const size_t l1, const size_t l2, const size_t l3,
@ -1096,82 +1097,69 @@ void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector
  }
 }

-template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                             const double *x0, const double *x1, double *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                             const float *x0, const float *x1, float *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const half *x0,
-                                             const half *x1, half *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int *x0,
-                                             const int *x1, int *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                             const int8_t *x0, const int8_t *x1, int8_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                             const uint8_t *x0, const uint8_t *x1, uint8_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                             const int64_t *x0, const int64_t *x1, int64_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                             const int16_t *x0, const int16_t *x1, int16_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                             const uint16_t *x0, const uint16_t *x1, uint16_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                             const uint32_t *x0, const uint32_t *x1, uint32_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                             const uint64_t *x0, const uint64_t *x1, uint64_t *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
-                                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const bool *x0,
-                                             const bool *x1, bool *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
-                                                    const std::vector<size_t> &x1_dims,
-                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                                    const Complex<float> *x0, const Complex<float> *x1,
-                                                    Complex<float> *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
-                                                    const std::vector<size_t> &x1_dims,
-                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                                    const Complex<float> *x0, const float *x1, Complex<float> *y,
-                                                    cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
-                                                    const std::vector<size_t> &x1_dims,
-                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                                    const float *x0, const Complex<float> *x1, Complex<float> *y,
-                                                    cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
-                                                    const std::vector<size_t> &x1_dims,
-                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                                    const Complex<double> *x0, const Complex<double> *x1,
-                                                    Complex<double> *y, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
-                                                    const std::vector<size_t> &x1_dims,
-                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                                    const Complex<double> *x0, const double *x1, Complex<double> *y,
-                                                    cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
-                                                    const std::vector<size_t> &x1_dims,
-                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                                    const double *x0, const Complex<double> *x1, Complex<double> *y,
-                                                    cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
-                                                    const std::vector<size_t> &x1_dims,
-                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                                    const double *x0, const double *x1, Complex<double> *y,
-                                                    cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector<size_t> &x0_dims,
-                                                    const std::vector<size_t> &x1_dims,
-                                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
-                                                    const float *x0, const float *x1, Complex<float> *y,
-                                                    cudaStream_t stream);
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const double *x0,
+                             const double *x1, double *y, cudaStream_t stream);
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const float *x0,
+                             const float *x1, float *y, cudaStream_t stream);
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const half *x0, const half *x1,
+                             half *y, cudaStream_t stream);
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int *x0, const int *x1,
+                             int *y, cudaStream_t stream);
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int8_t *x0,
+                             const int8_t *x1, int8_t *y, cudaStream_t stream);
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint8_t *x0,
+                             const uint8_t *x1, uint8_t *y, cudaStream_t stream);
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int64_t *x0,
+                             const int64_t *x1, int64_t *y, cudaStream_t stream);
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const int16_t *x0,
+                             const int16_t *x1, int16_t *y, cudaStream_t stream);
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint16_t *x0,
+                             const uint16_t *x1, uint16_t *y, cudaStream_t stream);
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint32_t *x0,
+                             const uint32_t *x1, uint32_t *y, cudaStream_t stream);
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const uint64_t *x0,
+                             const uint64_t *x1, uint64_t *y, cudaStream_t stream);
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const bool *x0, const bool *x1,
+                             bool *y, cudaStream_t stream);
+template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                    const Complex<float> *x0, const Complex<float> *x1, Complex<float> *y,
+                                    cudaStream_t stream);
+template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                    const Complex<float> *x0, const float *x1, Complex<float> *y, cudaStream_t stream);
+template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op, const float *x0,
+                                    const Complex<float> *x1, Complex<float> *y, cudaStream_t stream);
+template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                    const Complex<double> *x0, const Complex<double> *x1, Complex<double> *y,
+                                    cudaStream_t stream);
+template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op,
+                                    const Complex<double> *x0, const double *x1, Complex<double> *y,
+                                    cudaStream_t stream);
+template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op, const double *x0,
+                                    const Complex<double> *x1, Complex<double> *y, cudaStream_t stream);
+template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op, const double *x0,
+                                    const double *x1, Complex<double> *y, cudaStream_t stream);
+template void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                                    const std::vector<size_t> &y_dims, enum BroadcastOpType op, const float *x0,
+                                    const float *x1, Complex<float> *y, cudaStream_t stream);

 // BroadcastTo
 template <typename T>
@ -1198,24 +1186,24 @@ void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const siz
                                                                  output_addr);
 }

-template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
-                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
-                                          const double *input_addr, double *output_addr, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
-                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
-                                          const float *input_addr, float *output_addr, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
-                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
-                                          const half *input_addr, half *output_addr, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
-                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
-                                          const int16_t *input_addr, int16_t *output_addr, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
-                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
-                                          const int32_t *input_addr, int32_t *output_addr, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
-                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
-                                          const int64_t *input_addr, int64_t *output_addr, cudaStream_t stream);
-template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
-                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
-                                          const bool *input_addr, bool *output_addr, cudaStream_t stream);
+template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
+                          const size_t &o1, const size_t &o2, const size_t &o3, const double *input_addr,
+                          double *output_addr, cudaStream_t stream);
+template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
+                          const size_t &o1, const size_t &o2, const size_t &o3, const float *input_addr,
+                          float *output_addr, cudaStream_t stream);
+template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
+                          const size_t &o1, const size_t &o2, const size_t &o3, const half *input_addr,
+                          half *output_addr, cudaStream_t stream);
+template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
+                          const size_t &o1, const size_t &o2, const size_t &o3, const int16_t *input_addr,
+                          int16_t *output_addr, cudaStream_t stream);
+template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
+                          const size_t &o1, const size_t &o2, const size_t &o3, const int32_t *input_addr,
+                          int32_t *output_addr, cudaStream_t stream);
+template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
+                          const size_t &o1, const size_t &o2, const size_t &o3, const int64_t *input_addr,
+                          int64_t *output_addr, cudaStream_t stream);
+template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
+                          const size_t &o1, const size_t &o2, const size_t &o3, const bool *input_addr,
+                          bool *output_addr, cudaStream_t stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh
@ -0,0 +1,89 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
+
+#include <vector>
+#include "plugin/device/gpu/hal/device/cuda_common.h"
+#include "utils/complex.h"
+
+const float kFloatEplison = 1e-37;
+
+enum BroadcastOpType {
+  BROADCAST_TYPE_GREATER = 0,
+  BROADCAST_TYPE_LESS = 1,
+  BROADCAST_TYPE_MAXIMUM = 2,
+  BROADCAST_TYPE_MINIMUM = 3,
+  BROADCAST_TYPE_POWER = 4,
+  BROADCAST_TYPE_REALDIV = 5,
+  BROADCAST_TYPE_MUL = 6,
+  BROADCAST_TYPE_SUB = 7,
+  BROADCAST_TYPE_ADD = 8,
+  BROADCAST_TYPE_FLOORDIV = 9,
+  BROADCAST_TYPE_ABSGRAD = 10,
+  BROADCAST_TYPE_DIV = 11,
+  BROADCAST_TYPE_DIVNONAN = 12,
+  BROADCAST_TYPE_EQUAL = 13,
+  BROADCAST_TYPE_SQUARED_DIFFERENCE = 14,
+  BROADCAST_TYPE_MOD = 15,
+  BROADCAST_TYPE_FLOORMOD = 16,
+  BROADCAST_TYPE_ATAN2 = 17,
+  BROADCAST_TYPE_GREATER_EQUAL = 18,
+  BROADCAST_TYPE_LESS_EQUAL = 19,
+  BROADCAST_TYPE_NOT_EQUAL = 20,
+  BROADCAST_TYPE_LOGICAL_AND = 21,
+  BROADCAST_TYPE_LOGICAL_OR = 22,
+  BROADCAST_TYPE_TRUNCATEDIV = 23,
+  BROADCAST_TYPE_TRUNCATEMOD = 24,
+  BROADCAST_TYPE_COMPLEX = 25,
+  BROADCAST_TYPE_INVALID = 0xffffffff,
+};
+
+template <typename T>
+void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, bool *y, cudaStream_t stream);
+
+template <typename T>
+void ElewiseArith(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream);
+
+template <typename T1, typename T2, typename T3>
+void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const T1 *x0, const T2 *x1,
+                         Complex<T3> *y, cudaStream_t stream);
+
+template <typename T>
+void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                  const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T *x0, const T *x1, bool *y,
+                  cudaStream_t stream);
+
+template <typename T>
+void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                    const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T *x0, const T *x1, T *y,
+                    cudaStream_t stream);
+
+template <typename T1, typename T2, typename T3>
+void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T1 *x0, const T2 *x1,
+                           Complex<T3> *y, cudaStream_t stream);
+template <typename T>
+void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T *x0, const T *x1,
+                           Complex<T> *y, cudaStream_t stream);
+
+template <typename T>
+void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
+                 const size_t &o1, const size_t &o2, const size_t &o3, const T *input_addr, T *output_addr,
+                 cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cu
@ -17,8 +17,7 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cuh"
-#include "include/cuda_fp16.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cuh"

 template <typename T, typename S>
 __global__ void CastAll(T** inputs, S** output, const size_t num, const size_t *size) {
@ -35,7 +34,7 @@ void CastAllKernel(T** inputs, S** output, const size_t max, const size_t num, c
    CastAll<<<GET_BLOCKS(max), GET_THREADS, 0, stream>>>(inputs, output, num, size);
    return;
 }
-template CUDA_LIB_EXPORT void CastAllKernel(half** inputs, float** output, const size_t max, const size_t num,
-                                            const size_t *size, cudaStream_t stream);
-template CUDA_LIB_EXPORT void CastAllKernel(float** inputs, half** output, const size_t max, const size_t num,
-                                            const size_t *size, cudaStream_t stream);
+template void CastAllKernel(half** inputs, float** output, const size_t max, const size_t num,
+                            const size_t *size, cudaStream_t stream);
+template void CastAllKernel(float** inputs, half** output, const size_t max, const size_t num,
+                            const size_t *size, cudaStream_t stream);
--- a/Show More
+++ b/Show More