plugin compiling independent

2022-07-21 09:28:31 +08:00 · 2022-07-21 09:28:31 +08:00 · 95b5c5b23c
parent 11cee9c0fa
commit 95b5c5b23c
158 changed files with 1102 additions and 762 deletions
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@ -95,6 +95,22 @@ install(
    COMPONENT mindspore
 )

+if(ENABLE_D)
+    install(
+        TARGETS mindspore_ascend
+        DESTINATION ${INSTALL_LIB_DIR}
+        COMPONENT mindspore
+    )
+endif()
+
+if(ENABLE_GPU)
+    install(
+            TARGETS mindspore_gpu
+            DESTINATION ${INSTALL_LIB_DIR}
+            COMPONENT mindspore
+    )
+endif()
+
 if(USE_GLOG)
    install(FILES ${glog_LIBPATH}/libmindspore_glog.so.0.4.0
      DESTINATION ${INSTALL_LIB_DIR} RENAME libmindspore_glog.so.0 COMPONENT mindspore)
--- a/cmake/package_tar.cmake
+++ b/cmake/package_tar.cmake
@ -32,6 +32,22 @@ install(
        COMPONENT mindspore
 )

+if(ENABLE_D)
+    install(
+        TARGETS mindspore_ascend
+        DESTINATION ${INSTALL_LIB_DIR}
+        COMPONENT mindspore
+    )
+endif()
+
+if(ENABLE_GPU)
+    install(
+        TARGETS mindspore_gpu
+        DESTINATION ${INSTALL_LIB_DIR}
+        COMPONENT mindspore
+    )
+endif()
+
 if(USE_GLOG)
    file(GLOB_RECURSE GLOG_LIB_LIST ${glog_LIBPATH}/libmindspore_glog*)
    install(
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@ -14,9 +14,6 @@ set(FBS_FILES
 ms_build_flatbuffers(FBS_FILES ${CMAKE_CURRENT_SOURCE_DIR}../../schema generated_fbs_files ${SERVER_FLATBUFFER_OUTPUT})

 if(ENABLE_D OR ENABLE_ACL)
-    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/lib64)
-    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/lib64)
-    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/latest/lib64)
    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/op_tiling)
    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
    set(MINDSPORE_RPATH
@ -24,9 +21,77 @@ if(ENABLE_D OR ENABLE_ACL)
    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
 endif()

-if(ENABLE_D)
-    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/plugin/device/ascend/kernel/aicpu/aicpu_ops)
-    add_subdirectory(plugin/device/ascend/kernel/aicpu/aicpu_ops)
+if(ENABLE_GPU)
+    find_package(CUDA REQUIRED)
+    find_package(Threads)
+    if(${CUDA_VERSION} VERSION_LESS ${MS_REQUIRE_CUDA_VERSION})
+        message(FATAL_ERROR "The minimum CUDA version ${MS_REQUIRE_CUDA_VERSION} is required, \
+              but only CUDA ${CUDA_VERSION} found.")
+    endif()
+    enable_language(CUDA)
+    if(NOT CUDA_PATH OR CUDA_PATH STREQUAL "")
+        if(DEFINED ENV{CUDA_HOME} AND NOT $ENV{CUDA_HOME} STREQUAL "")
+            set(CUDA_PATH $ENV{CUDA_HOME})
+        else()
+            set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
+        endif()
+    endif()
+
+    if(DEFINED ENV{CUDNN_HOME} AND NOT $ENV{CUDNN_HOME} STREQUAL "")
+        set(CUDNN_INCLUDE_DIR $ENV{CUDNN_HOME}/include)
+        set(CUDNN_LIBRARY_DIR $ENV{CUDNN_HOME}/lib64)
+        find_path(CUDNN_INCLUDE_PATH cudnn.h HINTS ${CUDNN_INCLUDE_DIR} NO_DEFAULT_PATH)
+        find_library(CUDNN_LIBRARY_PATH "cudnn" HINTS ${CUDNN_LIBRARY_DIR} NO_DEFAULT_PATH)
+        find_library(CUBLAS_LIBRARY_PATH "cublas" HINTS ${CUDNN_LIBRARY_DIR})
+        if(CUDNN_INCLUDE_PATH STREQUAL CUDNN_INCLUDE_PATH-NOTFOUND)
+            message(FATAL_ERROR "Failed to find cudnn header file, please set environment variable CUDNN_HOME to \
+                    cudnn installation position.")
+        endif()
+        if(CUDNN_LIBRARY_PATH STREQUAL CUDNN_LIBRARY_PATH-NOTFOUND)
+            message(FATAL_ERROR "Failed to find cudnn library file, please set environment variable CUDNN_HOME to \
+                    cudnn installation position.")
+        endif()
+    else()
+        list(APPEND CMAKE_PREFIX_PATH  ${CUDA_TOOLKIT_ROOT_DIR})
+        find_path(CUDNN_INCLUDE_PATH cudnn.h PATH_SUFFIXES cuda/inclulde include cuda)
+        find_library(CUDNN_LIBRARY_PATH "cudnn" PATH_SUFFIXES cuda/lib64 lib64 lib cuda/lib lib/x86_64-linux-gnu)
+        find_library(CUBLAS_LIBRARY_PATH "cublas" PATH_SUFFIXES cuda/lib64 lib64 lib cuda/lib lib/x86_64-linux-gnu)
+        if(CUDNN_INCLUDE_PATH STREQUAL CUDNN_INCLUDE_PATH-NOTFOUND)
+            message(FATAL_ERROR "Failed to find cudnn header file, if cudnn library is not installed, please put \
+                    cudnn header file in cuda include path or user include path(eg. /usr/local/cuda/include; \
+                    /usr/local/include; /usr/include), if cudnn library is installed in other position, please \
+                    set environment variable CUDNN_HOME to cudnn installation position, there should be cudnn.h \
+                    in {CUDNN_HOME}/include.")
+        endif()
+        if(CUDNN_LIBRARY_PATH STREQUAL CUDNN_LIBRARY_PATH-NOTFOUND)
+            message(FATAL_ERROR "Failed to find cudnn library file, if cudnn library is not installed, please put \
+                    cudnn library file in cuda library path or user library path(eg. /usr/local/cuda/lib64; \
+                    /usr/local/lib64; /usr/lib64; /usr/local/lib; /usr/lib), if cudnn library is installed in other \
+                    position, please set environment variable CUDNN_HOME to cudnn installation position, there should \
+                    be cudnn library file in {CUDNN_HOME}/lib64.")
+        endif()
+    endif()
+
+    if(NOT CUPTI_INCLUDE_DIRS OR CUPTI_INCLUDE_DIRS STREQUAL "")
+        set(CUPTI_INCLUDE_DIRS  ${CUDA_PATH}/extras/CUPTI/include)
+    endif()
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:${CUDA_PATH}/lib64)
+    message("CUDA_PATH: ${CUDA_PATH}")
+    message("CUDA_INCLUDE_DIRS: ${CUDA_INCLUDE_DIRS}")
+    message("CUDNN_INCLUDE_PATH: ${CUDNN_INCLUDE_PATH}")
+    message("CUDNN_LIBRARY_PATH: ${CUDNN_LIBRARY_PATH}")
+    message("CUBLAS_LIBRARY_PATH: ${CUBLAS_LIBRARY_PATH}")
+    message("CUPTI_INCLUDE_DIRS: ${CUPTI_INCLUDE_DIRS}")
+    include_directories(${CUDNN_INCLUDE_PATH} ${CUDA_PATH} ${CUDA_INCLUDE_DIRS} ${CUPTI_INCLUDE_DIRS})
+
+    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
+    if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+        list(APPEND CUDA_NVCC_FLAGS -G)
+        message("CUDA_NVCC_FLAGS" ${CUDA_NVCC_FLAGS})
+    endif()
+    set(NVCC_TMP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    set(CMAKE_CXX_FLAGS ${NVCC_TMP_CMAKE_CXX_FLAGS})
+    add_compile_definitions(ENABLE_GPU)
 endif()

 if(ENABLE_CPU)
@ -59,111 +124,6 @@ if(ENABLE_MPI)
    add_compile_definitions(ENABLE_MPI)
 endif()

-if(ENABLE_GPU)
-    find_package(CUDA REQUIRED)
-    find_package(Threads)
-    if(${CUDA_VERSION} VERSION_LESS ${MS_REQUIRE_CUDA_VERSION})
-        message(FATAL_ERROR "The minimum CUDA version ${MS_REQUIRE_CUDA_VERSION} is required, \
-              but only CUDA ${CUDA_VERSION} found.")
-    endif()
-    enable_language(CUDA)
-    if(NOT CUDA_PATH OR CUDA_PATH STREQUAL "")
-        if(DEFINED ENV{CUDA_HOME} AND NOT $ENV{CUDA_HOME} STREQUAL "")
-            set(CUDA_PATH $ENV{CUDA_HOME})
-        else()
-            set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
-        endif()
-    endif()
-
-    if(DEFINED ENV{CUDNN_HOME} AND NOT $ENV{CUDNN_HOME} STREQUAL "")
-        set(CUDNN_INCLUDE_DIR $ENV{CUDNN_HOME}/include)
-        set(CUDNN_LIBRARY_DIR $ENV{CUDNN_HOME}/lib64)
-        find_path(CUDNN_INCLUDE_PATH cudnn.h HINTS ${CUDNN_INCLUDE_DIR} NO_DEFAULT_PATH)
-        find_library(CUDNN_LIBRARY_PATH "cudnn" HINTS ${CUDNN_LIBRARY_DIR} NO_DEFAULT_PATH)
-        if(CUDNN_INCLUDE_PATH STREQUAL CUDNN_INCLUDE_PATH-NOTFOUND)
-            message(FATAL_ERROR "Failed to find cudnn header file, please set environment variable CUDNN_HOME to \
-                    cudnn installation position.")
-        endif()
-        if(CUDNN_LIBRARY_PATH STREQUAL CUDNN_LIBRARY_PATH-NOTFOUND)
-            message(FATAL_ERROR "Failed to find cudnn library file, please set environment variable CUDNN_HOME to \
-                    cudnn installation position.")
-        endif()
-    else()
-        list(APPEND CMAKE_PREFIX_PATH  ${CUDA_TOOLKIT_ROOT_DIR})
-        find_path(CUDNN_INCLUDE_PATH cudnn.h PATH_SUFFIXES cuda/inclulde include cuda)
-        find_library(CUDNN_LIBRARY_PATH "cudnn" PATH_SUFFIXES cuda/lib64 lib64 lib cuda/lib lib/x86_64-linux-gnu)
-        if(CUDNN_INCLUDE_PATH STREQUAL CUDNN_INCLUDE_PATH-NOTFOUND)
-            message(FATAL_ERROR "Failed to find cudnn header file, if cudnn library is not installed, please put \
-                    cudnn header file in cuda include path or user include path(eg. /usr/local/cuda/include; \
-                    /usr/local/include; /usr/include), if cudnn library is installed in other position, please \
-                    set environment variable CUDNN_HOME to cudnn installation position, there should be cudnn.h \
-                    in {CUDNN_HOME}/include.")
-        endif()
-        if(CUDNN_LIBRARY_PATH STREQUAL CUDNN_LIBRARY_PATH-NOTFOUND)
-            message(FATAL_ERROR "Failed to find cudnn library file, if cudnn library is not installed, please put \
-                    cudnn library file in cuda library path or user library path(eg. /usr/local/cuda/lib64; \
-                    /usr/local/lib64; /usr/lib64; /usr/local/lib; /usr/lib), if cudnn library is installed in other \
-                    position, please set environment variable CUDNN_HOME to cudnn installation position, there should \
-                    be cudnn library file in {CUDNN_HOME}/lib64.")
-        endif()
-    endif()
-
-    if(NOT CUPTI_INCLUDE_DIRS OR CUPTI_INCLUDE_DIRS STREQUAL "")
-        set(CUPTI_INCLUDE_DIRS  ${CUDA_PATH}/extras/CUPTI/include)
-    endif()
-    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:${CUDA_PATH}/lib64)
-    message("CUDA_PATH: ${CUDA_PATH}")
-    message("CUDA_INCLUDE_DIRS: ${CUDA_INCLUDE_DIRS}")
-    message("CUDNN_INCLUDE_PATH: ${CUDNN_INCLUDE_PATH}")
-    message("CUDNN_LIBRARY_PATH: ${CUDNN_LIBRARY_PATH}")
-    message("CUPTI_INCLUDE_DIRS: ${CUPTI_INCLUDE_DIRS}")
-    include_directories(${CUDNN_INCLUDE_PATH} ${CUDA_PATH} ${CUDA_INCLUDE_DIRS} ${CUPTI_INCLUDE_DIRS})
-
-    file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-            "plugin/device/gpu/hal/device/*.cc"
-            "plugin/device/gpu/hal/device/*.cu"
-            "plugin/device/gpu/kernel/*.cu"
-            )
-
-    list(REMOVE_ITEM GPU_SRC_LIST "plugin/device/gpu/kernel/cuda_impl/cuda_ops/*.cu")
-
-    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
-    if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
-        list(APPEND CUDA_NVCC_FLAGS -G)
-        message("CUDA_NVCC_FLAGS" ${CUDA_NVCC_FLAGS})
-    endif()
-    list(REMOVE_ITEM GPU_SRC_LIST "plugin/device/gpu/hal/device/mpi/mpi_initializer.cc"
-                                  "plugin/device/gpu/hal/device/distribution/collective_wrapper.cc"
-                                  "plugin/device/gpu/hal/device/distribution/mpi_wrapper.cc"
-                                  "plugin/device/gpu/hal/device/distribution/nccl_wrapper.cc"
-                                  "plugin/device/gpu/hal/device/trt_loader.cc")
-
-    if(NOT ${TENSORRT_HOME} STREQUAL "")
-        find_path(TENSORRT_HOME_INCLUDE NvInfer.h HINTS ${TENSORRT_HOME}/include)
-        if(TENSORRT_HOME_INCLUDE STREQUAL TENSORRT_HOME_INCLUDE-NOTFOUND)
-          message(FATAL_ERROR "Tensor-RT dir not exist ${TENSORRT_HOME}")
-        endif()
-        message("Enable GPU inference. Tensor-RT include dir: ${TENSORRT_HOME_INCLUDE}")
-        set(ENABLE_GPU_INFER TRUE)
-        add_compile_definitions(ENABLE_GPU_INFER)
-        include_directories(${TENSORRT_HOME_INCLUDE})
-        list(APPEND GPU_SRC_LIST ${CMAKE_CURRENT_SOURCE_DIR}/plugin/device/gpu/hal/device/trt_loader.cc)
-    endif()
-
-    set(NVCC_TMP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-    if(${CUDA_VERSION} VERSION_LESS 11.0)
-        string(REPLACE "-std=c++17" "-std=c++11" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    else()
-        string(REPLACE "-std=c++17" "-std=c++14" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    endif()
-    set_property(SOURCE ${GPU_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
-    cuda_add_library(gpu_cuda_lib STATIC ${GPU_SRC_LIST})
-    set(CMAKE_CXX_FLAGS ${NVCC_TMP_CMAKE_CXX_FLAGS})
-    add_compile_definitions(ENABLE_GPU)
-    add_dependencies(gpu_cuda_lib proto_input)
-    add_subdirectory(plugin/device/gpu/kernel/cuda_impl/cuda_ops)
-endif()
-
 ## make protobuf files
 file(GLOB ONNX_PROTO "" ${CMAKE_SOURCE_DIR}/third_party/proto/onnx/onnx.proto)
 message("onnx proto path is :" ${ONNX_PROTO})
@ -328,22 +288,11 @@ set(BACKEND_SUB_COMP
        runtime/hardware
        runtime/pynative
        runtime/data_queue
-        plugin/device/ascend/hal/device
-        plugin/device/ascend/hal/hardware
-        plugin/device/ascend/hal/hccl_adapter
-        plugin/device/ascend/hal/profiler
-        plugin/device/ascend/kernel
-        plugin/device/ascend/optimizer
        plugin/device/cpu/hal/device
        plugin/device/cpu/hal/hardware
        plugin/device/cpu/hal/profiler
        plugin/device/cpu/kernel
        plugin/device/cpu/optimizer
-        plugin/device/gpu/hal/device
-        plugin/device/gpu/hal/hardware
-        plugin/device/gpu/hal/profiler
-        plugin/device/gpu/kernel
-        plugin/device/gpu/optimizer
        transform/graph_ir
        )

@ -370,14 +319,19 @@ endif()

 set_property(SOURCE ${BACKEND_SUB_OBJECTS_SRC} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_ME)
 add_library(mindspore_backend SHARED ${BACKEND_SUB_OBJECTS_SRC})
+add_library(mindspore_backend_common STATIC ${BACKEND_SUB_OBJECTS_SRC})
+
 if(MODE_ASCEND_ACL)
    add_library(mindspore_backend_static STATIC ${BACKEND_SUB_OBJECTS_SRC})
 endif()
+
 if(CMAKE_SYSTEM_NAME MATCHES "Windows")
    target_link_libraries(mindspore_backend PRIVATE mindspore::pybind11_module)
 endif()
+
 target_link_libraries(mindspore_backend PRIVATE mindspore_core mindspore_common proto_input mindspore::protobuf)
 target_link_libraries(mindspore_backend PRIVATE securec)
+
 if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
    set_target_properties(mindspore_backend PROPERTIES MACOSX_RPATH ON)
    set_target_properties(mindspore_backend PROPERTIES INSTALL_RPATH @loader_path)
@ -388,17 +342,6 @@ endif()
 if(ENABLE_CPU)
    target_link_libraries(mindspore_backend PRIVATE mindspore::dnnl mindspore::mkldnn nnacl)
 endif()
-if(ENABLE_GPU)
-    message("add gpu lib to mindspore_backend")
-    target_link_libraries(mindspore_backend PRIVATE gpu_cuda_lib cublas cuda_ops
-            ${CUDA_PATH}/lib64/libcurand.so
-            ${CUDNN_LIBRARY_PATH}
-            ${CUDA_PATH}/lib64/libcudart.so
-            ${CUDA_PATH}/lib64/stubs/libcuda.so
-            ${CUDA_PATH}/lib64/libcusolver.so
-            ${CUDA_PATH}/lib64/libcufft.so
-            ${CUDA_PATH}/lib64/libcusparse.so)
-endif()

 if(NOT WIN32)
    target_link_libraries(mindspore_backend PRIVATE mindspore::ssl mindspore::crypto)
@ -421,28 +364,7 @@ elseif(ENABLE_CPU AND NOT WIN32)
            -Wl,--no-as-needed mindspore::event_core ps_cache)
 endif()

-if(ENABLE_D)
-    find_library(GE_RUNNER ge_runner ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(GRAPH graph ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(HCCL hccl ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    target_link_libraries(mindspore_backend PUBLIC ${GE_RUNNER} ${GRAPH} ${HCCL})
-endif()
-
 if(MODE_ASCEND_ALL)
-    MESSAGE("USE DAV LIB PATH: ${ASCEND_PATH}")
-    find_library(ERROR_MANAGER error_manager ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(RUNTIME_LIB runtime ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(TSDCLIENT tsdclient HINTS ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(DATATRANSFER datatransfer HINTS ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(PROFILING msprofiler ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(ACL ascendcl ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(ACL_TDT_CHANNEL acl_tdt_channel ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(PLATFORM platform ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(OPT_FEATURE opt_feature ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(adump_server libadump_server.a ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(OPTILING optiling ${ASCEND_CANN_OPP_PATH} ${ASCEND_TOOLKIT_OPP_PATH})
-    target_link_libraries(mindspore_backend PUBLIC ${RUNTIME_LIB} ${TSDCLIENT} ${DATATRANSFER} ${ERROR_MANAGER}
-            -Wl,--no-as-needed ${OPTILING} ${PLATFORM} ${ACL} ${ACL_TDT_CHANNEL} ${OPT_FEATURE} ${PROFILING})
    target_link_libraries(mindspore PUBLIC -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group)
 elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
    target_link_libraries(mindspore PUBLIC -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece
@ -473,6 +395,18 @@ else()
 endif()
 set(MINDSPORE_RPATH ${ORIGIN_PATH}/lib:${MINDSPORE_RPATH})

+if(ENABLE_D)
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/plugin/device/ascend)
+    add_subdirectory(plugin/device/ascend)
+    target_link_libraries(mindspore_backend PRIVATE mindspore_ascend)
+endif()
+
+if(ENABLE_GPU)
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/plugin/device/gpu)
+    add_subdirectory(plugin/device/gpu)
+    target_link_libraries(mindspore_backend PRIVATE mindspore_gpu)
+endif()
+
 set_target_properties(_c_expression PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH})

 if(CMAKE_SYSTEM_NAME MATCHES "Windows")
@ -526,16 +460,6 @@ if(ENABLE_MINDDATA)
    add_subdirectory(minddata/dataset)
 endif()

-if(MODE_ASCEND_ALL)
-    target_link_libraries(_c_expression PRIVATE ${adump_server})
-endif()
-
-if(ENABLE_D)
-    if(ENABLE_MPI)
-        set_target_properties(_ascend_mpi PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH})
-    endif()
-endif()
-
 if(ENABLE_TEST OR ENABLE_TESTCASES)
    include_directories(${CMAKE_BINARY_DIR})
    list(APPEND STUB_COMMON_SOURCE ${CMAKE_SOURCE_DIR}/tests/ut/cpp/stub/ge/ge_operator_stub.cc)
--- a/mindspore/ccsrc/backend/common/optimizer/helper.cc
+++ b/mindspore/ccsrc/backend/common/optimizer/helper.cc
@ -734,10 +734,10 @@ AbstractBasePtrList RectifyAbstractFromRegAttr(const PrimitivePtr &primitive,
  MS_EXCEPTION_IF_NULL(ms_context);
  auto device = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  if (device == kGPUDevice) {
-    if (DynamicShapeConstInputToAttrGPU.find(primitive->name()) != DynamicShapeConstInputToAttrGPU.end()) {
+    if (IsOneOfDynamicShapeConstInputToAttrGPU(primitive->name())) {
      return input_abstract;
    }
-  } else if (DynamicShapeConstInputToAttr.find(primitive->name()) != DynamicShapeConstInputToAttr.end()) {
+  } else if (IsOneOfDynamicShapeConstInputToAttr(primitive->name())) {
    return input_abstract;
  }
  auto convert_input_list = reg.GetConstInputAttrInfo();
--- a/mindspore/ccsrc/backend/common/pass/custom_op_reg_info_to_attr.cc
+++ b/mindspore/ccsrc/backend/common/pass/custom_op_reg_info_to_attr.cc
@ -133,7 +133,7 @@ const AnfNodePtr CustomOpRegInfoToAttr::Process(const FuncGraphPtr &, const AnfN
  MS_EXCEPTION_IF_NULL(primitive);
  auto func_type = common::AnfAlgo::GetNodeAttr<std::string>(cnode, kAttrFuncType);
  // AKG/AICPU need to process attr, TBE will process later in the json creating phase.
-  if (kCustomTypeAkg.find(func_type) == kCustomTypeAkg.end() || func_type == kCustomTypeAICPU) {
+  if (!IsOneOfCustomAkgType(func_type) || func_type == kCustomTypeAICPU) {
    return nullptr;
  }
  // Early return if current node does not have attr
--- a/mindspore/ccsrc/backend/common/somas/somas.h
+++ b/mindspore/ccsrc/backend/common/somas/somas.h
@ -35,6 +35,7 @@
 #include "include/common/utils/anfalgo.h"
 #include "backend/common/session/kernel_graph.h"
 #include "runtime/hardware/device_type.h"
+#include "include/backend/visible.h"

 namespace mindspore {
 namespace somas {
@ -248,7 +249,7 @@ using SomasPtr = std::shared_ptr<Somas>;
 using SomasCreator = std::function<std::shared_ptr<Somas>()>;

 // @todo will delete when old runtime remove
-class SomasManager {
+class BACKEND_EXPORT SomasManager {
 public:
  static SomasManager &Instance() {
    static SomasManager instance{};
--- a/mindspore/ccsrc/backend/graph_compiler/backend.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/backend.cc
@ -399,7 +399,7 @@ void ClearInputDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *d
 }

 bool OpInBlackList(const session::BackendOpRunInfoPtr &op_run_info) {
-  return kOpCacheBlackList.find(op_run_info->base_op_run_info.op_name) != kOpCacheBlackList.end();
+  return IsOneOfCacheBlackList(op_run_info->base_op_run_info.op_name);
 }

 int GetExecutionMode() {
--- a/mindspore/ccsrc/backend/graph_compiler/graph_partition.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/graph_partition.cc
@ -28,9 +28,6 @@
 #include "utils/ms_context.h"
 #include "ps/ps_context.h"
 #include "utils/anf_utils.h"
-#ifdef ENABLE_D
-#include "include/transform/graph_ir/utils.h"
-#endif
 namespace mindspore {
 const char kMsConvert[] = "ms";
 const char kMsVm[] = "vm";
--- a/mindspore/ccsrc/common/graph_kernel/adapter/callback_impl.cc
+++ b/mindspore/ccsrc/common/graph_kernel/adapter/callback_impl.cc
@ -194,12 +194,17 @@ void CallbackImpl::ResetKernelInfo(const AnfNodePtr &node) {
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  if (GetTargetFromContext() == kAscendDevice) {
+    cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
    auto kernel_info_setter = GraphKernelInfoManager::Instance().GetGraphKernelInfo(kAscendDevice);
+    MS_EXCEPTION_IF_NULL(kernel_info_setter);
    kernel_info_setter->SetKernelInfo(cnode, KernelType::UNKNOWN_KERNEL_TYPE);
  } else if (GetTargetFromContext() == kGPUDevice) {
+    cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
    auto kernel_info_setter = GraphKernelInfoManager::Instance().GetGraphKernelInfo(kGPUDevice);
+    MS_EXCEPTION_IF_NULL(kernel_info_setter);
    kernel_info_setter->SetKernelInfo(cnode, KernelType::UNKNOWN_KERNEL_TYPE);
  } else {
+    cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
    auto kernel_info_setter = GraphKernelInfoManager::Instance().GetGraphKernelInfo(kCPUDevice);
    if (kernel_info_setter != nullptr) {
      kernel_info_setter->SetKernelInfo(cnode, KernelType::UNKNOWN_KERNEL_TYPE);
--- a/mindspore/ccsrc/common/graph_kernel/model/op_register.h
+++ b/mindspore/ccsrc/common/graph_kernel/model/op_register.h
@ -21,10 +21,11 @@

 #include "utils/hash_map.h"
 #include "common/graph_kernel/model/op_node.h"
+#include "include/backend/visible.h"

 namespace mindspore::graphkernel::inner {
 using CreatorFunc = std::function<PrimOpPtr(const std::string &)>;
-class OpRegistry {
+class BACKEND_EXPORT OpRegistry {
 public:
  static OpRegistry &Instance() {
    static OpRegistry instance{};
--- a/mindspore/ccsrc/cxx_api/CMakeLists.txt
+++ b/mindspore/ccsrc/cxx_api/CMakeLists.txt
@ -62,7 +62,13 @@ if(ENABLE_D)
          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/frontend/parallel/tensor_layout/array.cc"
          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/frontend/parallel/tensor_layout/map.cc"
          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/frontend/parallel/tensor_layout/arrangement.cc"
-          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/frontend/parallel/tensor_layout/shape_util.cc")
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/frontend/parallel/tensor_layout/shape_util.cc"
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/backend/common/optimizer/pattern_engine.cc"
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/backend/common/optimizer/helper.cc"
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/backend/common/optimizer/node_pass.cc"
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/backend/common/optimizer/visit.cc"
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/kernel/kernel_build_info.cc"
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/runtime/device/kernel_info.cc")
 endif()

 if(NOT ENABLE_TESTCASES AND NOT BUILD_LITE)
@ -70,11 +76,22 @@ if(NOT ENABLE_TESTCASES AND NOT BUILD_LITE)
    set(MSLIB_SRC ${MSLIB_SRC} ${CMAKE_SOURCE_DIR}/mindspore/core/utils/status.cc)
 endif()

+if(ENABLE_D OR ENABLE_ACL)
+    list(APPEND MSLIB_SRC
+      "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/plugin/device/ascend/optimizer/enhancer/add_placeholder_for_dynamic_rnn.cc")
+endif()
+
+if(ENABLE_GPU)
+    list(APPEND MSLIB_SRC "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_driver.cc")
+endif()
+
 if(BUILD_LITE)
    list(APPEND MSLIB_SRC "${CMAKE_CURRENT_SOURCE_DIR}/../../../mindspore/ccsrc/utils/config_manager.cc")
    file(GLOB_RECURSE ACL_REMOVE_SRC ${CMAKE_CURRENT_SOURCE_DIR}
            "model/acl/acl_vm/*.cc"
            )
+    list(REMOVE_ITEM MSLIB_SRC
+      "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/plugin/device/ascend/optimizer/enhancer/add_placeholder_for_dynamic_rnn.cc")
    list(REMOVE_ITEM MSLIB_SRC "${CMAKE_CURRENT_SOURCE_DIR}/akg_kernel_register.cc"
            "${CMAKE_CURRENT_SOURCE_DIR}/model/acl/acl_model_multi.cc"
            "${CMAKE_CURRENT_SOURCE_DIR}/model/acl/acl_model.cc"
@ -157,7 +174,8 @@ if(ENABLE_D)
 endif()

 if(ENABLE_GPU)
-    target_link_libraries(mindspore_shared_lib PRIVATE  gpu_cuda_lib cublas cuda_ops
+    target_link_libraries(mindspore_shared_lib PRIVATE cuda_ops
+                          ${CUBLAS_LIBRARY_PATH}
                          ${CUDA_PATH}/lib64/libcurand.so
                          ${CUDNN_LIBRARY_PATH}
                          ${CUDA_PATH}/lib64/libcudart.so
--- a/mindspore/ccsrc/debug/CMakeLists.txt
+++ b/mindspore/ccsrc/debug/CMakeLists.txt
@ -15,9 +15,6 @@ set(_OFFLINE_SRC_LIST

 if(ENABLE_DUMP_IR)
    file(GLOB_RECURSE _RDR_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "rdr/*.cc")
-    if(NOT ENABLE_D)
-        list(REMOVE_ITEM _RDR_SRC_LIST "rdr/task_debug_info_recorder.cc")
-    endif()
 endif()

 if("${ENABLE_HIDDEN}" STREQUAL "OFF")
--- a/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_or_list_eliminate.h
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_or_list_eliminate.h
@ -21,6 +21,7 @@
 #include <memory>
 #include <vector>
 #include <string>
+#include <map>

 #include "frontend/optimizer/optimizer_caller.h"
 #include "frontend/optimizer/anf_visitor.h"
@ -40,6 +41,8 @@ namespace irpass {
 // {prim::kPrimListGetItem, L, N}
 // {prim::kPrimTupleSetItem, T, N, Z}
 // {prim::kPrimListSetItem, L, N, Z}
+const std::map<std::string, size_t> kSliceAttrToIndex = {{kSliceStart, 1}, {kSliceStop, 2}, {kSliceStep, 3}};
+
 class TupleListConvertItemIndexToPositive : public AnfVisitor {
 public:
  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
--- a/mindspore/ccsrc/include/common/utils/utils.h
+++ b/mindspore/ccsrc/include/common/utils/utils.h
@ -29,6 +29,7 @@

 #include "utils/log_adapter.h"
 #include "ir/dtype/type.h"
+#include "include/common/visible.h"

 namespace mindspore {
 // op name. Op which not exists in operator/ops.h, so define it's name here
@ -661,7 +662,6 @@ constexpr auto kCustomTypePyfunc = "pyfunc";
 constexpr auto kCustomTypeTbe = "tbe";
 constexpr auto kCustomTypeAICPU = "aicpu";
 constexpr auto kCustomTypeHybrid = "hybrid";
-const std::set<std::string> kCustomTypeAkg = {"ir_builder", "tvm_compute", "hybrid"};

 // primal attr key name
 constexpr auto kPrimalAttrForwardNodeName = "forward_node_name";
@ -791,145 +791,23 @@ constexpr auto kOpFormat_FRACTAL_Z_3D = "FRACTAL_Z_3D";
 constexpr auto kOpFormat_FRACTAL_ZN_LSTM = "FRACTAL_ZN_LSTM";
 constexpr auto kOpFormat_FRACTAL_ZN_RNN = "FRACTAL_ZN_RNN";
 constexpr auto kOpFormat_ND_RNN_BIAS = "ND_RNN_BIAS";
-
-const std::set<std::string> kOpFormatList = {kOpFormat_DEFAULT,
-                                             kOpFormat_NC1KHKWHWC0,
-                                             kOpFormat_ND,
-                                             kOpFormat_NCHW,
-                                             kOpFormat_NHWC,
-                                             kOpFormat_HWCN,
-                                             kOpFormat_NC1HWC0,
-                                             kOpFormat_FRAC_Z,
-                                             kOpFormat_C1HWNCoC0,
-                                             kOpFormat_FRAC_NZ,
-                                             kOpFormat_NC1HWC0_C04,
-                                             kOpFormat_FRACTAL_Z_C04,
-                                             kOpFormat_NDHWC,
-                                             kOpFormat_FRACTAL_ZN_LSTM,
-                                             kOpFormat_FRACTAL_ZN_RNN,
-                                             kOpFormat_ND_RNN_BIAS,
-                                             kOpFormat_NDC1HWC0,
-                                             kOpFormat_NCDHW,
-                                             kOpFormat_FRACTAL_Z_3D,
-                                             kOpFormat_DHWNC,
-                                             kOpFormat_DHWCN};
-
 constexpr auto kSliceStart = "start";
 constexpr auto kSliceStop = "stop";
 constexpr auto kSliceStep = "step";
-const std::map<std::string, size_t> kSliceAttrToIndex = {{kSliceStart, 1}, {kSliceStop, 2}, {kSliceStep, 3}};

-const std::set<std::string> kDefaultCompatibleFormat = {kOpFormat_ND, kOpFormat_NCHW, kOpFormat_NHWC, kOpFormat_HWCN,
-                                                        kOpFormat_NCDHW};
-
-const std::set<std::string> kOptOperatorSet = {kMomentumOpName,
-                                               kApplyMomentumOpName,
-                                               kApplyAdadeltaOpName,
-                                               kApplyAdagradOpName,
-                                               kApplyAdagradDAName,
-                                               kApplyAdamOpName,
-                                               kApplyAdaMaxOpName,
-                                               kApplyAddSignOpName,
-                                               kApplyCenteredRMSPOpName,
-                                               kApplyFtrlOpName,
-                                               kApplyFtrlV2OpName,
-                                               kApplyGradientDescentOpName,
-                                               kApplyPowerSignOpName,
-                                               kApplyProximalAdagradOpName,
-                                               kApplyProximalGradientDescentOpName,
-                                               kApplyRMSPropOpName,
-                                               kAdamApplyOneWithDecayOpName,
-                                               kAdamApplyOneWithDecayAssignOpName,
-                                               kFusedAdamWeightDecayName,
-                                               kAdamWeightDecayName,
-                                               kFusedCastAdamWeightDecayName,
-                                               kFusedAdamName,
-                                               kFusedAdaFactorName,
-                                               kFusedAdaFactorWithGlobalNormName,
-                                               kFusedSparseAdamName,
-                                               kFusedMulApplyMomentumOpName,
-                                               kFusedWeightScaleApplyMomentum,
-                                               kFusedScaleApplyMomentum,
-                                               kApplyCenteredRMSPropOpName,
-                                               kFusedSparseFtrlName,
-                                               kFusedSparseProximalAdagradName,
-                                               kFusedSparseLazyAdamName,
-                                               kSparseApplyFtrlName,
-                                               kSparseApplyFtrlV2Name,
-                                               kSGDName,
-                                               kLARSUpdateName,
-                                               kCombineMomentumWeightOpName,
-                                               kCombineMomentumOpName,
-                                               kScatterAddOpName,
-                                               kScatterUpdateOpName,
-                                               kSparseApplyProximalAdagradOpName};
-
-const std::set<std::string> kNodeWithSeedOperators = {kGammaOpName,          kPoissonOpName,    kStandardLaplaceOpName,
-                                                      kStandardNormalOpName, kUniformIntOpName, kUniformRealOpName};
-const std::set<std::string> kPosteriorOperatorSet = {kPullOpName};
-
-const std::set<std::string> kOpCacheBlackList = {kUniformCandidateSamplerOpName, kInitDatasetQueueOpName,
-                                                 kGetNextOpName};
-
-const std::set<std::string> kOpNotSupportMultiThreadExecList = {kAvgPoolOpName, kAvgPoolGradOpName, kMaxPoolOpName,
-                                                                kBatchNorm, kBatchNormGradOpName};
-
-const std::set<std::string> kHWSpecialFormatSet = {
-  kOpFormat_FRACTAL_Z_3D,   kOpFormat_NC1KHKWHWC0, kOpFormat_NC1HWC0,       kOpFormat_FRAC_NZ,
-  kOpFormat_C1HWNCoC0,      kOpFormat_NC1HWC0_C04, kOpFormat_FRACTAL_Z_C04, kOpFormat_FRACTAL_ZN_LSTM,
-  kOpFormat_FRACTAL_ZN_RNN, kOpFormat_NDC1HWC0,    kOpFormat_FRAC_Z};
-
-const std::set<TypeId> kFloatDataTypeSet = {kNumberTypeFloat16, kNumberTypeFloat32};
-
-const std::set<std::string> kComputeDepend = {kUniqueOpName,
-                                              kUniqueConsecutiveOpName,
-                                              kComputeAccidentalHitsOpName,
-                                              kSubAndFilterOpName,
-                                              kPadAndShiftOpName,
-                                              kCTCGreedyDecoderOpName,
-                                              kMaskedSelectOpName,
-                                              kDynamicStitchOpName,
-                                              kGetNextOpName,
-                                              kListDiffOpName,
-                                              kNonMaxSuppressionV3OpName,
-                                              kNonMaxSuppressionWithOverlapsOpName,
-                                              kCoalesceOpName,
-                                              kTruncatedNormal,
-                                              kNonDeterministicInts,
-                                              kFractionalAvgPoolGradOpName,
-                                              kDenseToDenseSetOperation,
-                                              kSegmentMaxOpName,
-                                              kCSRSparseMatrixToSparseTensorOpName,
-                                              kSegmentMinOpName,
-                                              kLuUnpackOpName,
-                                              kSegmentSumOpName,
-                                              kResizeBicubicOpName,
-                                              kResizeAreaOpName,
-                                              kSegmentMeanOpName,
-                                              kSegmentProdOpName,
-                                              kNonZeroOpName,
-                                              kSparseSparseMinimumOpName,
-                                              kRpcRecvOpName,
-                                              kAdaptiveMaxPool3DGradOpName};
-
-const std::set<std::string> k3DFormatSet = {kOpFormat_NCDHW, kOpFormat_NDC1HWC0, kOpFormat_FRACTAL_Z_3D,
-                                            kOpFormat_NDHWC, kOpFormat_DHWCN,    kOpFormat_DHWNC};
-
-const std::set<std::string> kNoPaddingFormatSet = {kOpFormat_ChannelLast, kOpFormat_FRAC_NZ, kOpFormat_FRACTAL_ZN_RNN,
-                                                   kOpFormat_ND_RNN_BIAS};
-
-const std::set<std::string> DynamicShapeConstInputToAttr = {
-  kCastOpName,      kExpandDimsOpName, kEmbeddingLookupOpName, kReduceMinOpName, kReduceMeanOpName,
-  kReduceMaxOpName, kReduceAllOpName,  kReduceAnyOpName,       kConcatOpName,    kTransposeOpName};
-
-const std::set<std::string> DynamicShapeConstInputToAttrCPU = {
-  kCastOpName,      kExpandDimsOpName, kEmbeddingLookupOpName, kReduceMinOpName, kReduceMeanOpName, kReduceMaxOpName,
-  kReduceAllOpName, kReduceAnyOpName,  kConcatOpName,          kReduceSumOpName, kTransposeOpName};
-
-const std::set<std::string> DynamicShapeConstInputToAttrGPU = {
-  kCastOpName,      kExpandDimsOpName, kReshapeOpName,    kEmbeddingLookupOpName, kTransposeOpName,
-  kReduceSumOpName, kReduceMinOpName,  kReduceMeanOpName, kReduceMaxOpName,       kReduceAllOpName,
-  kReduceAnyOpName, kConcatOpName,     kScatterNdOpName,  kGatherV2OpName,        kAvgPool3DGradOpName};
+COMMON_EXPORT bool IsOneOfCustomAkgType(const std::string &name);
+COMMON_EXPORT bool IsOneOfOperator(const std::string &name);
+COMMON_EXPORT bool IsOneOfPosteriorOperator(const std::string &name);
+COMMON_EXPORT bool IsOneOfCacheBlackList(const std::string &name);
+COMMON_EXPORT bool IsOneOfNotSupportMultiThreadExec(const std::string &name);
+COMMON_EXPORT bool IsOneOf3DFormat(const std::string &format);
+COMMON_EXPORT bool IsOneOfNoPaddingFormat(const std::string &format);
+COMMON_EXPORT bool IsOneOfDynamicShapeConstInputToAttr(const std::string &name);
+COMMON_EXPORT bool IsOneOfDynamicShapeConstInputToAttrCPU(const std::string &name);
+COMMON_EXPORT bool IsOneOfDynamicShapeConstInputToAttrGPU(const std::string &name);
+COMMON_EXPORT bool IsOneOfComputeDepend(const std::string &name);
+COMMON_EXPORT bool IsOneOfHWSpecialFormat(const std::string &format);
+COMMON_EXPORT bool IsOneOfFormat(const std::string &format);

 // The map between kernel's output and input ref relationship.
 // Key is the output index while the value is input index which will be used as the reference of output.
--- a/mindspore/ccsrc/kernel/akg/akg_kernel_build_manager.h
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_build_manager.h
@ -21,12 +21,13 @@
 #include <utility>
 #include <memory>
 #include <string>
+#include "include/backend/visible.h"

 namespace mindspore {
 namespace kernel {
 using AkgKernelBuildCreator = std::function<std::shared_ptr<AkgKernelBuilder>()>;

-class AkgKernelBuildManager {
+class BACKEND_EXPORT AkgKernelBuildManager {
 public:
  static AkgKernelBuildManager &Instance();
  void Register(const std::string &device_type, AkgKernelBuildCreator &&creator);
--- a/mindspore/ccsrc/kernel/graph_kernel_info.h
+++ b/mindspore/ccsrc/kernel/graph_kernel_info.h
@ -25,6 +25,7 @@
 #include "ir/dtype.h"
 #include "ir/kernel_info_dev.h"
 #include "kernel/kernel.h"
+#include "include/backend/visible.h"
 namespace mindspore {
 class GraphKernelInfo {
 public:
@ -35,7 +36,7 @@ class GraphKernelInfo {

 using GraphKernelInfoCreator = std::function<std::shared_ptr<GraphKernelInfo>()>;

-class GraphKernelInfoManager {
+class BACKEND_EXPORT GraphKernelInfoManager {
 public:
  static GraphKernelInfoManager &Instance() {
    static GraphKernelInfoManager instance{};
@ -52,6 +53,7 @@ class GraphKernelInfoManager {
      MS_EXCEPTION_IF_NULL(iter->second);
      return (iter->second)();
    }
+    MS_LOG(WARNING) << "Can not get a graph kernel info ptr on device: " << device_type;
    return nullptr;
  }

--- a/mindspore/ccsrc/kernel/oplib/oplib.cc
+++ b/mindspore/ccsrc/kernel/oplib/oplib.cc
@ -67,7 +67,6 @@ constexpr auto kFormat = "format";
 constexpr auto kNeedCompile = "need_compile";
 constexpr auto kShape = "shape";
 constexpr auto kProcessor = "processor";
-std::multimap<std::string, std::shared_ptr<OpInfo>> OpLib::op_info_;

 static std::string ImplTypeToStr(OpImplyType impl_type) {
  switch (impl_type) {
--- a/mindspore/ccsrc/kernel/oplib/oplib.h
+++ b/mindspore/ccsrc/kernel/oplib/oplib.h
@ -27,16 +27,16 @@

 namespace mindspore {
 namespace kernel {
-class OpLib {
+class BACKEND_EXPORT OpLib {
 public:
  OpLib() = default;
  virtual ~OpLib() = default;
-  BACKEND_EXPORT static bool RegOp(const std::string &json_string, const std::string &impl_path);
+  static bool RegOp(const std::string &json_string, const std::string &impl_path);
  static std::shared_ptr<OpInfo> FindOp(const std::string &op_name, OpImplyType imply_type,
                                        bool is_dynamic_shape = false);

 protected:
-  static std::multimap<std::string, std::shared_ptr<OpInfo>> op_info_;
+  inline static std::multimap<std::string, std::shared_ptr<OpInfo>> op_info_ = {};

 private:
  static bool RegOpFromLocalInfo();
--- a/mindspore/ccsrc/pipeline/jit/init.cc
+++ b/mindspore/ccsrc/pipeline/jit/init.cc
@ -35,7 +35,7 @@
 #ifdef ENABLE_GPU_COLLECTIVE
 #include "plugin/device/gpu/hal/device/distribution/collective_init.h"
 #else
-#include "plugin/device/gpu/hal/device/distribution/collective_fake_init.h"
+#include "runtime/collective/collective_fake_init.h"
 #endif
 #if ((defined ENABLE_CPU) && (!defined _WIN32))
 #include "ps/util.h"
--- a/mindspore/ccsrc/plugin/device/ascend/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/ascend/CMakeLists.txt
@ -0,0 +1,85 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${CMAKE_BINARY_DIR})
+
+if(ENABLE_D OR ENABLE_ACL)
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/lib64)
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/lib64)
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/latest/lib64)
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/op_tiling)
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
+    set(MINDSPORE_RPATH
+            ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
+endif()
+
+########### mindspore_ascend.so #####
+set(ASCEND_SUB_COMP
+        hal/device
+        hal/hardware
+        hal/hccl_adapter
+        hal/profiler
+        kernel
+        optimizer
+        )
+
+foreach(a_comp ${ASCEND_SUB_COMP})
+    add_subdirectory(${a_comp})
+    string(REPLACE "/" "_" sub ${a_comp})
+    if(TARGET _mindspore_plugin_device_ascend_${sub}_obj)
+        list(APPEND ASCEND_SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_plugin_device_ascend_${sub}_obj>)
+        add_dependencies(_mindspore_plugin_device_ascend_${sub}_obj proto_input)
+    endif()
+endforeach()
+
+add_library(mindspore_ascend SHARED ${ASCEND_SUB_OBJECTS_SRC})
+target_link_libraries(mindspore_ascend PUBLIC mindspore_backend_common)
+target_link_libraries(mindspore_ascend PRIVATE mindspore_core mindspore_common proto_input mindspore::protobuf)
+target_link_libraries(mindspore_ascend PRIVATE securec)
+
+set_target_properties(mindspore_ascend PROPERTIES INSTALL_RPATH $ORIGIN)
+target_link_libraries(mindspore_ascend PRIVATE mindspore::dnnl mindspore::mkldnn mindspore::ssl
+                      mindspore::crypto nnacl)
+
+if(ENABLE_DEBUGGER)
+    # debugger: link grpc
+    if(ENABLE_D)
+        target_link_libraries(mindspore_ascend PRIVATE -Wl,--no-as-needed mindspore::grpc++)
+    endif()
+endif()
+
+if(ENABLE_D)
+    find_library(GE_RUNNER ge_runner ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(GRAPH graph ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(HCCL hccl ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    target_link_libraries(mindspore_ascend PUBLIC ${GE_RUNNER} ${GRAPH} ${HCCL})
+    target_link_libraries(mindspore_ascend PRIVATE mindspore::event mindspore::event_pthreads
+            mindspore::event_openssl -Wl,--no-as-needed mindspore::event_core ps_cache)
+endif()
+
+if(MODE_ASCEND_ALL)
+    MESSAGE("USE DAV LIB PATH: ${ASCEND_PATH}")
+    find_library(ERROR_MANAGER error_manager ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(RUNTIME_LIB runtime ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(TSDCLIENT tsdclient HINTS ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(DATATRANSFER datatransfer HINTS ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(PROFILING msprofiler ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(ACL ascendcl ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(ACL_TDT_CHANNEL acl_tdt_channel ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(PLATFORM platform ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(OPT_FEATURE opt_feature ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(adump_server libadump_server.a ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(OPTILING optiling ${ASCEND_CANN_OPP_PATH} ${ASCEND_TOOLKIT_OPP_PATH})
+
+    target_link_libraries(mindspore_ascend PUBLIC ${RUNTIME_LIB} ${TSDCLIENT} ${DATATRANSFER} ${ERROR_MANAGER}
+            -Wl,--no-as-needed ${OPTILING} ${PLATFORM} ${ACL} ${ACL_TDT_CHANNEL} ${OPT_FEATURE} ${PROFILING})
+    target_link_libraries(mindspore_ascend PRIVATE ${adump_server})
+endif()
+
+if(ENABLE_D)
+    if(ENABLE_MPI)
+        set_target_properties(_ascend_mpi PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH})
+    endif()
+endif()
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/kernel/aicpu/aicpu_ops)
+add_subdirectory(kernel/aicpu/aicpu_ops)
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_data_queue.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_data_queue.h
@ -24,10 +24,11 @@
 #include "runtime/hardware/device_context_manager.h"
 #include "runtime/data_queue/data_queue.h"
 #include "runtime/rt.h"
+#include "include/backend/visible.h"

 namespace mindspore {
 namespace device {
-class AscendDataQueueDynamic : public DataQueue {
+class BACKEND_EXPORT AscendDataQueueDynamic : public DataQueue {
 public:
  explicit AscendDataQueueDynamic(const size_t capacity);
  virtual ~AscendDataQueueDynamic() = default;
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_select_ascend.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_select_ascend.cc
@ -83,8 +83,7 @@ string GetPriorityMatchFormat(const CNodePtr &cnode) {
  size_t input_num = common::AnfAlgo::GetInputTensorNum(cnode);
  for (size_t index = 0; index < input_num; ++index) {
    auto pre_output_format = AnfAlgo::GetPrevNodeOutputFormat(cnode, index);
-    if (AnfAlgo::IsFeatureMapInput(cnode, index) &&
-        kHWSpecialFormatSet.find(pre_output_format) != kHWSpecialFormatSet.end()) {
+    if (AnfAlgo::IsFeatureMapInput(cnode, index) && IsOneOfHWSpecialFormat(pre_output_format)) {
      priority_matched_format = !is_init ? pre_output_format : priority_matched_format;
      is_init = true;
    }
@ -494,7 +493,7 @@ KernelSelectStatus SelectCustomKernelInfo(const CNodePtr &kernel_node, KernelTyp
  auto func_type = common::AnfAlgo::GetNodeAttr<std::string>(kernel_node, kAttrFuncType);
  if (func_type == kCustomTypeTbe) {
    *kernel_type = KernelType::TBE_KERNEL;
-  } else if (kCustomTypeAkg.find(func_type) != kCustomTypeAkg.end()) {
+  } else if (IsOneOfCustomAkgType(func_type)) {
    *kernel_type = KernelType::AKG_KERNEL;
  } else if (func_type == kCustomTypeAICPU) {
    *kernel_type = KernelType::AICPU_KERNEL;
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_manager.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_manager.cc
@ -27,6 +27,7 @@
 #include "runtime/base.h"
 #include <nlohmann/json.hpp>
 #include "plugin/device/ascend/hal/device/profiling/profiling_utils.h"
+#include "plugin/device/ascend/hal/profiler/ascend_profiling.h"

 using mindspore::device::ascend::ProfilingUtils;

--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.cc
@ -20,6 +20,7 @@
 #include "plugin/device/ascend/kernel/ascend_kernel_mod.h"
 #include "include/common/utils/utils.h"
 #include "backend/common/session/kernel_graph.h"
+#include "plugin/device/ascend/hal/profiler/ascend_profiling.h"

 namespace mindspore {
 namespace device {
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ps/ascend_ps_cache.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ps/ascend_ps_cache.cc
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include "ps/ps_cache/ascend/ascend_ps_cache.h"
+#include "plugin/device/ascend/hal/device/ps/ascend_ps_cache.h"
 #include <google/protobuf/text_format.h>
 #include <string>
 #include <vector>
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ps/ascend_ps_cache.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ps/ascend_ps_cache.h
@ -14,8 +14,8 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_PS_PS_CACHE_ASCEND_ASCEND_PS_CACHE_H_
-#define MINDSPORE_CCSRC_PS_PS_CACHE_ASCEND_ASCEND_PS_CACHE_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_PS_ASCEND_PS_CACHE_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_PS_ASCEND_PS_CACHE_H_

 #include <string>
 #include <vector>
@ -72,4 +72,4 @@ class AscendPsCache : public PsCacheBasic {
 }  // namespace ascend
 }  // namespace ps
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_PS_PS_CACHE_ASCEND_ASCEND_PS_CACHE_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_PS_ASCEND_PS_CACHE_H_
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.cc
@ -13,7 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include "debug/rdr/task_debug_info_recorder.h"
+#include "plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.h"
 #include <utility>
 #include "plugin/device/ascend/hal/device/tasksink/task_generator.h"
 #include "include/common/debug/rdr/recorder_manager.h"
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.h
@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_CCSRC_DEBUG_RDR_TASK_DEBUG_INFO_RECORDER_H_
-#define MINDSPORE_CCSRC_DEBUG_RDR_TASK_DEBUG_INFO_RECORDER_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_TASKSINK_TASK_DEBUG_INFO_RECORDER_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_TASKSINK_TASK_DEBUG_INFO_RECORDER_H_
 #include <vector>
 #include <string>
 #include <memory>
@ -49,4 +49,4 @@ bool RecordTaskDebugInfo(SubModuleId module, const std::string &name,
                         const std::vector<TaskDebugInfoPtr> &task_debug_info_list);
 }  // namespace RDR
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_DEBUG_RDR_TASK_DEBUG_INFO_RECORDER_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_TASKSINK_TASK_DEBUG_INFO_RECORDER_H_
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_generator.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_generator.cc
@ -28,7 +28,7 @@
 #include "plugin/device/ascend/hal/device/profiling/profiling_manager.h"
 #endif
 #ifdef ENABLE_DUMP_IR
-#include "debug/rdr/task_debug_info_recorder.h"
+#include "plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.h"
 #endif
 #include "mindspore/core/utils/file_utils.h"

--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_utils.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_utils.h
@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_UTILS_H_

 #include <string>
+#include <set>
 #include "plugin/device/ascend/hal/hardware/ascend_device_context.h"
 #include "backend/common/session/kernel_graph.h"

--- a/mindspore/ccsrc/plugin/device/ascend/hal/profiler/memory_profiling.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/profiler/memory_profiling.h
@ -23,6 +23,7 @@
 #include <vector>
 #include <memory>
 #include "utils/ms_context.h"
+#include "include/backend/visible.h"

 namespace mindspore {
 namespace profiler {
@ -107,8 +108,8 @@ class MemoryProfiling {
    return instance;
  }

-  std::shared_ptr<GraphMemory> AddGraphMemoryNode(uint32_t graph_id);
-  std::shared_ptr<GraphMemory> GetGraphMemoryNode(uint32_t graph_id) const;
+  BACKEND_EXPORT std::shared_ptr<GraphMemory> AddGraphMemoryNode(uint32_t graph_id);
+  BACKEND_EXPORT std::shared_ptr<GraphMemory> GetGraphMemoryNode(uint32_t graph_id) const;
  void SetDeviceMemSize(uint64_t size) { device_mem_size_ = size; }
  bool MemoryToPB();
  void SaveMemoryProfiling();
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_kernel_build.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_kernel_build.cc
@ -439,7 +439,7 @@ void CreateExtInfo(const std::shared_ptr<AnfNode> &anf_node, const std::shared_p

  UnknowShapeOpType shape_type = UnknowShapeOpType::DEPEND_IN_SHAPE;
  auto op_name = common::AnfAlgo::GetCNodeName(anf_node);
-  if (kComputeDepend.find(op_name) != kComputeDepend.end()) {
+  if (IsOneOfComputeDepend(op_name)) {
    shape_type = UnknowShapeOpType::DEPEND_COMPUTE;
  }
  ext_info_offset = SetExtInfoShapeType(ext_info_buf, ext_info_offset, shape_type);
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/dynamic_aicpu_kernel_mod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/dynamic_aicpu_kernel_mod.cc
@ -39,7 +39,7 @@ DynamicAicpuOpKernelMod::DynamicAicpuOpKernelMod(const AnfNodePtr &anf_node_ptr)
  auto cnode = anf_node_ptr->cast<CNodePtr>();
  if (cnode != nullptr) {
    auto op_name = common::AnfAlgo::GetCNodeName(cnode);
-    if (kComputeDepend.find(op_name) != kComputeDepend.end()) {
+    if (IsOneOfComputeDepend(op_name)) {
      unknow_type_ = device::ascend::UnknowShapeOpType::DEPEND_COMPUTE;
    }
  }
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/ascend_kernel_mod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/ascend_kernel_mod.cc
@ -47,7 +47,7 @@ bool AscendKernelMod::IsNeedRetrieveOutputShape() {
  MS_EXCEPTION_IF_NULL(cnode);

  auto op_name = common::AnfAlgo::GetCNodeName(cnode);
-  if (kComputeDepend.find(op_name) != kComputeDepend.end()) {
+  if (IsOneOfComputeDepend(op_name)) {
    is_need_retrieve_output_shape_ = true;
  }
  return is_need_retrieve_output_shape_;
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.cc
@ -166,8 +166,7 @@ void SingleTbeJsonCreator::GenInputDescJson(const AnfNodePtr &anf_node, size_t r
  auto def_format = TbeJsonUtils::IsNeedChangeDefaultFormat(anf_node) ? kOpFormat_NCDHW : kOpFormat_NCHW;
  auto format = AnfAlgo::GetInputFormat(anf_node, real_input_index);
  format = TbeAdapter::FormatPass(format, ori_shape.size());
-  format =
-    (def_format == kOpFormat_NCDHW && k3DFormatSet.find(format) == k3DFormatSet.end()) ? kOpFormat_NCDHW : format;
+  format = (def_format == kOpFormat_NCDHW && !IsOneOf3DFormat(format)) ? kOpFormat_NCDHW : format;
  auto d_type = AnfAlgo::GetInputDeviceDataType(anf_node, real_input_index);
  (*input_desc)[kJDtype] = tbe::TypeIdToString(d_type);
  (*input_desc)[kJDataType] = GetJsonValue<std::string>(*input_desc, kJDtype);
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/tbe_json_creator.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/tbe_json_creator.cc
@ -426,8 +426,7 @@ void TbeJsonCreator::GenDescJson(const AnfNodePtr &anf_node, size_t node_out_idx
  auto format = AnfAlgo::GetOutputFormat(anf_node, node_out_idx);
  format = tbe::TbeAdapter::FormatPass(format, ori_shape.size());
  auto def_format = TbeJsonUtils::IsNeedChangeDefaultFormat(anf_node) ? kOpFormat_NCDHW : kOpFormat_NCHW;
-  format =
-    (def_format == kOpFormat_NCDHW && k3DFormatSet.find(format) == k3DFormatSet.end()) ? kOpFormat_NCDHW : format;
+  format = (def_format == kOpFormat_NCDHW && !IsOneOf3DFormat(format)) ? kOpFormat_NCDHW : format;

  (*output_desc)[kJDataType] = tbe::TypeIdToString(AnfAlgo::GetOutputDeviceDataType(anf_node, node_out_idx));
  (*output_desc)[kJDtype] = GetJsonValue<std::string>(*output_desc, kJDataType);
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/common_utils.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/common_utils.cc
@ -70,12 +70,12 @@ std::vector<int64_t> HostCheck::GetFinalInferShape(const AnfNodePtr &node, const
  }

  auto temp_shape = infer_shape;
-  if (kNoPaddingFormatSet.find(format) == kNoPaddingFormatSet.end() && format != kOpFormat_FRACTAL_ZN_LSTM &&
-      infer_shape.size() < kShape4dDims && k3DFormatSet.find(format) == k3DFormatSet.end()) {
+  if (!IsOneOfNoPaddingFormat(format) && format != kOpFormat_FRACTAL_ZN_LSTM && infer_shape.size() < kShape4dDims &&
+      !IsOneOf3DFormat(format)) {
    MS_LOG(DEBUG) << "Get Device Shape using a shape size is less than 4 ,should be Padding shape by Default firstly";
    temp_shape = trans::PaddingShapeTo4dDefault(infer_shape, node);
  }
-  if (infer_shape.size() != kNcdhwShapeSize && k3DFormatSet.find(format) != k3DFormatSet.end()) {
+  if (infer_shape.size() != kNcdhwShapeSize && IsOneOf3DFormat(format)) {
    temp_shape = trans::PaddingShapeTo5dDefault(infer_shape, node);
  }
  return temp_shape;
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.cc
@ -41,7 +41,6 @@ constexpr auto kPrefixOutput = "output";
 constexpr char kParamTypeDynamic[] = "dynamic";
 constexpr char kParamTypeRequre[] = "required";
 constexpr char kParamTypeOptional[] = "optional";
-mindspore::HashMap<std::string, std::vector<std::shared_ptr<KernelBuildInfo>>> TbeKernelSelect::select_cache_ = {};

 void TbeMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list) {
  auto tbe_selecter = TbeKernelSelect(kernel_node, kernel_info_list);
@ -236,7 +235,7 @@ void TbeKernelSelect::GetAgnosticPatternKernelInfo(const OpInfo &op_info) {
    MS_LOG(EXCEPTION) << "AgnosticPattern only support one input.";
  }
  auto format = AnfAlgo::GetPrevNodeOutputFormat(cnode_ptr_, 0);
-  if (kOpFormatList.find(format) == kOpFormatList.end()) {
+  if (!IsOneOfFormat(format)) {
    MS_LOG(INFO) << "Got the unknown format " << format;
    format = kOpFormat_DEFAULT;
  }
@ -332,7 +331,7 @@ bool TbeKernelSelect::IsShapeMatchFormat(const ShapeVector &shape, const std::st
  }
  static const std::set<std::string> kServerNotSupportFormat = {kOpFormat_NC1HWC0_C04, kOpFormat_FRACTAL_Z_C04};
  // if format is default, it remarkes support all format
-  if (kOpFormatList.find(format) == kOpFormatList.end()) {
+  if (!IsOneOfFormat(format)) {
    MS_LOG(EXCEPTION) << "Got the unknown format " << format;
  }
  // server not support format with C04 suffix
@ -346,7 +345,7 @@ bool TbeKernelSelect::IsShapeMatchFormat(const ShapeVector &shape, const std::st
  }
  // not support format:
  // 1 3d formats with shape size > 5
-  if (k3DFormatSet.find(format) != k3DFormatSet.end() && shape.size() > kShape5dDims) {
+  if (IsOneOf3DFormat(format) && shape.size() > kShape5dDims) {
    return false;
  }
  return true;
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.h
@ -80,7 +80,7 @@ class TbeKernelSelect {
  nlohmann::json kernel_json;
  std::string kernel_hash_name;
  bool check_cnode;
-  static mindspore::HashMap<std::string, std::vector<std::shared_ptr<KernelBuildInfo>>> select_cache_;
+  inline static mindspore::HashMap<std::string, std::vector<std::shared_ptr<KernelBuildInfo>>> select_cache_ = {};
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_comm_op_reuse.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_comm_op_reuse.cc
@ -20,6 +20,7 @@
 #include "include/common/utils/anfalgo.h"
 #include "include/common/utils/comm_manager.h"
 #include "include/common/utils/parallel_context.h"
+#include "runtime/graph_scheduler/graph_compiler.h"
 #include "plugin/device/ascend/hal/device/ascend_stream_assign.h"
 #include "plugin/device/ascend/optimizer/ascend_comm_op_reuse.h"

--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_helper.h
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_helper.h
@ -29,6 +29,7 @@
 #include "plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.h"

 namespace mindspore {
+const std::set<TypeId> kFloatDataTypeSet = {kNumberTypeFloat16, kNumberTypeFloat32};
 namespace opt {
 class KernelSelect {
 public:
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/enhancer/add_attr_for_3d_graph.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/enhancer/add_attr_for_3d_graph.cc
@ -32,7 +32,7 @@ const AnfNodePtr AddIoFormatAttrFor3DGraph::Process(const FuncGraphPtr &func_gra
    common::AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
    auto formats = AnfAlgo::GetAllOutputFormats(node);
    if (std::any_of(formats.begin(), formats.end(),
-                    [](const std::string &format) { return k3DFormatSet.find(format) != k3DFormatSet.end(); })) {
+                    [](const std::string &format) { return IsOneOf3DFormat(format); })) {
      common::AnfAlgo::SetNodeAttr(kAttrFormat, MakeValue(kOpFormat_NCDHW), node);
    }
    return node;
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/check_consistency.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/check_consistency.cc
@ -28,6 +28,9 @@
 namespace mindspore {
 namespace opt {
 namespace {
+const std::set<std::string> kDefaultCompatibleFormat = {kOpFormat_ND, kOpFormat_NCHW, kOpFormat_NHWC, kOpFormat_HWCN,
+                                                        kOpFormat_NCDHW};
+
 bool CheckFormatForConsistency(const CNodePtr &node, const size_t input_index) {
  MS_EXCEPTION_IF_NULL(node);
  // get prior node's device output format
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/rectify_do_mask_kernel_info.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/rectify_do_mask_kernel_info.cc
@ -15,7 +15,6 @@
 */

 #include "plugin/device/ascend/optimizer/format_type/rectify_do_mask_kernel_info.h"
-
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "include/common/utils/anfalgo.h"
 #include "kernel/kernel_build_info.h"
@ -67,7 +66,7 @@ void RectifyDoMaskKernelInfo::RectifyKernelInfo(const std::vector<CNodePtr> &do_
  std::string convert_format;
  for (const auto &do_mask : do_mask_node_list) {
    auto do_mask_data_format = AnfAlgo::GetInputFormat(do_mask, 0);
-    if (special_format.empty() && kHWSpecialFormatSet.find(do_mask_data_format) != kHWSpecialFormatSet.end()) {
+    if (special_format.empty() && IsOneOfHWSpecialFormat(do_mask_data_format)) {
      special_format = do_mask_data_format;
    }
    if (format_counter.find(do_mask_data_format) == format_counter.end()) {
@ -99,7 +98,7 @@ std::string RectifyDoMaskKernelInfo::GetConvertFormat(const std::map<std::string
    if (counter < iter.second) {
      convert_format = iter.first;
      counter = iter.second;
-    } else if (counter == iter.second && kHWSpecialFormatSet.find(iter.first) != kHWSpecialFormatSet.end()) {
+    } else if (counter == iter.second && IsOneOfHWSpecialFormat(iter.first)) {
      convert_format = iter.first;
    }
  }
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/trans_op_format_refine.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/trans_op_format_refine.cc
@ -39,13 +39,13 @@ const AnfNodePtr TransOpFormatRefine::Process(const FuncGraphPtr &func_graph, co
    auto builder =
      std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(AnfAlgo::GetSelectKernelBuildInfo(node));
    MS_EXCEPTION_IF_NULL(builder);
-    if (in_format == kOpFormat_DEFAULT && k3DFormatSet.find(out_format) != k3DFormatSet.end()) {
+    if (in_format == kOpFormat_DEFAULT && IsOneOf3DFormat(out_format)) {
      builder->SetInputsFormat({kOpFormat_NCDHW});
      builder->SetOutputsFormat({out_format});
      AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), node.get());
      common::AnfAlgo::SetNodeAttr(kAttrSrcFormat, MakeValue(kOpFormat_NCDHW), node);
    }
-    if (out_format == kOpFormat_DEFAULT && k3DFormatSet.find(in_format) != k3DFormatSet.end()) {
+    if (out_format == kOpFormat_DEFAULT && IsOneOf3DFormat(in_format)) {
      builder->SetInputsFormat({in_format});
      builder->SetOutputsFormat({kOpFormat_NCDHW});
      AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), node.get());
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fission/seed_adapter.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fission/seed_adapter.cc
@ -30,6 +30,8 @@

 namespace mindspore::opt {
 namespace {
+const std::set<std::string> kNodeWithSeedOperators = {kGammaOpName,          kPoissonOpName,    kStandardLaplaceOpName,
+                                                      kStandardNormalOpName, kUniformIntOpName, kUniformRealOpName};
 tensor::TensorPtr CreateTensor(int64_t seed) {
  // 1 create seed tensor
  std::vector<int64_t> indices_shape = {1};
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_fusion.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_fusion.cc
@ -14,10 +14,8 @@
 * limitations under the License.
 */
 #include "plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_fusion.h"
-#include "backend/common/optimizer/helper.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
-#include "include/common/utils/anfalgo.h"
-#include "utils/trace_base.h"
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 namespace mindspore {
 namespace opt {
 const BaseRef AdamApplyOneFusion::DefinePattern() const {
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_with_decay_rule.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_with_decay_rule.cc
@ -15,6 +15,7 @@
 */
 #include "plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_with_decay_rule.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 #include "include/common/utils/anfalgo.h"
 #include "ir/primitive.h"
 #include "backend/common/optimizer/helper.h"
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_rule.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_rule.cc
@ -21,6 +21,7 @@
 #include "include/common/utils/utils.h"
 #include "backend/common/optimizer/helper.h"
 #include "mindspore/core/ops/core_ops.h"
+#include "plugin/device/ascend/optimizer/ascend_helper.h"

 namespace mindspore {
 namespace opt {
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_rule.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_rule.cc
@ -16,6 +16,7 @@
 #include "plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_rule.h"
 #include <utility>
 #include "backend/common/session/anf_runtime_algorithm.h"
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 #include "include/common/utils/anfalgo.h"
 #include "frontend/optimizer/opt.h"
 #include "utils/trace_base.h"
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_v1_rule.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_v1_rule.cc
@ -19,7 +19,7 @@
 #include <string>
 #include <tuple>
 #include <utility>
-
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "include/common/utils/anfalgo.h"
 #include "frontend/optimizer/opt.h"
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_right_rule.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_right_rule.cc
@ -16,6 +16,7 @@
 #include "plugin/device/ascend/optimizer/ir_fusion/lamb_next_right_rule.h"
 #include <vector>
 #include "backend/common/optimizer/helper.h"
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 #include "utils/trace_base.h"
 namespace mindspore {
 namespace opt {
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_update_with_lr_rule_fusion.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_update_with_lr_rule_fusion.cc
@ -17,7 +17,7 @@

 #include <memory>
 #include <vector>
-
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "include/common/utils/anfalgo.h"
 #include "ir/primitive.h"
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_update_with_lr_v2.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_update_with_lr_v2.cc
@ -19,6 +19,7 @@
 #include <algorithm>
 #include "include/common/utils/utils.h"
 #include "mindspore/core/ops/core_ops.h"
+#include "plugin/device/ascend/optimizer/ascend_helper.h"

 namespace mindspore {
 namespace opt {
--- a/mindspore/ccsrc/plugin/device/cpu/hal/device/kernel_select_cpu.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/hal/device/kernel_select_cpu.cc
@ -439,7 +439,7 @@ std::pair<std::string, ExceptionType> SetKernelInfoWithMsg(const CNodePtr &kerne
  const std::string &op_name = common::AnfAlgo::GetCNodeName(kernel_node);
  if (IsPrimitiveCNode(kernel_node, prim::kPrimCustom)) {
    auto tp = common::AnfAlgo::GetNodeAttr<std::string>(kernel_node, kAttrFuncType);
-    if (kCustomTypeAkg.find(tp) != kCustomTypeAkg.end()) {
+    if (IsOneOfCustomAkgType(tp)) {
      UpdateCustomKernelBuildInfo(kernel_node, true);
      return {};
    }
--- a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc
@ -345,8 +345,7 @@ bool CPUKernelExecutor::LaunchKernel(const CNodePtr &kernel, const std::vector<A

  // Some CPU kernels can't initialize kernel and launch kernel in different thread, so reinitialize the kernels before
  // launch.
-  if (kOpNotSupportMultiThreadExecList.find(common::AnfAlgo::GetCNodeName(kernel)) !=
-      kOpNotSupportMultiThreadExecList.end()) {
+  if (IsOneOfNotSupportMultiThreadExec(common::AnfAlgo::GetCNodeName(kernel))) {
    auto cpu_kernel_mod = dynamic_cast<kernel::DeprecatedNativeCpuKernelMod *>(kernel_mod);
    MS_EXCEPTION_IF_NULL(cpu_kernel_mod);
    cpu_kernel_mod->InitKernel(kernel);
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.cc
@ -89,8 +89,6 @@ std::vector<KernelAttr> NativeCpuKernelMod::GetSupportFromOpLib(const std::strin
  return support_kernel_attrs;
 }

-mindspore::HashMap<std::string, std::vector<KernelAttr>> NativeCpuKernelMod::support_map_{};
-
 int DeprecatedNativeCpuKernelMod::Resize(const BaseOperatorPtr &base_operator,
                                         const std::vector<KernelTensorPtr> &inputs,
                                         const std::vector<KernelTensorPtr> &outputs,
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.h
@ -169,7 +169,7 @@ class BACKEND_EXPORT NativeCpuKernelMod : public CpuKernelMod {
 private:
  std::vector<KernelAttr> GetAllSupportedList(const std::string &kernel_name);
  std::vector<KernelAttr> GetSupportFromOpLib(const std::string &kernel_name) const;
-  static mindspore::HashMap<std::string, std::vector<KernelAttr>> support_map_;
+  inline static mindspore::HashMap<std::string, std::vector<KernelAttr>> support_map_;
 };

 class BACKEND_EXPORT DeprecatedNativeCpuKernelMod : public NativeCpuKernelMod {
--- a/mindspore/ccsrc/plugin/device/gpu/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/gpu/CMakeLists.txt
@ -0,0 +1,62 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${CMAKE_BINARY_DIR})
+include_directories(${CUDNN_INCLUDE_PATH} ${CUDA_PATH} ${CUDA_INCLUDE_DIRS} ${CUPTI_INCLUDE_DIRS})
+add_subdirectory(kernel/cuda_impl)
+
+########### mindspore_gpu.so #####
+set(GPU_SUB_COMP
+        hal/device
+        hal/hardware
+        hal/profiler
+        kernel
+        optimizer
+        )
+
+if(ENABLE_GPU)
+    foreach(g_comp ${GPU_SUB_COMP})
+        add_subdirectory(${g_comp})
+        string(REPLACE "/" "_" sub ${g_comp})
+        if(TARGET _mindspore_plugin_device_gpu_${sub}_obj)
+            list(APPEND GPU_SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_plugin_device_gpu_${sub}_obj>)
+            add_dependencies(_mindspore_plugin_device_gpu_${sub}_obj proto_input)
+        endif()
+    endforeach()
+endif()
+
+add_library(mindspore_gpu SHARED ${GPU_SUB_OBJECTS_SRC})
+target_link_libraries(mindspore_gpu PUBLIC mindspore_backend_common)
+target_link_libraries(mindspore_gpu PRIVATE mindspore_core mindspore_common proto_input mindspore::protobuf)
+target_link_libraries(mindspore_gpu PRIVATE securec)
+set_target_properties(mindspore_gpu PROPERTIES INSTALL_RPATH $ORIGIN)
+target_link_libraries(mindspore_gpu PRIVATE mindspore::dnnl mindspore::mkldnn nnacl)
+target_link_libraries(mindspore_gpu PRIVATE mindspore::ssl mindspore::crypto)
+target_link_libraries(mindspore_gpu PRIVATE mindspore::event mindspore::event_pthreads
+        mindspore::event_openssl -Wl,--no-as-needed mindspore::event_core ps_cache)
+
+if(ENABLE_GPU)
+    message("add gpu lib to mindspore_gpu")
+    target_link_libraries(mindspore_gpu PRIVATE cuda_ops
+            ${CUBLAS_LIBRARY_PATH}
+            ${CUDA_PATH}/lib64/libcurand.so
+            ${CUDNN_LIBRARY_PATH}
+            ${CUDA_PATH}/lib64/libcudart.so
+            ${CUDA_PATH}/lib64/stubs/libcuda.so
+            ${CUDA_PATH}/lib64/libcusolver.so
+            ${CUDA_PATH}/lib64/libcufft.so
+            ${CUDA_PATH}/lib64/libcusparse.so)
+endif()
+
+if(ENABLE_DEBUGGER)
+    # debugger: link grpc
+    if(ENABLE_GPU)
+        target_link_libraries(mindspore_gpu PRIVATE -Wl,--no-as-needed mindspore::grpc++)
+    endif()
+endif()
+
+if(ENABLE_GPU)
+    if(ENABLE_MPI)
+        set_target_properties(_ms_mpi PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH})
+        set_target_properties(nvidia_collective PROPERTIES INSTALL_RPATH ${ORIGIN_PATH})
+        set_target_properties(gpu_collective PROPERTIES INSTALL_RPATH ${ORIGIN_PATH})
+    endif()
+endif()
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/CMakeLists.txt
@ -3,11 +3,29 @@ if("${ENABLE_HIDDEN}" STREQUAL "OFF")
    string(REPLACE " -fvisibility=hidden" " -fvisibility=default" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()

+list(APPEND DEVICE_SRC_LIST "ps/gpu_ps_cache.cc")
 if(ENABLE_GPU)
-    list(APPEND DEVICE_SRC_LIST "distribution/collective_init.cc")
-    list(APPEND DEVICE_SRC_LIST "gpu_comm_manager.cc")
-else()
-    list(APPEND DEVICE_SRC_LIST "distribution/collective_fake_init.cc")
+    list(APPEND DEVICE_SRC_LIST ${CMAKE_SOURCE_DIR}/mindspore/ccsrc/common/mem_reuse/mem_reuse.cc)
+    list(APPEND DEVICE_SRC_LIST ${CMAKE_SOURCE_DIR}/mindspore/ccsrc/common/mem_reuse/mem_swap_manager.cc)
+    list(APPEND DEVICE_SRC_LIST ${CMAKE_SOURCE_DIR}/mindspore/ccsrc/runtime/data_queue/data_queue.h)
+    file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
+    list(REMOVE_ITEM DEVICE_SRC_LIST
+            "mpi/mpi_initializer.cc"
+            "distribution/collective_wrapper.cc"
+            "distribution/mpi_wrapper.cc"
+            "distribution/nccl_wrapper.cc"
+            "trt_loader.cc")
+    if(NOT ${TENSORRT_HOME} STREQUAL "")
+        find_path(TENSORRT_HOME_INCLUDE NvInfer.h HINTS ${TENSORRT_HOME}/include)
+        if(TENSORRT_HOME_INCLUDE STREQUAL TENSORRT_HOME_INCLUDE-NOTFOUND)
+            message(FATAL_ERROR "Tensor-RT dir not exist ${TENSORRT_HOME}")
+        endif()
+        message("Enable GPU inference. Tensor-RT include dir: ${TENSORRT_HOME_INCLUDE}")
+        set(ENABLE_GPU_INFER TRUE)
+        add_compile_definitions(ENABLE_GPU_INFER)
+        include_directories(${TENSORRT_HOME_INCLUDE})
+        list(APPEND DEVICE_SRC_LIST ${CMAKE_CURRENT_SOURCE_DIR}/trt_loader.cc)
+    endif()
 endif()

 if(ENABLE_GPU)
@ -19,8 +37,6 @@ if(ENABLE_GPU)
    endif()

    file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc" "*.cu")
-
-    #set(GPU_QUEUE_SRCS "blocking_queue.cc" "gpu_buffer_mgr.cc" "data_queue.cc")
    set(GPU_COLLECTIVE_SRCS "distribution/collective_wrapper.cc"
                            "distribution/mpi_wrapper.cc"
                            "distribution/nccl_wrapper.cc")
@ -36,8 +52,6 @@ if(ENABLE_GPU)
        target_link_libraries(gpu_collective PRIVATE mindspore::ompi mindspore::nccl)
        target_link_libraries(_ms_mpi PRIVATE gpu_collective)
    endif()
-
-    # add_library(_mindspore_device_cuda_obj OBJECT ${CUDA_SRC_LIST})
 endif()

 set_property(SOURCE ${DEVICE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_data_queue.h
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_data_queue.h
@ -24,10 +24,11 @@
 #include <functional>
 #include "runtime/data_queue/data_queue.h"
 #include "runtime/hardware/device_context_manager.h"
+#include "include/backend/visible.h"

 namespace mindspore {
 namespace device {
-class GpuDataQueueDynamic : public DataQueue {
+class BACKEND_EXPORT GpuDataQueueDynamic : public DataQueue {
 public:
  explicit GpuDataQueueDynamic(const size_t capacity);
  virtual ~GpuDataQueueDynamic() = default;
@ -49,7 +50,7 @@ class GpuDataQueueDynamic : public DataQueue {
  std::unique_ptr<NodeInfo[]> node_info_;
 };

-class GpuQueue : public DataQueue {
+class BACKEND_EXPORT GpuQueue : public DataQueue {
 public:
  GpuQueue(void *addr, const std::vector<size_t> &shape, const size_t &capacity);
  virtual ~GpuQueue();
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc
@ -193,7 +193,7 @@ bool SelectCustomKernel(const CNodePtr &kernel_node, const std::shared_ptr<Kerne
      kernel::Factory<kernel::NativeGpuKernelMod>::Instance().Register(
        op_name, []() { return std::make_shared<kernel::CustomAOTGpuKernelMod>(); });
    }
-  } else if (kCustomTypeAkg.find(func_type) != kCustomTypeAkg.end()) {
+  } else if (IsOneOfCustomAkgType(func_type)) {
    *kernel_type = KernelType::AKG_KERNEL;
  } else {
    MS_LOG(EXCEPTION) << "Unsupported func type [" << func_type << "] for Custom op [" << op_name << "] on GPU";
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/ps/gpu_ps_cache.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/ps/gpu_ps_cache.cc
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include "ps/ps_cache/gpu/gpu_ps_cache.h"
+#include "plugin/device/gpu/hal/device/ps/gpu_ps_cache.h"
 #include "ps/ps_cache/ps_cache_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cuh"
 #include "plugin/device/gpu/hal/device/gpu_common.h"
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/ps/gpu_ps_cache.h
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/ps/gpu_ps_cache.h
@ -14,8 +14,8 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_PS_PS_CACHE_GPU_GPU_PS_CACHE_H_
-#define MINDSPORE_CCSRC_PS_PS_CACHE_GPU_GPU_PS_CACHE_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_HAL_DEVICE_PS_GPU_PS_CACHE_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_HAL_DEVICE_PS_GPU_PS_CACHE_H_

 #include <cuda_runtime_api.h>
 #include <memory>
@ -47,4 +47,4 @@ class GPUPsCache : public PsCacheBasic {
 }  // namespace gpu
 }  // namespace ps
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_PS_PS_CACHE_GPU_GPU_PS_CACHE_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_HAL_DEVICE_PS_GPU_PS_CACHE_H_
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_session.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_session.cc
@ -659,8 +659,7 @@ KernelGraphPtr GPUSession::BuildOpImpl(const BackendOpRunInfoPtr &op_run_info, c
                                       const std::vector<int64_t> &tensors_mask) {
  // Check if the graph cache exists.
  auto it = run_op_graphs_.find(graph_info);
-  if (it != run_op_graphs_.end() &&
-      kOpCacheBlackList.find(op_run_info->base_op_run_info.op_name) == kOpCacheBlackList.end()) {
+  if (it != run_op_graphs_.end() && !IsOneOfCacheBlackList(op_run_info->base_op_run_info.op_name)) {
    return it->second;
  }

@ -718,7 +717,7 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, const BackendOpRunInfoPt
    UpdateOutputAbstract(kernel_graph, op_run_info);
  }
  RunOpClearMemory(kernel_graph.get());
-  if (kOpCacheBlackList.find(op_run_info->base_op_run_info.op_name) != kOpCacheBlackList.end()) {
+  if (IsOneOfCacheBlackList(op_run_info->base_op_run_info.op_name)) {
    run_op_graphs_.erase(graph_info);
  }
 }
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/CMakeLists.txt
@ -3,8 +3,6 @@ if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
 endif()

 if(ENABLE_GPU)
-    file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cu")
-
    file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
    file(GLOB_RECURSE _AKG_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "akg/*.cc")
    list(REMOVE_ITEM GPU_SRC_LIST ${_AKG_SRC_LIST})
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt
@ -12,11 +12,12 @@ endif()

 set_property(SOURCE ${CUDA_OPS_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_KERNEL)
 if(ENABLE_GPU)
-    add_library(cuda_common_obj OBJECT cuda_common.cc  cuda_device_info.cc)
+    add_library(cuda_common_obj OBJECT cuda_ops/cuda_common.cc  cuda_ops/cuda_device_info.cc)
    target_compile_options(cuda_common_obj PRIVATE "-std=c++17")
    cuda_add_library(cuda_ops SHARED ${CUDA_OPS_SRC_LIST} $<TARGET_OBJECTS:cuda_common_obj>)
    message("add gpu lib to cuda_ops")
-    target_link_libraries(cuda_ops mindspore_core cublas
+    target_link_libraries(cuda_ops mindspore_core
+            ${CUBLAS_LIBRARY_PATH}
            ${CUDA_PATH}/lib64/libcurand.so
            ${CUDNN_LIBRARY_PATH}
            ${CUDA_PATH}/lib64/libcudart.so
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lp_norm_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lp_norm_impl.cu
@ -69,10 +69,11 @@ __global__ void NormCalHighPrecisionKernel(const float *middle_output, T *output
 }

 template <>
-void CalLpNorm<float>(const float *input, const size_t *input_shape, size_t input_shape_length, size_t input_elements,
-                      const size_t *output_axis, const size_t *output_stride, size_t output_shape_length,
-                      size_t output_elements, float p, float eps, float *middle_output, float *output,
-                      const uint32_t &device_id, cudaStream_t cuda_stream) {
+CUDA_LIB_EXPORT void CalLpNorm<float>(const float *input, const size_t *input_shape, size_t input_shape_length,
+                                      size_t input_elements, const size_t *output_axis, const size_t *output_stride,
+                                      size_t output_shape_length, size_t output_elements, float p, float eps,
+                                      float *middle_output, float *output, const uint32_t &device_id,
+                                      cudaStream_t cuda_stream) {
  LpCalKernel<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
    input, input_shape, input_shape_length, input_elements, output_axis, output_stride, output_shape_length, p, eps,
    output);
@ -81,10 +82,11 @@ void CalLpNorm<float>(const float *input, const size_t *input_shape, size_t inpu
 }

 template <>
-void CalLpNorm<half>(const half *input, const size_t *input_shape, size_t input_shape_length, size_t input_elements,
-                     const size_t *output_axis, const size_t *output_stride, size_t output_shape_length,
-                     size_t output_elements, float p, float eps, float *middle_output, half *output,
-                     const uint32_t &device_id, cudaStream_t cuda_stream) {
+CUDA_LIB_EXPORT void CalLpNorm<half>(const half *input, const size_t *input_shape, size_t input_shape_length,
+                                     size_t input_elements, const size_t *output_axis, const size_t *output_stride,
+                                     size_t output_shape_length, size_t output_elements, float p, float eps,
+                                     float *middle_output, half *output, const uint32_t &device_id,
+                                     cudaStream_t cuda_stream) {
  LpCalKernel<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
    input, input_shape, input_shape_length, input_elements, output_axis, output_stride, output_shape_length, p, eps,
    middle_output);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multi_margin_loss_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multi_margin_loss_grad_impl.cu
@ -133,9 +133,9 @@ __global__ void MultiMarginLoss_backward_kernel_half(half *gradInput, const half

 // namespace str
 template <typename T>
-void MultiMarginLossGrad(int64_t p, float margin, int64_t reduction, int nframe, int dim, const T *output_grad,
-                         const T *input, const int64_t *target, const T *weight, T *output, const uint32_t &device_id,
-                         cudaStream_t cuda_stream) {
+CUDA_LIB_EXPORT void MultiMarginLossGrad(int64_t p, float margin, int64_t reduction, int nframe, int dim,
+                                         const T *output_grad, const T *input, const int64_t *target, const T *weight,
+                                         T *output, const uint32_t &device_id, cudaStream_t cuda_stream) {
  dim3 blocks1(nframe);
  dim3 threads1(MULTIMARGIN_THREADS);
  bool reduce = false;
@ -158,9 +158,10 @@ void MultiMarginLossGrad(int64_t p, float margin, int64_t reduction, int nframe,

 // namespace str
 template <>
-void MultiMarginLossGrad(int64_t p, float margin, int64_t reduction, int nframe, int dim, const half *output_grad,
-                         const half *input, const int64_t *target, const half *weight, half *output,
-                         const uint32_t &device_id, cudaStream_t cuda_stream) {
+CUDA_LIB_EXPORT void MultiMarginLossGrad(int64_t p, float margin, int64_t reduction, int nframe, int dim,
+                                         const half *output_grad, const half *input, const int64_t *target,
+                                         const half *weight, half *output, const uint32_t &device_id,
+                                         cudaStream_t cuda_stream) {
  dim3 blocks1(nframe);
  dim3 threads1(MULTIMARGIN_THREADS);
  bool reduce = false;
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multi_margin_loss_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multi_margin_loss_impl.cu
@ -286,9 +286,9 @@ __global__ void MultiMarginLossReduceKernel(int dim, T *output) {

 // namespace str
 template <typename T>
-void MultiMarginLoss(int64_t p, float margin, int64_t reduction, int nframe, int dim, const T *input,
-                     const int64_t *target, const T *weight, T *output, const uint32_t &device_id,
-                     cudaStream_t cuda_stream) {
+CUDA_LIB_EXPORT void MultiMarginLoss(int64_t p, float margin, int64_t reduction, int nframe, int dim, const T *input,
+                                     const int64_t *target, const T *weight, T *output, const uint32_t &device_id,
+                                     cudaStream_t cuda_stream) {
  dim3 blocks(nframe);
  dim3 threads(MULTIMARGIN_THREADS);
  bool sizeAverage = false;
@ -311,9 +311,9 @@ void MultiMarginLoss(int64_t p, float margin, int64_t reduction, int nframe, int

 // namespace str
 template <>
-void MultiMarginLoss(int64_t p, float margin, int64_t reduction, int nframe, int dim, const half *input,
-                     const int64_t *target, const half *weight, half *output, const uint32_t &device_id,
-                     cudaStream_t cuda_stream) {
+CUDA_LIB_EXPORT void MultiMarginLoss(int64_t p, float margin, int64_t reduction, int nframe, int dim, const half *input,
+                                     const int64_t *target, const half *weight, half *output, const uint32_t &device_id,
+                                     cudaStream_t cuda_stream) {
  dim3 blocks(nframe);
  dim3 threads(128);
  bool sizeAverage = false;
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/renorm_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/renorm_impl.cu
@ -58,8 +58,7 @@ __global__ void CalNormValFun1(const Complex<double> *input, size_t input_elemen
 }

 __global__ void CalNormValFun2(float *norm_value, int p, size_t axis_size, float max_norm) {
-  for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (axis_size);
-       index += blockDim.x * gridDim.x) {
+  for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (axis_size); index += blockDim.x * gridDim.x) {
    float temp = pow(norm_value[index], static_cast<float>(1.0 / p));
    if (temp > max_norm) {
      norm_value[index] = max_norm / temp;
@ -84,61 +83,61 @@ __global__ void CalNormValFun3(const T *input, size_t inner_size, size_t axis_si
 }

 template <>
-void CalRenorm<half>(const half *input, size_t input_elements, size_t inner_size, size_t axis_size, int p,
-                     float *norm_value, half *output, const uint32_t &device_id, cudaStream_t cuda_stream,
-                     float max_norm) {
+CUDA_LIB_EXPORT void CalRenorm<half>(const half *input, size_t input_elements, size_t inner_size, size_t axis_size,
+                                     int p, float *norm_value, half *output, const uint32_t &device_id,
+                                     cudaStream_t cuda_stream, float max_norm) {
  CalNormValFun1<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
    input, input_elements, inner_size, axis_size, p, norm_value);
-  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
-    norm_value, p, axis_size, max_norm);
+  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(norm_value, p,
+                                                                                                 axis_size, max_norm);
  CalNormValFun3<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
    input, inner_size, axis_size, input_elements, output, norm_value);
 }

 template <>
-void CalRenorm<float>(const float *input, size_t input_elements, size_t inner_size, size_t axis_size, int p,
-                      float *norm_value, float *output, const uint32_t &device_id, cudaStream_t cuda_stream,
-                      float max_norm) {
+CUDA_LIB_EXPORT void CalRenorm<float>(const float *input, size_t input_elements, size_t inner_size, size_t axis_size,
+                                      int p, float *norm_value, float *output, const uint32_t &device_id,
+                                      cudaStream_t cuda_stream, float max_norm) {
  CalNormValFun1<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
    input, input_elements, inner_size, axis_size, p, norm_value);
-  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
-    norm_value, p, axis_size, max_norm);
+  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(norm_value, p,
+                                                                                                 axis_size, max_norm);
  CalNormValFun3<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
    input, inner_size, axis_size, input_elements, output, norm_value);
 }

 template <>
-void CalRenorm<double>(const double *input, size_t input_elements, size_t inner_size, size_t axis_size, int p,
-                       float *norm_value, double *output, const uint32_t &device_id, cudaStream_t cuda_stream,
-                       float max_norm) {
+CUDA_LIB_EXPORT void CalRenorm<double>(const double *input, size_t input_elements, size_t inner_size, size_t axis_size,
+                                       int p, float *norm_value, double *output, const uint32_t &device_id,
+                                       cudaStream_t cuda_stream, float max_norm) {
  CalNormValFun1<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
    input, input_elements, inner_size, axis_size, p, norm_value);
-  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
-    norm_value, p, axis_size, max_norm);
+  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(norm_value, p,
+                                                                                                 axis_size, max_norm);
  CalNormValFun3<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
    input, inner_size, axis_size, input_elements, output, norm_value);
 }

 template <>
-void CalRenorm<Complex<float>>(const Complex<float> *input, size_t input_elements, size_t inner_size, size_t axis_size,
-                               int p, float *norm_value, Complex<float> *output, const uint32_t &device_id,
-                               cudaStream_t cuda_stream, float max_norm) {
+CUDA_LIB_EXPORT void CalRenorm<Complex<float>>(const Complex<float> *input, size_t input_elements, size_t inner_size,
+                                               size_t axis_size, int p, float *norm_value, Complex<float> *output,
+                                               const uint32_t &device_id, cudaStream_t cuda_stream, float max_norm) {
  CalNormValFun1<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
    input, input_elements, inner_size, axis_size, p, norm_value);
-  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
-    norm_value, p, axis_size, max_norm);
+  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(norm_value, p,
+                                                                                                 axis_size, max_norm);
  CalNormValFun3<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
    input, inner_size, axis_size, input_elements, output, norm_value);
 }

 template <>
-void CalRenorm<Complex<double>>(const Complex<double> *input, size_t input_elements, size_t inner_size,
-                                size_t axis_size, int p, float *norm_value, Complex<double> *output,
-                                const uint32_t &device_id, cudaStream_t cuda_stream, float max_norm) {
+CUDA_LIB_EXPORT void CalRenorm<Complex<double>>(const Complex<double> *input, size_t input_elements, size_t inner_size,
+                                                size_t axis_size, int p, float *norm_value, Complex<double> *output,
+                                                const uint32_t &device_id, cudaStream_t cuda_stream, float max_norm) {
  CalNormValFun1<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
    input, input_elements, inner_size, axis_size, p, norm_value);
-  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
-    norm_value, p, axis_size, max_norm);
+  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(norm_value, p,
+                                                                                                 axis_size, max_norm);
  CalNormValFun3<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
    input, inner_size, axis_size, input_elements, output, norm_value);
 }
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scale_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scale_grad_impl.cu
@ -28,16 +28,16 @@ __global__ void ScaleGrad(const int nums, const T *x0, const S &x1, T *y) {
 }

 template <typename T, typename S>
-void ScaleGradKernel(const int &nums, const T *x0, const S &x1, T *y, cudaStream_t stream) {
+CUDA_LIB_EXPORT void ScaleGradKernel(const int &nums, const T *x0, const S &x1, T *y, cudaStream_t stream) {
  ScaleGrad<<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
  return;
 }

-template void ScaleGradKernel<float, float>(const int &nums, const float *x0, const float &x1, float *y,
-                                            cudaStream_t stream);
-template void ScaleGradKernel<float, half>(const int &nums, const float *x0, const half &x1, float *y,
-                                           cudaStream_t stream);
-template void ScaleGradKernel<half, float>(const int &nums, const half *x0, const float &x1, half *y,
-                                           cudaStream_t stream);
-template void ScaleGradKernel<half, half>(const int &nums, const half *x0, const half &x1, half *y,
-                                          cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScaleGradKernel<float, float>(const int &nums, const float *x0, const float &x1, float *y,
+                                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScaleGradKernel<float, half>(const int &nums, const float *x0, const half &x1, float *y,
+                                                           cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScaleGradKernel<half, float>(const int &nums, const half *x0, const float &x1, half *y,
+                                                           cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScaleGradKernel<half, half>(const int &nums, const half *x0, const half &x1, half *y,
+                                                          cudaStream_t stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scale_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scale_grad_impl.cuh
@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_SCALE_GRAD_IMPL_H_

 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 template <typename T, typename S>
 void ScaleGradKernel(const int &nums, const T *x0, const S &x1, T *y, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/discounted_return_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/discounted_return_impl.cu
@ -39,9 +39,9 @@ __global__ void DiscountedReturnKernel(const int timestep, const int num_env, co
 }

 template <typename T>
-void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element, const float &gamma,
-                      const T *reward, const bool *done, const T *last_value, T *discouted_return,
-                      cudaStream_t stream) {
+CUDA_LIB_EXPORT void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element,
+                                      const float &gamma, const T *reward, const bool *done, const T *last_value,
+                                      T *discouted_return, cudaStream_t stream) {
  // Every block process M element, 256 is a common tile size.
  const int element_per_step = num_env * num_element;
  const int element_per_block = std::min(256, element_per_step);
@ -51,9 +51,9 @@ void DiscountedReturn(const int &timestep, const int &num_env, const int &num_el
                                                                     done, last_value, discouted_return);
 }

-template void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element, const float &gamma,
-                               const float *reward, const bool *done, const float *last_value, float *discouted_return,
-                               cudaStream_t stream);
-template void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element, const float &gamma,
-                               const half *reward, const bool *done, const half *last_value, half *discouted_return,
-                               cudaStream_t stream);
+template CUDA_LIB_EXPORT void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element,
+                                               const float &gamma, const float *reward, const bool *done,
+                                               const float *last_value, float *discouted_return, cudaStream_t stream);
+template CUDA_LIB_EXPORT void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element,
+                                               const float &gamma, const half *reward, const bool *done,
+                                               const half *last_value, half *discouted_return, cudaStream_t stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/discounted_return_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/discounted_return_impl.cuh
@ -16,8 +16,9 @@

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_DISCONTED_RETURN_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_DISCONTED_RETURN_IMPL_H_
-
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 template <typename T>
-void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element, const float &gamma,
-                      const T *reward, const bool *done, const T *last_value, T *discouted_return, cudaStream_t stream);
+CUDA_LIB_EXPORT void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element,
+                                      const float &gamma, const T *reward, const bool *done, const T *last_value,
+                                      T *discouted_return, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_DISCONTED_RETURN_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/priority_replay_buffer.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/priority_replay_buffer.cuh
@ -18,21 +18,22 @@
 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMP_PRIORITY_REPLAY_BUFFER_IMPL_H_

 #include <curand_kernel.h>
-
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 struct SumTree {
  float sum;
  float min;
 };

-void SumTreeInit(SumTree *tree, float *max_priority, const size_t &capacity, cudaStream_t stream);
-void InitRandState(const size_t &batch_size, const uint64_t &seed, curandState *state, cudaStream_t stream);
-void SumTreePush(SumTree *tree, const float &alpha, const size_t &idx, const size_t &capacity, float *priority,
-                 float *max_priority, cudaStream_t stream);
-void SumTreeSample(SumTree *tree, curandState *state, const size_t &capacity, float *beta, const size_t &batch_size,
-                   size_t *indices, float *weights, cudaStream_t stream);
-void SumTreeUpdate(SumTree *tree, const size_t &capacity, const float &alpha, float *max_priority, size_t *indices,
-                   float *priorities, const size_t &batch_size, cudaStream_t stream);
-void FifoSlice(const uint8_t *input, const size_t *indice, uint8_t *output, size_t batch_size, size_t column,
-               cudaStream_t stream);
+CUDA_LIB_EXPORT void SumTreeInit(SumTree *tree, float *max_priority, const size_t &capacity, cudaStream_t stream);
+CUDA_LIB_EXPORT void InitRandState(const size_t &batch_size, const uint64_t &seed, curandState *state,
+                                   cudaStream_t stream);
+CUDA_LIB_EXPORT void SumTreePush(SumTree *tree, const float &alpha, const size_t &idx, const size_t &capacity,
+                                 float *priority, float *max_priority, cudaStream_t stream);
+CUDA_LIB_EXPORT void SumTreeSample(SumTree *tree, curandState *state, const size_t &capacity, float *beta,
+                                   const size_t &batch_size, size_t *indices, float *weights, cudaStream_t stream);
+CUDA_LIB_EXPORT void SumTreeUpdate(SumTree *tree, const size_t &capacity, const float &alpha, float *max_priority,
+                                   size_t *indices, float *priorities, const size_t &batch_size, cudaStream_t stream);
+CUDA_LIB_EXPORT void FifoSlice(const uint8_t *input, const size_t *indice, uint8_t *output, size_t batch_size,
+                               size_t column, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMP_PRIORITY_REPLAY_BUFFER_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh
@ -18,21 +18,24 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RL_BUFFER_IMPL_H_
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
-void BufferAppend(const int64_t capacity, const size_t size, const int *index, const int exp_batch,
-                  unsigned char *buffer, const unsigned char *exp, cudaStream_t cuda_stream);
-void IncreaseCount(const int64_t capacity, const int exp_batch, int *count, int *head, int *index,
-                   cudaStream_t cuda_stream);
-void ReMappingIndex(const int *count, const int *head, const int *origin_index, int *index, cudaStream_t cuda_stream);
-void BufferGetItem(const size_t size, const int *index, const size_t one_exp_len, const unsigned char *buffer,
-                   unsigned char *out, cudaStream_t cuda_stream);
-void CheckBatchSize(const int *count, const int *head, const size_t batch_size, const int64_t capacity,
-                    cudaStream_t cuda_stream);
-void BufferSample(const size_t size, const size_t one_element, const unsigned int *index, const unsigned char *buffer,
-                  unsigned char *out, cudaStream_t cuda_stream);
-void RandomGen(const int size, curandState *globalState, unsigned int *value, unsigned int *key, cudaStream_t stream);
-void RandInit(const int size, const int seed, curandState *state, cudaStream_t stream);
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
+CUDA_LIB_EXPORT void BufferAppend(const int64_t capacity, const size_t size, const int *index, const int exp_batch,
+                                  unsigned char *buffer, const unsigned char *exp, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void IncreaseCount(const int64_t capacity, const int exp_batch, int *count, int *head, int *index,
+                                   cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void ReMappingIndex(const int *count, const int *head, const int *origin_index, int *index,
+                                    cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void BufferGetItem(const size_t size, const int *index, const size_t one_exp_len,
+                                   const unsigned char *buffer, unsigned char *out, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CheckBatchSize(const int *count, const int *head, const size_t batch_size, const int64_t capacity,
+                                    cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void BufferSample(const size_t size, const size_t one_element, const unsigned int *index,
+                                  const unsigned char *buffer, unsigned char *out, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void RandomGen(const int size, curandState *globalState, unsigned int *value, unsigned int *key,
+                               cudaStream_t stream);
+CUDA_LIB_EXPORT void RandInit(const int size, const int seed, curandState *state, cudaStream_t stream);

 template <typename T>
-void RandomGenUniform(const int size, curandState *globalState, const int up_bound, T *indexes,
-                      cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void RandomGenUniform(const int size, curandState *globalState, const int up_bound, T *indexes,
+                                      cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/tag_env_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/tag_env_impl.cuh
@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_TAG_ENV_IMPL_H_

 #include <curand_kernel.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

 constexpr int kFeatureNum = 4;
 constexpr int kPartiallyObsFeatureNum = 6;
@ -46,14 +47,16 @@ struct AgentState {
  int *time_step;
 };

-void InitEnv(const int env_num, const int agent_num, const GameSetting *setting, AgentState *state,
-             cudaStream_t stream);
-void ResetEnv(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state, float *state,
-              cudaStream_t stream);
-void StepBindBlock(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state,
-                   const int *action, float *state, float *reward, bool *done, cudaStream_t stream);
-void StepCrossBlock(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state,
-                    const int *action, float *state, float *reward, bool *done, float *team_reward, int *distance,
-                    cudaStream_t stream);
-void AgentStateCopy(const int env_num, const int agent_num, AgentState *dst, AgentState *src, cudaStream_t stream);
+CUDA_LIB_EXPORT void InitEnv(const int env_num, const int agent_num, const GameSetting *setting, AgentState *state,
+                             cudaStream_t stream);
+CUDA_LIB_EXPORT void ResetEnv(const int env_num, const int agent_num, const GameSetting *setting,
+                              AgentState *agent_state, float *state, cudaStream_t stream);
+CUDA_LIB_EXPORT void StepBindBlock(const int env_num, const int agent_num, const GameSetting *setting,
+                                   AgentState *agent_state, const int *action, float *state, float *reward, bool *done,
+                                   cudaStream_t stream);
+CUDA_LIB_EXPORT void StepCrossBlock(const int env_num, const int agent_num, const GameSetting *setting,
+                                    AgentState *agent_state, const int *action, float *state, float *reward, bool *done,
+                                    float *team_reward, int *distance, cudaStream_t stream);
+CUDA_LIB_EXPORT void AgentStateCopy(const int env_num, const int agent_num, AgentState *dst, AgentState *src,
+                                    cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_TAG_ENV_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh
@ -19,8 +19,9 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void AngleAtomEnergy(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
-                     const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k,
-                     const float *angle_theta0, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void AngleAtomEnergy(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
+                                     const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k,
+                                     const float *angle_theta0, float *ene, cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh
@ -19,7 +19,9 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void AngleEnergy(int angle_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b,
-                 const int *atom_c, const float *angle_k, const float *angle_theta0, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void AngleEnergy(int angle_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a,
+                                 const int *atom_b, const int *atom_c, const float *angle_k, const float *angle_theta0,
+                                 float *ene, cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh
@ -19,8 +19,9 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void AngleForce(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a,
-                const int *atom_b, const int *atom_c, const float *angle_k, const float *angle_theta0, float *frc_f,
-                cudaStream_t stream);
+CUDA_LIB_EXPORT void AngleForce(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
+                                const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k,
+                                const float *angle_theta0, float *frc_f, cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh
@ -19,8 +19,10 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void AngleForceWithAtomEnergy(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
-                              const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k,
-                              const float *angle_theta0, float *frc_f, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void AngleForceWithAtomEnergy(int angle_numbers, int atom_numbers, const int *uint_crd_f,
+                                              const float *scaler_f, const int *atom_a, const int *atom_b,
+                                              const int *atom_c, const float *angle_k, const float *angle_theta0,
+                                              float *frc_f, float *ene, cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh
@ -19,8 +19,10 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void BondAtomEnergy(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a,
-                    const int *atom_b, const float *bond_k, const float *bond_r0, float *atom_ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void BondAtomEnergy(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
+                                    const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
+                                    float *atom_ene, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BOND_ATOM_ENERGY_GPU_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh
@ -19,9 +19,10 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void BondEnergy(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f, const float *scaler_f,
-                const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
-                float *bond_ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void BondEnergy(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f,
+                                const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k,
+                                const float *bond_r0, float *bond_ene, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BOND_ENERGY_CUDA_GPU_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh
@ -19,8 +19,9 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void BondForce(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f, const float *scaler_f,
-               const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
-               float *frc_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void BondForce(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f,
+                               const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k,
+                               const float *bond_r0, float *frc_f, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BOND_FORCE_CUDA_GPU_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh
@ -19,9 +19,11 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void BondForceWithAtomEnergyAndVirial(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f,
-                                      const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k,
-                                      const float *bond_r0, float *frc_f, float *atom_energy, float *atom_v,
-                                      cudaStream_t stream);
+CUDA_LIB_EXPORT void BondForceWithAtomEnergyAndVirial(int bond_numbers, int atom_numbers,
+                                                      const unsigned int *uint_crd_f, const float *scaler_f,
+                                                      const int *atom_a, const int *atom_b, const float *bond_k,
+                                                      const float *bond_r0, float *frc_f, float *atom_energy,
+                                                      float *atom_v, cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh
@ -19,8 +19,10 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void BondForceWithAtomEnergy(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
-                             const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
-                             float *frc_f, float *atom_e, cudaStream_t stream);
+CUDA_LIB_EXPORT void BondForceWithAtomEnergy(int bond_numbers, int atom_numbers, const int *uint_crd_f,
+                                             const float *scaler_f, const int *atom_a, const int *atom_b,
+                                             const float *bond_k, const float *bond_r0, float *frc_f, float *atom_e,
+                                             cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh
@ -19,8 +19,10 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void BondForceWithAtomVirial(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
-                             const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
-                             float *frc_f, float *atom_v, cudaStream_t stream);
+CUDA_LIB_EXPORT void BondForceWithAtomVirial(int bond_numbers, int atom_numbers, const int *uint_crd_f,
+                                             const float *scaler_f, const int *atom_a, const int *atom_b,
+                                             const float *bond_k, const float *bond_r0, float *frc_f, float *atom_v,
+                                             cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh
@ -18,9 +18,10 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_ATOMCRDTOCV_IMPL_H_

 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void AtomCrdToCV(int atom_numbers, int start_serial, int end_serial, int number, const float *crd_f,
-                 const float *old_crd, float *nowarp_crd, int *box_map_times, float *box, float *g_radial,
-                 float *g_angular, cudaStream_t stream);
+CUDA_LIB_EXPORT void AtomCrdToCV(int atom_numbers, int start_serial, int end_serial, int number, const float *crd_f,
+                                 const float *old_crd, float *nowarp_crd, int *box_map_times, float *box,
+                                 float *g_radial, float *g_angular, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_ATOMCRDTOCV_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh
@ -19,8 +19,9 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void CrdToUintCrd(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
-                  unsigned int *uint_crd_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void CrdToUintCrd(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
+                                  unsigned int *uint_crd_f, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh
@ -19,8 +19,9 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void CrdToUintCrdQuarter(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
-                         unsigned int *uint_crd_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void CrdToUintCrdQuarter(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
+                                         unsigned int *uint_crd_f, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_QUARTER_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh
@ -19,8 +19,9 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void GetCenterOfMass(int residue_numbers, int *start, int *end, float *crd_f, float *atom_mass,
-                     float *residue_mass_inverse, float *center_of_mass_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void GetCenterOfMass(int residue_numbers, int *start, int *end, float *crd_f, float *atom_mass,
+                                     float *residue_mass_inverse, float *center_of_mass_f, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTER_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh
@ -19,8 +19,10 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void GetCenterOfGeometry(const int center_numbers, float center_numbers_inverse, const int *center_atoms,
-                         const float *crd_f, float *center_of_geometry_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void GetCenterOfGeometry(const int center_numbers, float center_numbers_inverse,
+                                         const int *center_atoms, const float *crd_f, float *center_of_geometry_f,
+                                         cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTER_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh
@ -19,8 +19,10 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void MapCenterOfMass(int residue_numbers, int *start, int *end, float *center_of_mass_f,
-                     float *box_length_f, float *no_wrap_crd_f, float *crd_f, float* scaler, cudaStream_t stream);
+CUDA_LIB_EXPORT void MapCenterOfMass(int residue_numbers, int *start, int *end, float *center_of_mass_f,
+                                     float *box_length_f, float *no_wrap_crd_f, float *crd_f, float *scaler,
+                                     cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MAPCENTEROFMASS_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh
@ -19,7 +19,8 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void MDTemperature(const int residue_numbers, const int *start, const int *end, const float *atom_vel_f,
-                   const float *atom_mass, float *ek, cudaStream_t stream);
+CUDA_LIB_EXPORT void MDTemperature(const int residue_numbers, const int *start, const int *end, const float *atom_vel_f,
+                                   const float *atom_mass, float *ek, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MDTEMPERATURE_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh
@ -18,7 +18,9 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_TOTAL_C6_GET_IMPL_H_

 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void total_c6_get(int atom_numbers, int *atom_lj_type, float *d_lj_b, float *d_factor, cudaStream_t stream);
+CUDA_LIB_EXPORT void total_c6_get(int atom_numbers, int *atom_lj_type, float *d_lj_b, float *d_factor,
+                                  cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_TOTAL_C6_GET_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh
@ -20,8 +20,9 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void calculatenowrapcrd(int atom_numbers, int *box_map_times_f, float *box_f, float *crd_f, float *nowrap_crd_f,
-                        cudaStream_t stream);
+CUDA_LIB_EXPORT void calculatenowrapcrd(int atom_numbers, int *box_map_times_f, float *box_f, float *crd_f,
+                                        float *nowrap_crd_f, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRDMCMAP_CAL_NO_WRAP_CRD_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh
@ -20,8 +20,9 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void refresh_boxmaptimes(int atom_numbers, float *box_length_inverse, float *crd_f, float *old_crd_f,
-                         int *box_map_times_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void refresh_boxmaptimes(int atom_numbers, float *box_length_inverse, float *crd_f, float *old_crd_f,
+                                         int *box_map_times_f, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRDMCMAP_REFRESH_BOXMAPTIMES_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh
@ -19,9 +19,10 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void DihedralAtomEnergy(int dihedral_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
-                        const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d, const int *ipn,
-                        const float *pk, const float *gamc, const float *gams, const float *pn, float *ene,
-                        cudaStream_t stream);
+CUDA_LIB_EXPORT void DihedralAtomEnergy(int dihedral_numbers, int atom_numbers, const int *uint_crd_f,
+                                        const float *scaler_f, const int *atom_a, const int *atom_b, const int *atom_c,
+                                        const int *atom_d, const int *ipn, const float *pk, const float *gamc,
+                                        const float *gams, const float *pn, float *ene, cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh
@ -19,8 +19,10 @@

 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"

-void DihedralEnergy(int dihedral_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a,
-                    const int *atom_b, const int *atom_c, const int *atom_d, const int *ipn, const float *pk,
-                    const float *gamc, const float *gams, const float *pn, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void DihedralEnergy(int dihedral_numbers, const int *uint_crd_f, const float *scaler_f,
+                                    const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d,
+                                    const int *ipn, const float *pk, const float *gamc, const float *gams,
+                                    const float *pn, float *ene, cudaStream_t stream);
 #endif
--- a/Show More
+++ b/Show More