diff --git a/cmake/package.cmake b/cmake/package.cmake
index 979339e0065..dff5ccf3e80 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -95,6 +95,22 @@ install(
     COMPONENT mindspore
 )
 
+if(ENABLE_D)
+    install(
+        TARGETS mindspore_ascend
+        DESTINATION ${INSTALL_LIB_DIR}
+        COMPONENT mindspore
+    )
+endif()
+
+if(ENABLE_GPU)
+    install(
+            TARGETS mindspore_gpu
+            DESTINATION ${INSTALL_LIB_DIR}
+            COMPONENT mindspore
+    )
+endif()
+
 if(USE_GLOG)
     install(FILES ${glog_LIBPATH}/libmindspore_glog.so.0.4.0
       DESTINATION ${INSTALL_LIB_DIR} RENAME libmindspore_glog.so.0 COMPONENT mindspore)
diff --git a/cmake/package_tar.cmake b/cmake/package_tar.cmake
index df4aeeff80a..d18150d120c 100644
--- a/cmake/package_tar.cmake
+++ b/cmake/package_tar.cmake
@@ -32,6 +32,22 @@ install(
         COMPONENT mindspore
 )
 
+if(ENABLE_D)
+    install(
+        TARGETS mindspore_ascend
+        DESTINATION ${INSTALL_LIB_DIR}
+        COMPONENT mindspore
+    )
+endif()
+
+if(ENABLE_GPU)
+    install(
+        TARGETS mindspore_gpu
+        DESTINATION ${INSTALL_LIB_DIR}
+        COMPONENT mindspore
+    )
+endif()
+
 if(USE_GLOG)
     file(GLOB_RECURSE GLOG_LIB_LIST ${glog_LIBPATH}/libmindspore_glog*)
     install(
diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt
index 62fe1e0ee74..d47e0f6b6be 100644
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -14,9 +14,6 @@ set(FBS_FILES
 ms_build_flatbuffers(FBS_FILES ${CMAKE_CURRENT_SOURCE_DIR}../../schema generated_fbs_files ${SERVER_FLATBUFFER_OUTPUT})
 
 if(ENABLE_D OR ENABLE_ACL)
-    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/lib64)
-    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/lib64)
-    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/latest/lib64)
     set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/op_tiling)
     set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
     set(MINDSPORE_RPATH
@@ -24,9 +21,77 @@ if(ENABLE_D OR ENABLE_ACL)
     set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
 endif()
 
-if(ENABLE_D)
-    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/plugin/device/ascend/kernel/aicpu/aicpu_ops)
-    add_subdirectory(plugin/device/ascend/kernel/aicpu/aicpu_ops)
+if(ENABLE_GPU)
+    find_package(CUDA REQUIRED)
+    find_package(Threads)
+    if(${CUDA_VERSION} VERSION_LESS ${MS_REQUIRE_CUDA_VERSION})
+        message(FATAL_ERROR "The minimum CUDA version ${MS_REQUIRE_CUDA_VERSION} is required, \
+              but only CUDA ${CUDA_VERSION} found.")
+    endif()
+    enable_language(CUDA)
+    if(NOT CUDA_PATH OR CUDA_PATH STREQUAL "")
+        if(DEFINED ENV{CUDA_HOME} AND NOT $ENV{CUDA_HOME} STREQUAL "")
+            set(CUDA_PATH $ENV{CUDA_HOME})
+        else()
+            set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
+        endif()
+    endif()
+
+    if(DEFINED ENV{CUDNN_HOME} AND NOT $ENV{CUDNN_HOME} STREQUAL "")
+        set(CUDNN_INCLUDE_DIR $ENV{CUDNN_HOME}/include)
+        set(CUDNN_LIBRARY_DIR $ENV{CUDNN_HOME}/lib64)
+        find_path(CUDNN_INCLUDE_PATH cudnn.h HINTS ${CUDNN_INCLUDE_DIR} NO_DEFAULT_PATH)
+        find_library(CUDNN_LIBRARY_PATH "cudnn" HINTS ${CUDNN_LIBRARY_DIR} NO_DEFAULT_PATH)
+        find_library(CUBLAS_LIBRARY_PATH "cublas" HINTS ${CUDNN_LIBRARY_DIR})
+        if(CUDNN_INCLUDE_PATH STREQUAL CUDNN_INCLUDE_PATH-NOTFOUND)
+            message(FATAL_ERROR "Failed to find cudnn header file, please set environment variable CUDNN_HOME to \
+                    cudnn installation position.")
+        endif()
+        if(CUDNN_LIBRARY_PATH STREQUAL CUDNN_LIBRARY_PATH-NOTFOUND)
+            message(FATAL_ERROR "Failed to find cudnn library file, please set environment variable CUDNN_HOME to \
+                    cudnn installation position.")
+        endif()
+    else()
+        list(APPEND CMAKE_PREFIX_PATH  ${CUDA_TOOLKIT_ROOT_DIR})
+        find_path(CUDNN_INCLUDE_PATH cudnn.h PATH_SUFFIXES cuda/inclulde include cuda)
+        find_library(CUDNN_LIBRARY_PATH "cudnn" PATH_SUFFIXES cuda/lib64 lib64 lib cuda/lib lib/x86_64-linux-gnu)
+        find_library(CUBLAS_LIBRARY_PATH "cublas" PATH_SUFFIXES cuda/lib64 lib64 lib cuda/lib lib/x86_64-linux-gnu)
+        if(CUDNN_INCLUDE_PATH STREQUAL CUDNN_INCLUDE_PATH-NOTFOUND)
+            message(FATAL_ERROR "Failed to find cudnn header file, if cudnn library is not installed, please put \
+                    cudnn header file in cuda include path or user include path(eg. /usr/local/cuda/include; \
+                    /usr/local/include; /usr/include), if cudnn library is installed in other position, please \
+                    set environment variable CUDNN_HOME to cudnn installation position, there should be cudnn.h \
+                    in {CUDNN_HOME}/include.")
+        endif()
+        if(CUDNN_LIBRARY_PATH STREQUAL CUDNN_LIBRARY_PATH-NOTFOUND)
+            message(FATAL_ERROR "Failed to find cudnn library file, if cudnn library is not installed, please put \
+                    cudnn library file in cuda library path or user library path(eg. /usr/local/cuda/lib64; \
+                    /usr/local/lib64; /usr/lib64; /usr/local/lib; /usr/lib), if cudnn library is installed in other \
+                    position, please set environment variable CUDNN_HOME to cudnn installation position, there should \
+                    be cudnn library file in {CUDNN_HOME}/lib64.")
+        endif()
+    endif()
+
+    if(NOT CUPTI_INCLUDE_DIRS OR CUPTI_INCLUDE_DIRS STREQUAL "")
+        set(CUPTI_INCLUDE_DIRS  ${CUDA_PATH}/extras/CUPTI/include)
+    endif()
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:${CUDA_PATH}/lib64)
+    message("CUDA_PATH: ${CUDA_PATH}")
+    message("CUDA_INCLUDE_DIRS: ${CUDA_INCLUDE_DIRS}")
+    message("CUDNN_INCLUDE_PATH: ${CUDNN_INCLUDE_PATH}")
+    message("CUDNN_LIBRARY_PATH: ${CUDNN_LIBRARY_PATH}")
+    message("CUBLAS_LIBRARY_PATH: ${CUBLAS_LIBRARY_PATH}")
+    message("CUPTI_INCLUDE_DIRS: ${CUPTI_INCLUDE_DIRS}")
+    include_directories(${CUDNN_INCLUDE_PATH} ${CUDA_PATH} ${CUDA_INCLUDE_DIRS} ${CUPTI_INCLUDE_DIRS})
+
+    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
+    if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+        list(APPEND CUDA_NVCC_FLAGS -G)
+        message("CUDA_NVCC_FLAGS" ${CUDA_NVCC_FLAGS})
+    endif()
+    set(NVCC_TMP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    set(CMAKE_CXX_FLAGS ${NVCC_TMP_CMAKE_CXX_FLAGS})
+    add_compile_definitions(ENABLE_GPU)
 endif()
 
 if(ENABLE_CPU)
@@ -59,111 +124,6 @@ if(ENABLE_MPI)
     add_compile_definitions(ENABLE_MPI)
 endif()
 
-if(ENABLE_GPU)
-    find_package(CUDA REQUIRED)
-    find_package(Threads)
-    if(${CUDA_VERSION} VERSION_LESS ${MS_REQUIRE_CUDA_VERSION})
-        message(FATAL_ERROR "The minimum CUDA version ${MS_REQUIRE_CUDA_VERSION} is required, \
-              but only CUDA ${CUDA_VERSION} found.")
-    endif()
-    enable_language(CUDA)
-    if(NOT CUDA_PATH OR CUDA_PATH STREQUAL "")
-        if(DEFINED ENV{CUDA_HOME} AND NOT $ENV{CUDA_HOME} STREQUAL "")
-            set(CUDA_PATH $ENV{CUDA_HOME})
-        else()
-            set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
-        endif()
-    endif()
-
-    if(DEFINED ENV{CUDNN_HOME} AND NOT $ENV{CUDNN_HOME} STREQUAL "")
-        set(CUDNN_INCLUDE_DIR $ENV{CUDNN_HOME}/include)
-        set(CUDNN_LIBRARY_DIR $ENV{CUDNN_HOME}/lib64)
-        find_path(CUDNN_INCLUDE_PATH cudnn.h HINTS ${CUDNN_INCLUDE_DIR} NO_DEFAULT_PATH)
-        find_library(CUDNN_LIBRARY_PATH "cudnn" HINTS ${CUDNN_LIBRARY_DIR} NO_DEFAULT_PATH)
-        if(CUDNN_INCLUDE_PATH STREQUAL CUDNN_INCLUDE_PATH-NOTFOUND)
-            message(FATAL_ERROR "Failed to find cudnn header file, please set environment variable CUDNN_HOME to \
-                    cudnn installation position.")
-        endif()
-        if(CUDNN_LIBRARY_PATH STREQUAL CUDNN_LIBRARY_PATH-NOTFOUND)
-            message(FATAL_ERROR "Failed to find cudnn library file, please set environment variable CUDNN_HOME to \
-                    cudnn installation position.")
-        endif()
-    else()
-        list(APPEND CMAKE_PREFIX_PATH  ${CUDA_TOOLKIT_ROOT_DIR})
-        find_path(CUDNN_INCLUDE_PATH cudnn.h PATH_SUFFIXES cuda/inclulde include cuda)
-        find_library(CUDNN_LIBRARY_PATH "cudnn" PATH_SUFFIXES cuda/lib64 lib64 lib cuda/lib lib/x86_64-linux-gnu)
-        if(CUDNN_INCLUDE_PATH STREQUAL CUDNN_INCLUDE_PATH-NOTFOUND)
-            message(FATAL_ERROR "Failed to find cudnn header file, if cudnn library is not installed, please put \
-                    cudnn header file in cuda include path or user include path(eg. /usr/local/cuda/include; \
-                    /usr/local/include; /usr/include), if cudnn library is installed in other position, please \
-                    set environment variable CUDNN_HOME to cudnn installation position, there should be cudnn.h \
-                    in {CUDNN_HOME}/include.")
-        endif()
-        if(CUDNN_LIBRARY_PATH STREQUAL CUDNN_LIBRARY_PATH-NOTFOUND)
-            message(FATAL_ERROR "Failed to find cudnn library file, if cudnn library is not installed, please put \
-                    cudnn library file in cuda library path or user library path(eg. /usr/local/cuda/lib64; \
-                    /usr/local/lib64; /usr/lib64; /usr/local/lib; /usr/lib), if cudnn library is installed in other \
-                    position, please set environment variable CUDNN_HOME to cudnn installation position, there should \
-                    be cudnn library file in {CUDNN_HOME}/lib64.")
-        endif()
-    endif()
-
-    if(NOT CUPTI_INCLUDE_DIRS OR CUPTI_INCLUDE_DIRS STREQUAL "")
-        set(CUPTI_INCLUDE_DIRS  ${CUDA_PATH}/extras/CUPTI/include)
-    endif()
-    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:${CUDA_PATH}/lib64)
-    message("CUDA_PATH: ${CUDA_PATH}")
-    message("CUDA_INCLUDE_DIRS: ${CUDA_INCLUDE_DIRS}")
-    message("CUDNN_INCLUDE_PATH: ${CUDNN_INCLUDE_PATH}")
-    message("CUDNN_LIBRARY_PATH: ${CUDNN_LIBRARY_PATH}")
-    message("CUPTI_INCLUDE_DIRS: ${CUPTI_INCLUDE_DIRS}")
-    include_directories(${CUDNN_INCLUDE_PATH} ${CUDA_PATH} ${CUDA_INCLUDE_DIRS} ${CUPTI_INCLUDE_DIRS})
-
-    file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-            "plugin/device/gpu/hal/device/*.cc"
-            "plugin/device/gpu/hal/device/*.cu"
-            "plugin/device/gpu/kernel/*.cu"
-            )
-
-    list(REMOVE_ITEM GPU_SRC_LIST "plugin/device/gpu/kernel/cuda_impl/cuda_ops/*.cu")
-
-    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
-    if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
-        list(APPEND CUDA_NVCC_FLAGS -G)
-        message("CUDA_NVCC_FLAGS" ${CUDA_NVCC_FLAGS})
-    endif()
-    list(REMOVE_ITEM GPU_SRC_LIST "plugin/device/gpu/hal/device/mpi/mpi_initializer.cc"
-                                  "plugin/device/gpu/hal/device/distribution/collective_wrapper.cc"
-                                  "plugin/device/gpu/hal/device/distribution/mpi_wrapper.cc"
-                                  "plugin/device/gpu/hal/device/distribution/nccl_wrapper.cc"
-                                  "plugin/device/gpu/hal/device/trt_loader.cc")
-
-    if(NOT ${TENSORRT_HOME} STREQUAL "")
-        find_path(TENSORRT_HOME_INCLUDE NvInfer.h HINTS ${TENSORRT_HOME}/include)
-        if(TENSORRT_HOME_INCLUDE STREQUAL TENSORRT_HOME_INCLUDE-NOTFOUND)
-          message(FATAL_ERROR "Tensor-RT dir not exist ${TENSORRT_HOME}")
-        endif()
-        message("Enable GPU inference. Tensor-RT include dir: ${TENSORRT_HOME_INCLUDE}")
-        set(ENABLE_GPU_INFER TRUE)
-        add_compile_definitions(ENABLE_GPU_INFER)
-        include_directories(${TENSORRT_HOME_INCLUDE})
-        list(APPEND GPU_SRC_LIST ${CMAKE_CURRENT_SOURCE_DIR}/plugin/device/gpu/hal/device/trt_loader.cc)
-    endif()
-
-    set(NVCC_TMP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-    if(${CUDA_VERSION} VERSION_LESS 11.0)
-        string(REPLACE "-std=c++17" "-std=c++11" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    else()
-        string(REPLACE "-std=c++17" "-std=c++14" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    endif()
-    set_property(SOURCE ${GPU_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
-    cuda_add_library(gpu_cuda_lib STATIC ${GPU_SRC_LIST})
-    set(CMAKE_CXX_FLAGS ${NVCC_TMP_CMAKE_CXX_FLAGS})
-    add_compile_definitions(ENABLE_GPU)
-    add_dependencies(gpu_cuda_lib proto_input)
-    add_subdirectory(plugin/device/gpu/kernel/cuda_impl/cuda_ops)
-endif()
-
 ## make protobuf files
 file(GLOB ONNX_PROTO "" ${CMAKE_SOURCE_DIR}/third_party/proto/onnx/onnx.proto)
 message("onnx proto path is :" ${ONNX_PROTO})
@@ -328,22 +288,11 @@ set(BACKEND_SUB_COMP
         runtime/hardware
         runtime/pynative
         runtime/data_queue
-        plugin/device/ascend/hal/device
-        plugin/device/ascend/hal/hardware
-        plugin/device/ascend/hal/hccl_adapter
-        plugin/device/ascend/hal/profiler
-        plugin/device/ascend/kernel
-        plugin/device/ascend/optimizer
         plugin/device/cpu/hal/device
         plugin/device/cpu/hal/hardware
         plugin/device/cpu/hal/profiler
         plugin/device/cpu/kernel
         plugin/device/cpu/optimizer
-        plugin/device/gpu/hal/device
-        plugin/device/gpu/hal/hardware
-        plugin/device/gpu/hal/profiler
-        plugin/device/gpu/kernel
-        plugin/device/gpu/optimizer
         transform/graph_ir
         )
 
@@ -370,14 +319,19 @@ endif()
 
 set_property(SOURCE ${BACKEND_SUB_OBJECTS_SRC} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_ME)
 add_library(mindspore_backend SHARED ${BACKEND_SUB_OBJECTS_SRC})
+add_library(mindspore_backend_common STATIC ${BACKEND_SUB_OBJECTS_SRC})
+
 if(MODE_ASCEND_ACL)
     add_library(mindspore_backend_static STATIC ${BACKEND_SUB_OBJECTS_SRC})
 endif()
+
 if(CMAKE_SYSTEM_NAME MATCHES "Windows")
     target_link_libraries(mindspore_backend PRIVATE mindspore::pybind11_module)
 endif()
+
 target_link_libraries(mindspore_backend PRIVATE mindspore_core mindspore_common proto_input mindspore::protobuf)
 target_link_libraries(mindspore_backend PRIVATE securec)
+
 if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
     set_target_properties(mindspore_backend PROPERTIES MACOSX_RPATH ON)
     set_target_properties(mindspore_backend PROPERTIES INSTALL_RPATH @loader_path)
@@ -388,17 +342,6 @@ endif()
 if(ENABLE_CPU)
     target_link_libraries(mindspore_backend PRIVATE mindspore::dnnl mindspore::mkldnn nnacl)
 endif()
-if(ENABLE_GPU)
-    message("add gpu lib to mindspore_backend")
-    target_link_libraries(mindspore_backend PRIVATE gpu_cuda_lib cublas cuda_ops
-            ${CUDA_PATH}/lib64/libcurand.so
-            ${CUDNN_LIBRARY_PATH}
-            ${CUDA_PATH}/lib64/libcudart.so
-            ${CUDA_PATH}/lib64/stubs/libcuda.so
-            ${CUDA_PATH}/lib64/libcusolver.so
-            ${CUDA_PATH}/lib64/libcufft.so
-            ${CUDA_PATH}/lib64/libcusparse.so)
-endif()
 
 if(NOT WIN32)
     target_link_libraries(mindspore_backend PRIVATE mindspore::ssl mindspore::crypto)
@@ -421,28 +364,7 @@ elseif(ENABLE_CPU AND NOT WIN32)
             -Wl,--no-as-needed mindspore::event_core ps_cache)
 endif()
 
-if(ENABLE_D)
-    find_library(GE_RUNNER ge_runner ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(GRAPH graph ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(HCCL hccl ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    target_link_libraries(mindspore_backend PUBLIC ${GE_RUNNER} ${GRAPH} ${HCCL})
-endif()
-
 if(MODE_ASCEND_ALL)
-    MESSAGE("USE DAV LIB PATH: ${ASCEND_PATH}")
-    find_library(ERROR_MANAGER error_manager ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(RUNTIME_LIB runtime ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(TSDCLIENT tsdclient HINTS ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(DATATRANSFER datatransfer HINTS ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(PROFILING msprofiler ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(ACL ascendcl ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(ACL_TDT_CHANNEL acl_tdt_channel ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(PLATFORM platform ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(OPT_FEATURE opt_feature ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(adump_server libadump_server.a ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
-    find_library(OPTILING optiling ${ASCEND_CANN_OPP_PATH} ${ASCEND_TOOLKIT_OPP_PATH})
-    target_link_libraries(mindspore_backend PUBLIC ${RUNTIME_LIB} ${TSDCLIENT} ${DATATRANSFER} ${ERROR_MANAGER}
-            -Wl,--no-as-needed ${OPTILING} ${PLATFORM} ${ACL} ${ACL_TDT_CHANNEL} ${OPT_FEATURE} ${PROFILING})
     target_link_libraries(mindspore PUBLIC -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group)
 elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
     target_link_libraries(mindspore PUBLIC -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece
@@ -473,6 +395,18 @@ else()
 endif()
 set(MINDSPORE_RPATH ${ORIGIN_PATH}/lib:${MINDSPORE_RPATH})
 
+if(ENABLE_D)
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/plugin/device/ascend)
+    add_subdirectory(plugin/device/ascend)
+    target_link_libraries(mindspore_backend PRIVATE mindspore_ascend)
+endif()
+
+if(ENABLE_GPU)
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/plugin/device/gpu)
+    add_subdirectory(plugin/device/gpu)
+    target_link_libraries(mindspore_backend PRIVATE mindspore_gpu)
+endif()
+
 set_target_properties(_c_expression PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH})
 
 if(CMAKE_SYSTEM_NAME MATCHES "Windows")
@@ -526,16 +460,6 @@ if(ENABLE_MINDDATA)
     add_subdirectory(minddata/dataset)
 endif()
 
-if(MODE_ASCEND_ALL)
-    target_link_libraries(_c_expression PRIVATE ${adump_server})
-endif()
-
-if(ENABLE_D)
-    if(ENABLE_MPI)
-        set_target_properties(_ascend_mpi PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH})
-    endif()
-endif()
-
 if(ENABLE_TEST OR ENABLE_TESTCASES)
     include_directories(${CMAKE_BINARY_DIR})
     list(APPEND STUB_COMMON_SOURCE ${CMAKE_SOURCE_DIR}/tests/ut/cpp/stub/ge/ge_operator_stub.cc)
diff --git a/mindspore/ccsrc/backend/common/optimizer/helper.cc b/mindspore/ccsrc/backend/common/optimizer/helper.cc
index 763434dbb17..6957f2f7696 100644
--- a/mindspore/ccsrc/backend/common/optimizer/helper.cc
+++ b/mindspore/ccsrc/backend/common/optimizer/helper.cc
@@ -734,10 +734,10 @@ AbstractBasePtrList RectifyAbstractFromRegAttr(const PrimitivePtr &primitive,
   MS_EXCEPTION_IF_NULL(ms_context);
   auto device = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
   if (device == kGPUDevice) {
-    if (DynamicShapeConstInputToAttrGPU.find(primitive->name()) != DynamicShapeConstInputToAttrGPU.end()) {
+    if (IsOneOfDynamicShapeConstInputToAttrGPU(primitive->name())) {
       return input_abstract;
     }
-  } else if (DynamicShapeConstInputToAttr.find(primitive->name()) != DynamicShapeConstInputToAttr.end()) {
+  } else if (IsOneOfDynamicShapeConstInputToAttr(primitive->name())) {
     return input_abstract;
   }
   auto convert_input_list = reg.GetConstInputAttrInfo();
diff --git a/mindspore/ccsrc/backend/common/pass/custom_op_reg_info_to_attr.cc b/mindspore/ccsrc/backend/common/pass/custom_op_reg_info_to_attr.cc
index e5748b34213..1de58be3148 100644
--- a/mindspore/ccsrc/backend/common/pass/custom_op_reg_info_to_attr.cc
+++ b/mindspore/ccsrc/backend/common/pass/custom_op_reg_info_to_attr.cc
@@ -133,7 +133,7 @@ const AnfNodePtr CustomOpRegInfoToAttr::Process(const FuncGraphPtr &, const AnfN
   MS_EXCEPTION_IF_NULL(primitive);
   auto func_type = common::AnfAlgo::GetNodeAttr<std::string>(cnode, kAttrFuncType);
   // AKG/AICPU need to process attr, TBE will process later in the json creating phase.
-  if (kCustomTypeAkg.find(func_type) == kCustomTypeAkg.end() || func_type == kCustomTypeAICPU) {
+  if (!IsOneOfCustomAkgType(func_type) || func_type == kCustomTypeAICPU) {
     return nullptr;
   }
   // Early return if current node does not have attr
diff --git a/mindspore/ccsrc/backend/common/somas/somas.h b/mindspore/ccsrc/backend/common/somas/somas.h
index abaaeacac21..3a63647b2c7 100644
--- a/mindspore/ccsrc/backend/common/somas/somas.h
+++ b/mindspore/ccsrc/backend/common/somas/somas.h
@@ -35,6 +35,7 @@
 #include "include/common/utils/anfalgo.h"
 #include "backend/common/session/kernel_graph.h"
 #include "runtime/hardware/device_type.h"
+#include "include/backend/visible.h"
 
 namespace mindspore {
 namespace somas {
@@ -248,7 +249,7 @@ using SomasPtr = std::shared_ptr<Somas>;
 using SomasCreator = std::function<std::shared_ptr<Somas>()>;
 
 // @todo will delete when old runtime remove
-class SomasManager {
+class BACKEND_EXPORT SomasManager {
  public:
   static SomasManager &Instance() {
     static SomasManager instance{};
diff --git a/mindspore/ccsrc/backend/graph_compiler/backend.cc b/mindspore/ccsrc/backend/graph_compiler/backend.cc
index 4a4389de041..a9385f3ac5b 100644
--- a/mindspore/ccsrc/backend/graph_compiler/backend.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/backend.cc
@@ -399,7 +399,7 @@ void ClearInputDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *d
 }
 
 bool OpInBlackList(const session::BackendOpRunInfoPtr &op_run_info) {
-  return kOpCacheBlackList.find(op_run_info->base_op_run_info.op_name) != kOpCacheBlackList.end();
+  return IsOneOfCacheBlackList(op_run_info->base_op_run_info.op_name);
 }
 
 int GetExecutionMode() {
diff --git a/mindspore/ccsrc/backend/graph_compiler/graph_partition.cc b/mindspore/ccsrc/backend/graph_compiler/graph_partition.cc
index 8b9c78dfad2..202ae5048b3 100644
--- a/mindspore/ccsrc/backend/graph_compiler/graph_partition.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/graph_partition.cc
@@ -28,9 +28,6 @@
 #include "utils/ms_context.h"
 #include "ps/ps_context.h"
 #include "utils/anf_utils.h"
-#ifdef ENABLE_D
-#include "include/transform/graph_ir/utils.h"
-#endif
 namespace mindspore {
 const char kMsConvert[] = "ms";
 const char kMsVm[] = "vm";
diff --git a/mindspore/ccsrc/common/graph_kernel/adapter/callback_impl.cc b/mindspore/ccsrc/common/graph_kernel/adapter/callback_impl.cc
index 5ecf1755939..d0461729db5 100644
--- a/mindspore/ccsrc/common/graph_kernel/adapter/callback_impl.cc
+++ b/mindspore/ccsrc/common/graph_kernel/adapter/callback_impl.cc
@@ -194,12 +194,17 @@ void CallbackImpl::ResetKernelInfo(const AnfNodePtr &node) {
   auto cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
   if (GetTargetFromContext() == kAscendDevice) {
+    cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
     auto kernel_info_setter = GraphKernelInfoManager::Instance().GetGraphKernelInfo(kAscendDevice);
+    MS_EXCEPTION_IF_NULL(kernel_info_setter);
     kernel_info_setter->SetKernelInfo(cnode, KernelType::UNKNOWN_KERNEL_TYPE);
   } else if (GetTargetFromContext() == kGPUDevice) {
+    cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
     auto kernel_info_setter = GraphKernelInfoManager::Instance().GetGraphKernelInfo(kGPUDevice);
+    MS_EXCEPTION_IF_NULL(kernel_info_setter);
     kernel_info_setter->SetKernelInfo(cnode, KernelType::UNKNOWN_KERNEL_TYPE);
   } else {
+    cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
     auto kernel_info_setter = GraphKernelInfoManager::Instance().GetGraphKernelInfo(kCPUDevice);
     if (kernel_info_setter != nullptr) {
       kernel_info_setter->SetKernelInfo(cnode, KernelType::UNKNOWN_KERNEL_TYPE);
diff --git a/mindspore/ccsrc/common/graph_kernel/model/op_register.h b/mindspore/ccsrc/common/graph_kernel/model/op_register.h
index fd2be0880e4..d3f5ea4d585 100644
--- a/mindspore/ccsrc/common/graph_kernel/model/op_register.h
+++ b/mindspore/ccsrc/common/graph_kernel/model/op_register.h
@@ -21,10 +21,11 @@
 
 #include "utils/hash_map.h"
 #include "common/graph_kernel/model/op_node.h"
+#include "include/backend/visible.h"
 
 namespace mindspore::graphkernel::inner {
 using CreatorFunc = std::function<PrimOpPtr(const std::string &)>;
-class OpRegistry {
+class BACKEND_EXPORT OpRegistry {
  public:
   static OpRegistry &Instance() {
     static OpRegistry instance{};
diff --git a/mindspore/ccsrc/cxx_api/CMakeLists.txt b/mindspore/ccsrc/cxx_api/CMakeLists.txt
index 8641d2bedcd..a3dabc68549 100644
--- a/mindspore/ccsrc/cxx_api/CMakeLists.txt
+++ b/mindspore/ccsrc/cxx_api/CMakeLists.txt
@@ -62,7 +62,13 @@ if(ENABLE_D)
           "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/frontend/parallel/tensor_layout/array.cc"
           "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/frontend/parallel/tensor_layout/map.cc"
           "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/frontend/parallel/tensor_layout/arrangement.cc"
-          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/frontend/parallel/tensor_layout/shape_util.cc")
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/frontend/parallel/tensor_layout/shape_util.cc"
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/backend/common/optimizer/pattern_engine.cc"
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/backend/common/optimizer/helper.cc"
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/backend/common/optimizer/node_pass.cc"
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/backend/common/optimizer/visit.cc"
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/kernel/kernel_build_info.cc"
+          "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/runtime/device/kernel_info.cc")
 endif()
 
 if(NOT ENABLE_TESTCASES AND NOT BUILD_LITE)
@@ -70,11 +76,22 @@ if(NOT ENABLE_TESTCASES AND NOT BUILD_LITE)
     set(MSLIB_SRC ${MSLIB_SRC} ${CMAKE_SOURCE_DIR}/mindspore/core/utils/status.cc)
 endif()
 
+if(ENABLE_D OR ENABLE_ACL)
+    list(APPEND MSLIB_SRC
+      "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/plugin/device/ascend/optimizer/enhancer/add_placeholder_for_dynamic_rnn.cc")
+endif()
+
+if(ENABLE_GPU)
+    list(APPEND MSLIB_SRC "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_driver.cc")
+endif()
+
 if(BUILD_LITE)
     list(APPEND MSLIB_SRC "${CMAKE_CURRENT_SOURCE_DIR}/../../../mindspore/ccsrc/utils/config_manager.cc")
     file(GLOB_RECURSE ACL_REMOVE_SRC ${CMAKE_CURRENT_SOURCE_DIR}
             "model/acl/acl_vm/*.cc"
             )
+    list(REMOVE_ITEM MSLIB_SRC
+      "${CMAKE_SOURCE_DIR}/mindspore/ccsrc/plugin/device/ascend/optimizer/enhancer/add_placeholder_for_dynamic_rnn.cc")
     list(REMOVE_ITEM MSLIB_SRC "${CMAKE_CURRENT_SOURCE_DIR}/akg_kernel_register.cc"
             "${CMAKE_CURRENT_SOURCE_DIR}/model/acl/acl_model_multi.cc"
             "${CMAKE_CURRENT_SOURCE_DIR}/model/acl/acl_model.cc"
@@ -157,7 +174,8 @@ if(ENABLE_D)
 endif()
 
 if(ENABLE_GPU)
-    target_link_libraries(mindspore_shared_lib PRIVATE  gpu_cuda_lib cublas cuda_ops
+    target_link_libraries(mindspore_shared_lib PRIVATE cuda_ops
+                          ${CUBLAS_LIBRARY_PATH}
                           ${CUDA_PATH}/lib64/libcurand.so
                           ${CUDNN_LIBRARY_PATH}
                           ${CUDA_PATH}/lib64/libcudart.so
diff --git a/mindspore/ccsrc/debug/CMakeLists.txt b/mindspore/ccsrc/debug/CMakeLists.txt
index 7052db5798d..14ed40fa420 100644
--- a/mindspore/ccsrc/debug/CMakeLists.txt
+++ b/mindspore/ccsrc/debug/CMakeLists.txt
@@ -15,9 +15,6 @@ set(_OFFLINE_SRC_LIST
 
 if(ENABLE_DUMP_IR)
     file(GLOB_RECURSE _RDR_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "rdr/*.cc")
-    if(NOT ENABLE_D)
-        list(REMOVE_ITEM _RDR_SRC_LIST "rdr/task_debug_info_recorder.cc")
-    endif()
 endif()
 
 if("${ENABLE_HIDDEN}" STREQUAL "OFF")
diff --git a/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_or_list_eliminate.h b/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_or_list_eliminate.h
index 42844a747b2..b794b147a6b 100644
--- a/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_or_list_eliminate.h
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_or_list_eliminate.h
@@ -21,6 +21,7 @@
 #include <memory>
 #include <vector>
 #include <string>
+#include <map>
 
 #include "frontend/optimizer/optimizer_caller.h"
 #include "frontend/optimizer/anf_visitor.h"
@@ -40,6 +41,8 @@ namespace irpass {
 // {prim::kPrimListGetItem, L, N}
 // {prim::kPrimTupleSetItem, T, N, Z}
 // {prim::kPrimListSetItem, L, N, Z}
+const std::map<std::string, size_t> kSliceAttrToIndex = {{kSliceStart, 1}, {kSliceStop, 2}, {kSliceStep, 3}};
+
 class TupleListConvertItemIndexToPositive : public AnfVisitor {
  public:
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
diff --git a/mindspore/ccsrc/include/common/utils/utils.h b/mindspore/ccsrc/include/common/utils/utils.h
index 625ecca0e5f..59906a1f542 100644
--- a/mindspore/ccsrc/include/common/utils/utils.h
+++ b/mindspore/ccsrc/include/common/utils/utils.h
@@ -29,6 +29,7 @@
 
 #include "utils/log_adapter.h"
 #include "ir/dtype/type.h"
+#include "include/common/visible.h"
 
 namespace mindspore {
 // op name. Op which not exists in operator/ops.h, so define it's name here
@@ -661,7 +662,6 @@ constexpr auto kCustomTypePyfunc = "pyfunc";
 constexpr auto kCustomTypeTbe = "tbe";
 constexpr auto kCustomTypeAICPU = "aicpu";
 constexpr auto kCustomTypeHybrid = "hybrid";
-const std::set<std::string> kCustomTypeAkg = {"ir_builder", "tvm_compute", "hybrid"};
 
 // primal attr key name
 constexpr auto kPrimalAttrForwardNodeName = "forward_node_name";
@@ -791,145 +791,23 @@ constexpr auto kOpFormat_FRACTAL_Z_3D = "FRACTAL_Z_3D";
 constexpr auto kOpFormat_FRACTAL_ZN_LSTM = "FRACTAL_ZN_LSTM";
 constexpr auto kOpFormat_FRACTAL_ZN_RNN = "FRACTAL_ZN_RNN";
 constexpr auto kOpFormat_ND_RNN_BIAS = "ND_RNN_BIAS";
-
-const std::set<std::string> kOpFormatList = {kOpFormat_DEFAULT,
-                                             kOpFormat_NC1KHKWHWC0,
-                                             kOpFormat_ND,
-                                             kOpFormat_NCHW,
-                                             kOpFormat_NHWC,
-                                             kOpFormat_HWCN,
-                                             kOpFormat_NC1HWC0,
-                                             kOpFormat_FRAC_Z,
-                                             kOpFormat_C1HWNCoC0,
-                                             kOpFormat_FRAC_NZ,
-                                             kOpFormat_NC1HWC0_C04,
-                                             kOpFormat_FRACTAL_Z_C04,
-                                             kOpFormat_NDHWC,
-                                             kOpFormat_FRACTAL_ZN_LSTM,
-                                             kOpFormat_FRACTAL_ZN_RNN,
-                                             kOpFormat_ND_RNN_BIAS,
-                                             kOpFormat_NDC1HWC0,
-                                             kOpFormat_NCDHW,
-                                             kOpFormat_FRACTAL_Z_3D,
-                                             kOpFormat_DHWNC,
-                                             kOpFormat_DHWCN};
-
 constexpr auto kSliceStart = "start";
 constexpr auto kSliceStop = "stop";
 constexpr auto kSliceStep = "step";
-const std::map<std::string, size_t> kSliceAttrToIndex = {{kSliceStart, 1}, {kSliceStop, 2}, {kSliceStep, 3}};
 
-const std::set<std::string> kDefaultCompatibleFormat = {kOpFormat_ND, kOpFormat_NCHW, kOpFormat_NHWC, kOpFormat_HWCN,
-                                                        kOpFormat_NCDHW};
-
-const std::set<std::string> kOptOperatorSet = {kMomentumOpName,
-                                               kApplyMomentumOpName,
-                                               kApplyAdadeltaOpName,
-                                               kApplyAdagradOpName,
-                                               kApplyAdagradDAName,
-                                               kApplyAdamOpName,
-                                               kApplyAdaMaxOpName,
-                                               kApplyAddSignOpName,
-                                               kApplyCenteredRMSPOpName,
-                                               kApplyFtrlOpName,
-                                               kApplyFtrlV2OpName,
-                                               kApplyGradientDescentOpName,
-                                               kApplyPowerSignOpName,
-                                               kApplyProximalAdagradOpName,
-                                               kApplyProximalGradientDescentOpName,
-                                               kApplyRMSPropOpName,
-                                               kAdamApplyOneWithDecayOpName,
-                                               kAdamApplyOneWithDecayAssignOpName,
-                                               kFusedAdamWeightDecayName,
-                                               kAdamWeightDecayName,
-                                               kFusedCastAdamWeightDecayName,
-                                               kFusedAdamName,
-                                               kFusedAdaFactorName,
-                                               kFusedAdaFactorWithGlobalNormName,
-                                               kFusedSparseAdamName,
-                                               kFusedMulApplyMomentumOpName,
-                                               kFusedWeightScaleApplyMomentum,
-                                               kFusedScaleApplyMomentum,
-                                               kApplyCenteredRMSPropOpName,
-                                               kFusedSparseFtrlName,
-                                               kFusedSparseProximalAdagradName,
-                                               kFusedSparseLazyAdamName,
-                                               kSparseApplyFtrlName,
-                                               kSparseApplyFtrlV2Name,
-                                               kSGDName,
-                                               kLARSUpdateName,
-                                               kCombineMomentumWeightOpName,
-                                               kCombineMomentumOpName,
-                                               kScatterAddOpName,
-                                               kScatterUpdateOpName,
-                                               kSparseApplyProximalAdagradOpName};
-
-const std::set<std::string> kNodeWithSeedOperators = {kGammaOpName,          kPoissonOpName,    kStandardLaplaceOpName,
-                                                      kStandardNormalOpName, kUniformIntOpName, kUniformRealOpName};
-const std::set<std::string> kPosteriorOperatorSet = {kPullOpName};
-
-const std::set<std::string> kOpCacheBlackList = {kUniformCandidateSamplerOpName, kInitDatasetQueueOpName,
-                                                 kGetNextOpName};
-
-const std::set<std::string> kOpNotSupportMultiThreadExecList = {kAvgPoolOpName, kAvgPoolGradOpName, kMaxPoolOpName,
-                                                                kBatchNorm, kBatchNormGradOpName};
-
-const std::set<std::string> kHWSpecialFormatSet = {
-  kOpFormat_FRACTAL_Z_3D,   kOpFormat_NC1KHKWHWC0, kOpFormat_NC1HWC0,       kOpFormat_FRAC_NZ,
-  kOpFormat_C1HWNCoC0,      kOpFormat_NC1HWC0_C04, kOpFormat_FRACTAL_Z_C04, kOpFormat_FRACTAL_ZN_LSTM,
-  kOpFormat_FRACTAL_ZN_RNN, kOpFormat_NDC1HWC0,    kOpFormat_FRAC_Z};
-
-const std::set<TypeId> kFloatDataTypeSet = {kNumberTypeFloat16, kNumberTypeFloat32};
-
-const std::set<std::string> kComputeDepend = {kUniqueOpName,
-                                              kUniqueConsecutiveOpName,
-                                              kComputeAccidentalHitsOpName,
-                                              kSubAndFilterOpName,
-                                              kPadAndShiftOpName,
-                                              kCTCGreedyDecoderOpName,
-                                              kMaskedSelectOpName,
-                                              kDynamicStitchOpName,
-                                              kGetNextOpName,
-                                              kListDiffOpName,
-                                              kNonMaxSuppressionV3OpName,
-                                              kNonMaxSuppressionWithOverlapsOpName,
-                                              kCoalesceOpName,
-                                              kTruncatedNormal,
-                                              kNonDeterministicInts,
-                                              kFractionalAvgPoolGradOpName,
-                                              kDenseToDenseSetOperation,
-                                              kSegmentMaxOpName,
-                                              kCSRSparseMatrixToSparseTensorOpName,
-                                              kSegmentMinOpName,
-                                              kLuUnpackOpName,
-                                              kSegmentSumOpName,
-                                              kResizeBicubicOpName,
-                                              kResizeAreaOpName,
-                                              kSegmentMeanOpName,
-                                              kSegmentProdOpName,
-                                              kNonZeroOpName,
-                                              kSparseSparseMinimumOpName,
-                                              kRpcRecvOpName,
-                                              kAdaptiveMaxPool3DGradOpName};
-
-const std::set<std::string> k3DFormatSet = {kOpFormat_NCDHW, kOpFormat_NDC1HWC0, kOpFormat_FRACTAL_Z_3D,
-                                            kOpFormat_NDHWC, kOpFormat_DHWCN,    kOpFormat_DHWNC};
-
-const std::set<std::string> kNoPaddingFormatSet = {kOpFormat_ChannelLast, kOpFormat_FRAC_NZ, kOpFormat_FRACTAL_ZN_RNN,
-                                                   kOpFormat_ND_RNN_BIAS};
-
-const std::set<std::string> DynamicShapeConstInputToAttr = {
-  kCastOpName,      kExpandDimsOpName, kEmbeddingLookupOpName, kReduceMinOpName, kReduceMeanOpName,
-  kReduceMaxOpName, kReduceAllOpName,  kReduceAnyOpName,       kConcatOpName,    kTransposeOpName};
-
-const std::set<std::string> DynamicShapeConstInputToAttrCPU = {
-  kCastOpName,      kExpandDimsOpName, kEmbeddingLookupOpName, kReduceMinOpName, kReduceMeanOpName, kReduceMaxOpName,
-  kReduceAllOpName, kReduceAnyOpName,  kConcatOpName,          kReduceSumOpName, kTransposeOpName};
-
-const std::set<std::string> DynamicShapeConstInputToAttrGPU = {
-  kCastOpName,      kExpandDimsOpName, kReshapeOpName,    kEmbeddingLookupOpName, kTransposeOpName,
-  kReduceSumOpName, kReduceMinOpName,  kReduceMeanOpName, kReduceMaxOpName,       kReduceAllOpName,
-  kReduceAnyOpName, kConcatOpName,     kScatterNdOpName,  kGatherV2OpName,        kAvgPool3DGradOpName};
+COMMON_EXPORT bool IsOneOfCustomAkgType(const std::string &name);
+COMMON_EXPORT bool IsOneOfOperator(const std::string &name);
+COMMON_EXPORT bool IsOneOfPosteriorOperator(const std::string &name);
+COMMON_EXPORT bool IsOneOfCacheBlackList(const std::string &name);
+COMMON_EXPORT bool IsOneOfNotSupportMultiThreadExec(const std::string &name);
+COMMON_EXPORT bool IsOneOf3DFormat(const std::string &format);
+COMMON_EXPORT bool IsOneOfNoPaddingFormat(const std::string &format);
+COMMON_EXPORT bool IsOneOfDynamicShapeConstInputToAttr(const std::string &name);
+COMMON_EXPORT bool IsOneOfDynamicShapeConstInputToAttrCPU(const std::string &name);
+COMMON_EXPORT bool IsOneOfDynamicShapeConstInputToAttrGPU(const std::string &name);
+COMMON_EXPORT bool IsOneOfComputeDepend(const std::string &name);
+COMMON_EXPORT bool IsOneOfHWSpecialFormat(const std::string &format);
+COMMON_EXPORT bool IsOneOfFormat(const std::string &format);
 
 // The map between kernel's output and input ref relationship.
 // Key is the output index while the value is input index which will be used as the reference of output.
diff --git a/mindspore/ccsrc/kernel/akg/akg_kernel_build_manager.h b/mindspore/ccsrc/kernel/akg/akg_kernel_build_manager.h
index cdfe5983b96..61374fbdb46 100644
--- a/mindspore/ccsrc/kernel/akg/akg_kernel_build_manager.h
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_build_manager.h
@@ -21,12 +21,13 @@
 #include <utility>
 #include <memory>
 #include <string>
+#include "include/backend/visible.h"
 
 namespace mindspore {
 namespace kernel {
 using AkgKernelBuildCreator = std::function<std::shared_ptr<AkgKernelBuilder>()>;
 
-class AkgKernelBuildManager {
+class BACKEND_EXPORT AkgKernelBuildManager {
  public:
   static AkgKernelBuildManager &Instance();
   void Register(const std::string &device_type, AkgKernelBuildCreator &&creator);
diff --git a/mindspore/ccsrc/kernel/graph_kernel_info.h b/mindspore/ccsrc/kernel/graph_kernel_info.h
index fe721fa3f63..7710d5aa5e0 100644
--- a/mindspore/ccsrc/kernel/graph_kernel_info.h
+++ b/mindspore/ccsrc/kernel/graph_kernel_info.h
@@ -25,6 +25,7 @@
 #include "ir/dtype.h"
 #include "ir/kernel_info_dev.h"
 #include "kernel/kernel.h"
+#include "include/backend/visible.h"
 namespace mindspore {
 class GraphKernelInfo {
  public:
@@ -35,7 +36,7 @@ class GraphKernelInfo {
 
 using GraphKernelInfoCreator = std::function<std::shared_ptr<GraphKernelInfo>()>;
 
-class GraphKernelInfoManager {
+class BACKEND_EXPORT GraphKernelInfoManager {
  public:
   static GraphKernelInfoManager &Instance() {
     static GraphKernelInfoManager instance{};
@@ -52,6 +53,7 @@ class GraphKernelInfoManager {
       MS_EXCEPTION_IF_NULL(iter->second);
       return (iter->second)();
     }
+    MS_LOG(WARNING) << "Can not get a graph kernel info ptr on device: " << device_type;
     return nullptr;
   }
 
diff --git a/mindspore/ccsrc/kernel/oplib/oplib.cc b/mindspore/ccsrc/kernel/oplib/oplib.cc
index 7ea5a693087..80b57723ff8 100644
--- a/mindspore/ccsrc/kernel/oplib/oplib.cc
+++ b/mindspore/ccsrc/kernel/oplib/oplib.cc
@@ -67,7 +67,6 @@ constexpr auto kFormat = "format";
 constexpr auto kNeedCompile = "need_compile";
 constexpr auto kShape = "shape";
 constexpr auto kProcessor = "processor";
-std::multimap<std::string, std::shared_ptr<OpInfo>> OpLib::op_info_;
 
 static std::string ImplTypeToStr(OpImplyType impl_type) {
   switch (impl_type) {
diff --git a/mindspore/ccsrc/kernel/oplib/oplib.h b/mindspore/ccsrc/kernel/oplib/oplib.h
index bc96f589a06..9df44f5473d 100644
--- a/mindspore/ccsrc/kernel/oplib/oplib.h
+++ b/mindspore/ccsrc/kernel/oplib/oplib.h
@@ -27,16 +27,16 @@
 
 namespace mindspore {
 namespace kernel {
-class OpLib {
+class BACKEND_EXPORT OpLib {
  public:
   OpLib() = default;
   virtual ~OpLib() = default;
-  BACKEND_EXPORT static bool RegOp(const std::string &json_string, const std::string &impl_path);
+  static bool RegOp(const std::string &json_string, const std::string &impl_path);
   static std::shared_ptr<OpInfo> FindOp(const std::string &op_name, OpImplyType imply_type,
                                         bool is_dynamic_shape = false);
 
  protected:
-  static std::multimap<std::string, std::shared_ptr<OpInfo>> op_info_;
+  inline static std::multimap<std::string, std::shared_ptr<OpInfo>> op_info_ = {};
 
  private:
   static bool RegOpFromLocalInfo();
diff --git a/mindspore/ccsrc/pipeline/jit/init.cc b/mindspore/ccsrc/pipeline/jit/init.cc
index b9e498cc0a2..6318b50888b 100644
--- a/mindspore/ccsrc/pipeline/jit/init.cc
+++ b/mindspore/ccsrc/pipeline/jit/init.cc
@@ -35,7 +35,7 @@
 #ifdef ENABLE_GPU_COLLECTIVE
 #include "plugin/device/gpu/hal/device/distribution/collective_init.h"
 #else
-#include "plugin/device/gpu/hal/device/distribution/collective_fake_init.h"
+#include "runtime/collective/collective_fake_init.h"
 #endif
 #if ((defined ENABLE_CPU) && (!defined _WIN32))
 #include "ps/util.h"
diff --git a/mindspore/ccsrc/plugin/device/ascend/CMakeLists.txt b/mindspore/ccsrc/plugin/device/ascend/CMakeLists.txt
new file mode 100644
index 00000000000..e4c23cd344a
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/ascend/CMakeLists.txt
@@ -0,0 +1,85 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${CMAKE_BINARY_DIR})
+
+if(ENABLE_D OR ENABLE_ACL)
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/lib64)
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/lib64)
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/latest/lib64)
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/op_tiling)
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
+    set(MINDSPORE_RPATH
+            ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
+    set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
+endif()
+
+########### mindspore_ascend.so #####
+set(ASCEND_SUB_COMP
+        hal/device
+        hal/hardware
+        hal/hccl_adapter
+        hal/profiler
+        kernel
+        optimizer
+        )
+
+foreach(a_comp ${ASCEND_SUB_COMP})
+    add_subdirectory(${a_comp})
+    string(REPLACE "/" "_" sub ${a_comp})
+    if(TARGET _mindspore_plugin_device_ascend_${sub}_obj)
+        list(APPEND ASCEND_SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_plugin_device_ascend_${sub}_obj>)
+        add_dependencies(_mindspore_plugin_device_ascend_${sub}_obj proto_input)
+    endif()
+endforeach()
+
+add_library(mindspore_ascend SHARED ${ASCEND_SUB_OBJECTS_SRC})
+target_link_libraries(mindspore_ascend PUBLIC mindspore_backend_common)
+target_link_libraries(mindspore_ascend PRIVATE mindspore_core mindspore_common proto_input mindspore::protobuf)
+target_link_libraries(mindspore_ascend PRIVATE securec)
+
+set_target_properties(mindspore_ascend PROPERTIES INSTALL_RPATH $ORIGIN)
+target_link_libraries(mindspore_ascend PRIVATE mindspore::dnnl mindspore::mkldnn mindspore::ssl
+                      mindspore::crypto nnacl)
+
+if(ENABLE_DEBUGGER)
+    # debugger: link grpc
+    if(ENABLE_D)
+        target_link_libraries(mindspore_ascend PRIVATE -Wl,--no-as-needed mindspore::grpc++)
+    endif()
+endif()
+
+if(ENABLE_D)
+    find_library(GE_RUNNER ge_runner ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(GRAPH graph ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(HCCL hccl ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    target_link_libraries(mindspore_ascend PUBLIC ${GE_RUNNER} ${GRAPH} ${HCCL})
+    target_link_libraries(mindspore_ascend PRIVATE mindspore::event mindspore::event_pthreads
+            mindspore::event_openssl -Wl,--no-as-needed mindspore::event_core ps_cache)
+endif()
+
+if(MODE_ASCEND_ALL)
+    MESSAGE("USE DAV LIB PATH: ${ASCEND_PATH}")
+    find_library(ERROR_MANAGER error_manager ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(RUNTIME_LIB runtime ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(TSDCLIENT tsdclient HINTS ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(DATATRANSFER datatransfer HINTS ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(PROFILING msprofiler ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(ACL ascendcl ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(ACL_TDT_CHANNEL acl_tdt_channel ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(PLATFORM platform ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(OPT_FEATURE opt_feature ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(adump_server libadump_server.a ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(OPTILING optiling ${ASCEND_CANN_OPP_PATH} ${ASCEND_TOOLKIT_OPP_PATH})
+
+    target_link_libraries(mindspore_ascend PUBLIC ${RUNTIME_LIB} ${TSDCLIENT} ${DATATRANSFER} ${ERROR_MANAGER}
+            -Wl,--no-as-needed ${OPTILING} ${PLATFORM} ${ACL} ${ACL_TDT_CHANNEL} ${OPT_FEATURE} ${PROFILING})
+    target_link_libraries(mindspore_ascend PRIVATE ${adump_server})
+endif()
+
+if(ENABLE_D)
+    if(ENABLE_MPI)
+        set_target_properties(_ascend_mpi PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH})
+    endif()
+endif()
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/kernel/aicpu/aicpu_ops)
+add_subdirectory(kernel/aicpu/aicpu_ops)
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_data_queue.h b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_data_queue.h
index e241d4ee458..3724265f9f8 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_data_queue.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_data_queue.h
@@ -24,10 +24,11 @@
 #include "runtime/hardware/device_context_manager.h"
 #include "runtime/data_queue/data_queue.h"
 #include "runtime/rt.h"
+#include "include/backend/visible.h"
 
 namespace mindspore {
 namespace device {
-class AscendDataQueueDynamic : public DataQueue {
+class BACKEND_EXPORT AscendDataQueueDynamic : public DataQueue {
  public:
   explicit AscendDataQueueDynamic(const size_t capacity);
   virtual ~AscendDataQueueDynamic() = default;
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_select_ascend.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_select_ascend.cc
index 706c19b2b83..abea399fa1a 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_select_ascend.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_select_ascend.cc
@@ -83,8 +83,7 @@ string GetPriorityMatchFormat(const CNodePtr &cnode) {
   size_t input_num = common::AnfAlgo::GetInputTensorNum(cnode);
   for (size_t index = 0; index < input_num; ++index) {
     auto pre_output_format = AnfAlgo::GetPrevNodeOutputFormat(cnode, index);
-    if (AnfAlgo::IsFeatureMapInput(cnode, index) &&
-        kHWSpecialFormatSet.find(pre_output_format) != kHWSpecialFormatSet.end()) {
+    if (AnfAlgo::IsFeatureMapInput(cnode, index) && IsOneOfHWSpecialFormat(pre_output_format)) {
       priority_matched_format = !is_init ? pre_output_format : priority_matched_format;
       is_init = true;
     }
@@ -494,7 +493,7 @@ KernelSelectStatus SelectCustomKernelInfo(const CNodePtr &kernel_node, KernelTyp
   auto func_type = common::AnfAlgo::GetNodeAttr<std::string>(kernel_node, kAttrFuncType);
   if (func_type == kCustomTypeTbe) {
     *kernel_type = KernelType::TBE_KERNEL;
-  } else if (kCustomTypeAkg.find(func_type) != kCustomTypeAkg.end()) {
+  } else if (IsOneOfCustomAkgType(func_type)) {
     *kernel_type = KernelType::AKG_KERNEL;
   } else if (func_type == kCustomTypeAICPU) {
     *kernel_type = KernelType::AICPU_KERNEL;
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_manager.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_manager.cc
index de2f2ab4baa..12a35a180ff 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_manager.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_manager.cc
@@ -27,6 +27,7 @@
 #include "runtime/base.h"
 #include <nlohmann/json.hpp>
 #include "plugin/device/ascend/hal/device/profiling/profiling_utils.h"
+#include "plugin/device/ascend/hal/profiler/ascend_profiling.h"
 
 using mindspore::device::ascend::ProfilingUtils;
 
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.cc
index 68f62412116..94363853e43 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.cc
@@ -20,6 +20,7 @@
 #include "plugin/device/ascend/kernel/ascend_kernel_mod.h"
 #include "include/common/utils/utils.h"
 #include "backend/common/session/kernel_graph.h"
+#include "plugin/device/ascend/hal/profiler/ascend_profiling.h"
 
 namespace mindspore {
 namespace device {
diff --git a/mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/ps/ascend_ps_cache.cc
similarity index 99%
rename from mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.cc
rename to mindspore/ccsrc/plugin/device/ascend/hal/device/ps/ascend_ps_cache.cc
index 9fa4b5c18ca..9117a8b307d 100644
--- a/mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ps/ascend_ps_cache.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ps/ps_cache/ascend/ascend_ps_cache.h"
+#include "plugin/device/ascend/hal/device/ps/ascend_ps_cache.h"
 #include <google/protobuf/text_format.h>
 #include <string>
 #include <vector>
diff --git a/mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.h b/mindspore/ccsrc/plugin/device/ascend/hal/device/ps/ascend_ps_cache.h
similarity index 92%
rename from mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.h
rename to mindspore/ccsrc/plugin/device/ascend/hal/device/ps/ascend_ps_cache.h
index 9762dd9cc86..19de1157679 100644
--- a/mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ps/ascend_ps_cache.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_PS_PS_CACHE_ASCEND_ASCEND_PS_CACHE_H_
-#define MINDSPORE_CCSRC_PS_PS_CACHE_ASCEND_ASCEND_PS_CACHE_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_PS_ASCEND_PS_CACHE_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_PS_ASCEND_PS_CACHE_H_
 
 #include <string>
 #include <vector>
@@ -72,4 +72,4 @@ class AscendPsCache : public PsCacheBasic {
 }  // namespace ascend
 }  // namespace ps
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_PS_PS_CACHE_ASCEND_ASCEND_PS_CACHE_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_PS_ASCEND_PS_CACHE_H_
diff --git a/mindspore/ccsrc/debug/rdr/task_debug_info_recorder.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.cc
similarity index 95%
rename from mindspore/ccsrc/debug/rdr/task_debug_info_recorder.cc
rename to mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.cc
index 54b4decd5c9..f8641b7de9c 100644
--- a/mindspore/ccsrc/debug/rdr/task_debug_info_recorder.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.cc
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "debug/rdr/task_debug_info_recorder.h"
+#include "plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.h"
 #include <utility>
 #include "plugin/device/ascend/hal/device/tasksink/task_generator.h"
 #include "include/common/debug/rdr/recorder_manager.h"
diff --git a/mindspore/ccsrc/debug/rdr/task_debug_info_recorder.h b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.h
similarity index 85%
rename from mindspore/ccsrc/debug/rdr/task_debug_info_recorder.h
rename to mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.h
index be538823885..e7cf5f0eaf4 100644
--- a/mindspore/ccsrc/debug/rdr/task_debug_info_recorder.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.h
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef MINDSPORE_CCSRC_DEBUG_RDR_TASK_DEBUG_INFO_RECORDER_H_
-#define MINDSPORE_CCSRC_DEBUG_RDR_TASK_DEBUG_INFO_RECORDER_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_TASKSINK_TASK_DEBUG_INFO_RECORDER_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_TASKSINK_TASK_DEBUG_INFO_RECORDER_H_
 #include <vector>
 #include <string>
 #include <memory>
@@ -49,4 +49,4 @@ bool RecordTaskDebugInfo(SubModuleId module, const std::string &name,
                          const std::vector<TaskDebugInfoPtr> &task_debug_info_list);
 }  // namespace RDR
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_DEBUG_RDR_TASK_DEBUG_INFO_RECORDER_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_TASKSINK_TASK_DEBUG_INFO_RECORDER_H_
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_generator.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_generator.cc
index 946a2311dd6..dd30bebd991 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_generator.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_generator.cc
@@ -28,7 +28,7 @@
 #include "plugin/device/ascend/hal/device/profiling/profiling_manager.h"
 #endif
 #ifdef ENABLE_DUMP_IR
-#include "debug/rdr/task_debug_info_recorder.h"
+#include "plugin/device/ascend/hal/device/tasksink/task_debug_info_recorder.h"
 #endif
 #include "mindspore/core/utils/file_utils.h"
 
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_utils.h b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_utils.h
index 3a15351592c..65ebba5fa61 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_utils.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_utils.h
@@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_UTILS_H_
 
 #include <string>
+#include <set>
 #include "plugin/device/ascend/hal/hardware/ascend_device_context.h"
 #include "backend/common/session/kernel_graph.h"
 
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/profiler/memory_profiling.h b/mindspore/ccsrc/plugin/device/ascend/hal/profiler/memory_profiling.h
index 1aa2d2cbad5..a2986a62d73 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/profiler/memory_profiling.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/profiler/memory_profiling.h
@@ -23,6 +23,7 @@
 #include <vector>
 #include <memory>
 #include "utils/ms_context.h"
+#include "include/backend/visible.h"
 
 namespace mindspore {
 namespace profiler {
@@ -107,8 +108,8 @@ class MemoryProfiling {
     return instance;
   }
 
-  std::shared_ptr<GraphMemory> AddGraphMemoryNode(uint32_t graph_id);
-  std::shared_ptr<GraphMemory> GetGraphMemoryNode(uint32_t graph_id) const;
+  BACKEND_EXPORT std::shared_ptr<GraphMemory> AddGraphMemoryNode(uint32_t graph_id);
+  BACKEND_EXPORT std::shared_ptr<GraphMemory> GetGraphMemoryNode(uint32_t graph_id) const;
   void SetDeviceMemSize(uint64_t size) { device_mem_size_ = size; }
   bool MemoryToPB();
   void SaveMemoryProfiling();
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_kernel_build.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_kernel_build.cc
index 9f01e6e4677..20e07e833e2 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_kernel_build.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_kernel_build.cc
@@ -439,7 +439,7 @@ void CreateExtInfo(const std::shared_ptr<AnfNode> &anf_node, const std::shared_p
 
   UnknowShapeOpType shape_type = UnknowShapeOpType::DEPEND_IN_SHAPE;
   auto op_name = common::AnfAlgo::GetCNodeName(anf_node);
-  if (kComputeDepend.find(op_name) != kComputeDepend.end()) {
+  if (IsOneOfComputeDepend(op_name)) {
     shape_type = UnknowShapeOpType::DEPEND_COMPUTE;
   }
   ext_info_offset = SetExtInfoShapeType(ext_info_buf, ext_info_offset, shape_type);
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/dynamic_aicpu_kernel_mod.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/dynamic_aicpu_kernel_mod.cc
index dd9d94f3611..82d96ed7fe5 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/dynamic_aicpu_kernel_mod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/dynamic_aicpu_kernel_mod.cc
@@ -39,7 +39,7 @@ DynamicAicpuOpKernelMod::DynamicAicpuOpKernelMod(const AnfNodePtr &anf_node_ptr)
   auto cnode = anf_node_ptr->cast<CNodePtr>();
   if (cnode != nullptr) {
     auto op_name = common::AnfAlgo::GetCNodeName(cnode);
-    if (kComputeDepend.find(op_name) != kComputeDepend.end()) {
+    if (IsOneOfComputeDepend(op_name)) {
       unknow_type_ = device::ascend::UnknowShapeOpType::DEPEND_COMPUTE;
     }
   }
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/ascend_kernel_mod.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/ascend_kernel_mod.cc
index 5682bd16d5a..72b52cd74a5 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/ascend_kernel_mod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/ascend_kernel_mod.cc
@@ -47,7 +47,7 @@ bool AscendKernelMod::IsNeedRetrieveOutputShape() {
   MS_EXCEPTION_IF_NULL(cnode);
 
   auto op_name = common::AnfAlgo::GetCNodeName(cnode);
-  if (kComputeDepend.find(op_name) != kComputeDepend.end()) {
+  if (IsOneOfComputeDepend(op_name)) {
     is_need_retrieve_output_shape_ = true;
   }
   return is_need_retrieve_output_shape_;
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.cc
index 693e008ba0e..f2c249d8add 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.cc
@@ -166,8 +166,7 @@ void SingleTbeJsonCreator::GenInputDescJson(const AnfNodePtr &anf_node, size_t r
   auto def_format = TbeJsonUtils::IsNeedChangeDefaultFormat(anf_node) ? kOpFormat_NCDHW : kOpFormat_NCHW;
   auto format = AnfAlgo::GetInputFormat(anf_node, real_input_index);
   format = TbeAdapter::FormatPass(format, ori_shape.size());
-  format =
-    (def_format == kOpFormat_NCDHW && k3DFormatSet.find(format) == k3DFormatSet.end()) ? kOpFormat_NCDHW : format;
+  format = (def_format == kOpFormat_NCDHW && !IsOneOf3DFormat(format)) ? kOpFormat_NCDHW : format;
   auto d_type = AnfAlgo::GetInputDeviceDataType(anf_node, real_input_index);
   (*input_desc)[kJDtype] = tbe::TypeIdToString(d_type);
   (*input_desc)[kJDataType] = GetJsonValue<std::string>(*input_desc, kJDtype);
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/tbe_json_creator.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/tbe_json_creator.cc
index 24c892e7df9..c246a4ea914 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/tbe_json_creator.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/tbe_json_creator.cc
@@ -426,8 +426,7 @@ void TbeJsonCreator::GenDescJson(const AnfNodePtr &anf_node, size_t node_out_idx
   auto format = AnfAlgo::GetOutputFormat(anf_node, node_out_idx);
   format = tbe::TbeAdapter::FormatPass(format, ori_shape.size());
   auto def_format = TbeJsonUtils::IsNeedChangeDefaultFormat(anf_node) ? kOpFormat_NCDHW : kOpFormat_NCHW;
-  format =
-    (def_format == kOpFormat_NCDHW && k3DFormatSet.find(format) == k3DFormatSet.end()) ? kOpFormat_NCDHW : format;
+  format = (def_format == kOpFormat_NCDHW && !IsOneOf3DFormat(format)) ? kOpFormat_NCDHW : format;
 
   (*output_desc)[kJDataType] = tbe::TypeIdToString(AnfAlgo::GetOutputDeviceDataType(anf_node, node_out_idx));
   (*output_desc)[kJDtype] = GetJsonValue<std::string>(*output_desc, kJDataType);
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/common_utils.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/common_utils.cc
index 5117a30b470..64d9262a133 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/common_utils.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/common_utils.cc
@@ -70,12 +70,12 @@ std::vector<int64_t> HostCheck::GetFinalInferShape(const AnfNodePtr &node, const
   }
 
   auto temp_shape = infer_shape;
-  if (kNoPaddingFormatSet.find(format) == kNoPaddingFormatSet.end() && format != kOpFormat_FRACTAL_ZN_LSTM &&
-      infer_shape.size() < kShape4dDims && k3DFormatSet.find(format) == k3DFormatSet.end()) {
+  if (!IsOneOfNoPaddingFormat(format) && format != kOpFormat_FRACTAL_ZN_LSTM && infer_shape.size() < kShape4dDims &&
+      !IsOneOf3DFormat(format)) {
     MS_LOG(DEBUG) << "Get Device Shape using a shape size is less than 4 ,should be Padding shape by Default firstly";
     temp_shape = trans::PaddingShapeTo4dDefault(infer_shape, node);
   }
-  if (infer_shape.size() != kNcdhwShapeSize && k3DFormatSet.find(format) != k3DFormatSet.end()) {
+  if (infer_shape.size() != kNcdhwShapeSize && IsOneOf3DFormat(format)) {
     temp_shape = trans::PaddingShapeTo5dDefault(infer_shape, node);
   }
   return temp_shape;
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.cc
index e75a007705c..ab082e84aca 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.cc
@@ -41,7 +41,6 @@ constexpr auto kPrefixOutput = "output";
 constexpr char kParamTypeDynamic[] = "dynamic";
 constexpr char kParamTypeRequre[] = "required";
 constexpr char kParamTypeOptional[] = "optional";
-mindspore::HashMap<std::string, std::vector<std::shared_ptr<KernelBuildInfo>>> TbeKernelSelect::select_cache_ = {};
 
 void TbeMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list) {
   auto tbe_selecter = TbeKernelSelect(kernel_node, kernel_info_list);
@@ -236,7 +235,7 @@ void TbeKernelSelect::GetAgnosticPatternKernelInfo(const OpInfo &op_info) {
     MS_LOG(EXCEPTION) << "AgnosticPattern only support one input.";
   }
   auto format = AnfAlgo::GetPrevNodeOutputFormat(cnode_ptr_, 0);
-  if (kOpFormatList.find(format) == kOpFormatList.end()) {
+  if (!IsOneOfFormat(format)) {
     MS_LOG(INFO) << "Got the unknown format " << format;
     format = kOpFormat_DEFAULT;
   }
@@ -332,7 +331,7 @@ bool TbeKernelSelect::IsShapeMatchFormat(const ShapeVector &shape, const std::st
   }
   static const std::set<std::string> kServerNotSupportFormat = {kOpFormat_NC1HWC0_C04, kOpFormat_FRACTAL_Z_C04};
   // if format is default, it remarkes support all format
-  if (kOpFormatList.find(format) == kOpFormatList.end()) {
+  if (!IsOneOfFormat(format)) {
     MS_LOG(EXCEPTION) << "Got the unknown format " << format;
   }
   // server not support format with C04 suffix
@@ -346,7 +345,7 @@ bool TbeKernelSelect::IsShapeMatchFormat(const ShapeVector &shape, const std::st
   }
   // not support format:
   // 1 3d formats with shape size > 5
-  if (k3DFormatSet.find(format) != k3DFormatSet.end() && shape.size() > kShape5dDims) {
+  if (IsOneOf3DFormat(format) && shape.size() > kShape5dDims) {
     return false;
   }
   return true;
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.h b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.h
index f8948762d00..2c66bf966a6 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.h
@@ -80,7 +80,7 @@ class TbeKernelSelect {
   nlohmann::json kernel_json;
   std::string kernel_hash_name;
   bool check_cnode;
-  static mindspore::HashMap<std::string, std::vector<std::shared_ptr<KernelBuildInfo>>> select_cache_;
+  inline static mindspore::HashMap<std::string, std::vector<std::shared_ptr<KernelBuildInfo>>> select_cache_ = {};
 };
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_comm_op_reuse.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_comm_op_reuse.cc
index f30892cd97e..4558dd3ba77 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_comm_op_reuse.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_comm_op_reuse.cc
@@ -20,6 +20,7 @@
 #include "include/common/utils/anfalgo.h"
 #include "include/common/utils/comm_manager.h"
 #include "include/common/utils/parallel_context.h"
+#include "runtime/graph_scheduler/graph_compiler.h"
 #include "plugin/device/ascend/hal/device/ascend_stream_assign.h"
 #include "plugin/device/ascend/optimizer/ascend_comm_op_reuse.h"
 
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_helper.h b/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_helper.h
index b2d8703ce45..71b1270d275 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_helper.h
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_helper.h
@@ -29,6 +29,7 @@
 #include "plugin/device/ascend/kernel/tbe/tbe_kernel_select/tbe_kernel_select.h"
 
 namespace mindspore {
+const std::set<TypeId> kFloatDataTypeSet = {kNumberTypeFloat16, kNumberTypeFloat32};
 namespace opt {
 class KernelSelect {
  public:
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/enhancer/add_attr_for_3d_graph.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/enhancer/add_attr_for_3d_graph.cc
index 485929e21be..12a8017c47b 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/enhancer/add_attr_for_3d_graph.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/enhancer/add_attr_for_3d_graph.cc
@@ -32,7 +32,7 @@ const AnfNodePtr AddIoFormatAttrFor3DGraph::Process(const FuncGraphPtr &func_gra
     common::AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
     auto formats = AnfAlgo::GetAllOutputFormats(node);
     if (std::any_of(formats.begin(), formats.end(),
-                    [](const std::string &format) { return k3DFormatSet.find(format) != k3DFormatSet.end(); })) {
+                    [](const std::string &format) { return IsOneOf3DFormat(format); })) {
       common::AnfAlgo::SetNodeAttr(kAttrFormat, MakeValue(kOpFormat_NCDHW), node);
     }
     return node;
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/check_consistency.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/check_consistency.cc
index fbc23a25bef..31e299cf09c 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/check_consistency.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/check_consistency.cc
@@ -28,6 +28,9 @@
 namespace mindspore {
 namespace opt {
 namespace {
+const std::set<std::string> kDefaultCompatibleFormat = {kOpFormat_ND, kOpFormat_NCHW, kOpFormat_NHWC, kOpFormat_HWCN,
+                                                        kOpFormat_NCDHW};
+
 bool CheckFormatForConsistency(const CNodePtr &node, const size_t input_index) {
   MS_EXCEPTION_IF_NULL(node);
   // get prior node's device output format
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/rectify_do_mask_kernel_info.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/rectify_do_mask_kernel_info.cc
index d1dba89d08b..6523b3bfd93 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/rectify_do_mask_kernel_info.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/rectify_do_mask_kernel_info.cc
@@ -15,7 +15,6 @@
  */
 
 #include "plugin/device/ascend/optimizer/format_type/rectify_do_mask_kernel_info.h"
-
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "include/common/utils/anfalgo.h"
 #include "kernel/kernel_build_info.h"
@@ -67,7 +66,7 @@ void RectifyDoMaskKernelInfo::RectifyKernelInfo(const std::vector<CNodePtr> &do_
   std::string convert_format;
   for (const auto &do_mask : do_mask_node_list) {
     auto do_mask_data_format = AnfAlgo::GetInputFormat(do_mask, 0);
-    if (special_format.empty() && kHWSpecialFormatSet.find(do_mask_data_format) != kHWSpecialFormatSet.end()) {
+    if (special_format.empty() && IsOneOfHWSpecialFormat(do_mask_data_format)) {
       special_format = do_mask_data_format;
     }
     if (format_counter.find(do_mask_data_format) == format_counter.end()) {
@@ -99,7 +98,7 @@ std::string RectifyDoMaskKernelInfo::GetConvertFormat(const std::map<std::string
     if (counter < iter.second) {
       convert_format = iter.first;
       counter = iter.second;
-    } else if (counter == iter.second && kHWSpecialFormatSet.find(iter.first) != kHWSpecialFormatSet.end()) {
+    } else if (counter == iter.second && IsOneOfHWSpecialFormat(iter.first)) {
       convert_format = iter.first;
     }
   }
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/trans_op_format_refine.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/trans_op_format_refine.cc
index 54838314c4b..b9a96d9e516 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/trans_op_format_refine.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/trans_op_format_refine.cc
@@ -39,13 +39,13 @@ const AnfNodePtr TransOpFormatRefine::Process(const FuncGraphPtr &func_graph, co
     auto builder =
       std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(AnfAlgo::GetSelectKernelBuildInfo(node));
     MS_EXCEPTION_IF_NULL(builder);
-    if (in_format == kOpFormat_DEFAULT && k3DFormatSet.find(out_format) != k3DFormatSet.end()) {
+    if (in_format == kOpFormat_DEFAULT && IsOneOf3DFormat(out_format)) {
       builder->SetInputsFormat({kOpFormat_NCDHW});
       builder->SetOutputsFormat({out_format});
       AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), node.get());
       common::AnfAlgo::SetNodeAttr(kAttrSrcFormat, MakeValue(kOpFormat_NCDHW), node);
     }
-    if (out_format == kOpFormat_DEFAULT && k3DFormatSet.find(in_format) != k3DFormatSet.end()) {
+    if (out_format == kOpFormat_DEFAULT && IsOneOf3DFormat(in_format)) {
       builder->SetInputsFormat({in_format});
       builder->SetOutputsFormat({kOpFormat_NCDHW});
       AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), node.get());
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fission/seed_adapter.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fission/seed_adapter.cc
index a76f36a826e..56392894f29 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fission/seed_adapter.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fission/seed_adapter.cc
@@ -30,6 +30,8 @@
 
 namespace mindspore::opt {
 namespace {
+const std::set<std::string> kNodeWithSeedOperators = {kGammaOpName,          kPoissonOpName,    kStandardLaplaceOpName,
+                                                      kStandardNormalOpName, kUniformIntOpName, kUniformRealOpName};
 tensor::TensorPtr CreateTensor(int64_t seed) {
   // 1 create seed tensor
   std::vector<int64_t> indices_shape = {1};
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_fusion.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_fusion.cc
index a7fef2bdd67..c0b863b81f7 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_fusion.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_fusion.cc
@@ -14,10 +14,8 @@
  * limitations under the License.
  */
 #include "plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_fusion.h"
-#include "backend/common/optimizer/helper.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
-#include "include/common/utils/anfalgo.h"
-#include "utils/trace_base.h"
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 namespace mindspore {
 namespace opt {
 const BaseRef AdamApplyOneFusion::DefinePattern() const {
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_with_decay_rule.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_with_decay_rule.cc
index 59751e935e7..dcb79ee6dc6 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_with_decay_rule.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_with_decay_rule.cc
@@ -15,6 +15,7 @@
  */
 #include "plugin/device/ascend/optimizer/ir_fusion/adam_apply_one_with_decay_rule.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 #include "include/common/utils/anfalgo.h"
 #include "ir/primitive.h"
 #include "backend/common/optimizer/helper.h"
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_rule.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_rule.cc
index ea1be4bc33f..c5eb3df8682 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_rule.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_rule.cc
@@ -21,6 +21,7 @@
 #include "include/common/utils/utils.h"
 #include "backend/common/optimizer/helper.h"
 #include "mindspore/core/ops/core_ops.h"
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 
 namespace mindspore {
 namespace opt {
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_rule.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_rule.cc
index 2e299e03b19..560a4180f49 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_rule.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_rule.cc
@@ -16,6 +16,7 @@
 #include "plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_rule.h"
 #include <utility>
 #include "backend/common/session/anf_runtime_algorithm.h"
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 #include "include/common/utils/anfalgo.h"
 #include "frontend/optimizer/opt.h"
 #include "utils/trace_base.h"
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_v1_rule.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_v1_rule.cc
index ee435cfabcb..b3db5cd4b3f 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_v1_rule.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_mv_with_decay_v1_rule.cc
@@ -19,7 +19,7 @@
 #include <string>
 #include <tuple>
 #include <utility>
-
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "include/common/utils/anfalgo.h"
 #include "frontend/optimizer/opt.h"
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_right_rule.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_right_rule.cc
index c5fbec4d5bd..9a7ddfe7b26 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_right_rule.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_next_right_rule.cc
@@ -16,6 +16,7 @@
 #include "plugin/device/ascend/optimizer/ir_fusion/lamb_next_right_rule.h"
 #include <vector>
 #include "backend/common/optimizer/helper.h"
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 #include "utils/trace_base.h"
 namespace mindspore {
 namespace opt {
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_update_with_lr_rule_fusion.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_update_with_lr_rule_fusion.cc
index ecf0c70661e..1373171cadc 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_update_with_lr_rule_fusion.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_update_with_lr_rule_fusion.cc
@@ -17,7 +17,7 @@
 
 #include <memory>
 #include <vector>
-
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "include/common/utils/anfalgo.h"
 #include "ir/primitive.h"
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_update_with_lr_v2.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_update_with_lr_v2.cc
index acf8a26196d..93eecbc536d 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_update_with_lr_v2.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/lamb_update_with_lr_v2.cc
@@ -19,6 +19,7 @@
 #include <algorithm>
 #include "include/common/utils/utils.h"
 #include "mindspore/core/ops/core_ops.h"
+#include "plugin/device/ascend/optimizer/ascend_helper.h"
 
 namespace mindspore {
 namespace opt {
diff --git a/mindspore/ccsrc/plugin/device/cpu/hal/device/kernel_select_cpu.cc b/mindspore/ccsrc/plugin/device/cpu/hal/device/kernel_select_cpu.cc
index 3d44519feec..643928b47d7 100644
--- a/mindspore/ccsrc/plugin/device/cpu/hal/device/kernel_select_cpu.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/hal/device/kernel_select_cpu.cc
@@ -439,7 +439,7 @@ std::pair<std::string, ExceptionType> SetKernelInfoWithMsg(const CNodePtr &kerne
   const std::string &op_name = common::AnfAlgo::GetCNodeName(kernel_node);
   if (IsPrimitiveCNode(kernel_node, prim::kPrimCustom)) {
     auto tp = common::AnfAlgo::GetNodeAttr<std::string>(kernel_node, kAttrFuncType);
-    if (kCustomTypeAkg.find(tp) != kCustomTypeAkg.end()) {
+    if (IsOneOfCustomAkgType(tp)) {
       UpdateCustomKernelBuildInfo(kernel_node, true);
       return {};
     }
diff --git a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc
index 60ef76c475f..6e4f46115d9 100644
--- a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc
@@ -345,8 +345,7 @@ bool CPUKernelExecutor::LaunchKernel(const CNodePtr &kernel, const std::vector<A
 
   // Some CPU kernels can't initialize kernel and launch kernel in different thread, so reinitialize the kernels before
   // launch.
-  if (kOpNotSupportMultiThreadExecList.find(common::AnfAlgo::GetCNodeName(kernel)) !=
-      kOpNotSupportMultiThreadExecList.end()) {
+  if (IsOneOfNotSupportMultiThreadExec(common::AnfAlgo::GetCNodeName(kernel))) {
     auto cpu_kernel_mod = dynamic_cast<kernel::DeprecatedNativeCpuKernelMod *>(kernel_mod);
     MS_EXCEPTION_IF_NULL(cpu_kernel_mod);
     cpu_kernel_mod->InitKernel(kernel);
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.cc
index b2b23fc04a3..264ef7d76d5 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.cc
@@ -89,8 +89,6 @@ std::vector<KernelAttr> NativeCpuKernelMod::GetSupportFromOpLib(const std::strin
   return support_kernel_attrs;
 }
 
-mindspore::HashMap<std::string, std::vector<KernelAttr>> NativeCpuKernelMod::support_map_{};
-
 int DeprecatedNativeCpuKernelMod::Resize(const BaseOperatorPtr &base_operator,
                                          const std::vector<KernelTensorPtr> &inputs,
                                          const std::vector<KernelTensorPtr> &outputs,
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.h
index b2c3e1e79e6..5a248e62f12 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.h
@@ -169,7 +169,7 @@ class BACKEND_EXPORT NativeCpuKernelMod : public CpuKernelMod {
  private:
   std::vector<KernelAttr> GetAllSupportedList(const std::string &kernel_name);
   std::vector<KernelAttr> GetSupportFromOpLib(const std::string &kernel_name) const;
-  static mindspore::HashMap<std::string, std::vector<KernelAttr>> support_map_;
+  inline static mindspore::HashMap<std::string, std::vector<KernelAttr>> support_map_;
 };
 
 class BACKEND_EXPORT DeprecatedNativeCpuKernelMod : public NativeCpuKernelMod {
diff --git a/mindspore/ccsrc/plugin/device/gpu/CMakeLists.txt b/mindspore/ccsrc/plugin/device/gpu/CMakeLists.txt
new file mode 100644
index 00000000000..258e6e624e6
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/CMakeLists.txt
@@ -0,0 +1,62 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${CMAKE_BINARY_DIR})
+include_directories(${CUDNN_INCLUDE_PATH} ${CUDA_PATH} ${CUDA_INCLUDE_DIRS} ${CUPTI_INCLUDE_DIRS})
+add_subdirectory(kernel/cuda_impl)
+
+########### mindspore_gpu.so #####
+set(GPU_SUB_COMP
+        hal/device
+        hal/hardware
+        hal/profiler
+        kernel
+        optimizer
+        )
+
+if(ENABLE_GPU)
+    foreach(g_comp ${GPU_SUB_COMP})
+        add_subdirectory(${g_comp})
+        string(REPLACE "/" "_" sub ${g_comp})
+        if(TARGET _mindspore_plugin_device_gpu_${sub}_obj)
+            list(APPEND GPU_SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_plugin_device_gpu_${sub}_obj>)
+            add_dependencies(_mindspore_plugin_device_gpu_${sub}_obj proto_input)
+        endif()
+    endforeach()
+endif()
+
+add_library(mindspore_gpu SHARED ${GPU_SUB_OBJECTS_SRC})
+target_link_libraries(mindspore_gpu PUBLIC mindspore_backend_common)
+target_link_libraries(mindspore_gpu PRIVATE mindspore_core mindspore_common proto_input mindspore::protobuf)
+target_link_libraries(mindspore_gpu PRIVATE securec)
+set_target_properties(mindspore_gpu PROPERTIES INSTALL_RPATH $ORIGIN)
+target_link_libraries(mindspore_gpu PRIVATE mindspore::dnnl mindspore::mkldnn nnacl)
+target_link_libraries(mindspore_gpu PRIVATE mindspore::ssl mindspore::crypto)
+target_link_libraries(mindspore_gpu PRIVATE mindspore::event mindspore::event_pthreads
+        mindspore::event_openssl -Wl,--no-as-needed mindspore::event_core ps_cache)
+
+if(ENABLE_GPU)
+    message("add gpu lib to mindspore_gpu")
+    target_link_libraries(mindspore_gpu PRIVATE cuda_ops
+            ${CUBLAS_LIBRARY_PATH}
+            ${CUDA_PATH}/lib64/libcurand.so
+            ${CUDNN_LIBRARY_PATH}
+            ${CUDA_PATH}/lib64/libcudart.so
+            ${CUDA_PATH}/lib64/stubs/libcuda.so
+            ${CUDA_PATH}/lib64/libcusolver.so
+            ${CUDA_PATH}/lib64/libcufft.so
+            ${CUDA_PATH}/lib64/libcusparse.so)
+endif()
+
+if(ENABLE_DEBUGGER)
+    # debugger: link grpc
+    if(ENABLE_GPU)
+        target_link_libraries(mindspore_gpu PRIVATE -Wl,--no-as-needed mindspore::grpc++)
+    endif()
+endif()
+
+if(ENABLE_GPU)
+    if(ENABLE_MPI)
+        set_target_properties(_ms_mpi PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH})
+        set_target_properties(nvidia_collective PROPERTIES INSTALL_RPATH ${ORIGIN_PATH})
+        set_target_properties(gpu_collective PROPERTIES INSTALL_RPATH ${ORIGIN_PATH})
+    endif()
+endif()
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/CMakeLists.txt b/mindspore/ccsrc/plugin/device/gpu/hal/device/CMakeLists.txt
index 014bdf67d06..6889c852764 100644
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/CMakeLists.txt
@@ -3,11 +3,29 @@ if("${ENABLE_HIDDEN}" STREQUAL "OFF")
     string(REPLACE " -fvisibility=hidden" " -fvisibility=default" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
 
+list(APPEND DEVICE_SRC_LIST "ps/gpu_ps_cache.cc")
 if(ENABLE_GPU)
-    list(APPEND DEVICE_SRC_LIST "distribution/collective_init.cc")
-    list(APPEND DEVICE_SRC_LIST "gpu_comm_manager.cc")
-else()
-    list(APPEND DEVICE_SRC_LIST "distribution/collective_fake_init.cc")
+    list(APPEND DEVICE_SRC_LIST ${CMAKE_SOURCE_DIR}/mindspore/ccsrc/common/mem_reuse/mem_reuse.cc)
+    list(APPEND DEVICE_SRC_LIST ${CMAKE_SOURCE_DIR}/mindspore/ccsrc/common/mem_reuse/mem_swap_manager.cc)
+    list(APPEND DEVICE_SRC_LIST ${CMAKE_SOURCE_DIR}/mindspore/ccsrc/runtime/data_queue/data_queue.h)
+    file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
+    list(REMOVE_ITEM DEVICE_SRC_LIST
+            "mpi/mpi_initializer.cc"
+            "distribution/collective_wrapper.cc"
+            "distribution/mpi_wrapper.cc"
+            "distribution/nccl_wrapper.cc"
+            "trt_loader.cc")
+    if(NOT ${TENSORRT_HOME} STREQUAL "")
+        find_path(TENSORRT_HOME_INCLUDE NvInfer.h HINTS ${TENSORRT_HOME}/include)
+        if(TENSORRT_HOME_INCLUDE STREQUAL TENSORRT_HOME_INCLUDE-NOTFOUND)
+            message(FATAL_ERROR "Tensor-RT dir not exist ${TENSORRT_HOME}")
+        endif()
+        message("Enable GPU inference. Tensor-RT include dir: ${TENSORRT_HOME_INCLUDE}")
+        set(ENABLE_GPU_INFER TRUE)
+        add_compile_definitions(ENABLE_GPU_INFER)
+        include_directories(${TENSORRT_HOME_INCLUDE})
+        list(APPEND DEVICE_SRC_LIST ${CMAKE_CURRENT_SOURCE_DIR}/trt_loader.cc)
+    endif()
 endif()
 
 if(ENABLE_GPU)
@@ -19,8 +37,6 @@ if(ENABLE_GPU)
     endif()
 
     file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc" "*.cu")
-
-    #set(GPU_QUEUE_SRCS "blocking_queue.cc" "gpu_buffer_mgr.cc" "data_queue.cc")
     set(GPU_COLLECTIVE_SRCS "distribution/collective_wrapper.cc"
                             "distribution/mpi_wrapper.cc"
                             "distribution/nccl_wrapper.cc")
@@ -36,8 +52,6 @@ if(ENABLE_GPU)
         target_link_libraries(gpu_collective PRIVATE mindspore::ompi mindspore::nccl)
         target_link_libraries(_ms_mpi PRIVATE gpu_collective)
     endif()
-
-    # add_library(_mindspore_device_cuda_obj OBJECT ${CUDA_SRC_LIST})
 endif()
 
 set_property(SOURCE ${DEVICE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_data_queue.h b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_data_queue.h
index e8ec407f4d9..a673740f42d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_data_queue.h
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_data_queue.h
@@ -24,10 +24,11 @@
 #include <functional>
 #include "runtime/data_queue/data_queue.h"
 #include "runtime/hardware/device_context_manager.h"
+#include "include/backend/visible.h"
 
 namespace mindspore {
 namespace device {
-class GpuDataQueueDynamic : public DataQueue {
+class BACKEND_EXPORT GpuDataQueueDynamic : public DataQueue {
  public:
   explicit GpuDataQueueDynamic(const size_t capacity);
   virtual ~GpuDataQueueDynamic() = default;
@@ -49,7 +50,7 @@ class GpuDataQueueDynamic : public DataQueue {
   std::unique_ptr<NodeInfo[]> node_info_;
 };
 
-class GpuQueue : public DataQueue {
+class BACKEND_EXPORT GpuQueue : public DataQueue {
  public:
   GpuQueue(void *addr, const std::vector<size_t> &shape, const size_t &capacity);
   virtual ~GpuQueue();
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc b/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc
index fb2f03dc56b..a47b1761060 100644
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc
@@ -193,7 +193,7 @@ bool SelectCustomKernel(const CNodePtr &kernel_node, const std::shared_ptr<Kerne
       kernel::Factory<kernel::NativeGpuKernelMod>::Instance().Register(
         op_name, []() { return std::make_shared<kernel::CustomAOTGpuKernelMod>(); });
     }
-  } else if (kCustomTypeAkg.find(func_type) != kCustomTypeAkg.end()) {
+  } else if (IsOneOfCustomAkgType(func_type)) {
     *kernel_type = KernelType::AKG_KERNEL;
   } else {
     MS_LOG(EXCEPTION) << "Unsupported func type [" << func_type << "] for Custom op [" << op_name << "] on GPU";
diff --git a/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc b/mindspore/ccsrc/plugin/device/gpu/hal/device/ps/gpu_ps_cache.cc
similarity index 98%
rename from mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc
rename to mindspore/ccsrc/plugin/device/gpu/hal/device/ps/gpu_ps_cache.cc
index cd4632e1c9c..0a7853bffd3 100644
--- a/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/ps/gpu_ps_cache.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ps/ps_cache/gpu/gpu_ps_cache.h"
+#include "plugin/device/gpu/hal/device/ps/gpu_ps_cache.h"
 #include "ps/ps_cache/ps_cache_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cuh"
 #include "plugin/device/gpu/hal/device/gpu_common.h"
diff --git a/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.h b/mindspore/ccsrc/plugin/device/gpu/hal/device/ps/gpu_ps_cache.h
similarity index 89%
rename from mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.h
rename to mindspore/ccsrc/plugin/device/gpu/hal/device/ps/gpu_ps_cache.h
index 6ef18becada..b3442c2c9db 100644
--- a/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.h
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/ps/gpu_ps_cache.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_PS_PS_CACHE_GPU_GPU_PS_CACHE_H_
-#define MINDSPORE_CCSRC_PS_PS_CACHE_GPU_GPU_PS_CACHE_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_HAL_DEVICE_PS_GPU_PS_CACHE_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_HAL_DEVICE_PS_GPU_PS_CACHE_H_
 
 #include <cuda_runtime_api.h>
 #include <memory>
@@ -47,4 +47,4 @@ class GPUPsCache : public PsCacheBasic {
 }  // namespace gpu
 }  // namespace ps
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_PS_PS_CACHE_GPU_GPU_PS_CACHE_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_HAL_DEVICE_PS_GPU_PS_CACHE_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_session.cc b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_session.cc
index dc60c74c488..4d3fe57e347 100644
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_session.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_session.cc
@@ -659,8 +659,7 @@ KernelGraphPtr GPUSession::BuildOpImpl(const BackendOpRunInfoPtr &op_run_info, c
                                        const std::vector<int64_t> &tensors_mask) {
   // Check if the graph cache exists.
   auto it = run_op_graphs_.find(graph_info);
-  if (it != run_op_graphs_.end() &&
-      kOpCacheBlackList.find(op_run_info->base_op_run_info.op_name) == kOpCacheBlackList.end()) {
+  if (it != run_op_graphs_.end() && !IsOneOfCacheBlackList(op_run_info->base_op_run_info.op_name)) {
     return it->second;
   }
 
@@ -718,7 +717,7 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, const BackendOpRunInfoPt
     UpdateOutputAbstract(kernel_graph, op_run_info);
   }
   RunOpClearMemory(kernel_graph.get());
-  if (kOpCacheBlackList.find(op_run_info->base_op_run_info.op_name) != kOpCacheBlackList.end()) {
+  if (IsOneOfCacheBlackList(op_run_info->base_op_run_info.op_name)) {
     run_op_graphs_.erase(graph_info);
   }
 }
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/CMakeLists.txt b/mindspore/ccsrc/plugin/device/gpu/kernel/CMakeLists.txt
index e95c5ed0716..4f2f6f821fa 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/CMakeLists.txt
@@ -3,8 +3,6 @@ if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
 endif()
 
 if(ENABLE_GPU)
-    file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cu")
-
     file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
     file(GLOB_RECURSE _AKG_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "akg/*.cc")
     list(REMOVE_ITEM GPU_SRC_LIST ${_AKG_SRC_LIST})
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/CMakeLists.txt
similarity index 84%
rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt
rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/CMakeLists.txt
index 90edfd081ab..386b2acd466 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/CMakeLists.txt
@@ -12,11 +12,12 @@ endif()
 
 set_property(SOURCE ${CUDA_OPS_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_KERNEL)
 if(ENABLE_GPU)
-    add_library(cuda_common_obj OBJECT cuda_common.cc  cuda_device_info.cc)
+    add_library(cuda_common_obj OBJECT cuda_ops/cuda_common.cc  cuda_ops/cuda_device_info.cc)
     target_compile_options(cuda_common_obj PRIVATE "-std=c++17")
     cuda_add_library(cuda_ops SHARED ${CUDA_OPS_SRC_LIST} $<TARGET_OBJECTS:cuda_common_obj>)
     message("add gpu lib to cuda_ops")
-    target_link_libraries(cuda_ops mindspore_core cublas
+    target_link_libraries(cuda_ops mindspore_core
+            ${CUBLAS_LIBRARY_PATH}
             ${CUDA_PATH}/lib64/libcurand.so
             ${CUDNN_LIBRARY_PATH}
             ${CUDA_PATH}/lib64/libcudart.so
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lp_norm_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lp_norm_impl.cu
index e5f19aaecbb..354c35e030e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lp_norm_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/lp_norm_impl.cu
@@ -69,10 +69,11 @@ __global__ void NormCalHighPrecisionKernel(const float *middle_output, T *output
 }
 
 template <>
-void CalLpNorm<float>(const float *input, const size_t *input_shape, size_t input_shape_length, size_t input_elements,
-                      const size_t *output_axis, const size_t *output_stride, size_t output_shape_length,
-                      size_t output_elements, float p, float eps, float *middle_output, float *output,
-                      const uint32_t &device_id, cudaStream_t cuda_stream) {
+CUDA_LIB_EXPORT void CalLpNorm<float>(const float *input, const size_t *input_shape, size_t input_shape_length,
+                                      size_t input_elements, const size_t *output_axis, const size_t *output_stride,
+                                      size_t output_shape_length, size_t output_elements, float p, float eps,
+                                      float *middle_output, float *output, const uint32_t &device_id,
+                                      cudaStream_t cuda_stream) {
   LpCalKernel<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
     input, input_shape, input_shape_length, input_elements, output_axis, output_stride, output_shape_length, p, eps,
     output);
@@ -81,10 +82,11 @@ void CalLpNorm<float>(const float *input, const size_t *input_shape, size_t inpu
 }
 
 template <>
-void CalLpNorm<half>(const half *input, const size_t *input_shape, size_t input_shape_length, size_t input_elements,
-                     const size_t *output_axis, const size_t *output_stride, size_t output_shape_length,
-                     size_t output_elements, float p, float eps, float *middle_output, half *output,
-                     const uint32_t &device_id, cudaStream_t cuda_stream) {
+CUDA_LIB_EXPORT void CalLpNorm<half>(const half *input, const size_t *input_shape, size_t input_shape_length,
+                                     size_t input_elements, const size_t *output_axis, const size_t *output_stride,
+                                     size_t output_shape_length, size_t output_elements, float p, float eps,
+                                     float *middle_output, half *output, const uint32_t &device_id,
+                                     cudaStream_t cuda_stream) {
   LpCalKernel<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
     input, input_shape, input_shape_length, input_elements, output_axis, output_stride, output_shape_length, p, eps,
     middle_output);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multi_margin_loss_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multi_margin_loss_grad_impl.cu
index 55ea8d59db6..e273cae4ce5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multi_margin_loss_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multi_margin_loss_grad_impl.cu
@@ -133,9 +133,9 @@ __global__ void MultiMarginLoss_backward_kernel_half(half *gradInput, const half
 
 // namespace str
 template <typename T>
-void MultiMarginLossGrad(int64_t p, float margin, int64_t reduction, int nframe, int dim, const T *output_grad,
-                         const T *input, const int64_t *target, const T *weight, T *output, const uint32_t &device_id,
-                         cudaStream_t cuda_stream) {
+CUDA_LIB_EXPORT void MultiMarginLossGrad(int64_t p, float margin, int64_t reduction, int nframe, int dim,
+                                         const T *output_grad, const T *input, const int64_t *target, const T *weight,
+                                         T *output, const uint32_t &device_id, cudaStream_t cuda_stream) {
   dim3 blocks1(nframe);
   dim3 threads1(MULTIMARGIN_THREADS);
   bool reduce = false;
@@ -158,9 +158,10 @@ void MultiMarginLossGrad(int64_t p, float margin, int64_t reduction, int nframe,
 
 // namespace str
 template <>
-void MultiMarginLossGrad(int64_t p, float margin, int64_t reduction, int nframe, int dim, const half *output_grad,
-                         const half *input, const int64_t *target, const half *weight, half *output,
-                         const uint32_t &device_id, cudaStream_t cuda_stream) {
+CUDA_LIB_EXPORT void MultiMarginLossGrad(int64_t p, float margin, int64_t reduction, int nframe, int dim,
+                                         const half *output_grad, const half *input, const int64_t *target,
+                                         const half *weight, half *output, const uint32_t &device_id,
+                                         cudaStream_t cuda_stream) {
   dim3 blocks1(nframe);
   dim3 threads1(MULTIMARGIN_THREADS);
   bool reduce = false;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multi_margin_loss_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multi_margin_loss_impl.cu
index aae43eddd8e..d5c86daaeb1 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multi_margin_loss_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multi_margin_loss_impl.cu
@@ -286,9 +286,9 @@ __global__ void MultiMarginLossReduceKernel(int dim, T *output) {
 
 // namespace str
 template <typename T>
-void MultiMarginLoss(int64_t p, float margin, int64_t reduction, int nframe, int dim, const T *input,
-                     const int64_t *target, const T *weight, T *output, const uint32_t &device_id,
-                     cudaStream_t cuda_stream) {
+CUDA_LIB_EXPORT void MultiMarginLoss(int64_t p, float margin, int64_t reduction, int nframe, int dim, const T *input,
+                                     const int64_t *target, const T *weight, T *output, const uint32_t &device_id,
+                                     cudaStream_t cuda_stream) {
   dim3 blocks(nframe);
   dim3 threads(MULTIMARGIN_THREADS);
   bool sizeAverage = false;
@@ -311,9 +311,9 @@ void MultiMarginLoss(int64_t p, float margin, int64_t reduction, int nframe, int
 
 // namespace str
 template <>
-void MultiMarginLoss(int64_t p, float margin, int64_t reduction, int nframe, int dim, const half *input,
-                     const int64_t *target, const half *weight, half *output, const uint32_t &device_id,
-                     cudaStream_t cuda_stream) {
+CUDA_LIB_EXPORT void MultiMarginLoss(int64_t p, float margin, int64_t reduction, int nframe, int dim, const half *input,
+                                     const int64_t *target, const half *weight, half *output, const uint32_t &device_id,
+                                     cudaStream_t cuda_stream) {
   dim3 blocks(nframe);
   dim3 threads(128);
   bool sizeAverage = false;
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/renorm_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/renorm_impl.cu
index e5acf51726f..932c01ec278 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/renorm_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/renorm_impl.cu
@@ -58,8 +58,7 @@ __global__ void CalNormValFun1(const Complex<double> *input, size_t input_elemen
 }
 
 __global__ void CalNormValFun2(float *norm_value, int p, size_t axis_size, float max_norm) {
-  for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (axis_size);
-       index += blockDim.x * gridDim.x) {
+  for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (axis_size); index += blockDim.x * gridDim.x) {
     float temp = pow(norm_value[index], static_cast<float>(1.0 / p));
     if (temp > max_norm) {
       norm_value[index] = max_norm / temp;
@@ -84,61 +83,61 @@ __global__ void CalNormValFun3(const T *input, size_t inner_size, size_t axis_si
 }
 
 template <>
-void CalRenorm<half>(const half *input, size_t input_elements, size_t inner_size, size_t axis_size, int p,
-                     float *norm_value, half *output, const uint32_t &device_id, cudaStream_t cuda_stream,
-                     float max_norm) {
+CUDA_LIB_EXPORT void CalRenorm<half>(const half *input, size_t input_elements, size_t inner_size, size_t axis_size,
+                                     int p, float *norm_value, half *output, const uint32_t &device_id,
+                                     cudaStream_t cuda_stream, float max_norm) {
   CalNormValFun1<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
     input, input_elements, inner_size, axis_size, p, norm_value);
-  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
-    norm_value, p, axis_size, max_norm);
+  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(norm_value, p,
+                                                                                                 axis_size, max_norm);
   CalNormValFun3<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
     input, inner_size, axis_size, input_elements, output, norm_value);
 }
 
 template <>
-void CalRenorm<float>(const float *input, size_t input_elements, size_t inner_size, size_t axis_size, int p,
-                      float *norm_value, float *output, const uint32_t &device_id, cudaStream_t cuda_stream,
-                      float max_norm) {
+CUDA_LIB_EXPORT void CalRenorm<float>(const float *input, size_t input_elements, size_t inner_size, size_t axis_size,
+                                      int p, float *norm_value, float *output, const uint32_t &device_id,
+                                      cudaStream_t cuda_stream, float max_norm) {
   CalNormValFun1<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
     input, input_elements, inner_size, axis_size, p, norm_value);
-  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
-    norm_value, p, axis_size, max_norm);
+  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(norm_value, p,
+                                                                                                 axis_size, max_norm);
   CalNormValFun3<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
     input, inner_size, axis_size, input_elements, output, norm_value);
 }
 
 template <>
-void CalRenorm<double>(const double *input, size_t input_elements, size_t inner_size, size_t axis_size, int p,
-                       float *norm_value, double *output, const uint32_t &device_id, cudaStream_t cuda_stream,
-                       float max_norm) {
+CUDA_LIB_EXPORT void CalRenorm<double>(const double *input, size_t input_elements, size_t inner_size, size_t axis_size,
+                                       int p, float *norm_value, double *output, const uint32_t &device_id,
+                                       cudaStream_t cuda_stream, float max_norm) {
   CalNormValFun1<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
     input, input_elements, inner_size, axis_size, p, norm_value);
-  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
-    norm_value, p, axis_size, max_norm);
+  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(norm_value, p,
+                                                                                                 axis_size, max_norm);
   CalNormValFun3<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
     input, inner_size, axis_size, input_elements, output, norm_value);
 }
 
 template <>
-void CalRenorm<Complex<float>>(const Complex<float> *input, size_t input_elements, size_t inner_size, size_t axis_size,
-                               int p, float *norm_value, Complex<float> *output, const uint32_t &device_id,
-                               cudaStream_t cuda_stream, float max_norm) {
+CUDA_LIB_EXPORT void CalRenorm<Complex<float>>(const Complex<float> *input, size_t input_elements, size_t inner_size,
+                                               size_t axis_size, int p, float *norm_value, Complex<float> *output,
+                                               const uint32_t &device_id, cudaStream_t cuda_stream, float max_norm) {
   CalNormValFun1<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
     input, input_elements, inner_size, axis_size, p, norm_value);
-  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
-    norm_value, p, axis_size, max_norm);
+  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(norm_value, p,
+                                                                                                 axis_size, max_norm);
   CalNormValFun3<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
     input, inner_size, axis_size, input_elements, output, norm_value);
 }
 
 template <>
-void CalRenorm<Complex<double>>(const Complex<double> *input, size_t input_elements, size_t inner_size,
-                                size_t axis_size, int p, float *norm_value, Complex<double> *output,
-                                const uint32_t &device_id, cudaStream_t cuda_stream, float max_norm) {
+CUDA_LIB_EXPORT void CalRenorm<Complex<double>>(const Complex<double> *input, size_t input_elements, size_t inner_size,
+                                                size_t axis_size, int p, float *norm_value, Complex<double> *output,
+                                                const uint32_t &device_id, cudaStream_t cuda_stream, float max_norm) {
   CalNormValFun1<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
     input, input_elements, inner_size, axis_size, p, norm_value);
-  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
-    norm_value, p, axis_size, max_norm);
+  CalNormValFun2<<<CUDA_BLOCKS(device_id, axis_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(norm_value, p,
+                                                                                                 axis_size, max_norm);
   CalNormValFun3<<<CUDA_BLOCKS(device_id, input_elements), CUDA_THREADS(device_id), 0, cuda_stream>>>(
     input, inner_size, axis_size, input_elements, output, norm_value);
 }
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scale_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scale_grad_impl.cu
index 50694ecade4..c872e183395 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scale_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scale_grad_impl.cu
@@ -28,16 +28,16 @@ __global__ void ScaleGrad(const int nums, const T *x0, const S &x1, T *y) {
 }
 
 template <typename T, typename S>
-void ScaleGradKernel(const int &nums, const T *x0, const S &x1, T *y, cudaStream_t stream) {
+CUDA_LIB_EXPORT void ScaleGradKernel(const int &nums, const T *x0, const S &x1, T *y, cudaStream_t stream) {
   ScaleGrad<<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
   return;
 }
 
-template void ScaleGradKernel<float, float>(const int &nums, const float *x0, const float &x1, float *y,
-                                            cudaStream_t stream);
-template void ScaleGradKernel<float, half>(const int &nums, const float *x0, const half &x1, float *y,
-                                           cudaStream_t stream);
-template void ScaleGradKernel<half, float>(const int &nums, const half *x0, const float &x1, half *y,
-                                           cudaStream_t stream);
-template void ScaleGradKernel<half, half>(const int &nums, const half *x0, const half &x1, half *y,
-                                          cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScaleGradKernel<float, float>(const int &nums, const float *x0, const float &x1, float *y,
+                                                            cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScaleGradKernel<float, half>(const int &nums, const float *x0, const half &x1, float *y,
+                                                           cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScaleGradKernel<half, float>(const int &nums, const half *x0, const float &x1, half *y,
+                                                           cudaStream_t stream);
+template CUDA_LIB_EXPORT void ScaleGradKernel<half, half>(const int &nums, const half *x0, const half &x1, half *y,
+                                                          cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scale_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scale_grad_impl.cuh
index 172337c3566..8f9151dc803 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scale_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scale_grad_impl.cuh
@@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_SCALE_GRAD_IMPL_H_
 
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 template <typename T, typename S>
 void ScaleGradKernel(const int &nums, const T *x0, const S &x1, T *y, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/discounted_return_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/discounted_return_impl.cu
index 88e621dfdb5..117fdbdf050 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/discounted_return_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/discounted_return_impl.cu
@@ -39,9 +39,9 @@ __global__ void DiscountedReturnKernel(const int timestep, const int num_env, co
 }
 
 template <typename T>
-void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element, const float &gamma,
-                      const T *reward, const bool *done, const T *last_value, T *discouted_return,
-                      cudaStream_t stream) {
+CUDA_LIB_EXPORT void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element,
+                                      const float &gamma, const T *reward, const bool *done, const T *last_value,
+                                      T *discouted_return, cudaStream_t stream) {
   // Every block process M element, 256 is a common tile size.
   const int element_per_step = num_env * num_element;
   const int element_per_block = std::min(256, element_per_step);
@@ -51,9 +51,9 @@ void DiscountedReturn(const int &timestep, const int &num_env, const int &num_el
                                                                      done, last_value, discouted_return);
 }
 
-template void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element, const float &gamma,
-                               const float *reward, const bool *done, const float *last_value, float *discouted_return,
-                               cudaStream_t stream);
-template void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element, const float &gamma,
-                               const half *reward, const bool *done, const half *last_value, half *discouted_return,
-                               cudaStream_t stream);
+template CUDA_LIB_EXPORT void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element,
+                                               const float &gamma, const float *reward, const bool *done,
+                                               const float *last_value, float *discouted_return, cudaStream_t stream);
+template CUDA_LIB_EXPORT void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element,
+                                               const float &gamma, const half *reward, const bool *done,
+                                               const half *last_value, half *discouted_return, cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/discounted_return_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/discounted_return_impl.cuh
index 2e69f5d12b8..eab53e03634 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/discounted_return_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/discounted_return_impl.cuh
@@ -16,8 +16,9 @@
 
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_DISCONTED_RETURN_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_DISCONTED_RETURN_IMPL_H_
-
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 template <typename T>
-void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element, const float &gamma,
-                      const T *reward, const bool *done, const T *last_value, T *discouted_return, cudaStream_t stream);
+CUDA_LIB_EXPORT void DiscountedReturn(const int &timestep, const int &num_env, const int &num_element,
+                                      const float &gamma, const T *reward, const bool *done, const T *last_value,
+                                      T *discouted_return, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_DISCONTED_RETURN_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/priority_replay_buffer.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/priority_replay_buffer.cuh
index 6c6be21ad44..c8b32ef88a7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/priority_replay_buffer.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/priority_replay_buffer.cuh
@@ -18,21 +18,22 @@
 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMP_PRIORITY_REPLAY_BUFFER_IMPL_H_
 
 #include <curand_kernel.h>
-
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 struct SumTree {
   float sum;
   float min;
 };
 
-void SumTreeInit(SumTree *tree, float *max_priority, const size_t &capacity, cudaStream_t stream);
-void InitRandState(const size_t &batch_size, const uint64_t &seed, curandState *state, cudaStream_t stream);
-void SumTreePush(SumTree *tree, const float &alpha, const size_t &idx, const size_t &capacity, float *priority,
-                 float *max_priority, cudaStream_t stream);
-void SumTreeSample(SumTree *tree, curandState *state, const size_t &capacity, float *beta, const size_t &batch_size,
-                   size_t *indices, float *weights, cudaStream_t stream);
-void SumTreeUpdate(SumTree *tree, const size_t &capacity, const float &alpha, float *max_priority, size_t *indices,
-                   float *priorities, const size_t &batch_size, cudaStream_t stream);
-void FifoSlice(const uint8_t *input, const size_t *indice, uint8_t *output, size_t batch_size, size_t column,
-               cudaStream_t stream);
+CUDA_LIB_EXPORT void SumTreeInit(SumTree *tree, float *max_priority, const size_t &capacity, cudaStream_t stream);
+CUDA_LIB_EXPORT void InitRandState(const size_t &batch_size, const uint64_t &seed, curandState *state,
+                                   cudaStream_t stream);
+CUDA_LIB_EXPORT void SumTreePush(SumTree *tree, const float &alpha, const size_t &idx, const size_t &capacity,
+                                 float *priority, float *max_priority, cudaStream_t stream);
+CUDA_LIB_EXPORT void SumTreeSample(SumTree *tree, curandState *state, const size_t &capacity, float *beta,
+                                   const size_t &batch_size, size_t *indices, float *weights, cudaStream_t stream);
+CUDA_LIB_EXPORT void SumTreeUpdate(SumTree *tree, const size_t &capacity, const float &alpha, float *max_priority,
+                                   size_t *indices, float *priorities, const size_t &batch_size, cudaStream_t stream);
+CUDA_LIB_EXPORT void FifoSlice(const uint8_t *input, const size_t *indice, uint8_t *output, size_t batch_size,
+                               size_t column, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMP_PRIORITY_REPLAY_BUFFER_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh
index 9ffcec4e663..1f719bfe120 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh
@@ -18,21 +18,24 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RL_BUFFER_IMPL_H_
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
-void BufferAppend(const int64_t capacity, const size_t size, const int *index, const int exp_batch,
-                  unsigned char *buffer, const unsigned char *exp, cudaStream_t cuda_stream);
-void IncreaseCount(const int64_t capacity, const int exp_batch, int *count, int *head, int *index,
-                   cudaStream_t cuda_stream);
-void ReMappingIndex(const int *count, const int *head, const int *origin_index, int *index, cudaStream_t cuda_stream);
-void BufferGetItem(const size_t size, const int *index, const size_t one_exp_len, const unsigned char *buffer,
-                   unsigned char *out, cudaStream_t cuda_stream);
-void CheckBatchSize(const int *count, const int *head, const size_t batch_size, const int64_t capacity,
-                    cudaStream_t cuda_stream);
-void BufferSample(const size_t size, const size_t one_element, const unsigned int *index, const unsigned char *buffer,
-                  unsigned char *out, cudaStream_t cuda_stream);
-void RandomGen(const int size, curandState *globalState, unsigned int *value, unsigned int *key, cudaStream_t stream);
-void RandInit(const int size, const int seed, curandState *state, cudaStream_t stream);
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
+CUDA_LIB_EXPORT void BufferAppend(const int64_t capacity, const size_t size, const int *index, const int exp_batch,
+                                  unsigned char *buffer, const unsigned char *exp, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void IncreaseCount(const int64_t capacity, const int exp_batch, int *count, int *head, int *index,
+                                   cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void ReMappingIndex(const int *count, const int *head, const int *origin_index, int *index,
+                                    cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void BufferGetItem(const size_t size, const int *index, const size_t one_exp_len,
+                                   const unsigned char *buffer, unsigned char *out, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void CheckBatchSize(const int *count, const int *head, const size_t batch_size, const int64_t capacity,
+                                    cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void BufferSample(const size_t size, const size_t one_element, const unsigned int *index,
+                                  const unsigned char *buffer, unsigned char *out, cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void RandomGen(const int size, curandState *globalState, unsigned int *value, unsigned int *key,
+                               cudaStream_t stream);
+CUDA_LIB_EXPORT void RandInit(const int size, const int seed, curandState *state, cudaStream_t stream);
 
 template <typename T>
-void RandomGenUniform(const int size, curandState *globalState, const int up_bound, T *indexes,
-                      cudaStream_t cuda_stream);
+CUDA_LIB_EXPORT void RandomGenUniform(const int size, curandState *globalState, const int up_bound, T *indexes,
+                                      cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/tag_env_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/tag_env_impl.cuh
index e4b61b62184..bcd1c9b0ec4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/tag_env_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/tag_env_impl.cuh
@@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_TAG_ENV_IMPL_H_
 
 #include <curand_kernel.h>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
 constexpr int kFeatureNum = 4;
 constexpr int kPartiallyObsFeatureNum = 6;
@@ -46,14 +47,16 @@ struct AgentState {
   int *time_step;
 };
 
-void InitEnv(const int env_num, const int agent_num, const GameSetting *setting, AgentState *state,
-             cudaStream_t stream);
-void ResetEnv(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state, float *state,
-              cudaStream_t stream);
-void StepBindBlock(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state,
-                   const int *action, float *state, float *reward, bool *done, cudaStream_t stream);
-void StepCrossBlock(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state,
-                    const int *action, float *state, float *reward, bool *done, float *team_reward, int *distance,
-                    cudaStream_t stream);
-void AgentStateCopy(const int env_num, const int agent_num, AgentState *dst, AgentState *src, cudaStream_t stream);
+CUDA_LIB_EXPORT void InitEnv(const int env_num, const int agent_num, const GameSetting *setting, AgentState *state,
+                             cudaStream_t stream);
+CUDA_LIB_EXPORT void ResetEnv(const int env_num, const int agent_num, const GameSetting *setting,
+                              AgentState *agent_state, float *state, cudaStream_t stream);
+CUDA_LIB_EXPORT void StepBindBlock(const int env_num, const int agent_num, const GameSetting *setting,
+                                   AgentState *agent_state, const int *action, float *state, float *reward, bool *done,
+                                   cudaStream_t stream);
+CUDA_LIB_EXPORT void StepCrossBlock(const int env_num, const int agent_num, const GameSetting *setting,
+                                    AgentState *agent_state, const int *action, float *state, float *reward, bool *done,
+                                    float *team_reward, int *distance, cudaStream_t stream);
+CUDA_LIB_EXPORT void AgentStateCopy(const int env_num, const int agent_num, AgentState *dst, AgentState *src,
+                                    cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_TAG_ENV_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh
index f09f72de66a..538652a1bc2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh
@@ -19,8 +19,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void AngleAtomEnergy(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
-                     const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k,
-                     const float *angle_theta0, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void AngleAtomEnergy(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
+                                     const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k,
+                                     const float *angle_theta0, float *ene, cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh
index be75db9a29c..b1eeb53a72b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh
@@ -19,7 +19,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void AngleEnergy(int angle_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b,
-                 const int *atom_c, const float *angle_k, const float *angle_theta0, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void AngleEnergy(int angle_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a,
+                                 const int *atom_b, const int *atom_c, const float *angle_k, const float *angle_theta0,
+                                 float *ene, cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh
index 24276a23860..15314403924 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh
@@ -19,8 +19,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void AngleForce(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a,
-                const int *atom_b, const int *atom_c, const float *angle_k, const float *angle_theta0, float *frc_f,
-                cudaStream_t stream);
+CUDA_LIB_EXPORT void AngleForce(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
+                                const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k,
+                                const float *angle_theta0, float *frc_f, cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh
index c8f647fc1e8..ac2445b52dc 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh
@@ -19,8 +19,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void AngleForceWithAtomEnergy(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
-                              const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k,
-                              const float *angle_theta0, float *frc_f, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void AngleForceWithAtomEnergy(int angle_numbers, int atom_numbers, const int *uint_crd_f,
+                                              const float *scaler_f, const int *atom_a, const int *atom_b,
+                                              const int *atom_c, const float *angle_k, const float *angle_theta0,
+                                              float *frc_f, float *ene, cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh
index 832eea77397..96f97609e87 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh
@@ -19,8 +19,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void BondAtomEnergy(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a,
-                    const int *atom_b, const float *bond_k, const float *bond_r0, float *atom_ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void BondAtomEnergy(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
+                                    const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
+                                    float *atom_ene, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BOND_ATOM_ENERGY_GPU_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh
index fbba36e5d38..cd728d86d4c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh
@@ -19,9 +19,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void BondEnergy(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f, const float *scaler_f,
-                const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
-                float *bond_ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void BondEnergy(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f,
+                                const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k,
+                                const float *bond_r0, float *bond_ene, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BOND_ENERGY_CUDA_GPU_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh
index e401f9deba4..5e9899a8cb9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh
@@ -19,8 +19,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void BondForce(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f, const float *scaler_f,
-               const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
-               float *frc_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void BondForce(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f,
+                               const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k,
+                               const float *bond_r0, float *frc_f, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BOND_FORCE_CUDA_GPU_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh
index 98319adde34..fa5f0408729 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh
@@ -19,9 +19,11 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void BondForceWithAtomEnergyAndVirial(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f,
-                                      const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k,
-                                      const float *bond_r0, float *frc_f, float *atom_energy, float *atom_v,
-                                      cudaStream_t stream);
+CUDA_LIB_EXPORT void BondForceWithAtomEnergyAndVirial(int bond_numbers, int atom_numbers,
+                                                      const unsigned int *uint_crd_f, const float *scaler_f,
+                                                      const int *atom_a, const int *atom_b, const float *bond_k,
+                                                      const float *bond_r0, float *frc_f, float *atom_energy,
+                                                      float *atom_v, cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh
index 4b11524afc9..4841de06336 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh
@@ -19,8 +19,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void BondForceWithAtomEnergy(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
-                             const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
-                             float *frc_f, float *atom_e, cudaStream_t stream);
+CUDA_LIB_EXPORT void BondForceWithAtomEnergy(int bond_numbers, int atom_numbers, const int *uint_crd_f,
+                                             const float *scaler_f, const int *atom_a, const int *atom_b,
+                                             const float *bond_k, const float *bond_r0, float *frc_f, float *atom_e,
+                                             cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh
index 9757c081ea1..0ebe0dbb4a6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh
@@ -19,8 +19,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void BondForceWithAtomVirial(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
-                             const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0,
-                             float *frc_f, float *atom_v, cudaStream_t stream);
+CUDA_LIB_EXPORT void BondForceWithAtomVirial(int bond_numbers, int atom_numbers, const int *uint_crd_f,
+                                             const float *scaler_f, const int *atom_a, const int *atom_b,
+                                             const float *bond_k, const float *bond_r0, float *frc_f, float *atom_v,
+                                             cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh
index 0c235d4fd9d..85338ade301 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh
@@ -18,9 +18,10 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_ATOMCRDTOCV_IMPL_H_
 
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void AtomCrdToCV(int atom_numbers, int start_serial, int end_serial, int number, const float *crd_f,
-                 const float *old_crd, float *nowarp_crd, int *box_map_times, float *box, float *g_radial,
-                 float *g_angular, cudaStream_t stream);
+CUDA_LIB_EXPORT void AtomCrdToCV(int atom_numbers, int start_serial, int end_serial, int number, const float *crd_f,
+                                 const float *old_crd, float *nowarp_crd, int *box_map_times, float *box,
+                                 float *g_radial, float *g_angular, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_ATOMCRDTOCV_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh
index 5ab563f8e1c..99f13cdeac7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh
@@ -19,8 +19,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void CrdToUintCrd(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
-                  unsigned int *uint_crd_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void CrdToUintCrd(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
+                                  unsigned int *uint_crd_f, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh
index 18b531fc58d..e2360b42fc8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh
@@ -19,8 +19,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void CrdToUintCrdQuarter(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
-                         unsigned int *uint_crd_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void CrdToUintCrdQuarter(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
+                                         unsigned int *uint_crd_f, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_QUARTER_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh
index e3fcfb3bcce..75bd130e5ac 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh
@@ -19,8 +19,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void GetCenterOfMass(int residue_numbers, int *start, int *end, float *crd_f, float *atom_mass,
-                     float *residue_mass_inverse, float *center_of_mass_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void GetCenterOfMass(int residue_numbers, int *start, int *end, float *crd_f, float *atom_mass,
+                                     float *residue_mass_inverse, float *center_of_mass_f, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTER_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh
index 6355344e09f..df366b2fede 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh
@@ -19,8 +19,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void GetCenterOfGeometry(const int center_numbers, float center_numbers_inverse, const int *center_atoms,
-                         const float *crd_f, float *center_of_geometry_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void GetCenterOfGeometry(const int center_numbers, float center_numbers_inverse,
+                                         const int *center_atoms, const float *crd_f, float *center_of_geometry_f,
+                                         cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTER_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh
index c60c9a4df4c..151ba37eb15 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh
@@ -19,8 +19,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void MapCenterOfMass(int residue_numbers, int *start, int *end, float *center_of_mass_f,
-                     float *box_length_f, float *no_wrap_crd_f, float *crd_f, float* scaler, cudaStream_t stream);
+CUDA_LIB_EXPORT void MapCenterOfMass(int residue_numbers, int *start, int *end, float *center_of_mass_f,
+                                     float *box_length_f, float *no_wrap_crd_f, float *crd_f, float *scaler,
+                                     cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MAPCENTEROFMASS_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh
index 1a41f87f92e..b2b08648f3f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh
@@ -19,7 +19,8 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void MDTemperature(const int residue_numbers, const int *start, const int *end, const float *atom_vel_f,
-                   const float *atom_mass, float *ek, cudaStream_t stream);
+CUDA_LIB_EXPORT void MDTemperature(const int residue_numbers, const int *start, const int *end, const float *atom_vel_f,
+                                   const float *atom_mass, float *ek, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MDTEMPERATURE_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh
index 1f2bdb4251e..76d582b7ae5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh
@@ -18,7 +18,9 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_TOTAL_C6_GET_IMPL_H_
 
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void total_c6_get(int atom_numbers, int *atom_lj_type, float *d_lj_b, float *d_factor, cudaStream_t stream);
+CUDA_LIB_EXPORT void total_c6_get(int atom_numbers, int *atom_lj_type, float *d_lj_b, float *d_factor,
+                                  cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_TOTAL_C6_GET_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh
index 27116d8aa26..3e925124f70 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh
@@ -20,8 +20,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void calculatenowrapcrd(int atom_numbers, int *box_map_times_f, float *box_f, float *crd_f, float *nowrap_crd_f,
-                        cudaStream_t stream);
+CUDA_LIB_EXPORT void calculatenowrapcrd(int atom_numbers, int *box_map_times_f, float *box_f, float *crd_f,
+                                        float *nowrap_crd_f, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRDMCMAP_CAL_NO_WRAP_CRD_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh
index 3218bd5fba5..cbd8402bf35 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh
@@ -20,8 +20,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void refresh_boxmaptimes(int atom_numbers, float *box_length_inverse, float *crd_f, float *old_crd_f,
-                         int *box_map_times_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void refresh_boxmaptimes(int atom_numbers, float *box_length_inverse, float *crd_f, float *old_crd_f,
+                                         int *box_map_times_f, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRDMCMAP_REFRESH_BOXMAPTIMES_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh
index 56d1baba7b9..adc6b16ad9e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh
@@ -19,9 +19,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void DihedralAtomEnergy(int dihedral_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
-                        const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d, const int *ipn,
-                        const float *pk, const float *gamc, const float *gams, const float *pn, float *ene,
-                        cudaStream_t stream);
+CUDA_LIB_EXPORT void DihedralAtomEnergy(int dihedral_numbers, int atom_numbers, const int *uint_crd_f,
+                                        const float *scaler_f, const int *atom_a, const int *atom_b, const int *atom_c,
+                                        const int *atom_d, const int *ipn, const float *pk, const float *gamc,
+                                        const float *gams, const float *pn, float *ene, cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh
index eb8dc46e75c..f8fcb5dfe72 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh
@@ -19,8 +19,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void DihedralEnergy(int dihedral_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a,
-                    const int *atom_b, const int *atom_c, const int *atom_d, const int *ipn, const float *pk,
-                    const float *gamc, const float *gams, const float *pn, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void DihedralEnergy(int dihedral_numbers, const int *uint_crd_f, const float *scaler_f,
+                                    const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d,
+                                    const int *ipn, const float *pk, const float *gamc, const float *gams,
+                                    const float *pn, float *ene, cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh
index 5804e3b825b..33038fac47a 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh
@@ -19,9 +19,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void DihedralForce(int dihedral_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
-                   const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d, const int *ipn,
-                   const float *pk, const float *gamc, const float *gams, const float *pn, float *frc_f,
-                   cudaStream_t stream);
+CUDA_LIB_EXPORT void DihedralForce(int dihedral_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
+                                   const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d,
+                                   const int *ipn, const float *pk, const float *gamc, const float *gams,
+                                   const float *pn, float *frc_f, cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh
index dc9ca6ea818..82dc3e26bb6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh
@@ -19,9 +19,11 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void DihedralForceWithAtomEnergy(int dihedral_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f,
-                                 const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d,
-                                 const int *ipn, const float *pk, const float *gamc, const float *gams, const float *pn,
-                                 float *frc_f, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void DihedralForceWithAtomEnergy(int dihedral_numbers, int atom_numbers, const int *uint_crd_f,
+                                                 const float *scaler_f, const int *atom_a, const int *atom_b,
+                                                 const int *atom_c, const int *atom_d, const int *ipn, const float *pk,
+                                                 const float *gamc, const float *gams, const float *pn, float *frc_f,
+                                                 float *ene, cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh
index b51a52ef1a4..5ad9f2f4609 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh
@@ -23,8 +23,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void LJ_Direct_CF_Force_With_LJ_Virial_Direct_CF_Energy(
+CUDA_LIB_EXPORT void LJ_Direct_CF_Force_With_LJ_Virial_Direct_CF_Energy(
   const int atom_numbers, const float cutoff, const float pme_beta, const unsigned int *uint_crd_f, const int *LJtype,
   const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers, int *nl_atom_serial,
   int *nl, const float *d_LJ_A, const float *d_LJ_B, float *frc_f, float *atom_lj_virial, float *atom_energy,
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh
index 25045a5ebed..a97a462aad8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh
@@ -19,9 +19,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void LJEnergy(const int atom_numbers, const float cutoff_square, const int *uint_crd_f, const int *LJtype,
-              const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers,
-              int *nl_atom_serial, int *nl, const float *d_LJ_A, const float *d_LJ_B, float *d_LJ_energy_atom,
-              cudaStream_t stream);
+CUDA_LIB_EXPORT void LJEnergy(const int atom_numbers, const float cutoff_square, const int *uint_crd_f,
+                              const int *LJtype, const float *charge, const float *scaler_f, float *uint_crd_with_LJ,
+                              int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *d_LJ_A,
+                              const float *d_LJ_B, float *d_LJ_energy_atom, cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh
index c37fdcdea97..5c8462144b6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh
@@ -19,9 +19,11 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void LJForce(const int atom_numbers, const float cutoff_square, const int *uint_crd_f, const int *LJtype,
-             const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers,
-             int *nl_atom_serial, int *nl, const float *d_LJ_A, const float *d_LJ_B, float *frc_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void LJForce(const int atom_numbers, const float cutoff_square, const int *uint_crd_f,
+                             const int *LJtype, const float *charge, const float *scaler_f, float *uint_crd_with_LJ,
+                             int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *d_LJ_A,
+                             const float *d_LJ_B, float *frc_f, cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh
index 2bd59ee88e0..22c603c9642 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh
@@ -19,10 +19,12 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void LJForceWithPMEDirectForce(const int atom_numbers, const float cutoff, const float pme_beta, const int *uint_crd_f,
-                               const int *LJtype, const float *charge, const float *scaler_f, float *uint_crd_with_LJ,
-                               int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *d_LJ_A,
-                               const float *d_LJ_B, float *frc_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void LJForceWithPMEDirectForce(const int atom_numbers, const float cutoff, const float pme_beta,
+                                               const int *uint_crd_f, const int *LJtype, const float *charge,
+                                               const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers,
+                                               int *nl_atom_serial, int *nl, const float *d_LJ_A, const float *d_LJ_B,
+                                               float *frc_f, cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh
index 0e22abe4352..e23c5de34b6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh
@@ -19,11 +19,13 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void LJDirectCFForceWithAtomEnergy(const int atom_numbers, const float cutoff, const float pme_beta,
-                                   const int *uint_crd_f, const int *LJtype, const float *charge, const float *scaler_f,
-                                   float *uint_crd_with_LJ, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
-                                   const float *d_LJ_A, const float *d_LJ_B, float *frc_f, float *atom_energy,
-                                   cudaStream_t stream);
+CUDA_LIB_EXPORT void LJDirectCFForceWithAtomEnergy(const int atom_numbers, const float cutoff, const float pme_beta,
+                                                   const int *uint_crd_f, const int *LJtype, const float *charge,
+                                                   const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers,
+                                                   int *nl_atom_serial, int *nl, const float *d_LJ_A,
+                                                   const float *d_LJ_B, float *frc_f, float *atom_energy,
+                                                   cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh
index 616ce19b6d8..eb0f80a46ec 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh
@@ -18,8 +18,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void Dihedral14CFAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
-                            const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
-                            const int *b_14, const float *cf_scale_factor, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void Dihedral14CFAtomEnergy(const int dihedral_14_numbers, const int atom_numbers,
+                                            const int *uint_crd_f, const int *LJtype, const float *charge,
+                                            const float *boxlength_f, const int *a_14, const int *b_14,
+                                            const float *cf_scale_factor, float *ene, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_CF_ENERGY_IMPL_H
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh
index dfc31a357f0..bc5aea7af1d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh
@@ -18,8 +18,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void Dihedral14CFEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
-                        const float *charge, float *uint_crd_with_LJ_f, const float *boxlength_f, const int *a_14,
-                        const int *b_14, const float *cf_scale_factor, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void Dihedral14CFEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
+                                        const int *LJtype, const float *charge, float *uint_crd_with_LJ_f,
+                                        const float *boxlength_f, const int *a_14, const int *b_14,
+                                        const float *cf_scale_factor, float *ene, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_CF_ENERGY_IMPL_H
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh
index 286bc9d5ac0..a1943aefe2f 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh
@@ -18,9 +18,11 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void Dihedral14LJAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
-                            const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
-                            const int *b_14, const float *lj_scale_factor, const float *LJ_type_A,
-                            const float *LJ_type_B, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void Dihedral14LJAtomEnergy(const int dihedral_14_numbers, const int atom_numbers,
+                                            const int *uint_crd_f, const int *LJtype, const float *charge,
+                                            const float *boxlength_f, const int *a_14, const int *b_14,
+                                            const float *lj_scale_factor, const float *LJ_type_A,
+                                            const float *LJ_type_B, float *ene, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_ATOM_ENERGY_IMPL_H
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh
index 914d6e0a010..91018f1fac4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh
@@ -18,12 +18,11 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void Dihedral14LJCFForceWithAtomEnergyAndVirial(const int dihedral_14_numbers, const int atom_numbers,
-                                                const int *uint_crd_f, const int *LJtype, const float *charge,
-                                                float *uint_crd_with_LJ_f, const float *boxlength_f, const int *a_14,
-                                                const int *b_14, const float *lj_scale_factor,
-                                                const float *cf_scale_factor, const float *LJ_type_A,
-                                                const float *LJ_type_B, float *frc_f, float *atom_energy,
-                                                float *atom_virial, cudaStream_t stream);
+CUDA_LIB_EXPORT void Dihedral14LJCFForceWithAtomEnergyAndVirial(
+  const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype, const float *charge,
+  float *uint_crd_with_LJ_f, const float *boxlength_f, const int *a_14, const int *b_14, const float *lj_scale_factor,
+  const float *cf_scale_factor, const float *LJ_type_A, const float *LJ_type_B, float *frc_f, float *atom_energy,
+  float *atom_virial, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_IMPL_H
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh
index ad2957b79c0..7d5560c78e0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh
@@ -18,11 +18,13 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void Dihedral14LJCFForceWithAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
-                                       const int *LJtype, const float *charge, float *uint_crd_with_LJ_f,
-                                       const float *boxlength_f, const int *a_14, const int *b_14,
-                                       const float *lj_scale_factor, const float *cf_scale_factor,
-                                       const float *LJ_type_A, const float *LJ_type_B, float *frc_f, float *atom_energy,
-                                       cudaStream_t stream);
+CUDA_LIB_EXPORT void Dihedral14LJCFForceWithAtomEnergy(const int dihedral_14_numbers, const int atom_numbers,
+                                                       const int *uint_crd_f, const int *LJtype, const float *charge,
+                                                       float *uint_crd_with_LJ_f, const float *boxlength_f,
+                                                       const int *a_14, const int *b_14, const float *lj_scale_factor,
+                                                       const float *cf_scale_factor, const float *LJ_type_A,
+                                                       const float *LJ_type_B, float *frc_f, float *atom_energy,
+                                                       cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_IMPL_H
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh
index cd9c125c899..7202a596df3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh
@@ -18,10 +18,12 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void Dihedral14LJEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
-                        const float *charge, float *uint_crd_with_LJ_f, const float *boxlength_f, const int *a_14,
-                        const int *b_14, const float *lj_scale_factor, const float *LJ_type_A, const float *LJ_type_B,
-                        float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void Dihedral14LJEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
+                                        const int *LJtype, const float *charge, float *uint_crd_with_LJ_f,
+                                        const float *boxlength_f, const int *a_14, const int *b_14,
+                                        const float *lj_scale_factor, const float *LJ_type_A, const float *LJ_type_B,
+                                        float *ene, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_ENERGY_IMPL_H
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh
index ab67d0c076c..61849b1abc8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh
@@ -18,9 +18,11 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void Dihedral14LJForce(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
-                       const float *charge, const float *boxlength_f, const int *a_14, const int *b_14,
-                       const float *lj_scale_factor, const float *LJ_type_A, const float *LJ_type_B, float *frc_f,
-                       cudaStream_t stream);
+CUDA_LIB_EXPORT void Dihedral14LJForce(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
+                                       const int *LJtype, const float *charge, const float *boxlength_f,
+                                       const int *a_14, const int *b_14, const float *lj_scale_factor,
+                                       const float *LJ_type_A, const float *LJ_type_B, float *frc_f,
+                                       cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_FORCE_IMPL_H
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh
index 0b97b56b685..901aa4f56cc 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh
@@ -18,9 +18,12 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void Dihedral14LJForceWithDirectCF(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
-                                   const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
-                                   const int *b_14, const float *lj_scale_factor, const float *cf_scale_factor,
-                                   const float *LJ_type_A, const float *LJ_type_B, float *frc_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void Dihedral14LJForceWithDirectCF(const int dihedral_14_numbers, const int atom_numbers,
+                                                   const int *uint_crd_f, const int *LJtype, const float *charge,
+                                                   const float *boxlength_f, const int *a_14, const int *b_14,
+                                                   const float *lj_scale_factor, const float *cf_scale_factor,
+                                                   const float *LJ_type_A, const float *LJ_type_B, float *frc_f,
+                                                   cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_FORCE_WITH_DIRECT_CF_IMPL_H
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh
index a91b582c5d3..f5204e8de84 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh
@@ -22,6 +22,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_IMPL_H_
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
 struct VECTOR {
   float x;
@@ -49,33 +50,33 @@ struct GRID_POINTER {
   int *grid_serial;
 };
 
-void ConstructNeighborList(int grid_numbers, int max_neighbor_numbers, int *nl_atom_numbers, int *nl_atom_serial,
-                           NEIGHBOR_LIST *nl, cudaStream_t stream);
+CUDA_LIB_EXPORT void ConstructNeighborList(int grid_numbers, int max_neighbor_numbers, int *nl_atom_numbers,
+                                           int *nl_atom_serial, NEIGHBOR_LIST *nl, cudaStream_t stream);
 
-void CopyNeighborList(int atom_numbers, int max_neighbor_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers,
-                      int *nl_atom_serial, cudaStream_t stream);
+CUDA_LIB_EXPORT void CopyNeighborList(int atom_numbers, int max_neighbor_numbers, NEIGHBOR_LIST *nl,
+                                      int *nl_atom_numbers, int *nl_atom_serial, cudaStream_t stream);
 
-void NeighborListRefresh(int grid_numbers, int atom_numbers, int *d_refresh_count, int refresh_interval,
-                         int not_first_time, float skin, int nxy, float cutoff_square, float cutoff_with_skin_square,
-                         int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket, float *grid_length_inverse,
-                         int *atom_in_grid_serial, GRID_BUCKET *bucket, float *crd, float *old_crd,
-                         float *crd_to_uint_crd_cof, float *half_crd_to_uint_crd_cof, unsigned int *uint_crd,
-                         float *uint_dr_to_dr_cof, GRID_POINTER *gpointer, NEIGHBOR_LIST *d_nl,
-                         int *excluded_list_start, int *excluded_list, int *excluded_numbers, float half_skin_square,
-                         int *is_need_refresh_neighbor_list, int forced_update, int forced_check, cudaStream_t stream);
+CUDA_LIB_EXPORT void NeighborListRefresh(
+  int grid_numbers, int atom_numbers, int *d_refresh_count, int refresh_interval, int not_first_time, float skin,
+  int nxy, float cutoff_square, float cutoff_with_skin_square, int *grid_N, float *box_length,
+  int *atom_numbers_in_grid_bucket, float *grid_length_inverse, int *atom_in_grid_serial, GRID_BUCKET *bucket,
+  float *crd, float *old_crd, float *crd_to_uint_crd_cof, float *half_crd_to_uint_crd_cof, unsigned int *uint_crd,
+  float *uint_dr_to_dr_cof, GRID_POINTER *gpointer, NEIGHBOR_LIST *d_nl, int *excluded_list_start, int *excluded_list,
+  int *excluded_numbers, float half_skin_square, int *is_need_refresh_neighbor_list, int forced_update,
+  int forced_check, cudaStream_t stream);
 
-void ConstructNeighborListHalf(int grid_numbers, int max_neighbor_numbers, int *nl_atom_numbers, int *nl_atom_serial,
-                               NEIGHBOR_LIST *nl, cudaStream_t stream);
+CUDA_LIB_EXPORT void ConstructNeighborListHalf(int grid_numbers, int max_neighbor_numbers, int *nl_atom_numbers,
+                                               int *nl_atom_serial, NEIGHBOR_LIST *nl, cudaStream_t stream);
 
-void CopyNeighborListHalf(int atom_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers, cudaStream_t stream);
+CUDA_LIB_EXPORT void CopyNeighborListHalf(int atom_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers,
+                                          cudaStream_t stream);
 
-void NeighborListUpdate(int grid_numbers, int atom_numbers, int *d_refresh_count, int refresh_interval,
-                        int not_first_time, float skin, int nxy, float cutoff_square, float cutoff_with_skin_square,
-                        int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket, float *grid_length_inverse,
-                        int *atom_in_grid_serial, GRID_BUCKET *bucket, float *crd, float *old_crd,
-                        float *crd_to_uint_crd_cof, float *half_crd_to_uint_crd_cof, unsigned int *uint_crd,
-                        float *uint_dr_to_dr_cof, GRID_POINTER *gpointer, NEIGHBOR_LIST *d_nl, int *excluded_list_start,
-                        int *excluded_list, int *excluded_numbers, float half_skin_square,
-                        int *is_need_refresh_neighbor_list, cudaStream_t stream);
+CUDA_LIB_EXPORT void NeighborListUpdate(
+  int grid_numbers, int atom_numbers, int *d_refresh_count, int refresh_interval, int not_first_time, float skin,
+  int nxy, float cutoff_square, float cutoff_with_skin_square, int *grid_N, float *box_length,
+  int *atom_numbers_in_grid_bucket, float *grid_length_inverse, int *atom_in_grid_serial, GRID_BUCKET *bucket,
+  float *crd, float *old_crd, float *crd_to_uint_crd_cof, float *half_crd_to_uint_crd_cof, unsigned int *uint_crd,
+  float *uint_dr_to_dr_cof, GRID_POINTER *gpointer, NEIGHBOR_LIST *d_nl, int *excluded_list_start, int *excluded_list,
+  int *excluded_numbers, float half_skin_square, int *is_need_refresh_neighbor_list, cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh
index 0455f49219a..8299fc491ca 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh
@@ -19,7 +19,8 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
-void MDIterationGradientDescent(const int atom_numbers, float *crd, float *frc, const float learning_rate,
-                                cudaStream_t stream);
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
+CUDA_LIB_EXPORT void MDIterationGradientDescent(const int atom_numbers, float *crd, float *frc,
+                                                const float learning_rate, cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh
index af5756a7fa6..61d149060b7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh
@@ -23,8 +23,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void MDIterationLeapFrog(const int atom_numbers, float *vel, float *crd, float *frc, float *acc,
-                         const float *inverse_mass, const float dt, cudaStream_t stream);
+CUDA_LIB_EXPORT void MDIterationLeapFrog(const int atom_numbers, float *vel, float *crd, float *frc, float *acc,
+                                         const float *inverse_mass, const float dt, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh
index 58bd87184c4..77e42a3f2d6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh
@@ -19,10 +19,11 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
-void MD_Iteration_Leap_Frog_With_LiuJian(const int atom_numbers, const float half_dt, const float dt,
-                                         const float exp_gamma, int float4_numbers, float *inverse_mass,
-                                         float *sqrt_mass_inverse, float *vel, float *crd, float *frc, float *acc,
-                                         curandStatePhilox4_32_10_t *rand_state, float *rand_frc, float *output,
-                                         cudaStream_t stream);
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
+CUDA_LIB_EXPORT void MD_Iteration_Leap_Frog_With_LiuJian(const int atom_numbers, const float half_dt, const float dt,
+                                                         const float exp_gamma, int float4_numbers, float *inverse_mass,
+                                                         float *sqrt_mass_inverse, float *vel, float *crd, float *frc,
+                                                         float *acc, curandStatePhilox4_32_10_t *rand_state,
+                                                         float *rand_frc, float *output, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh
index cf832952d7b..7872f70cfd8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh
@@ -19,11 +19,10 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
-void MD_Iteration_Leap_Frog_With_LiuJian_With_Max_Vel(const int atom_numbers, const float half_dt, const float dt,
-                                                      const float exp_gamma, int float4_numbers, float *inverse_mass,
-                                                      float *sqrt_mass_inverse, float *vel, float *crd, float *frc,
-                                                      float *acc, curandStatePhilox4_32_10_t *rand_state,
-                                                      float *rand_frc, float *output, const float max_vel,
-                                                      cudaStream_t stream);
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
+CUDA_LIB_EXPORT void MD_Iteration_Leap_Frog_With_LiuJian_With_Max_Vel(
+  const int atom_numbers, const float half_dt, const float dt, const float exp_gamma, int float4_numbers,
+  float *inverse_mass, float *sqrt_mass_inverse, float *vel, float *crd, float *frc, float *acc,
+  curandStatePhilox4_32_10_t *rand_state, float *rand_frc, float *output, const float max_vel, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_WITH_MAX_VEL_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh
index 4c6fa476483..3bdb3dd60dc 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh
@@ -19,8 +19,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
-void MDIterationLeapFrogWithMaxVelocity(const int atom_numbers, float *vel, float *crd, float *frc, float *acc,
-                                        const float *inverse_mass, const float dt, const float max_velocity,
-                                        cudaStream_t stream);
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
+CUDA_LIB_EXPORT void MDIterationLeapFrogWithMaxVelocity(const int atom_numbers, float *vel, float *crd, float *frc,
+                                                        float *acc, const float *inverse_mass, const float dt,
+                                                        const float max_velocity, cudaStream_t stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_WITH_MAX_VEL_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh
index b3d268c3013..8fda94fc3a7 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh
@@ -18,6 +18,8 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
-void MD_Iteration_Setup_Random_State(int float4_numbers, curandStatePhilox4_32_10_t *rand_state, int seed,
-                                     cudaStream_t stream);
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
+
+CUDA_LIB_EXPORT void MD_Iteration_Setup_Random_State(int float4_numbers, curandStatePhilox4_32_10_t *rand_state,
+                                                     int seed, cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh
index bb2f6c87bab..6df893b71ea 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh
@@ -19,8 +19,10 @@
 #include <cufft.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
 template <typename T>
-void FFT3D(int Nfft, T *input_tensor, Complex<T> *output_tensor, const cufftHandle &FFT_plan_r2c, cudaStream_t stream);
+CUDA_LIB_EXPORT void FFT3D(int Nfft, T *input_tensor, Complex<T> *output_tensor, const cufftHandle &FFT_plan_r2c,
+                           cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh
index 68d12d08cae..0d740e97a6b 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh
@@ -19,8 +19,10 @@
 #include <cufft.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
 template <typename T>
-void IFFT3D(int Nfft, Complex<T> *input_tensor, T *output_tensor, const cufftHandle &FFT_plan_c2r, cudaStream_t stream);
+CUDA_LIB_EXPORT void IFFT3D(int Nfft, Complex<T> *input_tensor, T *output_tensor, const cufftHandle &FFT_plan_c2r,
+                            cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_batched_fft_2d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_batched_fft_2d_impl.cuh
index 99fca45954f..71f1aa919d5 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_batched_fft_2d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_batched_fft_2d_impl.cuh
@@ -19,9 +19,10 @@
 #include <cufft.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
 template <typename T>
-void PMEBatchedFFT2D(Complex<T> *input_tensor, Complex<T> *output_tensor,
-                 const cufftHandle &FFT_plan_c2c, int direction, cudaStream_t stream);
+CUDA_LIB_EXPORT void PMEBatchedFFT2D(Complex<T> *input_tensor, Complex<T> *output_tensor,
+                                     const cufftHandle &FFT_plan_c2c, int direction, cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh
index f5697b779ee..24b0db6d046 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh
@@ -18,13 +18,15 @@
 
 #include <cufft.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
-               float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, const int *uint_crd_f,
-               const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *scaler_f,
-               const int *excluded_list_start, const int *excluded_list, const int *excluded_atom_numbers,
-               float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene, float *d_correction_ene,
-               dim3 thread_PME, int PME_Nin, int PME_Nfft, int PME_Nall, const cufftHandle &PME_plan_r2c,
-               const cufftHandle &PME_plan_c2r, cudaStream_t stream);
+CUDA_LIB_EXPORT void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
+                               float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
+                               const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial,
+                               int *nl, const float *scaler_f, const int *excluded_list_start, const int *excluded_list,
+                               const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene,
+                               float *d_direct_ene, float *d_correction_ene, dim3 thread_PME, int PME_Nin, int PME_Nfft,
+                               int PME_Nall, const cufftHandle &PME_plan_r2c, const cufftHandle &PME_plan_c2r,
+                               cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh
index 49de269ca66..3a26a7eccc4 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh
@@ -18,14 +18,17 @@
 
 #include <cufft.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void PMEEnergyUpdate(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
-                     float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
-                     const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
-                     const float *scaler_f, const int *excluded_list_start, const int *excluded_list,
-                     const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene,
-                     float *d_correction_ene, dim3 thread_PME, int PME_Nin, int PME_Nfft, int PME_Nall,
-                     const cufftHandle &PME_plan_r2c, const cufftHandle &PME_plan_c2r, float *neutralizing_factor,
-                     float *charge_sum, int max_neighbor_numbers, cudaStream_t stream);
+CUDA_LIB_EXPORT void PMEEnergyUpdate(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC,
+                                     int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near,
+                                     int *pme_kxyz, const int *uint_crd_f, const float *charge, int *nl_atom_numbers,
+                                     int *nl_atom_serial, int *nl, const float *scaler_f,
+                                     const int *excluded_list_start, const int *excluded_list,
+                                     const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene,
+                                     float *d_direct_ene, float *d_correction_ene, dim3 thread_PME, int PME_Nin,
+                                     int PME_Nfft, int PME_Nall, const cufftHandle &PME_plan_r2c,
+                                     const cufftHandle &PME_plan_c2r, float *neutralizing_factor, float *charge_sum,
+                                     int max_neighbor_numbers, cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh
index 8cbbf282054..bd08b11c1d8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh
@@ -18,9 +18,11 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void PMEExcludedForce(const int atom_numbers, const float pme_beta, const int *uint_crd_f, const float *sacler_f,
-                      const float *charge, const int *excluded_list_start, const int *excluded_list,
-                      const int *excluded_atom_numbers, float *frc_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void PMEExcludedForce(const int atom_numbers, const float pme_beta, const int *uint_crd_f,
+                                      const float *sacler_f, const float *charge, const int *excluded_list_start,
+                                      const int *excluded_list, const int *excluded_atom_numbers, float *frc_f,
+                                      cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_fft_1d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_fft_1d_impl.cuh
index f3bcccfc3d3..0ecde226b81 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_fft_1d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_fft_1d_impl.cuh
@@ -21,7 +21,7 @@
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void PMEFFT1D(int Nfft, Complex<T> *input_tensor, Complex<T> *output_tensor,
-          const cufftHandle &FFT_plan_c2c, cudaStream_t stream);
+CUDA_LIB_EXPORT void PMEFFT1D(int Nfft, Complex<T> *input_tensor, Complex<T> *output_tensor,
+                              const cufftHandle &FFT_plan_c2c, cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_fft_2d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_fft_2d_impl.cuh
index 87b10052f4a..3218a1ac46e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_fft_2d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_fft_2d_impl.cuh
@@ -21,7 +21,7 @@
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void PMEFFT2D(int Nfft, Complex<T> *input_tensor, Complex<T> *output_tensor,
-          const cufftHandle &FFT_plan_c2c, cudaStream_t stream);
+CUDA_LIB_EXPORT void PMEFFT2D(int Nfft, Complex<T> *input_tensor, Complex<T> *output_tensor,
+                              const cufftHandle &FFT_plan_c2c, cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_ifft_1d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_ifft_1d_impl.cuh
index f486472bb00..524195b0d7c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_ifft_1d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_ifft_1d_impl.cuh
@@ -21,7 +21,7 @@
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void PMEIFFT1D(int Nfft, Complex<T> *input_tensor, Complex<T> *output_tensor,
-           const cufftHandle &FFT_plan_c2c, cudaStream_t stream);
+CUDA_LIB_EXPORT void PMEIFFT1D(int Nfft, Complex<T> *input_tensor, Complex<T> *output_tensor,
+                               const cufftHandle &FFT_plan_c2c, cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_ifft_2d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_ifft_2d_impl.cuh
index c598ddbbdd6..4d3e7f44a7e 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_ifft_2d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_ifft_2d_impl.cuh
@@ -21,7 +21,7 @@
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void PMEIFFT2D(int Nfft, Complex<T> *input_tensor, Complex<T> *output_tensor,
-           const cufftHandle &FFT_plan_c2c, cudaStream_t stream);
+CUDA_LIB_EXPORT void PMEIFFT2D(int Nfft, Complex<T> *input_tensor, Complex<T> *output_tensor,
+                               const cufftHandle &FFT_plan_c2c, cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_irfft_2d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_irfft_2d_impl.cuh
index 40b76c890ee..3610537d68d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_irfft_2d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_irfft_2d_impl.cuh
@@ -21,7 +21,7 @@
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 
 template <typename T>
-void PMEIRFFT2D(int Nfft, Complex<T> *input_tensor, T *output_tensor,
-            const cufftHandle &FFT_plan_c2r, cudaStream_t stream);
+CUDA_LIB_EXPORT void PMEIRFFT2D(int Nfft, Complex<T> *input_tensor, T *output_tensor, const cufftHandle &FFT_plan_c2r,
+                                cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh
index e21cd655f6c..5faa797bdef 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh
@@ -24,10 +24,11 @@ struct _VECTOR {
   float y;
   float z;
 };
-void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
-                        float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
-                        const int *uint_crd_f, const float *charge, float *force, int PME_Nin, int PME_Nall,
-                        int PME_Nfft, const cufftHandle &PME_plan_r2c, const cufftHandle &PME_plan_c2r,
-                        const _VECTOR &PME_inverse_box_vector, cudaStream_t stream);
+CUDA_LIB_EXPORT void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC,
+                                        int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq,
+                                        int *PME_atom_near, int *pme_kxyz, const int *uint_crd_f, const float *charge,
+                                        float *force, int PME_Nin, int PME_Nall, int PME_Nfft,
+                                        const cufftHandle &PME_plan_r2c, const cufftHandle &PME_plan_c2r,
+                                        const _VECTOR &PME_inverse_box_vector, cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_rfft_2d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_rfft_2d_impl.cuh
index 27569bedcf4..f6648bb33e2 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_rfft_2d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_rfft_2d_impl.cuh
@@ -19,9 +19,10 @@
 #include <cufft.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
 template <typename T>
-void PMERFFT2D(int Nfft, T *input_tensor, Complex<T> *output_tensor, const cufftHandle &FFT_plan_r2c,
-                cudaStream_t stream);
+CUDA_LIB_EXPORT void PMERFFT2D(int Nfft, T *input_tensor, Complex<T> *output_tensor, const cufftHandle &FFT_plan_r2c,
+                               cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh
index a58fca6cbfe..6402e158c1d 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh
@@ -19,7 +19,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void restrainenergy(int restrain_numbers, int atom_numbers, float weight, const int *restrain_list, const float *crd_f,
-                    const float *crd_ref, const float *boxlength_f, float *ene, cudaStream_t stream);
+CUDA_LIB_EXPORT void restrainenergy(int restrain_numbers, int atom_numbers, float weight, const int *restrain_list,
+                                    const float *crd_f, const float *crd_ref, const float *boxlength_f, float *ene,
+                                    cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_RESTRAIN_RESTRAIN_ENERGY_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_atom_energy_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_atom_energy_virial_impl.cuh
index 94f3398e8d8..099bd7435a3 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_atom_energy_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_atom_energy_virial_impl.cuh
@@ -19,9 +19,11 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void restrainforcewithatomenergyandvirial(int restrain_numbers, int atom_numbers, const int *restrain_list,
-                                          const float *crd_f, const float *crd_ref_f, const float weight,
-                                          const float *boxlength_f, float *atom_ene, float *atom_virial, float *frc_f,
-                                          cudaStream_t stream);
+CUDA_LIB_EXPORT void restrainforcewithatomenergyandvirial(int restrain_numbers, int atom_numbers,
+                                                          const int *restrain_list, const float *crd_f,
+                                                          const float *crd_ref_f, const float weight,
+                                                          const float *boxlength_f, float *atom_ene, float *atom_virial,
+                                                          float *frc_f, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_RESTRAIN_RESTRAIN_FORCE_ATOM_ENERGY_VIRIAL_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh
index 1788b9c55c3..f1cd09b0298 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh
@@ -19,8 +19,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void restrainforce(int restrain_numbers, int atom_numbers, const int *restrain_list, const int *uint_crd_f,
-                   const int *uint_crd_ref, const float factor, const float *scaler_f, float *frc_f,
-                   cudaStream_t stream);
+CUDA_LIB_EXPORT void restrainforce(int restrain_numbers, int atom_numbers, const int *restrain_list,
+                                   const int *uint_crd_f, const int *uint_crd_ref, const float factor,
+                                   const float *scaler_f, float *frc_f, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_RESTRAIN_RESTAIN_FORCE_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh
index 6af297ba7a0..502e6e65823 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh
@@ -23,10 +23,12 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void Constrain_Force_Cycle(int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f,
-                           const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f,
-                           const int *atom_i_serials, const int *atom_j_serials, const float *constant_rs,
-                           const float *constrain_ks, float *test_frc_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void Constrain_Force_Cycle(int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f,
+                                           const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f,
+                                           const int *atom_i_serials, const int *atom_j_serials,
+                                           const float *constant_rs, const float *constrain_ks, float *test_frc_f,
+                                           cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh
index 3c77e8d0355..bbc9064d1e9 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh
@@ -23,10 +23,12 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void Constrain_Force_Cycle_With_Virial(int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f,
-                                       const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f,
-                                       const int *atom_i_serials, const int *atom_j_serials, const float *constant_rs,
-                                       const float *constrain_ks, float *test_frc_f, float *d_atom_virial,
-                                       cudaStream_t stream);
+CUDA_LIB_EXPORT void Constrain_Force_Cycle_With_Virial(int atom_numbers, int constrain_pair_numbers,
+                                                       const unsigned int *uint_crd_f, const float *scaler_f,
+                                                       float *constrain_pair_f, const float *pair_dr_f,
+                                                       const int *atom_i_serials, const int *atom_j_serials,
+                                                       const float *constant_rs, const float *constrain_ks,
+                                                       float *test_frc_f, float *d_atom_virial, cudaStream_t stream);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh
index 05b351c053b..289455b9327 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh
@@ -23,26 +23,27 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void constrain_force_cycle_update(int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f,
-                                  const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f,
-                                  const int *atom_i_serials, const int *atom_j_serials, const float *constant_rs,
-                                  const float *constrain_ks, float *test_frc_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void constrain_force_cycle_update(int atom_numbers, int constrain_pair_numbers,
+                                                  const unsigned int *uint_crd_f, const float *scaler_f,
+                                                  float *constrain_pair_f, const float *pair_dr_f,
+                                                  const int *atom_i_serials, const int *atom_j_serials,
+                                                  const float *constant_rs, const float *constrain_ks,
+                                                  float *test_frc_f, cudaStream_t stream);
 
-void constrain_force_cycle_with_virial_update(int atom_numbers, int constrain_pair_numbers,
-                                              const unsigned int *uint_crd_f, const float *scaler_f,
-                                              float *constrain_pair_f, const float *pair_dr_f,
-                                              const int *atom_i_serials, const int *atom_j_serials,
-                                              const float *constant_rs, const float *constrain_ks, float *test_frc_f,
-                                              float *d_atom_virial, cudaStream_t stream);
+CUDA_LIB_EXPORT void constrain_force_cycle_with_virial_update(
+  int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f, const float *scaler_f,
+  float *constrain_pair_f, const float *pair_dr_f, const int *atom_i_serials, const int *atom_j_serials,
+  const float *constant_rs, const float *constrain_ks, float *test_frc_f, float *d_atom_virial, cudaStream_t stream);
 
-void refresh_uint_crd_update(int atom_numbers, float half_exp_gamma_plus_half, const float *crd_f,
-                             const float *quarter_crd_to_uint_crd_cof_f, float *test_frc_f, const float *mass_inverse,
-                             unsigned int *uint_crd_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void refresh_uint_crd_update(int atom_numbers, float half_exp_gamma_plus_half, const float *crd_f,
+                                             const float *quarter_crd_to_uint_crd_cof_f, float *test_frc_f,
+                                             const float *mass_inverse, unsigned int *uint_crd_f, cudaStream_t stream);
 
-void set_zero_force_with_virial(int atom_numbers, int constrain_pair_numbers, float *test_frc_f, float *d_atom_virial,
-                                cudaStream_t stream);
+CUDA_LIB_EXPORT void set_zero_force_with_virial(int atom_numbers, int constrain_pair_numbers, float *test_frc_f,
+                                                float *d_atom_virial, cudaStream_t stream);
 
-void set_zero(int numbers, float *x, cudaStream_t stream);
+CUDA_LIB_EXPORT void set_zero(int numbers, float *x, cudaStream_t stream);
 
 #endif
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/last_crd_to_dr_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/last_crd_to_dr_impl.cuh
index 2d71397680a..becb141bfe0 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/last_crd_to_dr_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/last_crd_to_dr_impl.cuh
@@ -19,9 +19,11 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void lastcrdtodr(int constrain_pair_numbers, const float *atom_crd_f, const float *quarter_crd_to_uint_crd_cof_f,
-                 const float *uint_dr_to_dr_f, float *constrain_pair_f, const int *atom_i_serials,
-                 const int *atom_j_serials, const float *constant_rs, const float *constrain_ks, float *pair_dr_f,
-                 cudaStream_t stream);
+CUDA_LIB_EXPORT void lastcrdtodr(int constrain_pair_numbers, const float *atom_crd_f,
+                                 const float *quarter_crd_to_uint_crd_cof_f, const float *uint_dr_to_dr_f,
+                                 float *constrain_pair_f, const int *atom_i_serials, const int *atom_j_serials,
+                                 const float *constant_rs, const float *constrain_ks, float *pair_dr_f,
+                                 cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_LAST_CRD_TO_DR_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_crd_vel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_crd_vel_impl.cuh
index a3c3d40f452..f3c27147830 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_crd_vel_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_crd_vel_impl.cuh
@@ -19,7 +19,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void refreshcrdvel(int atom_numbers, float dt_inverse, float dt, float exp_gamma, float half_exp_gamma_plus_half,
-                   float *test_frc_f, float *mass_inverse, float *crd_f, float *vel_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void refreshcrdvel(int atom_numbers, float dt_inverse, float dt, float exp_gamma,
+                                   float half_exp_gamma_plus_half, float *test_frc_f, float *mass_inverse, float *crd_f,
+                                   float *vel_f, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_REFRESH_CRD_VEL_IMPL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_uint_crd_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_uint_crd_impl.cuh
index ecdb5c7ec72..1a6e3df7d9c 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_uint_crd_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_uint_crd_impl.cuh
@@ -19,8 +19,9 @@
 
 #include <curand_kernel.h>
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
 
-void refreshuintcrd(int atom_numbers, float half_exp_gamma_plus_half, const float *crd_f,
-                    const float *quarter_crd_to_uint_crd_cof_f, const float *test_frc_f, const float *mass_inverse,
-                    unsigned int *uint_crd_f, cudaStream_t stream);
+CUDA_LIB_EXPORT void refreshuintcrd(int atom_numbers, float half_exp_gamma_plus_half, const float *crd_f,
+                                    const float *quarter_crd_to_uint_crd_cof_f, const float *test_frc_f,
+                                    const float *mass_inverse, unsigned int *uint_crd_f, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_REFRESH_UINT_CRD_IMPL_H_
diff --git a/mindspore/ccsrc/profiler/device/profiling.h b/mindspore/ccsrc/profiler/device/profiling.h
index 209b21ef12d..3a4ddfc2c23 100644
--- a/mindspore/ccsrc/profiler/device/profiling.h
+++ b/mindspore/ccsrc/profiler/device/profiling.h
@@ -27,6 +27,7 @@
 #include <utility>
 #include <vector>
 #include "utils/hash_map.h"
+#include "include/backend/visible.h"
 
 namespace mindspore {
 namespace profiler {
@@ -122,7 +123,7 @@ class Profiler {
   std::string profiling_options_;
 
  private:
-  inline static HashMap<std::string, std::shared_ptr<Profiler>> instance_map_ = {};
+  BACKEND_EXPORT inline static std::map<std::string, std::shared_ptr<Profiler>> instance_map_ = {};
 };
 }  // namespace profiler
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/ps/util.cc b/mindspore/ccsrc/ps/util.cc
index b231efd4088..905af140306 100644
--- a/mindspore/ccsrc/ps/util.cc
+++ b/mindspore/ccsrc/ps/util.cc
@@ -24,26 +24,28 @@
 
 namespace mindspore {
 namespace ps {
-mindspore::HashMap<std::string, int64_t> Util::optimizer_to_ids{
+namespace {
+static mindspore::HashMap<std::string, int64_t> optimizer_to_ids = {
   {kApplyMomentum, 0},
   {kSparseAdam, 1},
   {kSparseLazyAdam, 2},
   {kSparseFtrl, 3},
 };
 
-mindspore::HashMap<int64_t, std::string> Util::id_to_optimizers{
+static mindspore::HashMap<int64_t, std::string> id_to_optimizers = {
   {0, kApplyMomentum},
   {1, kSparseAdam},
   {2, kSparseLazyAdam},
   {3, kSparseFtrl},
 };
 
-mindspore::HashMap<int64_t, std::string> Util::id_to_optimizer_nodes{
+static mindspore::HashMap<int64_t, std::string> id_to_optimizer_nodes = {
   {0, kApplyMomentumOp},
   {1, kSparseAdamOp},
   {2, kSparseLazyAdamOp},
   {3, kSparseFtrlOp},
 };
+}  // namespace
 
 bool Util::IsRoleOfPServer() { return PSContext::instance()->is_server(); }
 
diff --git a/mindspore/ccsrc/ps/util.h b/mindspore/ccsrc/ps/util.h
index ad39c6db670..3508b3487ad 100644
--- a/mindspore/ccsrc/ps/util.h
+++ b/mindspore/ccsrc/ps/util.h
@@ -67,11 +67,6 @@ class BACKEND_EXPORT Util {
   static void DoFusion(const FuncGraphPtr &func_graph, const std::string &cnode_name,
                        const std::string &fused_cnode_name);
   static kernel::KernelBuildInfoPtr GenerateKernelBuildInfo(const std::vector<AnfNodePtr> &node_list);
-
-  static mindspore::HashMap<std::string, int64_t> optimizer_to_ids;
-  static mindspore::HashMap<int64_t, std::string> id_to_optimizers;
-  static mindspore::HashMap<int64_t, std::string> id_to_optimizer_nodes;
-  static int64_t rank_id_;
 };
 }  // namespace ps
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/distribution/collective_fake_init.cc b/mindspore/ccsrc/runtime/collective/collective_fake_init.cc
similarity index 95%
rename from mindspore/ccsrc/plugin/device/gpu/hal/device/distribution/collective_fake_init.cc
rename to mindspore/ccsrc/runtime/collective/collective_fake_init.cc
index 6bc3cd5e513..b639bf3e505 100644
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/distribution/collective_fake_init.cc
+++ b/mindspore/ccsrc/runtime/collective/collective_fake_init.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "plugin/device/gpu/hal/device/distribution/collective_fake_init.h"
+#include "runtime/collective/collective_fake_init.h"
 #include "utils/log_adapter.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/distribution/collective_fake_init.h b/mindspore/ccsrc/runtime/collective/collective_fake_init.h
similarity index 100%
rename from mindspore/ccsrc/plugin/device/gpu/hal/device/distribution/collective_fake_init.h
rename to mindspore/ccsrc/runtime/collective/collective_fake_init.h
diff --git a/mindspore/ccsrc/runtime/device/ms_device_shape_transfer.cc b/mindspore/ccsrc/runtime/device/ms_device_shape_transfer.cc
index 3ca7182f95e..d78342c6626 100644
--- a/mindspore/ccsrc/runtime/device/ms_device_shape_transfer.cc
+++ b/mindspore/ccsrc/runtime/device/ms_device_shape_transfer.cc
@@ -252,8 +252,7 @@ bool IsNeedPadding(const std::string &format, size_t shape_size) {
   if (shape_size == 0) {
     return false;
   }
-  if (format == kOpFormat_DEFAULT || format == kOpFormat_NCHW ||
-      kNoPaddingFormatSet.find(format) != kNoPaddingFormatSet.end()) {
+  if (format == kOpFormat_DEFAULT || format == kOpFormat_NCHW || IsOneOfNoPaddingFormat(format)) {
     return false;
   } else if (shape_size < kDim4) {
     return true;
@@ -494,12 +493,12 @@ ShapeVector DeviceShapeTransfer::TransCore(const ShapeVector &shape, const std::
     return NDRNNBiasDeviceShape(shape, type, input_hidden_size[1]);
   }
   auto temp_shape = shape;
-  if (kNoPaddingFormatSet.find(format) == kNoPaddingFormatSet.end() && format != kOpFormat_FRACTAL_ZN_LSTM &&
-      shape.size() < kDim4 && k3DFormatSet.find(format) == k3DFormatSet.end()) {
+  if (!IsOneOfNoPaddingFormat(format) && format != kOpFormat_FRACTAL_ZN_LSTM && shape.size() < kDim4 &&
+      !IsOneOf3DFormat(format)) {
     MS_LOG(WARNING) << "Origin shape size is less than 4, should be Padding shape by Default firstly";
     temp_shape = PaddingShapeTo4dDefault(shape);
   }
-  if (shape.size() != kDim5 && k3DFormatSet.find(format) != k3DFormatSet.end()) {
+  if (shape.size() != kDim5 && IsOneOf3DFormat(format)) {
     temp_shape = PaddingShapeTo5dDefault(shape);
   }
   auto iter = device_shape_map.find(format);
@@ -1799,11 +1798,11 @@ RangePair ShapeRangeTransfer::GetRealRange(const RangePair &ori_range, const std
     return FRAC_NZRange(ori_range, type);
   }
   auto temp_range = ori_range;
-  if (ori_range.size() < kDim4 && k3DFormatSet.find(format) == k3DFormatSet.end()) {
+  if (ori_range.size() < kDim4 && !IsOneOf3DFormat(format)) {
     MS_LOG(DEBUG) << "A special format:" << format << " with a range size less than 4, so padding the range firstly";
     temp_range = PaddingRangeTo4D(ori_range, padding_str);
   }
-  if (ori_range.size() < kDim5 && k3DFormatSet.find(format) != k3DFormatSet.end()) {
+  if (ori_range.size() < kDim5 && IsOneOf3DFormat(format)) {
     MS_LOG(DEBUG) << "A special format:" << format << " with a range size less than 5, so padding the range firstly";
     temp_range = PaddingRangeTo5D(ori_range, padding_str);
   }
diff --git a/mindspore/ccsrc/runtime/device/ms_device_shape_transfer.h b/mindspore/ccsrc/runtime/device/ms_device_shape_transfer.h
index 2a2f443ebe1..c912441f671 100644
--- a/mindspore/ccsrc/runtime/device/ms_device_shape_transfer.h
+++ b/mindspore/ccsrc/runtime/device/ms_device_shape_transfer.h
@@ -443,7 +443,7 @@ std::vector<T> PaddingShape(const std::vector<T> &shape, const std::string &form
                   << ", detail info: " << node->DebugString();
   }
   std::vector<T> host_shape;
-  if (k3DFormatSet.find(format) != k3DFormatSet.end()) {
+  if (IsOneOf3DFormat(format)) {
     if (shape.size() >= kDim5) {
       return shape;
     }
@@ -466,7 +466,7 @@ std::vector<int> StringToAxisVector(const std::vector<T> &shape, const std::stri
   }
 
   std::vector<int> padding_axis;
-  if (k3DFormatSet.find(format) != k3DFormatSet.end()) {
+  if (IsOneOf3DFormat(format)) {
     if (shape.size() >= kDim5) {
       return padding_axis;
     }
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/graph_compiler.cc b/mindspore/ccsrc/runtime/graph_scheduler/graph_compiler.cc
index dd3ffa78ad6..eb26de0b234 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/graph_compiler.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/graph_compiler.cc
@@ -481,8 +481,7 @@ std::set<CNodePtr> FetchNopNodeNotSupportEliminate(const KernelGraph *const grap
     }
 
     // kernel not support multi-thread execute will be inited in launch kernel, so its input cannot be eliminated.
-    if (kOpNotSupportMultiThreadExecList.find(common::AnfAlgo::GetCNodeName(cnode)) !=
-          kOpNotSupportMultiThreadExecList.end() ||
+    if (IsOneOfNotSupportMultiThreadExec(common::AnfAlgo::GetCNodeName(cnode)) ||
         (kCPUOpNoEliminateList.find(common::AnfAlgo::GetCNodeName(cnode)) != kCPUOpNoEliminateList.end())) {
       const auto &inputs = cnode->inputs();
       for (const auto &input : inputs) {
diff --git a/mindspore/ccsrc/runtime/pynative/run_op_helper.cc b/mindspore/ccsrc/runtime/pynative/run_op_helper.cc
index 81e4b1e9aa8..a777339a340 100644
--- a/mindspore/ccsrc/runtime/pynative/run_op_helper.cc
+++ b/mindspore/ccsrc/runtime/pynative/run_op_helper.cc
@@ -503,7 +503,7 @@ void ReleaseKernelResource(const KernelGraphPtr &graph) {
   const auto &kernels = graph->execution_order();
   for (const auto &kernel : kernels) {
     MS_EXCEPTION_IF_NULL(kernel);
-    if (kOpCacheBlackList.find(common::AnfAlgo::GetCNodeName(kernel)) != kOpCacheBlackList.end()) {
+    if (IsOneOfCacheBlackList(common::AnfAlgo::GetCNodeName(kernel))) {
       auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
       if (kernel_mod) {
         kernel_mod->ReleaseResource();
diff --git a/mindspore/ccsrc/transform/graph_ir/df_graph_manager.h b/mindspore/ccsrc/transform/graph_ir/df_graph_manager.h
index 9f3b0134fc8..9165fc4266d 100644
--- a/mindspore/ccsrc/transform/graph_ir/df_graph_manager.h
+++ b/mindspore/ccsrc/transform/graph_ir/df_graph_manager.h
@@ -24,12 +24,13 @@
 #include <map>
 #include "include/transform/graph_ir/types.h"
 #include "ir/anf.h"
+#include "include/backend/visible.h"
 
 namespace mindspore {
 namespace transform {
 class GraphRunner;
 
-class DfGraphManager {
+class BACKEND_EXPORT DfGraphManager {
  public:
   ~DfGraphManager();
   void ClearGraph() noexcept;
diff --git a/mindspore/ccsrc/utils/anfalgo.cc b/mindspore/ccsrc/utils/anfalgo.cc
index b84a55024a2..13321809bf5 100644
--- a/mindspore/ccsrc/utils/anfalgo.cc
+++ b/mindspore/ccsrc/utils/anfalgo.cc
@@ -815,7 +815,7 @@ bool AnfAlgo::IsUpdateParameterKernel(const CNodePtr &node) {
   if (HasNodeAttr(kAttrAsync, node) && GetNodeAttr<bool>(node, kAttrAsync)) {
     return false;
   }
-  if (kOptOperatorSet.find(node_name) == kOptOperatorSet.end() && node_name.find("Assign") == string::npos) {
+  if (!IsOneOfOperator(node_name) && node_name.find("Assign") == string::npos) {
     return false;
   }
   return true;
@@ -1031,7 +1031,7 @@ void FindDelayExecPosition(const std::vector<CNodePtr> &nodes, size_t current_in
     auto &child = nodes[j];
     auto child_name = AnfAlgo::GetCNodeName(child);
     if (child_name == kAssignAddOpName || child_name == kAssignSubOpName || child_name == kAssignOpName ||
-        kOptOperatorSet.find(child_name) != kOptOperatorSet.end()) {
+        IsOneOfOperator(child_name)) {
       return;
     }
 
@@ -1112,7 +1112,7 @@ void AnfAlgo::ReorderPosteriorExecList(NotNull<std::vector<CNodePtr> *> node_lis
 
   for (const auto &node : *node_list) {
     MS_EXCEPTION_IF_NULL(node);
-    if (kPosteriorOperatorSet.find(AnfAlgo::GetCNodeName(node)) != kPosteriorOperatorSet.end()) {
+    if (IsOneOfPosteriorOperator(AnfAlgo::GetCNodeName(node))) {
       posterior_node_list.emplace_back(node);
     } else {
       ordinary_node_list.emplace_back(node);
diff --git a/mindspore/ccsrc/utils/utils.cc b/mindspore/ccsrc/utils/utils.cc
new file mode 100644
index 00000000000..2b368430a76
--- /dev/null
+++ b/mindspore/ccsrc/utils/utils.cc
@@ -0,0 +1,214 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "include/common/utils/utils.h"
+#include <set>
+#include <string>
+namespace mindspore {
+bool IsOneOfPosteriorOperator(const std::string &name) {
+  const std::set<std::string> kPosteriorOperatorSet = {kPullOpName};
+
+  auto iter = kPosteriorOperatorSet.find(name);
+  return iter != kPosteriorOperatorSet.end();
+}
+
+bool IsOneOfCacheBlackList(const std::string &name) {
+  const std::set<std::string> kOpCacheBlackList = {kUniformCandidateSamplerOpName, kInitDatasetQueueOpName,
+                                                   kGetNextOpName};
+
+  auto iter = kOpCacheBlackList.find(name);
+  return iter != kOpCacheBlackList.end();
+}
+
+bool IsOneOfNotSupportMultiThreadExec(const std::string &name) {
+  const std::set<std::string> kOpNotSupportMultiThreadExecList = {kAvgPoolOpName, kAvgPoolGradOpName, kMaxPoolOpName,
+                                                                  kBatchNorm, kBatchNormGradOpName};
+
+  auto iter = kOpNotSupportMultiThreadExecList.find(name);
+  return iter != kOpNotSupportMultiThreadExecList.end();
+}
+
+bool IsOneOf3DFormat(const std::string &format) {
+  const std::set<std::string> k3DFormatSet = {kOpFormat_NCDHW, kOpFormat_NDC1HWC0, kOpFormat_FRACTAL_Z_3D,
+                                              kOpFormat_NDHWC, kOpFormat_DHWCN,    kOpFormat_DHWNC};
+
+  auto iter = k3DFormatSet.find(format);
+  return iter != k3DFormatSet.end();
+}
+
+bool IsOneOfNoPaddingFormat(const std::string &format) {
+  const std::set<std::string> kNoPaddingFormatSet = {kOpFormat_ChannelLast, kOpFormat_FRAC_NZ, kOpFormat_FRACTAL_ZN_RNN,
+                                                     kOpFormat_ND_RNN_BIAS};
+
+  auto iter = kNoPaddingFormatSet.find(format);
+  return iter != kNoPaddingFormatSet.end();
+}
+
+bool IsOneOfDynamicShapeConstInputToAttr(const std::string &name) {
+  const std::set<std::string> DynamicShapeConstInputToAttr = {
+    kCastOpName,      kExpandDimsOpName, kEmbeddingLookupOpName, kReduceMinOpName, kReduceMeanOpName,
+    kReduceMaxOpName, kReduceAllOpName,  kReduceAnyOpName,       kConcatOpName,    kTransposeOpName};
+
+  auto iter = DynamicShapeConstInputToAttr.find(name);
+  return iter != DynamicShapeConstInputToAttr.end();
+}
+
+bool IsOneOfDynamicShapeConstInputToAttrCPU(const std::string &name) {
+  const std::set<std::string> DynamicShapeConstInputToAttrCPU = {
+    kCastOpName,      kExpandDimsOpName, kEmbeddingLookupOpName, kReduceMinOpName, kReduceMeanOpName, kReduceMaxOpName,
+    kReduceAllOpName, kReduceAnyOpName,  kConcatOpName,          kReduceSumOpName, kTransposeOpName};
+
+  auto iter = DynamicShapeConstInputToAttrCPU.find(name);
+  return iter != DynamicShapeConstInputToAttrCPU.end();
+}
+
+bool IsOneOfDynamicShapeConstInputToAttrGPU(const std::string &name) {
+  const std::set<std::string> DynamicShapeConstInputToAttrGPU = {
+    kCastOpName,      kExpandDimsOpName, kReshapeOpName,    kEmbeddingLookupOpName, kTransposeOpName,
+    kReduceSumOpName, kReduceMinOpName,  kReduceMeanOpName, kReduceMaxOpName,       kReduceAllOpName,
+    kReduceAnyOpName, kConcatOpName,     kScatterNdOpName,  kGatherV2OpName,        kAvgPool3DGradOpName};
+
+  auto iter = DynamicShapeConstInputToAttrGPU.find(name);
+  return iter != DynamicShapeConstInputToAttrGPU.end();
+}
+
+bool IsOneOfCustomAkgType(const std::string &name) {
+  const std::set<std::string> kCustomTypeAkg = {"ir_builder", "tvm_compute", "hybrid"};
+
+  auto iter = kCustomTypeAkg.find(name);
+  return iter != kCustomTypeAkg.end();
+}
+
+bool IsOneOfOperator(const std::string &name) {
+  const std::set<std::string> kOptOperatorSet = {kMomentumOpName,
+                                                 kApplyMomentumOpName,
+                                                 kApplyAdadeltaOpName,
+                                                 kApplyAdagradOpName,
+                                                 kApplyAdagradDAName,
+                                                 kApplyAdamOpName,
+                                                 kApplyAdaMaxOpName,
+                                                 kApplyAddSignOpName,
+                                                 kApplyCenteredRMSPOpName,
+                                                 kApplyFtrlOpName,
+                                                 kApplyFtrlV2OpName,
+                                                 kApplyGradientDescentOpName,
+                                                 kApplyPowerSignOpName,
+                                                 kApplyProximalAdagradOpName,
+                                                 kApplyProximalGradientDescentOpName,
+                                                 kApplyRMSPropOpName,
+                                                 kAdamApplyOneWithDecayOpName,
+                                                 kAdamApplyOneWithDecayAssignOpName,
+                                                 kFusedAdamWeightDecayName,
+                                                 kAdamWeightDecayName,
+                                                 kFusedCastAdamWeightDecayName,
+                                                 kFusedAdamName,
+                                                 kFusedAdaFactorName,
+                                                 kFusedAdaFactorWithGlobalNormName,
+                                                 kFusedSparseAdamName,
+                                                 kFusedMulApplyMomentumOpName,
+                                                 kFusedWeightScaleApplyMomentum,
+                                                 kFusedScaleApplyMomentum,
+                                                 kApplyCenteredRMSPropOpName,
+                                                 kFusedSparseFtrlName,
+                                                 kFusedSparseProximalAdagradName,
+                                                 kFusedSparseLazyAdamName,
+                                                 kSparseApplyFtrlName,
+                                                 kSparseApplyFtrlV2Name,
+                                                 kSGDName,
+                                                 kLARSUpdateName,
+                                                 kCombineMomentumWeightOpName,
+                                                 kCombineMomentumOpName,
+                                                 kScatterAddOpName,
+                                                 kScatterUpdateOpName,
+                                                 kSparseApplyProximalAdagradOpName};
+
+  auto iter = kOptOperatorSet.find(name);
+  return iter != kOptOperatorSet.end();
+}
+
+bool IsOneOfComputeDepend(const std::string &name) {
+  const std::set<std::string> kComputeDepend = {kUniqueOpName,
+                                                kUniqueConsecutiveOpName,
+                                                kComputeAccidentalHitsOpName,
+                                                kSubAndFilterOpName,
+                                                kPadAndShiftOpName,
+                                                kCTCGreedyDecoderOpName,
+                                                kMaskedSelectOpName,
+                                                kDynamicStitchOpName,
+                                                kGetNextOpName,
+                                                kListDiffOpName,
+                                                kNonMaxSuppressionV3OpName,
+                                                kNonMaxSuppressionWithOverlapsOpName,
+                                                kCoalesceOpName,
+                                                kTruncatedNormal,
+                                                kNonDeterministicInts,
+                                                kFractionalAvgPoolGradOpName,
+                                                kDenseToDenseSetOperation,
+                                                kSegmentMaxOpName,
+                                                kCSRSparseMatrixToSparseTensorOpName,
+                                                kSegmentMinOpName,
+                                                kLuUnpackOpName,
+                                                kSegmentSumOpName,
+                                                kResizeBicubicOpName,
+                                                kResizeAreaOpName,
+                                                kSegmentMeanOpName,
+                                                kSegmentProdOpName,
+                                                kNonZeroOpName,
+                                                kSparseSparseMinimumOpName,
+                                                kRpcRecvOpName,
+                                                kAdaptiveMaxPool3DGradOpName};
+
+  auto iter = kComputeDepend.find(name);
+  return iter != kComputeDepend.end();
+}
+
+bool IsOneOfHWSpecialFormat(const std::string &format) {
+  const std::set<std::string> kHWSpecialFormatSet = {
+    kOpFormat_FRACTAL_Z_3D,   kOpFormat_NC1KHKWHWC0, kOpFormat_NC1HWC0,       kOpFormat_FRAC_NZ,
+    kOpFormat_C1HWNCoC0,      kOpFormat_NC1HWC0_C04, kOpFormat_FRACTAL_Z_C04, kOpFormat_FRACTAL_ZN_LSTM,
+    kOpFormat_FRACTAL_ZN_RNN, kOpFormat_NDC1HWC0,    kOpFormat_FRAC_Z};
+
+  auto iter = kHWSpecialFormatSet.find(format);
+  return iter != kHWSpecialFormatSet.end();
+}
+
+bool IsOneOfFormat(const std::string &format) {
+  const std::set<std::string> kOpFormatList = {kOpFormat_DEFAULT,
+                                               kOpFormat_NC1KHKWHWC0,
+                                               kOpFormat_ND,
+                                               kOpFormat_NCHW,
+                                               kOpFormat_NHWC,
+                                               kOpFormat_HWCN,
+                                               kOpFormat_NC1HWC0,
+                                               kOpFormat_FRAC_Z,
+                                               kOpFormat_C1HWNCoC0,
+                                               kOpFormat_FRAC_NZ,
+                                               kOpFormat_NC1HWC0_C04,
+                                               kOpFormat_FRACTAL_Z_C04,
+                                               kOpFormat_NDHWC,
+                                               kOpFormat_FRACTAL_ZN_LSTM,
+                                               kOpFormat_FRACTAL_ZN_RNN,
+                                               kOpFormat_ND_RNN_BIAS,
+                                               kOpFormat_NDC1HWC0,
+                                               kOpFormat_NCDHW,
+                                               kOpFormat_FRACTAL_Z_3D,
+                                               kOpFormat_DHWNC,
+                                               kOpFormat_DHWCN};
+
+  auto iter = kOpFormatList.find(format);
+  return iter != kOpFormatList.end();
+}
+}  // namespace mindspore
diff --git a/mindspore/lite/src/extendrt/CMakeLists.txt b/mindspore/lite/src/extendrt/CMakeLists.txt
index d6c0cbb06e4..53b6f145000 100644
--- a/mindspore/lite/src/extendrt/CMakeLists.txt
+++ b/mindspore/lite/src/extendrt/CMakeLists.txt
@@ -88,6 +88,7 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
 
     set(ANF_ALG_SRC ${ANF_ALG_SRC}
         ${CCSRC_DIR}/utils/anfalgo.cc
+        ${CCSRC_DIR}/utils/utils.cc
         ${CCSRC_DIR}/utils/parallel_context.cc
         ${CCSRC_DIR}/utils/convert_utils.cc)
     add_library(mindspore-infer-anfalgo OBJECT ${ANF_ALG_SRC})
diff --git a/mindspore/lite/src/extendrt/utils/kernel_build_utils.cc b/mindspore/lite/src/extendrt/utils/kernel_build_utils.cc
index c5e97446b3b..7537a7405ae 100644
--- a/mindspore/lite/src/extendrt/utils/kernel_build_utils.cc
+++ b/mindspore/lite/src/extendrt/utils/kernel_build_utils.cc
@@ -405,7 +405,7 @@ std::pair<std::string, ExceptionType> SetKernelInfoWithMsg(const CNodePtr &kerne
   const std::string &op_name = common::AnfAlgo::GetCNodeName(kernel_node);
   if (IsPrimitiveCNode(kernel_node, prim::kPrimCustom)) {
     auto tp = common::AnfAlgo::GetNodeAttr<std::string>(kernel_node, kAttrFuncType);
-    if (kCustomTypeAkg.find(tp) != kCustomTypeAkg.end()) {
+    if (IsOneOfCustomAkgType(tp)) {
       UpdateCustomKernelBuildInfo(kernel_node, true);
       return {};
     }