backend

2022-06-07 09:55:48 +08:00 · 2022-06-07 09:55:48 +08:00 · 6e86fa29ff
parent 2d2d9d8122
commit 6e86fa29ff
28 changed files with 173 additions and 88 deletions
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@ -123,7 +123,7 @@ if(ENABLE_TESTCASES OR (NOT ENABLE_D))
 endif()

 if(NOT (ENABLE_TESTCASES OR ENABLE_TEST) AND NOT (CMAKE_SYSTEM_NAME MATCHES "Windows" OR
-        CMAKE_SYSTEM_NAME MATCHES "Darwin"))
+        CMAKE_SYSTEM_NAME MATCHES "Darwin") AND (ENABLE_D OR ENABLE_GPU OR ENABLE_CPU))
    add_compile_definitions(WITH_BACKEND)
 endif()

--- a/mindspore/ccsrc/backend/common/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/common/session/ascend_session.cc
@ -69,7 +69,7 @@
 #include "include/common/debug/rdr/recorder_manager.h"
 #include "debug/rdr/graph_recorder.h"
 #endif
-#if ENABLE_CPU && ENABLE_D
+#ifdef WITH_BACKEND
 #include "ps/util.h"
 #include "ps/ps_cache/ps_cache_manager.h"
 #endif
@ -247,7 +247,7 @@ bool TensorNeedSync(const std::shared_ptr<KernelGraph> &kernel_graph, const AnfN
      }
      MS_EXCEPTION_IF_NULL(memcpy_nums);
      (*memcpy_nums)++;
-#if ((defined ENABLE_CPU) && (!defined _WIN32))
+#ifdef WITH_BACKEND
      const std::string &param_name = parameter->fullname_with_scope();
      if (ps::ps_cache_instance.IsHashTable(param_name)) {
        return false;
@ -346,7 +346,7 @@ void AscendSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_gra
    }
    if (AnfAlgo::OutputAddrExist(input_node, 0) &&
        TensorNeedSync(kernel_graph, input_node, tensor, &device_memcpy_nums)) {
-#if ((defined ENABLE_CPU) && (!defined _WIN32))
+#ifdef WITH_BACKEND
      const std::string &param_name = input_node->fullname_with_scope();
      if (ps::ps_cache_instance.IsHashTable(param_name)) {
        continue;
@ -459,7 +459,7 @@ GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {

  // adjust kernel
  AdjustKernel(root_graph);
-#if ENABLE_CPU && ENABLE_D
+#ifdef WITH_BACKEND
  InitPsWorker(root_graph);
 #endif
  // assign stream
@ -538,7 +538,7 @@ void AscendSession::BuildGraphImpl(GraphId graph_id) {
  single_graph->UpdateExecuteKernelStreamLabel();
  // adjust execution order because  merge child graph and other special operations
  AdjustKernel(graph);
-#if ENABLE_CPU && ENABLE_D
+#ifdef WITH_BACKEND
  InitPsWorker(graph);
 #endif
  // Assign streams for control sink and hccl and so on
@ -616,7 +616,7 @@ void AscendSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_g
    debugger_->PreExecute(kernel_graph);
  }
 #endif
-#if ENABLE_CPU && ENABLE_D
+#ifdef WITH_BACKEND
  // Initialize parameter server
  InitPSParamAndOptim(kernel_graph, inputs);
  std::string channel_name;
@ -1000,17 +1000,19 @@ void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::map<AnfN
 }

 #ifndef ENABLE_SECURITY
-void DumpInit(uint32_t device_id) {
+void DumpInit(const std::string &device_type, uint32_t device_id) {
  auto &json_parser = DumpJsonParser::GetInstance();
  json_parser.Parse();
  json_parser.CopyDumpJsonToDir(device_id);
  json_parser.CopyHcclJsonToDir(device_id);
  json_parser.CopyMSCfgJsonToDir(device_id);
  if (json_parser.async_dump_enabled()) {
-#ifdef ENABLE_D
-    // register callback to adx
-    if (json_parser.FileFormatIsNpy()) {
-      AdxRegDumpProcessCallBack(DumpDataCallBack);
+#if !(defined(ENABLE_TEST) || defined(ENABLE_TESTCASES))
+    if (device_type == kAscendDevice) {
+      // register callback to adx
+      if (json_parser.FileFormatIsNpy()) {
+        AdxRegDumpProcessCallBack(DumpDataCallBack);
+      }
    }
 #endif
    if (AdxDataDumpServerInit() != 0) {
@ -1035,7 +1037,8 @@ void AscendSession::InitRuntimeResource() {
    rank_id_ = GetRankId();
  }
 #ifndef ENABLE_SECURITY
-  DumpInit(rank_id_);
+  auto device_type = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
+  DumpInit(device_type, rank_id_);
 #endif
  MS_LOG(INFO) << "Status record: end init runtime resource.";
 }
--- a/mindspore/ccsrc/backend/common/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/common/session/cpu_session.cc
@ -27,7 +27,9 @@
 #include "plugin/factory/ms_factory.h"
 #include "runtime/device/kernel_runtime.h"
 #include "plugin/device/cpu/kernel/cpu_kernel.h"
+#ifdef ENABLE_AKG
 #include "plugin/device/cpu/kernel/akg/akg_cpu_kernel_build.h"
+#endif
 #include "plugin/device/cpu/hal/device/kernel_select_cpu.h"
 #include "backend/common/optimizer/optimizer.h"
 #include "backend/common/optimizer/pass_manager.h"
@ -42,7 +44,7 @@
 #ifndef ENABLE_SECURITY
 #include "debug/data_dump/dump_json_parser.h"
 #endif
-#if ((defined ENABLE_CPU) && (!defined _WIN32))
+#ifdef WITH_BACKEND
 #include "ps/util.h"
 #include "ps/ps_context.h"
 #endif
@ -87,7 +89,7 @@ void CPUSession::Reorder(std::vector<CNodePtr> *node_list) {
 void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  auto optimizer = std::make_shared<opt::GraphOptimizer>();
  auto pm = std::make_shared<opt::PassManager>();
-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode && ps::PSContext::instance()->is_ps_mode()) {
@ -195,7 +197,7 @@ void CPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_grap
  MS_LOG(INFO) << "Bind input output address";
  runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs);

-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
  InitPSParamAndOptim(kernel_graph, inputs);
 #endif
 }
--- a/mindspore/ccsrc/backend/common/session/executor.cc
+++ b/mindspore/ccsrc/backend/common/session/executor.cc
@ -22,9 +22,6 @@
 #include "include/common/utils/comm_manager.h"
 #include "include/common/utils/scoped_long_running.h"
 #include "pybind_api/ir/tensor_py.h"
-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
-#include "ps/ps_cache/ps_cache_manager.h"
-#endif

 using mindspore::tensor::TensorPy;
 namespace mindspore {
--- a/mindspore/ccsrc/backend/common/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/common/session/gpu_session.cc
@ -85,7 +85,7 @@
 #include "common/graph_kernel/graph_kernel_flags.h"
 #include "include/common/utils/utils.h"
 #include "abstract/utils.h"
-#if ENABLE_CPU && ENABLE_GPU
+#ifdef WITH_BACKEND
 #include "ps/util.h"
 #include "ps/ps_cache/ps_cache_manager.h"
 #endif
@ -358,7 +358,7 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
    auto input_node = input_nodes[i];
    MS_EXCEPTION_IF_NULL(input_node);
    if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
-#if ENABLE_CPU && ENABLE_GPU
+#ifdef WITH_BACKEND
      const std::string &param_name = input_node->fullname_with_scope();
      if (ps::ps_cache_instance.IsHashTable(param_name)) {
        continue;
@ -438,7 +438,7 @@ GraphId GPUSession::CompileGraphImpl(const KernelGraphPtr &graph) {
  GraphKernelOptimize(graph);
  // Start gpu kernel runtime
  StartKernelRT();
-#if ENABLE_CPU && ENABLE_GPU
+#ifdef WITH_BACKEND
  InitPsWorker(graph);
 #endif
  // Assign CUDA streams
@ -518,7 +518,7 @@ void GPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_grap
  E2eDump::UpdateIterOldRTDump(kernel_graph.get());
 #endif

-#if ENABLE_CPU && ENABLE_GPU
+#ifdef WITH_BACKEND
  // Initialize parameter server
  InitPSParamAndOptim(kernel_graph, inputs);
 #endif
@ -555,7 +555,7 @@ void GPUSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph)
  int kernel_num = kernel_graph->execution_order().size();
  int64_t loopsize = (kernel_num > 1) ? ConfigManager::GetInstance().gpu_loopsink_size() : 1;
  for (int64_t i = 0; i < loopsize; i++) {
-#if ENABLE_CPU && ENABLE_GPU
+#ifdef WITH_BACKEND
    std::string channel_name;
    if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(kernel_graph, &channel_name)) {
      ps::ps_cache_instance.IncreaseGraphStep(channel_name);
@ -605,7 +605,7 @@ void GPUSession::UpdateOutputTensors(const VectorRef *outputs,
        // But one time memory application scenarios need to be skipped, because the memory is not allocated next step:
        // 1. Non cnode 2. Communication kernel.
        bool ps_mode = false;
-#if ((defined ENABLE_CPU) && (!defined _WIN32))
+#ifdef WITH_BACKEND
        ps_mode = ps::PSContext::instance()->is_ps_mode();
 #endif
        if (node->isa<CNode>() && !common::AnfAlgo::IsCommunicationOp(node) && !ps_mode) {
--- a/mindspore/ccsrc/backend/common/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/common/session/session_basic.cc
@ -46,7 +46,7 @@
 #include "utils/file_utils.h"
 #include "utils/trace_base.h"
 #include "include/common/utils/parallel_context.h"
-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
 #include "ps/ps_cache/ps_cache_manager.h"
 #include "ps/constants.h"
 #include "ps/util.h"
@ -520,7 +520,7 @@ void SetReturnNode(const AnfNodePtr &node, KernelGraph *graph) {
  }
 }

-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
 // Get all users of this node
 void GetNodeUsedList(const FuncGraphPtr &kernel_graph, const AnfNodePtr &node,
                     std::vector<AnfNodePtr> *node_users_list) {
@ -2973,7 +2973,7 @@ void SessionBasic::DumpGraphs(const std::vector<KernelGraphPtr> &graphs) {

 void SessionBasic::UnifyMindIR(const KernelGraphPtr &graph) { opt::CommonUnifyMindIR(graph); }

-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
 void SessionBasic::InitPsWorker(const KernelGraphPtr &kernel_graph) {
  if (!ps::PSContext::instance()->is_worker()) {
    return;
--- a/mindspore/ccsrc/backend/common/session/session_basic.h
+++ b/mindspore/ccsrc/backend/common/session/session_basic.h
@ -344,7 +344,7 @@ class BACKEND_EXPORT SessionBasic : public std::enable_shared_from_this<SessionB
  std::vector<uint32_t> GetAllReduceSplitIndex();
  virtual std::string GetCommWorldGroup() { return std::string(); }
  void DumpGraphs(const std::vector<KernelGraphPtr> &graphs);
-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
  void CheckPSModeConsistence(const KernelGraphPtr &kernel_graph) const;
  void GetBatchElements(const AnfNodePtr &kernel_node) const;
  void InitPsWorker(const KernelGraphPtr &kernel_graph);
--- a/mindspore/ccsrc/backend/graph_compiler/backend.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/backend.cc
@ -40,16 +40,13 @@
 #include "runtime/pynative/graph_adapter.h"
 #include "distributed/recovery/recovery_context.h"
 #include "include/common/utils/scoped_long_running.h"
-#ifdef ENABLE_D
-#include "include/common/utils/callbacks_ge.h"
-#endif
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif
 #ifndef ENABLE_SECURITY
 #include "debug/data_dump/dump_json_parser.h"
 #endif
-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
 #include "ps/ps_context.h"
 #endif

@ -352,7 +349,7 @@ VectorRef MsBackend::MsRunGraph(const GraphId &g, const VectorRef &args, const s
  const session::SessionPtr &exe_session = ((target != target_device_ && !target.empty()) ? other_sess_ : target_sess_);
  MS_EXCEPTION_IF_NULL(exe_session);

-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
  // If in PS mode, must use sync mode to run graph in case that the weights on server are not updated in the last step.
  if (ps::PSContext::instance()->is_ps_mode()) {
    exe_session->RunGraph(g, inputs, &outputs);
--- a/mindspore/ccsrc/backend/graph_compiler/transform.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/transform.cc
@ -28,7 +28,7 @@
 #include "ir/graph_utils.h"
 #include "utils/ms_context.h"
 #include "utils/trace_base.h"
-#if ((defined ENABLE_CPU) && (!defined _WIN32))
+#ifdef WITH_BACKEND
 #include "ps/ps_context.h"
 #endif

@ -598,7 +598,7 @@ void SetMindRTEnable() {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);

-#if ((defined ENABLE_CPU) && (!defined _WIN32) && (!defined __APPLE__))
+#ifdef WITH_BACKEND
  if (ps::PSContext::instance()->is_ps_mode() && !ps::PSContext::instance()->enable_distributed_mindrt()) {
    context_ptr->set_param<bool>(MS_CTX_ENABLE_MINDRT, false);
    return;
--- a/mindspore/ccsrc/common/graph_kernel/graph_kernel_build.cc
+++ b/mindspore/ccsrc/common/graph_kernel/graph_kernel_build.cc
@ -26,12 +26,7 @@
 #include "kernel/akg/akg_kernel_json_generator.h"
 #include "common/graph_kernel/graph_kernel_helper.h"
 #include "common/graph_kernel/core/graph_kernel_utils.h"
-#if ENABLE_D
-#include "plugin/device/ascend/kernel/akg/akg_ascend_kernel_build.h"
-#elif ENABLE_GPU
-#include "plugin/device/gpu/kernel/akg/akg_gpu_kernel_build.h"
-#endif
-#include "plugin/device/cpu/kernel/akg/akg_cpu_kernel_build.h"
+#include "kernel/akg/akg_kernel_build_manager.h"

 namespace mindspore::graphkernel {
 namespace {
@ -151,21 +146,13 @@ void GraphKernelBuild::Init() {
  }

  // Init AkgKernelBuilder.
-#if ENABLE_D
-  if (Callback::Instance()->GetTargetFromContext() == kCPUDevice) {
-    kernel_builder_ = std::make_shared<kernel::AkgCpuKernelBuilder>();
+  if (Callback::Instance()->GetTargetFromContext() == kGPUDevice) {
+    kernel_builder_ = kernel::AkgKernelBuildManager::Instance().GetAkgKernelBuilder(kGPUDevice);
+  } else if (Callback::Instance()->GetTargetFromContext() == kAscendDevice) {
+    kernel_builder_ = kernel::AkgKernelBuildManager::Instance().GetAkgKernelBuilder(kAscendDevice);
  } else {
-    kernel_builder_ = std::make_shared<kernel::AkgAscendKernelBuilder>();
+    kernel_builder_ = kernel::AkgKernelBuildManager::Instance().GetAkgKernelBuilder(kCPUDevice);
  }
-#elif ENABLE_GPU
-  if (Callback::Instance()->GetTargetFromContext() == kCPUDevice) {
-    kernel_builder_ = std::make_shared<kernel::AkgCpuKernelBuilder>();
-  } else {
-    kernel_builder_ = std::make_shared<kernel::AkgGpuKernelBuilder>();
-  }
-#elif ENABLE_CPU
-  kernel_builder_ = std::make_shared<kernel::AkgCpuKernelBuilder>();
-#endif
 }

 bool GraphKernelBuild::Process(const FuncGraphPtr &func_graph, int iter) {
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -1767,7 +1767,6 @@ bool Debugger::TensorExistsInCurrent(const std::string &tensor_name) {
  return false;
 }

-#ifdef ENABLE_D
 /*
 * Feature group: Dump.
 * Target device group: Ascend.
@ -1805,6 +1804,5 @@ void Debugger::WaitForWriteFileFinished() {
    recheck_cnt++;
  }
 }
-#endif

 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@ -27,9 +27,7 @@
 #include "debug/debugger/grpc_client.h"
 #include "debug/debug_services.h"
 #include "runtime/device/ms_device_shape_transfer.h"
-#ifdef ENABLE_D
 #include "debug/dump_data_builder.h"
-#endif
 #include "runtime/device/device_address.h"
 #include "include/backend/visible.h"

@ -197,13 +195,11 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this<Debugger> {

  std::vector<AnfNodePtr> GetParametersMindRT() const { return parameters_mindRT_; }

-#ifdef ENABLE_D
  std::shared_ptr<DumpDataBuilder> LoadDumpDataBuilder(const std::string &node_name);

  void ClearDumpDataBuilder(const std::string &node_name);

  void WaitForWriteFileFinished();
-#endif

 private:
  // private constructor for singleton
@ -331,10 +327,8 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this<Debugger> {
  // map to store iter num in each epoch when dataset_sink_mode is true
  std::map<uint32_t, int32_t> graph_iter_num_map_;

-#ifdef ENABLE_D
  // to construct kernel data for async dump, key is the dump path to the node
  std::map<std::string, std::shared_ptr<DumpDataBuilder>> dump_data_construct_map_;
-#endif

  // singleton
  inline static std::mutex instance_lock_ = {};
--- a/mindspore/ccsrc/distributed/init.h
+++ b/mindspore/ccsrc/distributed/init.h
@ -20,7 +20,7 @@
 #include <vector>
 #include <string>
 #include "distributed/collective/collective_manager.h"
-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
 #include "distributed/cluster/cluster_context.h"
 #else
 #include "distributed/cluster/dummy_cluster_context.h"
--- a/mindspore/ccsrc/kernel/CMakeLists.txt
+++ b/mindspore/ccsrc/kernel/CMakeLists.txt
@ -14,6 +14,7 @@ endif()
 if(ENABLE_AKG AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
    file(GLOB_RECURSE AKG_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "akg/akg_kernel_build.cc"
+        "akg/akg_kernel_build_manager.cc"
        "akg/akg_kernel_json_generator.cc"
        "akg/akg_kernel_json_decoder.cc"
    )
--- a/mindspore/ccsrc/kernel/akg/akg_kernel_build_manager.cc
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_build_manager.cc
@ -0,0 +1,41 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/akg/akg_kernel_build_manager.h"
+#include <memory>
+namespace mindspore {
+namespace kernel {
+AkgKernelBuildManager &AkgKernelBuildManager::Instance() {
+  static AkgKernelBuildManager instance{};
+  return instance;
+}
+
+void AkgKernelBuildManager::Register(const std::string &device_type, AkgKernelBuildCreator &&creator) {
+  if (base_map_.find(device_type) == base_map_.end()) {
+    (void)base_map_.emplace(device_type, creator);
+  }
+}
+
+std::shared_ptr<AkgKernelBuilder> AkgKernelBuildManager::GetAkgKernelBuilder(const std::string &device_type) {
+  auto iter = base_map_.find(device_type);
+  if (base_map_.end() != iter) {
+    MS_EXCEPTION_IF_NULL(iter->second);
+    return (iter->second)();
+  }
+  return nullptr;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/kernel/akg/akg_kernel_build_manager.h
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_build_manager.h
@ -0,0 +1,53 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_MANAGER_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_MANAGER_H_
+#include "kernel/akg/akg_kernel_build.h"
+#include <map>
+#include <utility>
+#include <memory>
+#include <string>
+
+namespace mindspore {
+namespace kernel {
+using AkgKernelBuildCreator = std::function<std::shared_ptr<AkgKernelBuilder>()>;
+
+class AkgKernelBuildManager {
+ public:
+  static AkgKernelBuildManager &Instance();
+  void Register(const std::string &device_type, AkgKernelBuildCreator &&creator);
+  std::shared_ptr<AkgKernelBuilder> GetAkgKernelBuilder(const std::string &device_type);
+
+ private:
+  std::map<std::string, AkgKernelBuildCreator> base_map_;
+};
+
+class AkgKernelBuildRegister {
+ public:
+  AkgKernelBuildRegister(const std::string &device_type, AkgKernelBuildCreator &&creator) {
+    AkgKernelBuildManager::Instance().Register(device_type, std::move(creator));
+  }
+  ~AkgKernelBuildRegister() = default;
+};
+
+#define REG_AKG_KERNEL_BUILDER(DEVICE_TYPE, BUILDER_CLASS)                         \
+  static const AkgKernelBuildRegister g_akg_kernel_builder_##DEVICE_TYPE##_##_reg( \
+    DEVICE_TYPE, []() { return std::make_shared<BUILDER_CLASS>(); });
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_MANAGER_H_
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_kernel_runtime.cc
@ -245,12 +245,15 @@ bool AscendKernelRuntime::NeedDestroyHccl() {
 #ifndef ENABLE_SECURITY
 void AsyncDataDumpUninit() {
  if (DumpJsonParser::GetInstance().async_dump_enabled()) {
-#if ENABLE_D
-    // When it is A+M dump mode, wait until file save is finished.
-    if (DumpJsonParser::GetInstance().FileFormatIsNpy()) {
-      Debugger::GetInstance()->WaitForWriteFileFinished();
+    auto ms_context = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(ms_context);
+    auto device_type = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
+    if (device_type == kAscendDevice) {
+      // When it is A+M dump mode, wait until file save is finished.
+      if (DumpJsonParser::GetInstance().FileFormatIsNpy()) {
+        Debugger::GetInstance()->WaitForWriteFileFinished();
+      }
    }
-#endif
    if (AdxDataDumpServerUnInit() != 0) {
      MS_LOG(ERROR) << "Adx data dump server uninit failed";
    }
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_context.cc
@ -239,7 +239,7 @@ void DumpInit(uint32_t device_id) {
  json_parser.CopyHcclJsonToDir(device_id);
  json_parser.CopyMSCfgJsonToDir(device_id);
  if (json_parser.async_dump_enabled()) {
-#ifdef ENABLE_D
+#if !(defined(ENABLE_TEST) || defined(ENABLE_TESTCASES))
    // register callback to adx
    if (json_parser.FileFormatIsNpy()) {
      AdxRegDumpProcessCallBack(DumpDataCallBack);
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/akg/akg_ascend_kernel_build.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/akg/akg_ascend_kernel_build.h
@ -23,6 +23,7 @@
 #include <map>
 #include "ir/anf.h"
 #include "kernel/akg/akg_kernel_build.h"
+#include "kernel/akg/akg_kernel_build_manager.h"

 namespace mindspore {
 namespace kernel {
@ -39,6 +40,8 @@ class AkgAscendKernelBuilder : public AkgKernelBuilder {
                       const AnfNodePtr &anf_node) override;
  void AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) override;
 };
+
+REG_AKG_KERNEL_BUILDER(kAscendDevice, AkgAscendKernelBuilder);
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc
@ -18,7 +18,9 @@
 #include <string>
 #include "plugin/device/cpu/hal/device/cpu_device_address.h"
 #include "plugin/device/cpu/hal/device/cpu_memory_manager.h"
+#ifdef ENABLE_AKG
 #include "plugin/device/cpu/kernel/akg/akg_cpu_kernel_build.h"
+#endif
 #include "plugin/factory/ms_factory.h"
 #include "plugin/device/cpu/kernel/cpu_kernel.h"
 #include "kernel/kernel_build_info.h"
@ -40,7 +42,7 @@
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "include/common/utils/anfalgo.h"
 #include "profiler/device/cpu/cpu_profiling.h"
-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
 #include "plugin/device/cpu/hal/hardware/ms_collective_comm_lib.h"
 #endif
 #ifndef ENABLE_SECURITY
@ -368,7 +370,7 @@ bool CPUDeviceContext::LoadCollectiveCommLib() {
    collective_comm_lib_ = instance_func();
    MS_EXCEPTION_IF_NULL(collective_comm_lib_);
  } else {
-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
    collective_comm_lib_ = &MsCollectiveCommLib::GetInstance();
    MS_EXCEPTION_IF_NULL(collective_comm_lib_);
 #endif
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/akg/akg_cpu_kernel_build.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/akg/akg_cpu_kernel_build.h
@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_
 #include <string>
 #include "kernel/akg/akg_kernel_build.h"
+#include "kernel/akg/akg_kernel_build_manager.h"
 #include "base/base.h"

 namespace mindspore {
@ -32,6 +33,7 @@ class AkgCpuKernelBuilder : public AkgKernelBuilder {
                       const AnfNodePtr &anf_node) override;
  void AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) override;
 };
+REG_AKG_KERNEL_BUILDER(kCPUDevice, AkgCpuKernelBuilder);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/allreduce_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/allreduce_cpu_kernel.cc
@ -20,13 +20,13 @@
 #include <functional>
 #include <memory>

-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
 #include "plugin/device/cpu/hal/hardware/ms_collective_comm_lib.h"
 #endif

 namespace mindspore {
 namespace kernel {
-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
 using device::CollectiveOpReduceType::Reduce_Sum;
 using device::cpu::kMCCLGlobalGroupName;
 using device::cpu::MsCollectiveCommLib;
@ -37,7 +37,7 @@ constexpr char kSupportedReduceOp[] = "sum";
 }  // namespace

 void AllReduceCPUKernelMod::InitKernel(const CNodePtr &kernel_node) {
-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
  auto kernel_attr = GetKernelAttrFromNode(kernel_node);
@ -67,7 +67,7 @@ std::vector<KernelAttr> AllReduceCPUKernelMod::GetOpSupport() {
 bool AllReduceCPUKernelMod::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                   const std::vector<kernel::AddressPtr> &,
                                   const std::vector<kernel::AddressPtr> &outputs) {
-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
  if (inputs.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << kernel_name_ << " has at least one input and one output, but got 0.";
  }
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/akg/akg_gpu_kernel_build.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/akg/akg_gpu_kernel_build.h
@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_GPU_AKG_GPU_KERNEL_BUILD_H_
 #include <string>
 #include "kernel/akg/akg_kernel_build.h"
+#include "kernel/akg/akg_kernel_build_manager.h"
 #include "base/base.h"

 namespace mindspore {
@ -33,6 +34,7 @@ class AkgGpuKernelBuilder : public AkgKernelBuilder {
  void AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) override;
 };

+REG_AKG_KERNEL_BUILDER(kGPUDevice, AkgGpuKernelBuilder);
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@ -37,7 +37,7 @@
 #include "include/common/utils/parallel_context.h"
 #include "include/common/debug/env_config_parser.h"
 #include "plugin/device/ascend/hal/device/ascend_device_address.h"
-#if ((defined ENABLE_CPU) && (!defined _WIN32))
+#ifdef WITH_BACKEND
 #include "ps/ps_cache/ps_cache_manager.h"
 #endif
 #include "kernel/common_utils.h"
@ -599,7 +599,7 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph &graph) {
    }
    add_need_alloc_nodes(input_node);
  }
-#if ((defined ENABLE_CPU) && (!defined _WIN32))
+#ifdef WITH_BACKEND
  bool ps_cache_check = false;
 #endif
  std::map<AnfNodePtr, AnfNodePtr> shadow_backend_node_map;
@ -615,7 +615,7 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph &graph) {
        continue;
      }
      DeviceAddressPtr device_address = GetInternalDeviceAddress(graph, item);
-#if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
      const std::string &param_name = item->fullname_with_scope();
      if (ps::ps_cache_instance.IsHashTable(param_name)) {
        MS_LOG(INFO) << "Parameter(" << param_name << ")"
@ -1773,7 +1773,7 @@ void KernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id) {
  MS_LOG(INFO) << "Clear graph:" << graph_id << " runtime resource";
 }

-#if ((defined ENABLE_CPU) && (!defined _WIN32))
+#ifdef WITH_BACKEND
 namespace {
 // Finalize ps cache module before throw an exception.
 void FinalizePsCache(const std::string &exception) {
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@ -193,7 +193,7 @@ class KernelRuntime {
  void RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, const session::KernelGraph &graph);
  void AssignValueNodeTensor(const ValueNodePtr &value_node, const ValuePtr &node_value, size_t output_idx);
  DeviceAddressPtr PreAssignCNodeMemory(const AnfNodePtr &anf_node, size_t index) const;
-#if ((defined ENABLE_CPU) && (!defined _WIN32))
+#ifdef WITH_BACKEND
  void GetFirstPSEmbeddingCache(const session::KernelGraph &graph, AnfNodePtr *const first_cache_input_index,
                                size_t *const first_cache_size);
  void CheckIfSupportPSEmbeddingCache(const session::KernelGraph &graph);
--- a/mindspore/ccsrc/runtime/device/kernel_runtime_manager.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime_manager.cc
@ -16,7 +16,7 @@

 #include "runtime/device/kernel_runtime_manager.h"
 #include "utils/log_adapter.h"
-#if ((defined ENABLE_CPU) && (!defined _WIN32))
+#ifdef WITH_BACKEND
 #include "ps/ps_cache/ps_cache_manager.h"
 #endif
 #include "backend/common/session/pynative_task_manager.h"
@ -26,7 +26,7 @@ namespace device {
 void KernelRuntimeManager::ClearRuntimeResource() {
  // Just remove PyNative tasks before runtime resource release.
  session::PynativeTaskManager::GetInstance().Reset();
-#if ((defined ENABLE_CPU) && (!defined _WIN32))
+#ifdef WITH_BACKEND
  if (ps::PSContext::instance()->is_worker() && ps::PsDataPrefetch::GetInstance().cache_enable()) {
    ps::ps_cache_instance.SyncEmbeddingTable();
  }
@ -132,7 +132,7 @@ void KernelRuntimeManager::ReleaseKernelRuntime(const std::string &device_name,
  if (runtime == nullptr) {
    return;
  }
-#if ((defined ENABLE_CPU) && (!defined _WIN32))
+#ifdef WITH_BACKEND
  if (ps::PSContext::instance()->is_worker() && ps::PsDataPrefetch::GetInstance().cache_enable()) {
    ps::ps_cache_instance.SyncEmbeddingTable();
  }
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_set.h
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_set.h
@ -17,7 +17,7 @@
 #ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_SET_H_
 #define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_SET_H_

-#if ((defined ENABLE_CPU) && (!defined _WIN32) && (!defined __APPLE__))
+#ifdef WITH_BACKEND
 #define ENABLE_RPC_ACTOR
 #endif

--- a/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc
@ -150,7 +150,7 @@ void IntHandler(int, siginfo_t *, void *) {
 }
 #endif

-#if ((defined ENABLE_CPU) && (!defined _WIN32) && (!defined _WIN64) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
 bool SendFinishTransform(const std::string &actor_set_name) {
  auto node = ClusterContext::instance()->node();
  MS_EXCEPTION_IF_NULL(node);
@ -479,7 +479,7 @@ ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info
  Optimize(actor_set);
  MS_LOG(INFO) << "Graph(" << graph_compiler_info.name_ << ") transforms actor end.";

-#if ((defined ENABLE_CPU) && (!defined _WIN32) && (!defined _WIN64) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
  if (ClusterContext::instance()->initialized() && RecoveryContext::GetInstance()->enable_recovery()) {
    while (!SendFinishTransform(graph_compiler_info.name_)) {
      MS_LOG(WARNING) << "Send finish transform graph failed.";
@ -570,7 +570,7 @@ void GraphScheduler::Run(ActorSet *const actor_set, const std::vector<DeviceCont
  const size_t kSecondsToMilliseconds = 1000;
  SetActorExecutionStrategy(actor_set, strategy, (end_time - start_time) * kSecondsToMilliseconds);

-#if ((defined ENABLE_CPU) && (!defined _WIN32) && (!defined _WIN64) && !defined(__APPLE__))
+#ifdef WITH_BACKEND
  DoDisasterRecovery(actor_set->name_);
 #endif
 }