!18448 Add debugger to new GPU runtime

Merge pull request !18448 from parastooashtari/new_unified_gpu
2021-06-23 01:56:09 +00:00 · 2021-06-23 01:56:09 +00:00 · d4aca69981
parent c2e03304ef 6ed17d52b1
commit d4aca69981
11 changed files with 359 additions and 21 deletions
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -34,6 +34,15 @@
 #include "debug/data_dump/e2e_dump.h"
 #include "utils/config_manager.h"
 #include "debug/env_config_parser.h"
+#include "utils/comm_manager.h"
+#include "runtime/framework/actor/actor_common.h"
+#include "runtime/hardware/device_context_manager.h"
+#include "debug/anf_ir_dump.h"
+#ifdef ENABLE_DEBUGGER
+#include "debug/debugger/proto_exporter.h"
+#else
+#include "debug/debugger/proto_exporter_stub.h"
+#endif

 using debugger::Chunk;
 using debugger::EventReply;
@ -228,6 +237,9 @@ bool Debugger::CheckDebuggerDumpEnabled() const {
  // see if dump is enabled
  if (device_target_ == kGPUDevice) {
    return device::KernelRuntime::DumpDataEnabled();
+  } else if (IsMindRTUsed()) {
+    auto &dump_json_parser = DumpJsonParser::GetInstance();
+    return dump_json_parser.e2e_dump_enabled();
  }
  return false;
 }
@ -289,8 +301,23 @@ void Debugger::Reset() {
  graph_ptr_list_.clear();
 }

+void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs) {
+  // Only GPU is supported for MindRTBackend
+  if (device_target_ != kGPUDevice) {
+    return;
+  }
+  uint32_t graph_sum = graphs.size();
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    const auto &graph = graphs[graph_index];
+    if (debugger_) {
+      debugger_->PreExecute(graph, graph_sum);
+    }
+    DumpSetup(graph);
+  }
+}
 void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
  // access lock for public method
+
  std::lock_guard<std::mutex> a_lock(access_lock_);
  CheckDatasetSinkMode();
  auto graph_id = graph_ptr->graph_id();
@ -313,7 +340,6 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
      if (!debugger_enabled_) {
        EnableDebugger();
      }
-
      if (debugger_enabled_) {
        if (graph_proto_list_.size()) {
          // only send compiled graphs once.
@ -323,7 +349,9 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
          LoadParametersAndConst();
          // revert graph ptr to original value
          graph_ptr_ = dbg_graph_ptr;
+
          SendMultiGraphsAndSuspend(graph_proto_list_);
+
          graph_proto_list_.clear();
        } else if (graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice) {
          // stop only when receive the first sub run graph for each step
@ -351,6 +379,89 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
  // resets for the new graph
  suspended_at_last_kernel_ = 0;
 }
+bool Debugger::DumpDataEnabledIteration() const {
+  auto &dump_json_parser = DumpJsonParser::GetInstance();
+  if (!dump_json_parser.e2e_dump_enabled()) {
+    return false;
+  }
+
+  auto cur_iter = dump_json_parser.cur_dump_iter();
+  if (dump_json_parser.IsDumpIter(cur_iter)) {
+    return true;
+  }
+  return false;
+}
+
+void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
+  uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  const auto &device_context =
+    device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
+  uint32_t rank_id = device_context->GetRankID();
+  if (debugger_->DebuggerBackendEnabled()) {
+    MS_EXCEPTION_IF_NULL(kernel_graph);
+    E2eDump::DumpData(kernel_graph.get(), rank_id, debugger_.get());
+  } else {
+    DumpJsonParser::GetInstance().UpdateDumpIter();
+  }
+}
+
+void Debugger::DumpSetup(const KernelGraphPtr &kernel_graph) const {
+  MS_LOG(INFO) << "Start!";
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
+  uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  const auto &device_context =
+    device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
+  uint32_t rank_id = device_context->GetRankID();
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  E2eDump::DumpSetup(kernel_graph.get(), rank_id);
+  MS_LOG(INFO) << "Finish!";
+}
+void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) {
+  // This function will be called for new GPU runtime using MindRTBackend
+  auto &json_parser = DumpJsonParser::GetInstance();
+  if (json_parser.e2e_dump_enabled()) {
+    auto ms_context = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(ms_context);
+    std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
+    uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+    const auto &device_context =
+      device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
+    uint32_t rank_id = device_context->GetRankID();
+    kernel_graph->set_root_graph_id(kernel_graph->graph_id());
+    std::string final_graph = "trace_code_graph_" + std::to_string(kernel_graph->graph_id());
+    std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id);
+    std::string target_dir = root_dir + "/graphs";
+    std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
+    DumpIRProtoWithSrcInfo(kernel_graph, final_graph, target_dir, kDebugWholeStack);
+    DumpIR("trace_code_graph", kernel_graph, true, kWholeStack, ir_file_path);
+    DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(kernel_graph->graph_id()) + ".csv", root_dir,
+                      kernel_graph->execution_order());
+  }
+}
+void Debugger::PostExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs) {
+  // Only GPU is supported for MindRTBackend
+  if (device_target_ != kGPUDevice) {
+    return;
+  }
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    const auto &graph = graphs[graph_index];
+    bool dump_enabled = debugger_->DumpDataEnabledIteration();
+    // debug used for dump
+    if (debugger_ && dump_enabled) {
+      debugger_->Dump(graph);
+    } else {
+      DumpJsonParser::GetInstance().UpdateDumpIter();
+    }
+    if (debugger_) {
+      debugger_->PostExecute();
+    }
+  }
+}

 void Debugger::PostExecute() {
  // access lock for public method
@ -365,6 +476,7 @@ void Debugger::PostExecute() {
        num_step_++;
      }
      SendWatchpoints(CheckWatchpoints());
+
      // no need to suspend at each graph for GPU, suspension happens in preExecute
      if (device_target_ != kGPUDevice) {
        CommandLoop();
@ -388,7 +500,6 @@ bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) const {
  }
  return false;
 }
-
 void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
@ -405,6 +516,7 @@ void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
      if (!hits.empty()) {
        SendWatchpoints(hits);
        CommandLoop();
+
        hit_empty_flag = false;
      }
    }
@ -507,7 +619,6 @@ GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
  ModelProto model = GetDebuggerFuncGraphProto(graph_ptr);
  return model.graph();
 }
-
 void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
  if (SendMetadata(true)) {
    // send graph to Mindinsight server
@ -533,7 +644,9 @@ bool Debugger::SendMetadata(bool version_check) {
  MS_LOG(INFO) << "Is training done?" << training_done_;
  // set graph munber to not_dataset_graph_sum_
  metadata.set_graph_num(not_dataset_graph_sum_);
+
  EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
+
  bool ret = false;
  if (reply_metadata.status() == reply_metadata.OK) {
    if (version_check) {
@ -575,6 +688,7 @@ void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_prot
    auto graph_size = graph.ByteSize();
    if (graph_size > g_chunk_size) {
      auto sub_graph_str = grpc_client_->ChunkString(str, graph_size);
+
      for (unsigned int i = 0; i < sub_graph_str.size(); i++) {
        chunk.set_buffer(sub_graph_str[i]);
        chunked_graph_proto_list.push_back(chunk);
@ -834,7 +948,6 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
  }
  return tensor_list;
 }
-
 void Debugger::Exit() {
  // clear resource before exit
  // debugger will notify main thread to exit because main thread can only exit at step boundary
@ -1171,6 +1284,13 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
  if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
    return;
  }
+  // When MindRT is used, only ValueNodes and ParameterWeights can be loaded from device to host
+  if (IsMindRTUsed() && (device_target_ == kGPUDevice)) {
+    if (!anf_node->isa<ValueNode>() &&
+        !(anf_node->isa<Parameter>() && AnfAlgo::IsParameterWeight(anf_node->cast<ParameterPtr>()))) {
+      return;
+    }
+  }
  // for parameters and value nodes, set its execution order to be 0;
  int exec_order = 0;
  std::string node_name = anf_node->fullname_with_scope();
@ -1268,6 +1388,14 @@ void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
    ++num_step_;
  }
 }
+void Debugger::UpdateStepNumGPU() {
+  // UpdateStepNum with DebugActor::DebugOnStepEnd
+  if (device_target_ == kGPUDevice && (debugger_enabled_ || DumpDataEnabledIteration())) {
+    // access lock for public method
+    std::lock_guard<std::mutex> a_lock(access_lock_);
+    ++num_step_;
+  }
+}

 void Debugger::ClearCurrentData() {
  if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()))
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@ -73,6 +73,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // reset debugger
  void Reset();

+  void PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs);
  // enable debugger
  // send graph and wait for command
  // do nothing if graph is set already
@ -82,6 +83,16 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // don't need a graph_ptr because it is saved during pre_execute
  void PostExecute();

+  bool DumpDataEnabledIteration() const;
+
+  void Dump(const KernelGraphPtr &kernel_graph) const;
+
+  void DumpSetup(const KernelGraphPtr &kernel_graph) const;
+
+  void DumpInGraphCompiler(const KernelGraphPtr &kernel_graph);
+
+  void PostExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs);
+
  bool ReadNodeDataRequired(const CNodePtr &kernel) const;

  void PostExecuteNode(const CNodePtr &kernel, bool last_kernel);
@ -132,6 +143,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  void UpdateStepNum(const session::KernelGraph *graph);

+  void UpdateStepNumGPU();
+
  void ClearCurrentData();

  void LoadGraphOutputs();
@ -194,7 +207,6 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  void ProcessKSetCMD(const EventReply &reply);
  // Process the KViewCMD
  void ProcessKViewCMD(const EventReply &reply);
-
  // set what nodes and conditions to watch
  void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
                     const ProtoVector<WatchCondition_Parameter> &parameters);
@ -228,6 +240,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  void LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index);

  // class members
+
  std::unique_ptr<GrpcClient> grpc_client_;
  std::unique_ptr<DebugServices> debug_services_;
  KernelGraphPtr graph_ptr_;
@ -249,6 +262,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  std::map<uint32_t, std::string> overflow_bin_path_;
  // flag to keep track of the very first suspension of debugger
  bool initial_suspend_;
+
  std::list<GraphProto> graph_proto_list_;
  std::list<KernelGraphPtr> graph_ptr_list_;

@ -261,9 +275,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
 };

 using DebuggerPtr = std::shared_ptr<Debugger>;
-
 // get debugger ModelProto
 std::string GetDebuggerFuncGraphProtoString(const FuncGraphPtr &func_graph);
+
 ModelProto GetDebuggerFuncGraphProto(const FuncGraphPtr &func_graph);

 // for getting proto DataType from Type of Tensor
@ -282,7 +296,6 @@ int32_t GetWatchpointID(const EventReply &reply);
 bool GetWatchpointDelete(const EventReply &reply);
 ProtoVector<TensorProto> GetTensors(const EventReply &reply);
 bool GetMiVersionMatched(const EventReply &reply);
-
 // get the full name of a tensor, which is the name used in TensorLoader
 std::string GetTensorFullName(const TensorProto &tensor);

--- a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc
@ -168,7 +168,7 @@ void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *co
 }

 void DeviceQueueDataSourceActor::SendDebugReq(OpContext<DeviceTensor> *context) {
-  Async(*debug_aid_, &DebugActor::Debug, data_kernel_, device_context_, context, &GetAID());
+  Async(*debug_aid_, &DebugActor::Debug, data_kernel_, &launch_info_, device_context_, context, &GetAID());
 }

 void DeviceQueueDataSourceActor::OnDebugFinish(OpContext<DeviceTensor> *context) {
--- a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
@ -15,20 +15,134 @@
 */

 #include "runtime/framework/actor/debug_actor.h"
+#include <vector>
+#include <memory>
+#include <string>
 #include "runtime/framework/actor/debug_aware_actor.h"
 #include "mindrt/include/async/async.h"
 #include "utils/log_adapter.h"
+#ifdef ENABLE_GPU
+#include "debug/debugger/debugger.h"
+#include "runtime/device/gpu/gpu_device_address.h"

+using mindspore::kernel::AddressPtr;
+using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;
+#endif
 namespace mindspore {
 namespace runtime {
-void DebugActor::Debug(const AnfNodePtr &node, const DeviceContext *device_context, OpContext<DeviceTensor> *op_context,
-                       const AID *from_aid) {
+
+#ifdef ENABLE_GPU
+static const size_t PARAMETER_OUTPUT_INDEX = 0;
+
+std::vector<int> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
+  // define a vector containing real output number
+  std::vector<int> real_outputs;
+  // P.BatchNorm is used for training and inference
+  // can add the filter list for more operators here....
+  if (node_name == "BatchNorm") {
+    MS_LOG(INFO) << "loading node named " << node_name;
+    real_outputs.insert(real_outputs.end(), {0, 3, 4});
+  } else {
+    // by default, TensorLoader will load all outputs
+    for (size_t j = 0; j < output_size; ++j) {
+      real_outputs.push_back(j);
+    }
+  }
+  return real_outputs;
+}
+void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) {
+  // get inputs
+  auto kernel_inputs = launch_info_->inputs_;
+  auto input_size = AnfAlgo::GetInputTensorNum(cnode);
+  for (size_t j = 0; j < input_size; ++j) {
+    auto input_kernel = cnode->input(j + 1);
+    std::string input_kernel_name = input_kernel->fullname_with_scope();
+    auto addr = kernel_inputs[j];
+    auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
+    // For example, this happens with the Depend op
+    if (type == kMetaTypeNone) {
+      continue;
+    }
+    auto format = kOpFormat_DEFAULT;
+    auto gpu_addr = std::make_unique<device::gpu::GPUDeviceAddress>(addr->addr, addr->size, format, type);
+    string input_tensor_name = input_kernel_name + ':' + "0";
+    ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX);
+    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order_, format, int_shapes, type, 0, true);
+    if (!ret) {
+      MS_LOG(ERROR) << "LoadMemToHost:"
+                    << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
+    }
+  }
+}
+void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) {
+  // get outputs
+  auto kernel_outputs = launch_info_->outputs_;
+  auto output_size = AnfAlgo::GetOutputTensorNum(cnode);
+  auto node_name = AnfAlgo::GetCNodeName(cnode);
+  std::string kernel_name = cnode->fullname_with_scope();
+  std::vector<int> real_outputs = CheckRealOutput(node_name, output_size);
+
+  for (int j : real_outputs) {
+    auto addr = kernel_outputs[j];
+    auto type = AnfAlgo::GetOutputInferDataType(cnode, j);
+    // For example, this happens with the Depend op
+    if (type == kMetaTypeNone) {
+      continue;
+    }
+    auto format = kOpFormat_DEFAULT;
+    auto gpu_addr = std::make_unique<device::gpu::GPUDeviceAddress>(addr->addr, addr->size, format, type);
+    string tensor_name = kernel_name + ':' + std::to_string(j);
+    ShapeVector int_shapes = trans::GetRuntimePaddingShape(cnode, j);
+    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order_, format, int_shapes, type, j, false);
+    if (!ret) {
+      MS_LOG(ERROR) << "LoadMemToHost:"
+                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
+    }
+  }
+}
+#endif
+
+void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_info_,
+                       const DeviceContext *device_context, OpContext<DeviceTensor> *op_context, const AID *from_aid) {
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(device_context);
  MS_EXCEPTION_IF_NULL(op_context);
  MS_EXCEPTION_IF_NULL(from_aid);
-  // todo debug.
-
+// todo debug.
+#ifdef ENABLE_GPU
+  if (node->isa<CNode>()) {
+    const auto &cnode = node->cast<CNodePtr>();
+    auto debugger = Debugger::GetInstance();
+    if (debugger) {
+      std::string kernel_name = cnode->fullname_with_scope();
+      debugger->SetCurNode(kernel_name);
+      bool read_data = false;
+      auto &dump_json_parser = DumpJsonParser::GetInstance();
+      bool dump_enabled = debugger->DumpDataEnabledIteration();
+      if (dump_enabled) {
+        auto dump_mode = dump_json_parser.dump_mode();
+        // dump the node if dump_mode is 0, which means all kernels, or if this kernel is in the kernels list
+        if ((dump_mode == 0) || ((dump_mode == 1) && dump_json_parser.NeedDump(kernel_name))) {
+          read_data = true;
+        }
+      } else if (debugger->debugger_enabled()) {
+        read_data = debugger->ReadNodeDataRequired(cnode);
+      }
+      if (read_data) {
+        if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) {
+          LoadInputs(cnode, launch_info_, exec_order_);
+        }
+        if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) {
+          LoadOutputs(cnode, launch_info_, exec_order_);
+        }
+        // check if the node is last kernel
+        bool last_kernel = !AnfAlgo::IsInplaceNode(cnode, "skip");
+        debugger->PostExecuteNode(cnode, last_kernel);
+      }
+    }
+    exec_order_ += 1;
+  }
+#endif
  // Call back to the from actor to process after debug finished.
  Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
 }
@ -36,8 +150,16 @@ void DebugActor::Debug(const AnfNodePtr &node, const DeviceContext *device_conte
 void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid) {
  MS_EXCEPTION_IF_NULL(op_context);
  MS_EXCEPTION_IF_NULL(from_aid);
-  // todo debug.
-
+// todo debug.
+#ifdef ENABLE_GPU
+  auto debugger = Debugger::GetInstance();
+  if (debugger) {
+    debugger->Debugger::UpdateStepNumGPU();
+    debugger->Debugger::LoadParametersAndConst();
+    // Reset exec_order for the next step
+    exec_order_ = 0;
+  }
+#endif
  // Call back to the from actor to process after debug finished.
  Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
 }
--- a/mindspore/ccsrc/runtime/framework/actor/debug_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/debug_actor.h
@ -24,6 +24,7 @@
 namespace mindspore {
 namespace runtime {
 using mindspore::device::DeviceContext;
+using mindspore::kernel::KernelLaunchInfo;

 // The debug actor is used to debug and dump kernel info, it gets the kernel real time execution info in the device, so
 // it is synchronous and blocked.
@ -33,12 +34,17 @@ class DebugActor : public ActorBase {
  ~DebugActor() override = default;

  // The debug of each node.
-  void Debug(const AnfNodePtr &node, const DeviceContext *device_context, OpContext<DeviceTensor> *op_context,
-             const AID *from_aid);
+  void Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_info_, const DeviceContext *device_context,
+             OpContext<DeviceTensor> *op_context, const AID *from_aid);

  // The debug on step end.
  void DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid);
+
+ private:
+  // class members
+  uint32_t exec_order_ = 0;
 };
+
 }  // namespace runtime
 }  // namespace mindspore

--- a/mindspore/ccsrc/runtime/framework/actor/kernel_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/kernel_actor.cc
@ -170,7 +170,7 @@ void KernelActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *context) {
 }

 void KernelActor::SendDebugReq(OpContext<DeviceTensor> *context) {
-  Async(*debug_aid_, &DebugActor::Debug, kernel_, device_context_, context, &GetAID());
+  Async(*debug_aid_, &DebugActor::Debug, kernel_, &launch_info_, device_context_, context, &GetAID());
 }

 void KernelActor::OnDebugFinish(OpContext<DeviceTensor> *context) {
--- a/mindspore/ccsrc/runtime/framework/graph_compiler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_compiler.cc
@ -24,6 +24,10 @@
 #include "ir/tensor.h"
 #include "backend/optimizer/common/helper.h"
 #include "base/base_ref_utils.h"
+#ifdef ENABLE_DEBUGGER
+#include "debug/debugger/debugger.h"
+#endif
+#include "debug/data_dump/dump_json_parser.h"

 namespace mindspore {
 namespace runtime {
@ -278,6 +282,9 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(device_context);

+  auto &json_parser = DumpJsonParser::GetInstance();
+  json_parser.Parse();
+
  // Execute optimization pass.
  auto outputs_before_optimizer = AnfAlgo::GetAllOutputWithIndex(graph->output());
  device_context->OptimizeGraph(graph);
@ -297,13 +304,20 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
  }

  graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get()));
-
+#ifdef ENABLE_DEBUGGER
+  auto debugger = Debugger::GetInstance();
+  debugger->DumpInGraphCompiler(graph);
+#endif
  MS_EXCEPTION_IF_NULL(session_);
  session_->InitAllBucket(graph, device_context);

  session_->SetSummaryNodes(graph.get());
  SetSummaryNodesRefCount(graph.get());
-
+#ifdef ENABLE_DEBUGGER
+  if (debugger && debugger->DebuggerBackendEnabled()) {
+    debugger->LoadGraphs(graph);
+  }
+#endif
  return graph->graph_id();
 }

--- a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
@ -31,7 +31,9 @@
 #ifdef ENABLE_DUMP_IR
 #include "debug/rdr/recorder_manager.h"
 #endif
-
+#ifdef ENABLE_DEBUGGER
+#include "debug/debugger/debugger.h"
+#endif
 namespace mindspore {
 namespace runtime {
 namespace {
@ -371,6 +373,18 @@ void GraphScheduler::Initialize() {
    (void)actorMgr->Spawn(base_recorder_actor, true);
  }
 #endif
+// Create and schedule debug actor.
+#ifdef ENABLE_DEBUGGER
+  auto debugger = mindspore::Debugger::GetInstance();
+  if (debugger->DebuggerBackendEnabled()) {
+    auto debug_actor = std::make_shared<DebugActor>();
+    MS_EXCEPTION_IF_NULL(debug_actor);
+    debug_aid_ = &(debug_actor->GetAID());
+    auto base_debug_actor = static_cast<ActorReference>(debug_actor);
+    base_debug_actor->set_thread_pool(thread_pool_);
+    (void)actorMgr->Spawn(base_debug_actor, true);
+  }
+#endif
 }

 ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info, GraphExecutionStrategy strategy) {
--- a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
@ -37,6 +37,7 @@
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "debug/rdr/running_data_recorder.h"
 #include "utils/comm_manager.h"
+#include "debug/debugger/debugger.h"

 namespace mindspore {
 namespace device {
@ -91,6 +92,12 @@ bool GPUDeviceContext::Initialize() {
    (*init_nccl_comm_funcptr)();
  }

+  auto rank_id = GetRankID();
+  auto &json_parser = DumpJsonParser::GetInstance();
+  // Dump json config file if dump is enabled
+  json_parser.CopyJsonToDir(rank_id);
+  json_parser.CopyMSCfgJsonToDir(rank_id);
+
  initialized_ = true;
  return ret;
 }
@ -125,6 +132,12 @@ bool GPUDeviceContext::InitDevice() {

 void GPUDeviceContext::Destroy() {
  // Release GPU buffer manager resource
+  auto debugger = Debugger::GetInstance();
+  if (debugger && debugger->debugger_enabled()) {
+    debugger->SetTrainingDone(true);
+    debugger->SendMetadata(false);
+  }
+
  if (GpuBufferMgr::GetInstance().IsInit()) {
    if (!GpuBufferMgr::GetInstance().IsClosed() && !GpuBufferMgr::GetInstance().CloseNotify()) {
      MS_LOG(EXCEPTION) << "Could not close gpu data queue.";
--- a/mindspore/ccsrc/vm/backend.cc
+++ b/mindspore/ccsrc/vm/backend.cc
@ -36,7 +36,9 @@
 #ifdef ENABLE_GE
 #include "utils/callbacks_ge.h"
 #endif
-
+#ifdef ENABLE_DEBUGGER
+#include "debug/debugger/debugger.h"
+#endif
 namespace mindspore {
 namespace compile {
 bool Backend::GetCond(const BaseRef &c, bool *const value) { return BaseRefToBool(c, value); }
@ -577,10 +579,24 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args,
  const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(actor_info);
  MS_EXCEPTION_IF_NULL(actor_set);
  runtime::GraphScheduler::GetInstance().PrepareRun(actor_set, graph_compiler_info, input_tensors);
+
+// PreExecuteGraph
+#ifdef ENABLE_DEBUGGER
+  auto debugger = Debugger::GetInstance();
+  if (debugger) {
+    debugger->Debugger::PreExecuteGraphDebugger(graph_compiler_info.graphs_);
+  }
+#endif
  if (!runtime::GraphScheduler::GetInstance().Run(actor_set)) {
    MS_LOG(EXCEPTION) << "The actor runs failed, actor name: " << actor_set->name_;
  }

+// PostExecuteGraph
+#ifdef ENABLE_DEBUGGER
+  if (debugger) {
+    debugger->Debugger::PostExecuteGraphDebugger(graph_compiler_info.graphs_);
+  }
+#endif
  // Sync device stream.
  const auto &first_device_context = graph_compiler_info.device_contexts_[0];
  MS_EXCEPTION_IF_NULL(first_device_context);
@ -658,6 +674,15 @@ void MindRTBackend::ConstructOutputs(const AnfNodePtr &output_node,
  }
 }

+#ifdef ENABLE_DEBUGGER
+void MindRTBackend::SetDebugger() {
+  auto debugger_ = Debugger::GetInstance();
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  debugger_->Init(device_id_, ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET));
+}
+#endif
+
 std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(const FuncGraphPtr &root_graph) {
  MS_EXCEPTION_IF_NULL(root_graph);
  MS_EXCEPTION_IF_NULL(graph_compiler_);
--- a/mindspore/ccsrc/vm/backend.h
+++ b/mindspore/ccsrc/vm/backend.h
@ -118,6 +118,9 @@ class MindRTBackend : public Backend {
  // Run Graph in the pyNative mode.
  void RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info, const std::vector<int64_t> *tensors_mask,
                const std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs);
+#ifdef ENABLE_DEBUGGER
+  void SetDebugger() override;
+#endif

 private:
  // The parameter func_graph is a graph, it can be either a root graph or a sub graph,