!6907 Load input tensors in debugger before suspending execution

Merge pull request !6907 from Harshvardhan Gupta/load-input-dbg
2020-10-08 21:50:45 +08:00 · 2020-10-08 21:50:45 +08:00 · 9c79b9d712
parent c1b9efe8e6 7c5e0541ba
commit 9c79b9d712
13 changed files with 107 additions and 140 deletions
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@ -171,7 +171,7 @@ GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) {
  device::KernelAdjust::GetInstance().Profiling(NOT_NULL(root_graph.get()));
  // build kernel
  BuildKernel(root_graph);
-  if (debugger_) {
+  if (debugger_ && debugger_->partial_memory()) {
    debugger_->PreExecute(root_graph);
  }
  SetSummaryNodes(root_graph.get());
@ -248,7 +248,7 @@ void AscendSession::BuildGraph(GraphId graph_id) {
  BuildKernel(graph);
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
-  if (debugger_) {
+  if (debugger_ && debugger_->partial_memory()) {
    debugger_->PreExecute(graph);
  }
  if (ms_context->get_param<bool>(MS_CTX_PRECOMPILE_ONLY)) {
@ -312,6 +312,9 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
  }
  // load input data from user input
  LoadInputData(kernel_graph, inputs);
+  if (debugger_) {
+    debugger_->PreExecute(kernel_graph);
+  }
 #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
  // Initialize parameter server
  InitPSParamAndOptim(kernel_graph, inputs);
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@ -278,9 +278,9 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList

 void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
  auto &kernel_graph = graphs_[graph_id];
-  PreIterationDbg(kernel_graph);
  // Load input data from user input
  LoadInputData(kernel_graph, inputs);
+  PreIterationDbg(kernel_graph);
 #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
  // Initialize parameter server
  InitPSParamAndOptim(kernel_graph, inputs);
--- a/mindspore/ccsrc/backend/session/session_basic.h
+++ b/mindspore/ccsrc/backend/session/session_basic.h
@ -22,7 +22,6 @@
 #include <utility>
 #include <memory>
 #include <map>
-
 #include "backend/session/session_context.h"
 #include "backend/session/kernel_graph.h"
 #include "backend/session/anf_runtime_algorithm.h"
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -30,6 +30,7 @@
 #include "pipeline/jit/pipeline.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "runtime/device/kernel_runtime_manager.h"
+#include "runtime/device/kernel_runtime.h"

 using debugger::EventReply;
 using debugger::GraphProto;
@ -47,6 +48,7 @@ namespace mindspore {

 DebuggerPtr Debugger::debugger_ = nullptr;
 std::mutex Debugger::instance_lock_;
+static const size_t PRAMATER_OUTPUT_INDEX = 0;

 Debugger::Debugger()
    : grpc_client_(nullptr),
@ -62,7 +64,26 @@ Debugger::Debugger()
      is_dataset_graph_(false),
      partial_memory_(false),
      last_overflow_bin_(0),
-      overflow_bin_path_("") {}
+      overflow_bin_path_("") {
+  if (CheckDebuggerEnabled()) {
+    // configure partial memory reuse
+    partial_memory_ = CheckDebuggerPartialMemoryEnabled();
+
+    // switch memory reuse on or off
+    auto context_ptr = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(context_ptr);
+    context_ptr->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, partial_memory_);
+    // print some message about memory reuse to user
+    if (partial_memory_) {
+      MS_LOG(WARNING)
+        << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
+           "step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
+    } else {
+      MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
+                      "usage for large models.";
+    }
+  }
+}

 void Debugger::Init(const uint32_t device_id, const std::string device_target) {
  // access lock for public method
@ -133,27 +154,6 @@ void Debugger::EnableDebugger() {
    MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
    port = "50051";
  }
-
-  // configure partial memory reuse
-  const char *env_partial_mem_str = std::getenv("MS_DEBUGGER_PARTIAL_MEM");
-  if (env_partial_mem_str != nullptr) {
-    MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str;
-    if (std::strcmp(env_partial_mem_str, "1") == 0) {
-      partial_memory_ = true;
-    }
-  }
-  // switch memory reuse on or off
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
-  context_ptr->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, partial_memory_);
-  // print some message about memory reuse to user
-  if (partial_memory_) {
-    MS_LOG(WARNING) << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
-                       "step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
-  } else {
-    MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
-                    "usage for large models.";
-  }
 #ifdef ENABLE_D
  // set operation overflow info
  overflow_bin_path_ = DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id(), device_id_);
@ -195,9 +195,7 @@ void Debugger::EnableDebugger() {
 bool Debugger::CheckDebuggerDumpEnabled() {
  // see if dump is enabled
  if (device_target_ == kGPUDevice) {
-    auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
-    MS_EXCEPTION_IF_NULL(runtime_instance);
-    return runtime_instance->DumpDataEnabled();
+    return device::KernelRuntime::DumpDataEnabled();
  }
  return false;
 }
@ -213,6 +211,17 @@ bool Debugger::CheckDebuggerEnabled() {
  return false;
 }

+bool Debugger::CheckDebuggerPartialMemoryEnabled() {
+  const char *env_partial_mem_str = std::getenv("MS_DEBUGGER_PARTIAL_MEM");
+  if (env_partial_mem_str != nullptr) {
+    MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str;
+    if (std::strcmp(env_partial_mem_str, "1") == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool Debugger::DebuggerBackendEnabled() { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }

 void Debugger::Reset() {
@ -324,6 +333,7 @@ void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
      // only try to enable debugger if it is not a dataset graph
      EnableDebugger();
      if (debugger_enabled_) {
+        LoadParameters();
        // get graph proto and send to mindinsight
        SendGraphAndSuspend(GetGraphProto());
      }
@ -839,4 +849,34 @@ bool Debugger::CheckPort(const char *port) {
  return true;
 }

+void Debugger::LoadParameters() {
+  if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
+  if (!(num_step_ == 0 || device_target_ == kAscendDevice ||
+        (device_target_ == kGPUDevice && device::KernelRuntime::DumpDataEnabledIteration())))
+    return;
+  MS_EXCEPTION_IF_NULL(graph_ptr_);
+  const auto &parameters = graph_ptr_->inputs();
+  // for parameters, set its execution order to be 0;
+  int exec_order = 0;
+  for (auto &item : parameters) {
+    if (!item->isa<Parameter>()) {
+      continue;
+    }
+    std::string parameter_name = item->fullname_with_scope();
+    auto addr = AnfAlgo::GetOutputAddr(item, PRAMATER_OUTPUT_INDEX);
+    auto type = AnfAlgo::GetOutputInferDataType(item, PRAMATER_OUTPUT_INDEX);
+    auto format = kOpFormat_DEFAULT;
+    string tensor_name = parameter_name + ':' + "0";
+    ShapeVector int_shapes;
+    auto shape = AnfAlgo::GetOutputDeviceShape(item, PRAMATER_OUTPUT_INDEX);
+    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                         [](size_t inner_item) { return SizeToInt(inner_item); });
+    bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, true);
+    if (!ret) {
+      MS_LOG(ERROR) << "LoadMemToHost:"
+                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
+    }
+  }
+}
+
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@ -103,6 +103,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  void SendMetadata();

+  void LoadParameters();
+
 private:
  // private constructor for singleton
  Debugger();
@ -118,6 +120,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // check if debugger enabled
  bool CheckDebuggerEnabled();

+  bool CheckDebuggerPartialMemoryEnabled();
+
  // check and save graph pointer
  void CheckGraphPtr(const KernelGraphPtr &graph_ptr);

--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@ -663,39 +663,25 @@ bool AscendDeviceAddress::DumpMemToFile(bool trans_flag, const std::string &file
 }

 #ifdef ENABLE_DEBUGGER
-bool AscendDeviceAddress::LoadMemToHost(bool trans_flag, const std::string &tensor_name, int execution_order,
+bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order,
                                        const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type,
-                                        size_t slot, Debugger *debugger, bool keep_prev) const {
+                                        size_t slot, bool keep_prev) const {
  bool ret = false;
-  DebugServices *debug_services = debugger->debug_services();
-  MS_EXCEPTION_IF_NULL(debug_services);
-  TensorLoader *tensor_loader = debug_services->tensor_loader();
+  TensorLoader *tensor_loader = Debugger::GetInstance()->debug_services()->tensor_loader();
  MS_EXCEPTION_IF_NULL(tensor_loader);
  // TensorData is freed up in AscendSession class
  auto tensor_data = std::make_shared<mindspore::TensorData>();
  tensor_data->SetName(tensor_name);
  tensor_data->SetExecutionOrder(execution_order);
  tensor_data->SetSlot(slot);
-  if (trans_flag) {
-    MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
-    mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(host_type, host_shape);
-    size_t host_size = out_tensor->data().nbytes();
-    ret = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
-    if (!ret) {
-      MS_LOG(ERROR) << "Copy device mem to host failed";
-      return ret;
-    }
-    tensor_data->SetTensor(out_tensor);
-  } else {
-    mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
-    size_t host_size = out_tensor->data().nbytes();
-    auto ret_rt_memcpy = rtMemcpy(out_tensor->data_c(), host_size, ptr_, host_size, RT_MEMCPY_DEVICE_TO_HOST);
-    if (ret_rt_memcpy != RT_ERROR_NONE) {
-      MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]";
-    }
-    MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
-    tensor_data->SetTensor(out_tensor);
+  mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
+  size_t host_size = out_tensor->data().nbytes();
+  auto ret_rt_memcpy = rtMemcpy(out_tensor->data_c(), host_size, ptr_, host_size, RT_MEMCPY_DEVICE_TO_HOST);
+  if (ret_rt_memcpy != RT_ERROR_NONE) {
+    MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]";
  }
+  MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
+  tensor_data->SetTensor(out_tensor);
  ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev);
  return ret;
 }
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
@ -45,9 +45,8 @@ class AscendDeviceAddress : public DeviceAddress {
  bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt,
                     const ShapeVector &host_shape, TypeId host_type) const override;
 #ifdef ENABLE_DEBUGGER
-  bool LoadMemToHost(bool dump_mode, const std::string &tensor_name, int execution_order, const std::string &host_fmt,
-                     const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
-                     bool keep_prev) const;
+  bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
+                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const override;
 #endif

 private:
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@ -254,15 +254,10 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
      auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr);
      MS_EXCEPTION_IF_NULL(ascend_addr);
      ShapeVector int_shapes;
-      if (trans_flag) {
-        int_shapes = trans::GetRuntimePaddingShape(node, j);
-      } else {
-        auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
-        (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
-                             [](size_t inner_item) { return SizeToInt(inner_item); });
-      }
-      auto ret =
-        ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, j, debugger, false);
+      auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
+      (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                           [](size_t inner_item) { return SizeToInt(inner_item); });
+      auto ret = ascend_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false);
      if (!ret) {
        MS_LOG(ERROR) << "LoadMemToHost: flag:" << trans_flag << ", tensor_name:" << tensor_name
                      << ", host_format:" << format << ".!";
@ -272,40 +267,6 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
  }
 }

-void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger) {
-  MS_EXCEPTION_IF_NULL(graph);
-  // trans_flag: "true" means tensor values will be transfered to host format, otherwise not.
-  bool trans_flag = false;
-  const auto &parameters = graph->inputs();
-  // for parameters, set its execution order to be 0;
-  int exec_order = 0;
-  for (auto &item : parameters) {
-    if (!item->isa<Parameter>()) {
-      continue;
-    }
-    std::string parameter_name = item->fullname_with_scope();
-    auto addr = AnfAlgo::GetOutputAddr(item, PRAMATER_OUTPUT_INDEX);
-    auto type = AnfAlgo::GetOutputInferDataType(item, PRAMATER_OUTPUT_INDEX);
-    auto format = kOpFormat_DEFAULT;
-    string tensor_name = parameter_name + ':' + "0";
-    auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr);
-    MS_EXCEPTION_IF_NULL(ascend_addr);
-    ShapeVector int_shapes;
-    if (trans_flag) {
-      int_shapes = trans::GetRuntimePaddingShape(item, PRAMATER_OUTPUT_INDEX);
-    } else {
-      auto shape = AnfAlgo::GetOutputDeviceShape(item, PRAMATER_OUTPUT_INDEX);
-      (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
-                           [](size_t inner_item) { return SizeToInt(inner_item); });
-    }
-    auto ret =
-      ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
-    if (!ret) {
-      MS_LOG(ERROR) << "LoadMemToHost Failed: flag:" << trans_flag << ", path:" << tensor_name
-                    << ", host_format:" << format << ".!";
-    }
-  }
-}
 }  // namespace
 #endif

@ -319,7 +280,7 @@ bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debug
  // load output
  LoadOutput(graph, debugger);
  // load parameters
-  LoadParameters(graph, debugger);
+  if (debugger) debugger->LoadParameters();
 #endif
  return true;
 }
--- a/mindspore/ccsrc/runtime/device/device_address.h
+++ b/mindspore/ccsrc/runtime/device/device_address.h
@ -70,6 +70,12 @@ class DeviceAddress : public mindspore::DeviceSync {
                             const ShapeVector &host_shape, TypeId host_type) const {
    return true;
  }
+#ifdef ENABLE_DEBUGGER
+  virtual bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
+                             const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const {
+    return true;
+  }
+#endif

 protected:
  const void *ptr() const { return ptr_; }
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@ -80,14 +80,14 @@ GPUDeviceAddress::~GPUDeviceAddress() {
 }
 #ifdef ENABLE_DEBUGGER
 bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
-                                     const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
+                                     const ShapeVector &host_shape, TypeId host_type, size_t slot,
                                     bool keep_prev) const {
  bool ret = false;
  if (size_ == 0) {
    return true;
  }
-  DebugServices *debug_services = debugger->debug_services();
-  TensorLoader *tensor_loader = debug_services->tensor_loader();
+
+  TensorLoader *tensor_loader = Debugger::GetInstance()->debug_services()->tensor_loader();

  mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
  size_t host_size = out_tensor->data().nbytes();
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
@ -44,8 +44,7 @@ class GPUDeviceAddress : public DeviceAddress {

 #ifdef ENABLE_DEBUGGER
  bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
-                     const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
-                     bool keep_prev) const;
+                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const override;
 #endif
 private:
  DeviceAddressStatus status_{DeviceAddressStatus::kInDevice};
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@ -111,7 +111,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
    auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                         [](size_t inner_item) { return SizeToInt(inner_item); });
-    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
+    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, true);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
@ -130,7 +130,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
    auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j);
    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                         [](size_t inner_item) { return SizeToInt(inner_item); });
-    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, debugger, false);
+    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
@ -148,36 +148,6 @@ void UpdateStepNum(Debugger *debugger, bool dump_enabled) {
  }
 }

-void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) {
-  MS_EXCEPTION_IF_NULL(graph);
-  if (!(debugger && dump_enabled)) {
-    return;
-  }
-  const auto &parameters = graph->inputs();
-  // for parameters, set its execution order to be 0;
-  int exec_order = 0;
-  for (auto &item : parameters) {
-    if (!item->isa<Parameter>()) {
-      continue;
-    }
-    std::string parameter_name = item->fullname_with_scope();
-    auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX);
-    auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX);
-    auto format = kOpFormat_DEFAULT;
-    string tensor_name = parameter_name + ':' + "0";
-    auto gpu_addr = dynamic_cast<const mindspore::device::gpu::GPUDeviceAddress *>(addr);
-    ShapeVector int_shapes;
-    auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX);
-    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
-                         [](size_t inner_item) { return SizeToInt(inner_item); });
-    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
-    if (!ret) {
-      MS_LOG(ERROR) << "LoadMemToHost:"
-                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
-    }
-  }
-}
-
 void ClearCurrentData(Debugger *debugger, bool dump_enabled) {
  if (debugger && (debugger->debugger_enabled() || dump_enabled)) {
    DebugServices *debug_services = debugger->debug_services();
@ -601,7 +571,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
  }
  if (!mock) {
    // collect weights and bias for dump mode
-    LoadParameters(graph, debugger, dump_enabled);
+    if (debugger) debugger->LoadParameters();
    CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
  }
  ClearSwapInfo(mock);
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@ -53,8 +53,8 @@ class KernelRuntime {
  void RunOpAssignMemory(const ValuePtr &pre_output_value, const std::vector<tensor::TensorPtr> &input_tensors,
                         session::KernelGraph *graph);
  void RunOpClearMemory(const session::KernelGraph *graph);
-  bool DumpDataEnabled();
-  bool DumpDataEnabledIteration();
+  static bool DumpDataEnabled();
+  static bool DumpDataEnabledIteration();
  virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger);
  virtual bool Load(session::KernelGraph *graph, bool is_task_sink);
  virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0;