diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc
index b5069c4acb7..2ef959fa10d 100644
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -171,7 +171,7 @@ GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) {
   device::KernelAdjust::GetInstance().Profiling(NOT_NULL(root_graph.get()));
   // build kernel
   BuildKernel(root_graph);
-  if (debugger_) {
+  if (debugger_ && debugger_->partial_memory()) {
     debugger_->PreExecute(root_graph);
   }
   SetSummaryNodes(root_graph.get());
@@ -248,7 +248,7 @@ void AscendSession::BuildGraph(GraphId graph_id) {
   BuildKernel(graph);
   auto ms_context = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(ms_context);
-  if (debugger_) {
+  if (debugger_ && debugger_->partial_memory()) {
     debugger_->PreExecute(graph);
   }
   if (ms_context->get_param<bool>(MS_CTX_PRECOMPILE_ONLY)) {
@@ -312,6 +312,9 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
   }
   // load input data from user input
   LoadInputData(kernel_graph, inputs);
+  if (debugger_) {
+    debugger_->PreExecute(kernel_graph);
+  }
 #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
   // Initialize parameter server
   InitPSParamAndOptim(kernel_graph, inputs);
diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc
index d8d04908787..1ce0048b8db 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -278,9 +278,9 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
 
 void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
   auto &kernel_graph = graphs_[graph_id];
-  PreIterationDbg(kernel_graph);
   // Load input data from user input
   LoadInputData(kernel_graph, inputs);
+  PreIterationDbg(kernel_graph);
 #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
   // Initialize parameter server
   InitPSParamAndOptim(kernel_graph, inputs);
diff --git a/mindspore/ccsrc/backend/session/session_basic.h b/mindspore/ccsrc/backend/session/session_basic.h
index a8fb3e95a98..af49da4f626 100644
--- a/mindspore/ccsrc/backend/session/session_basic.h
+++ b/mindspore/ccsrc/backend/session/session_basic.h
@@ -22,7 +22,6 @@
 #include <utility>
 #include <memory>
 #include <map>
-
 #include "backend/session/session_context.h"
 #include "backend/session/kernel_graph.h"
 #include "backend/session/anf_runtime_algorithm.h"
diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc
index f347c0351fc..4d09df8f84a 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -30,6 +30,7 @@
 #include "pipeline/jit/pipeline.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "runtime/device/kernel_runtime_manager.h"
+#include "runtime/device/kernel_runtime.h"
 
 using debugger::EventReply;
 using debugger::GraphProto;
@@ -47,6 +48,7 @@ namespace mindspore {
 
 DebuggerPtr Debugger::debugger_ = nullptr;
 std::mutex Debugger::instance_lock_;
+static const size_t PRAMATER_OUTPUT_INDEX = 0;
 
 Debugger::Debugger()
     : grpc_client_(nullptr),
@@ -62,7 +64,26 @@ Debugger::Debugger()
       is_dataset_graph_(false),
       partial_memory_(false),
       last_overflow_bin_(0),
-      overflow_bin_path_("") {}
+      overflow_bin_path_("") {
+  if (CheckDebuggerEnabled()) {
+    // configure partial memory reuse
+    partial_memory_ = CheckDebuggerPartialMemoryEnabled();
+
+    // switch memory reuse on or off
+    auto context_ptr = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(context_ptr);
+    context_ptr->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, partial_memory_);
+    // print some message about memory reuse to user
+    if (partial_memory_) {
+      MS_LOG(WARNING)
+        << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
+           "step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
+    } else {
+      MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
+                      "usage for large models.";
+    }
+  }
+}
 
 void Debugger::Init(const uint32_t device_id, const std::string device_target) {
   // access lock for public method
@@ -133,27 +154,6 @@ void Debugger::EnableDebugger() {
     MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
     port = "50051";
   }
-
-  // configure partial memory reuse
-  const char *env_partial_mem_str = std::getenv("MS_DEBUGGER_PARTIAL_MEM");
-  if (env_partial_mem_str != nullptr) {
-    MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str;
-    if (std::strcmp(env_partial_mem_str, "1") == 0) {
-      partial_memory_ = true;
-    }
-  }
-  // switch memory reuse on or off
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
-  context_ptr->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, partial_memory_);
-  // print some message about memory reuse to user
-  if (partial_memory_) {
-    MS_LOG(WARNING) << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
-                       "step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
-  } else {
-    MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
-                    "usage for large models.";
-  }
 #ifdef ENABLE_D
   // set operation overflow info
   overflow_bin_path_ = DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id(), device_id_);
@@ -195,9 +195,7 @@ void Debugger::EnableDebugger() {
 bool Debugger::CheckDebuggerDumpEnabled() {
   // see if dump is enabled
   if (device_target_ == kGPUDevice) {
-    auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
-    MS_EXCEPTION_IF_NULL(runtime_instance);
-    return runtime_instance->DumpDataEnabled();
+    return device::KernelRuntime::DumpDataEnabled();
   }
   return false;
 }
@@ -213,6 +211,17 @@ bool Debugger::CheckDebuggerEnabled() {
   return false;
 }
 
+bool Debugger::CheckDebuggerPartialMemoryEnabled() {
+  const char *env_partial_mem_str = std::getenv("MS_DEBUGGER_PARTIAL_MEM");
+  if (env_partial_mem_str != nullptr) {
+    MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str;
+    if (std::strcmp(env_partial_mem_str, "1") == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool Debugger::DebuggerBackendEnabled() { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }
 
 void Debugger::Reset() {
@@ -324,6 +333,7 @@ void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
       // only try to enable debugger if it is not a dataset graph
       EnableDebugger();
       if (debugger_enabled_) {
+        LoadParameters();
         // get graph proto and send to mindinsight
         SendGraphAndSuspend(GetGraphProto());
       }
@@ -839,4 +849,34 @@ bool Debugger::CheckPort(const char *port) {
   return true;
 }
 
+void Debugger::LoadParameters() {
+  if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
+  if (!(num_step_ == 0 || device_target_ == kAscendDevice ||
+        (device_target_ == kGPUDevice && device::KernelRuntime::DumpDataEnabledIteration())))
+    return;
+  MS_EXCEPTION_IF_NULL(graph_ptr_);
+  const auto &parameters = graph_ptr_->inputs();
+  // for parameters, set its execution order to be 0;
+  int exec_order = 0;
+  for (auto &item : parameters) {
+    if (!item->isa<Parameter>()) {
+      continue;
+    }
+    std::string parameter_name = item->fullname_with_scope();
+    auto addr = AnfAlgo::GetOutputAddr(item, PRAMATER_OUTPUT_INDEX);
+    auto type = AnfAlgo::GetOutputInferDataType(item, PRAMATER_OUTPUT_INDEX);
+    auto format = kOpFormat_DEFAULT;
+    string tensor_name = parameter_name + ':' + "0";
+    ShapeVector int_shapes;
+    auto shape = AnfAlgo::GetOutputDeviceShape(item, PRAMATER_OUTPUT_INDEX);
+    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                         [](size_t inner_item) { return SizeToInt(inner_item); });
+    bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, true);
+    if (!ret) {
+      MS_LOG(ERROR) << "LoadMemToHost:"
+                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
+    }
+  }
+}
+
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h
index 1661fa04027..6e4407b0c58 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -103,6 +103,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
 
   void SendMetadata();
 
+  void LoadParameters();
+
  private:
   // private constructor for singleton
   Debugger();
@@ -118,6 +120,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
   // check if debugger enabled
   bool CheckDebuggerEnabled();
 
+  bool CheckDebuggerPartialMemoryEnabled();
+
   // check and save graph pointer
   void CheckGraphPtr(const KernelGraphPtr &graph_ptr);
 
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
index 5f475f0c2ac..d761dfd7033 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@@ -663,39 +663,25 @@ bool AscendDeviceAddress::DumpMemToFile(bool trans_flag, const std::string &file
 }
 
 #ifdef ENABLE_DEBUGGER
-bool AscendDeviceAddress::LoadMemToHost(bool trans_flag, const std::string &tensor_name, int execution_order,
+bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order,
                                         const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type,
-                                        size_t slot, Debugger *debugger, bool keep_prev) const {
+                                        size_t slot, bool keep_prev) const {
   bool ret = false;
-  DebugServices *debug_services = debugger->debug_services();
-  MS_EXCEPTION_IF_NULL(debug_services);
-  TensorLoader *tensor_loader = debug_services->tensor_loader();
+  TensorLoader *tensor_loader = Debugger::GetInstance()->debug_services()->tensor_loader();
   MS_EXCEPTION_IF_NULL(tensor_loader);
   // TensorData is freed up in AscendSession class
   auto tensor_data = std::make_shared<mindspore::TensorData>();
   tensor_data->SetName(tensor_name);
   tensor_data->SetExecutionOrder(execution_order);
   tensor_data->SetSlot(slot);
-  if (trans_flag) {
-    MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
-    mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(host_type, host_shape);
-    size_t host_size = out_tensor->data().nbytes();
-    ret = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
-    if (!ret) {
-      MS_LOG(ERROR) << "Copy device mem to host failed";
-      return ret;
-    }
-    tensor_data->SetTensor(out_tensor);
-  } else {
-    mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
-    size_t host_size = out_tensor->data().nbytes();
-    auto ret_rt_memcpy = rtMemcpy(out_tensor->data_c(), host_size, ptr_, host_size, RT_MEMCPY_DEVICE_TO_HOST);
-    if (ret_rt_memcpy != RT_ERROR_NONE) {
-      MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]";
-    }
-    MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
-    tensor_data->SetTensor(out_tensor);
+  mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
+  size_t host_size = out_tensor->data().nbytes();
+  auto ret_rt_memcpy = rtMemcpy(out_tensor->data_c(), host_size, ptr_, host_size, RT_MEMCPY_DEVICE_TO_HOST);
+  if (ret_rt_memcpy != RT_ERROR_NONE) {
+    MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]";
   }
+  MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
+  tensor_data->SetTensor(out_tensor);
   ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev);
   return ret;
 }
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
index 10389a0796b..393525c2e3c 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
@@ -45,9 +45,8 @@ class AscendDeviceAddress : public DeviceAddress {
   bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt,
                      const ShapeVector &host_shape, TypeId host_type) const override;
 #ifdef ENABLE_DEBUGGER
-  bool LoadMemToHost(bool dump_mode, const std::string &tensor_name, int execution_order, const std::string &host_fmt,
-                     const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
-                     bool keep_prev) const;
+  bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
+                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const override;
 #endif
 
  private:
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
index b1213f79676..1946c630aa1 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -254,15 +254,10 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
       auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr);
       MS_EXCEPTION_IF_NULL(ascend_addr);
       ShapeVector int_shapes;
-      if (trans_flag) {
-        int_shapes = trans::GetRuntimePaddingShape(node, j);
-      } else {
-        auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
-        (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
-                             [](size_t inner_item) { return SizeToInt(inner_item); });
-      }
-      auto ret =
-        ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, j, debugger, false);
+      auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
+      (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                           [](size_t inner_item) { return SizeToInt(inner_item); });
+      auto ret = ascend_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false);
       if (!ret) {
         MS_LOG(ERROR) << "LoadMemToHost: flag:" << trans_flag << ", tensor_name:" << tensor_name
                       << ", host_format:" << format << ".!";
@@ -272,40 +267,6 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
   }
 }
 
-void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger) {
-  MS_EXCEPTION_IF_NULL(graph);
-  // trans_flag: "true" means tensor values will be transfered to host format, otherwise not.
-  bool trans_flag = false;
-  const auto &parameters = graph->inputs();
-  // for parameters, set its execution order to be 0;
-  int exec_order = 0;
-  for (auto &item : parameters) {
-    if (!item->isa<Parameter>()) {
-      continue;
-    }
-    std::string parameter_name = item->fullname_with_scope();
-    auto addr = AnfAlgo::GetOutputAddr(item, PRAMATER_OUTPUT_INDEX);
-    auto type = AnfAlgo::GetOutputInferDataType(item, PRAMATER_OUTPUT_INDEX);
-    auto format = kOpFormat_DEFAULT;
-    string tensor_name = parameter_name + ':' + "0";
-    auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr);
-    MS_EXCEPTION_IF_NULL(ascend_addr);
-    ShapeVector int_shapes;
-    if (trans_flag) {
-      int_shapes = trans::GetRuntimePaddingShape(item, PRAMATER_OUTPUT_INDEX);
-    } else {
-      auto shape = AnfAlgo::GetOutputDeviceShape(item, PRAMATER_OUTPUT_INDEX);
-      (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
-                           [](size_t inner_item) { return SizeToInt(inner_item); });
-    }
-    auto ret =
-      ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
-    if (!ret) {
-      MS_LOG(ERROR) << "LoadMemToHost Failed: flag:" << trans_flag << ", path:" << tensor_name
-                    << ", host_format:" << format << ".!";
-    }
-  }
-}
 }  // namespace
 #endif
 
@@ -319,7 +280,7 @@ bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debug
   // load output
   LoadOutput(graph, debugger);
   // load parameters
-  LoadParameters(graph, debugger);
+  if (debugger) debugger->LoadParameters();
 #endif
   return true;
 }
diff --git a/mindspore/ccsrc/runtime/device/device_address.h b/mindspore/ccsrc/runtime/device/device_address.h
index ddae68d3dbd..7d32d11af5a 100644
--- a/mindspore/ccsrc/runtime/device/device_address.h
+++ b/mindspore/ccsrc/runtime/device/device_address.h
@@ -70,6 +70,12 @@ class DeviceAddress : public mindspore::DeviceSync {
                              const ShapeVector &host_shape, TypeId host_type) const {
     return true;
   }
+#ifdef ENABLE_DEBUGGER
+  virtual bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
+                             const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const {
+    return true;
+  }
+#endif
 
  protected:
   const void *ptr() const { return ptr_; }
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
index 9dba9a8a245..fc3fab30c34 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@@ -80,14 +80,14 @@ GPUDeviceAddress::~GPUDeviceAddress() {
 }
 #ifdef ENABLE_DEBUGGER
 bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
-                                     const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
+                                     const ShapeVector &host_shape, TypeId host_type, size_t slot,
                                      bool keep_prev) const {
   bool ret = false;
   if (size_ == 0) {
     return true;
   }
-  DebugServices *debug_services = debugger->debug_services();
-  TensorLoader *tensor_loader = debug_services->tensor_loader();
+
+  TensorLoader *tensor_loader = Debugger::GetInstance()->debug_services()->tensor_loader();
 
   mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
   size_t host_size = out_tensor->data().nbytes();
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
index c68108b9de2..a98f67786b8 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
@@ -44,8 +44,7 @@ class GPUDeviceAddress : public DeviceAddress {
 
 #ifdef ENABLE_DEBUGGER
   bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
-                     const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
-                     bool keep_prev) const;
+                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const override;
 #endif
  private:
   DeviceAddressStatus status_{DeviceAddressStatus::kInDevice};
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index c6c766e322a..74a16310971 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -111,7 +111,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
     auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
     (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                          [](size_t inner_item) { return SizeToInt(inner_item); });
-    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
+    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, true);
     if (!ret) {
       MS_LOG(ERROR) << "LoadMemToHost:"
                     << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
@@ -130,7 +130,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
     auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j);
     (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                          [](size_t inner_item) { return SizeToInt(inner_item); });
-    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, debugger, false);
+    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false);
     if (!ret) {
       MS_LOG(ERROR) << "LoadMemToHost:"
                     << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
@@ -148,36 +148,6 @@ void UpdateStepNum(Debugger *debugger, bool dump_enabled) {
   }
 }
 
-void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) {
-  MS_EXCEPTION_IF_NULL(graph);
-  if (!(debugger && dump_enabled)) {
-    return;
-  }
-  const auto &parameters = graph->inputs();
-  // for parameters, set its execution order to be 0;
-  int exec_order = 0;
-  for (auto &item : parameters) {
-    if (!item->isa<Parameter>()) {
-      continue;
-    }
-    std::string parameter_name = item->fullname_with_scope();
-    auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX);
-    auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX);
-    auto format = kOpFormat_DEFAULT;
-    string tensor_name = parameter_name + ':' + "0";
-    auto gpu_addr = dynamic_cast<const mindspore::device::gpu::GPUDeviceAddress *>(addr);
-    ShapeVector int_shapes;
-    auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX);
-    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
-                         [](size_t inner_item) { return SizeToInt(inner_item); });
-    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
-    if (!ret) {
-      MS_LOG(ERROR) << "LoadMemToHost:"
-                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
-    }
-  }
-}
-
 void ClearCurrentData(Debugger *debugger, bool dump_enabled) {
   if (debugger && (debugger->debugger_enabled() || dump_enabled)) {
     DebugServices *debug_services = debugger->debug_services();
@@ -601,7 +571,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
   }
   if (!mock) {
     // collect weights and bias for dump mode
-    LoadParameters(graph, debugger, dump_enabled);
+    if (debugger) debugger->LoadParameters();
     CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
   }
   ClearSwapInfo(mock);
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h
index 78e8f80a670..69d7764e6f3 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -53,8 +53,8 @@ class KernelRuntime {
   void RunOpAssignMemory(const ValuePtr &pre_output_value, const std::vector<tensor::TensorPtr> &input_tensors,
                          session::KernelGraph *graph);
   void RunOpClearMemory(const session::KernelGraph *graph);
-  bool DumpDataEnabled();
-  bool DumpDataEnabledIteration();
+  static bool DumpDataEnabled();
+  static bool DumpDataEnabledIteration();
   virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger);
   virtual bool Load(session::KernelGraph *graph, bool is_task_sink);
   virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0;