diff --git a/build.sh b/build.sh
index adeb099fb7b..146b0de1c51 100755
--- a/build.sh
+++ b/build.sh
@@ -279,6 +279,9 @@ checkopts()
   done
 }
 checkopts "$@"
+if [[ "X$ENABLE_GPU" = "Xon" ]] && [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
+    ENABLE_DEBUGGER="on"
+fi
 echo "---------------- MindSpore: build start ----------------"
 mkdir -pv "${BUILD_PATH}/package/mindspore/lib"
 git submodule update --init graphengine
diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc
index 3a462d9cb9a..4398d9a0375 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -37,6 +37,7 @@
 #include "common/trans.h"
 #include "utils/context/ms_context.h"
 #include "utils/base_ref_extends.h"
+#include "debug/tensor_load.h"
 
 namespace mindspore {
 namespace session {
@@ -164,7 +165,11 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
 void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
   auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
   MS_EXCEPTION_IF_NULL(runtime_instance);
+#ifdef ENABLE_DEBUGGER
+  if (!runtime_instance->Run(kernel_graph.get(), debugger_.get())) {
+#else
   if (!runtime_instance->Run(kernel_graph.get())) {
+#endif
     MS_LOG(EXCEPTION) << "GPU execute graph failed!";
   }
 }
@@ -229,6 +234,9 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
 
 void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
   auto &kernel_graph = graphs_[graph_id];
+#ifdef ENABLE_DEBUGGER
+  PreIterationDbg(kernel_graph);
+#endif
   // Load input data from user input
   LoadInputData(kernel_graph, inputs);
 #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
@@ -245,6 +253,9 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
     // Run graph on GPU
     Execute(kernel_graph);
   }
+#ifdef ENABLE_DEBUGGER
+  PostLoadTensor(kernel_graph);
+#endif
   // Get result from GPU
   UpdateOutputs(kernel_graph, outputs, inputs);
   // Summary
@@ -253,6 +264,9 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
   if (context_ptr->enable_gpu_summary()) {
     Summary(kernel_graph.get());
   }
+#ifdef ENABLE_DEBUGGER
+  PostIterationDbg(kernel_graph);
+#endif
 }
 
 void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
@@ -296,6 +310,70 @@ py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph
   RunOpClearMemory(kernel_graph.get());
   return tuple_tensors;
 }
+
+#ifdef ENABLE_DEBUGGER
+void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
+#ifdef ENABLE_DUMP_E2E
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  (void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
+#endif
+}
+
+bool GPUSession::DumpDataEnabledIteration() const {
+  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  return runtime_instance->DumpDataEnabledIteration();
+}
+
+void GPUSession::PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
+  if (debugger_) {
+    debugger_->PreExecute(kernel_graph);
+  }
+  PreLoadTensor(kernel_graph);
+}
+
+void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
+  bool dump_enabled = DumpDataEnabledIteration();
+  // debug used for dump
+  if (debugger_ && dump_enabled) {
+    Dump(kernel_graph);
+  }
+  if (debugger_) {
+    debugger_->PostExecute();
+  }
+}
+
+void GPUSession::PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
+  bool dump_enabled = DumpDataEnabledIteration();
+  if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
+    return;
+  }
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  DebugServices *debug_services = debugger_->debug_services();
+  TensorLoader *tensor_loader = debug_services->tensor_loader();
+  tensor_loader->EmptyTensor();
+  uint32_t iter_num = tensor_loader->GetIterNum();
+  tensor_loader->set_iter_num(++iter_num);
+}
+
+void GPUSession::PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
+  bool dump_enabled = DumpDataEnabledIteration();
+  if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
+    return;
+  }
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  DebugServices *debug_services = debugger_->debug_services();
+  TensorLoader *tensor_loader = debug_services->tensor_loader();
+  tensor_loader->EmptyPrevTensor();
+}
+#endif
+
 }  // namespace gpu
 }  // namespace session
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/session/gpu_session.h b/mindspore/ccsrc/backend/session/gpu_session.h
index 04e5021c02a..3e4e84a29bb 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.h
+++ b/mindspore/ccsrc/backend/session/gpu_session.h
@@ -67,6 +67,20 @@ class GPUSession : public SessionBasic {
                      const std::vector<tensor::TensorPtr> &inputs_const) const override;
 
   void Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const;
+
+#ifdef ENABLE_DEBUGGER
+  void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
+
+  bool DumpDataEnabledIteration() const;
+
+  void PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;
+
+  void PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;
+
+  void PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
+
+  void PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
+#endif
 };
 using GPUSessionPtr = std::shared_ptr<GPUSession>;
 MS_REG_SESSION(kGPUDevice, GPUSession);
diff --git a/mindspore/ccsrc/backend/session/session_basic.cc b/mindspore/ccsrc/backend/session/session_basic.cc
index b551040bbda..79efb26088b 100644
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@@ -24,7 +24,6 @@
 #include "backend/kernel_compiler/common_utils.h"
 #include "frontend/operator/ops.h"
 #include "common/trans.h"
-#include "utils/context/ms_context.h"
 #include "utils/config_manager.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/oplib/oplib.h"
diff --git a/mindspore/ccsrc/backend/session/session_basic.h b/mindspore/ccsrc/backend/session/session_basic.h
index 367b1fe80a9..838a8807aed 100755
--- a/mindspore/ccsrc/backend/session/session_basic.h
+++ b/mindspore/ccsrc/backend/session/session_basic.h
@@ -32,6 +32,7 @@
 #include "utils/contract.h"
 #include "pipeline/pynative/pynative_execute.h"
 #include "runtime/device/kernel_info.h"
+#include "utils/context/ms_context.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif
@@ -112,7 +113,9 @@ class SessionBasic {
   // set debugger
   void SetDebugger() {
     debugger_ = Debugger::GetInstance();
-    debugger_->Init(device_id_);
+    auto ms_context = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(ms_context);
+    debugger_->Init(device_id_, ms_context->device_target());
   }
 #endif
 
diff --git a/mindspore/ccsrc/debug/CMakeLists.txt b/mindspore/ccsrc/debug/CMakeLists.txt
index 8be5a0a834b..9ed24ec25b5 100644
--- a/mindspore/ccsrc/debug/CMakeLists.txt
+++ b/mindspore/ccsrc/debug/CMakeLists.txt
@@ -16,6 +16,7 @@ if (ENABLE_DEBUGGER)
         "${CMAKE_CURRENT_SOURCE_DIR}/debugger/grpc_client.cc"
         "${CMAKE_CURRENT_SOURCE_DIR}/debugger/proto_exporter.cc"
         "${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc"
+        "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
         )
 endif (ENABLE_DEBUGGER)
 
diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc
index dd89e17e2db..b9e9238034e 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -21,6 +21,7 @@
 #include "debug/debugger/debugger.h"
 #include "pipeline/jit/pipeline.h"
 #include "backend/session/anf_runtime_algorithm.h"
+#include "runtime/device/kernel_runtime_manager.h"
 
 using debugger::EventReply;
 using debugger::GraphProto;
@@ -41,17 +42,20 @@ Debugger::Debugger()
     : grpc_client_(nullptr),
       debug_services_(nullptr),
       device_id_(0),
+      device_target_(""),
       num_step_(0),
       debugger_enabled_(false),
       is_dataset_graph_(false),
       partial_memory_(false) {}
 
-void Debugger::Init(const uint32_t device_id) {
+void Debugger::Init(const uint32_t device_id, const std::string device_target) {
   // access lock for public method
   std::lock_guard<std::mutex> a_lock(access_lock_);
   // save device_id
   MS_LOG(INFO) << "Debugger got device_id: " << device_id;
   device_id_ = device_id;
+  MS_LOG(INFO) << "Debugger got device_target: " << device_target;
+  device_target_ = device_target;
 }
 
 void Debugger::EnableDebugger() {
@@ -62,6 +66,14 @@ void Debugger::EnableDebugger() {
   grpc_client_ = nullptr;
   debug_services_ = nullptr;
 
+  // see if dump is enabled
+  bool dump_enabled = false;
+  if (device_target_ == kGPUDevice) {
+    auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
+    MS_EXCEPTION_IF_NULL(runtime_instance);
+    dump_enabled = runtime_instance->DumpDataEnabled();
+  }
+
   // get env variables to configure debugger
   const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
   if (env_enable_str != nullptr) {
@@ -70,7 +82,8 @@ void Debugger::EnableDebugger() {
       debugger_enabled_ = true;
     }
   }
-  if (!debugger_enabled_) {
+
+  if (!debugger_enabled_ && !dump_enabled) {
     MS_LOG(WARNING) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
     return;
   }
@@ -118,7 +131,10 @@ void Debugger::EnableDebugger() {
   }
 
   // initialize grpc client
-  grpc_client_ = std::make_unique<GrpcClient>(host, port);
+  if (debugger_enabled_) {
+    grpc_client_ = std::make_unique<GrpcClient>(host, port);
+  }
+
   debug_services_ = std::make_unique<DebugServices>();
 }
 
@@ -127,6 +143,7 @@ void Debugger::Reset() {
   std::lock_guard<std::mutex> a_lock(access_lock_);
   // reset components
   device_id_ = 0;
+  device_target_ = "";
   num_step_ = 0;
   debugger_enabled_ = false;
   is_dataset_graph_ = false;
diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h
index 5a3965d7cc9..f72a3e038c7 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -55,7 +55,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
 
   // init
   // only save device_id
-  void Init(const uint32_t device_id);
+  void Init(const uint32_t device_id, const std::string device_target);
 
   // reset debugger
   void Reset();
@@ -128,6 +128,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
   std::unique_ptr<DebugServices> debug_services_;
   KernelGraphPtr graph_ptr_;
   uint32_t device_id_;
+  std::string device_target_;
   int32_t num_step_;
   bool debugger_enabled_;
   bool is_dataset_graph_;
diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h
index ae0e89aae27..7215b9a6244 100644
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@@ -24,6 +24,10 @@
 #include <string>
 #include <utility>
 #include "debug/tensor_data.h"
+#include "ir/dtype.h"
+#ifdef ENABLE_DUMP_E2E
+#include "debug/e2e_dump.h"
+#endif
 namespace mindspore {
 class TensorLoader {
  public:
@@ -72,8 +76,54 @@ class TensorLoader {
 
   void EmptyPrevTensor() { prev_tensor_list_map.clear(); }
 
+  void EmptyCurrentTensor() {
+    tensor_list_map.clear();
+    tensor_list.clear();
+  }
+
   void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; }
 
+#ifdef ENABLE_DUMP_E2E
+  bool DumpTensorToFile(std::string tensor_name, bool trans_flag, const std::string &filepath,
+                        const std::string &host_fmt, const std::vector<int> &host_shape, TypeId host_type,
+                        TypeId addr_type_id, std::string addr_format, size_t slot) const {
+    bool ret = false;
+    if (filepath.empty()) {
+      MS_LOG(ERROR) << "Dump file path is null!";
+      return ret;
+    }
+    std::string shape = "shape";
+    if (host_shape.size()) {
+      for (auto &value : host_shape) {
+        shape = shape + '_' + std::to_string(value);
+      }
+    } else {
+      shape = shape + "_0";
+    }
+    std::string file_extension = ".bin";
+    std::string path = "";
+    if (trans_flag) {
+      path = filepath + '_' + shape + '_' + TypeIdLabel(host_type) + '_' + host_fmt + file_extension;
+    } else {
+      path = filepath + '_' + shape + '_' + TypeIdToType(addr_type_id)->ToString() + '_' + addr_format + file_extension;
+    }
+
+    MS_LOG(INFO) << "Dump path is " << path;
+
+    std::string tensor_loader_name = tensor_name + ":" + std::to_string(slot);
+    auto iter = tensor_list_map.find(tensor_loader_name);
+    if (iter != tensor_list_map.end()) {
+      std::shared_ptr<TensorData> node = iter->second;
+      mindspore::tensor::TensorPtr out_tensor = node->GetTensor();
+      size_t host_size = out_tensor->data().nbytes();
+
+      ret = mindspore::Dump::DumpToFile(path, out_tensor->data_c(), host_size);
+    }
+
+    return ret;
+  }
+#endif
+
  private:
   std::vector<std::shared_ptr<TensorData>> tensor_list;
   std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map;
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
index aafbf757654..da290dd1c0b 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -275,7 +275,7 @@ void DumpParameters(mindspore::session::KernelGraph *graph, const string &dump_p
 }  // namespace
 #endif
 
-bool AscendKernelRuntime::DumpData(mindspore::session::KernelGraph *graph) {
+bool AscendKernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
   MS_EXCEPTION_IF_NULL(graph);
 #ifdef ENABLE_DUMP_E2E
   MS_LOG(INFO) << "Start dump step";
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
index c4d82b05672..33500dc27f3 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@@ -38,7 +38,7 @@ class AscendKernelRuntime : public KernelRuntime {
   AscendKernelRuntime() = default;
   ~AscendKernelRuntime() override;
   bool Init() override;
-  bool DumpData(session::KernelGraph *graph) override;
+  bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
   bool LoadData(session::KernelGraph *graph, Debugger *debugger) override;
   bool GenTask(const session::KernelGraph *graph) override;
   bool RunTask(const session::KernelGraph *graph) override;
diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
index 8b144b04400..0c3cf9684a9 100644
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
@@ -270,7 +270,7 @@ void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutput
   resource_manager_.DecreaseSummaryRefCount(summary_outputs);
 }
 
-bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) {
+bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, Debugger *debugger) {
   MS_EXCEPTION_IF_NULL(kernel_graph);
   resource_manager_.IncreaseAddressRefCount(kernel_graph);
 
diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
index dc952f526ec..a486ab1a8b8 100644
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
@@ -36,7 +36,7 @@ class CPUKernelRuntime : public KernelRuntime {
   ~CPUKernelRuntime() override = default;
 
   bool Init() override { return true; }
-  bool Run(session::KernelGraph *graph) override;
+  bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
   void AssignKernelAddress(session::KernelGraph *kernel_graph);
   void BindInputOutput(const session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
                        VectorRef *outputs, std::vector<tensor::TensorPtr> *need_sync_outputs);
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
index a20a6a9a3c8..35fc90b7e45 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@@ -16,9 +16,16 @@
 
 #include "runtime/device/gpu/gpu_device_address.h"
 #include <vector>
+#include <memory>
 #include "runtime/device/gpu/gpu_device_manager.h"
 #include "utils/log_adapter.h"
 #include "runtime/device/gpu/gpu_memory_allocator.h"
+#include "ir/tensor.h"
+#ifdef ENABLE_DEBUGGER
+#include "debug/debug_services.h"
+#include "debug/tensor_load.h"
+#include "debug/debugger/debugger.h"
+#endif
 
 namespace mindspore {
 namespace device {
@@ -59,6 +66,36 @@ GPUDeviceAddress::~GPUDeviceAddress() {
     ptr_ = nullptr;
   }
 }
+#ifdef ENABLE_DEBUGGER
+bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
+                                     const std::vector<int> &host_shape, TypeId host_type, size_t slot,
+                                     Debugger *debugger, bool keep_prev) const {
+  bool ret = false;
+  if (size_ == 0) {
+    return true;
+  }
+  DebugServices *debug_services = debugger->debug_services();
+  TensorLoader *tensor_loader = debug_services->tensor_loader();
+
+  mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
+  size_t host_size = out_tensor->data().nbytes();
+  auto ret_rt_memcpy = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
+  if (!ret_rt_memcpy) {
+    MS_LOG(ERROR) << "Copy device mem to host failed";
+    return ret;
+  }
+  auto tensor_data = std::make_shared<mindspore::TensorData>();
+  tensor_data->SetName(tensor_name);
+  tensor_data->SetExecutionOrder(execution_order);
+  tensor_data->SetTensor(out_tensor);
+  tensor_data->SetSlot(slot);
+  ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev);
+
+  MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
+
+  return ret;
+}
+#endif
 }  // namespace gpu
 }  // namespace device
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
index 8b846bf341a..8a3baccb611 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
@@ -22,6 +22,9 @@
 #include "runtime/device/device_address.h"
 
 namespace mindspore {
+#ifdef ENABLE_DEBUGGER
+class Debugger;
+#endif
 namespace device {
 namespace gpu {
 class GPUDeviceAddress : public DeviceAddress {
@@ -37,6 +40,11 @@ class GPUDeviceAddress : public DeviceAddress {
   DeviceAddressStatus status() const { return status_; }
   DeviceAddressType DeviceType() const override { return DeviceAddressType::kGPU; }
 
+#ifdef ENABLE_DEBUGGER
+  bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
+                     const std::vector<int> &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
+                     bool keep_prev) const;
+#endif
  private:
   DeviceAddressStatus status_{DeviceAddressStatus::kInDevice};
 };
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index 3a5d9ca34a8..dbfc80b9ff4 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "runtime/device/gpu/gpu_kernel_runtime.h"
+#include <algorithm>
 #include "runtime/device/gpu/gpu_device_address.h"
 #include "runtime/device/gpu/cuda_driver.h"
 #include "runtime/device/gpu/gpu_buffer_mgr.h"
@@ -29,6 +29,8 @@
 #include "runtime/device/gpu/gpu_memory_manager.h"
 #include "backend/kernel_compiler/common_utils.h"
 #include "runtime/device/gpu/gpu_memory_copy_manager.h"
+#include "common/trans.h"
+#include "ir/dtype.h"
 
 namespace mindspore {
 namespace device {
@@ -36,6 +38,7 @@ namespace gpu {
 using mindspore::device::memswap::MemSwapInfoSet;
 using mindspore::device::memswap::MemSwapManager;
 using mindspore::device::memswap::SwapKind;
+static const size_t PARAMETER_OUTPUT_INDEX = 0;
 bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }
 
 bool GPUKernelRuntime::Init() {
@@ -43,7 +46,15 @@ bool GPUKernelRuntime::Init() {
     GPUMemoryAllocator::GetInstance().CheckMaxDeviceMemory();
     return true;
   }
-  auto ret = InitDevice();
+  bool ret = false;
+#ifdef ENABLE_DUMP_E2E
+  ret = SetDumpConf();
+  if (!ret) {
+    MS_LOG(INFO) << "No dump conf to set!";
+  }
+#endif
+
+  ret = InitDevice();
   if (!ret) {
     MS_LOG(ERROR) << "InitDevice error.";
     return ret;
@@ -63,6 +74,216 @@ bool GPUKernelRuntime::Init() {
   return ret;
 }
 
+#ifdef ENABLE_DUMP_E2E
+namespace {
+void DumpOutput(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf,
+                Debugger *debugger) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(dump_conf);
+  bool trans_flag = dump_conf->trans_flag();
+  const auto &apply_kernels = graph->execution_order();
+  for (const auto &node : apply_kernels) {
+    MS_EXCEPTION_IF_NULL(node);
+    auto node_name = AnfAlgo::GetCNodeName(node);
+    std::string kernel_name = node->fullname_with_scope();
+    if (!dump_conf->IsKernelNeedDump(kernel_name)) {
+      continue;
+    }
+    const std::string strsrc = "/";
+    const std::string strdst = "--";
+    std::string::size_type pos = 0;
+    std::string::size_type srclen = strsrc.size();
+    std::string::size_type dstlen = strdst.size();
+    while ((pos = kernel_name.find(strsrc, pos)) != std::string::npos) {
+      kernel_name.replace(pos, srclen, strdst);
+      pos += dstlen;
+    }
+    auto output_size = AnfAlgo::GetOutputTensorNum(node);
+    for (size_t j = 0; j < output_size; ++j) {
+      auto addr = AnfAlgo::GetOutputAddr(node, j);
+      TypeId addr_type_id = addr->type_id();
+      std::string addr_format = addr->format();
+      std::vector<int> int_shapes;
+      if (trans_flag) {
+        int_shapes = trans::GetRuntimePaddingShape(node, j);
+      } else {
+        auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
+        (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                             [](size_t inner_item) { return SizeToInt(inner_item); });
+      }
+
+      auto type = AnfAlgo::GetOutputInferDataType(node, j);
+
+      auto format = kOpFormat_DEFAULT;
+      string filepath = dump_path + '/' + kernel_name + '_' + "output_" + std::to_string(j);
+
+      DebugServices *debug_services = debugger->debug_services();
+      TensorLoader *tensor_loader = debug_services->tensor_loader();
+      std::string original_kernel_name = node->fullname_with_scope();
+      size_t slot = j;
+      auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type,
+                                                 addr_type_id, addr_format, slot);
+
+      if (!ret) {
+        std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath +
+                            ", host_format:" + format + ".!";
+      }
+    }
+  }
+}
+
+void DumpParameters(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf,
+                    Debugger *debugger) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(dump_conf);
+  bool trans_flag = dump_conf->trans_flag();
+  const auto &parameters = graph->inputs();
+  for (auto &item : parameters) {
+    if (!item->isa<Parameter>()) {
+      continue;
+    }
+    std::string parameter_name = item->fullname_with_scope();
+    if (!dump_conf->IsKernelNeedDump(parameter_name)) {
+      continue;
+    }
+    auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX);
+    TypeId addr_type_id = addr->type_id();
+    std::string addr_format = addr->format();
+    std::vector<int> int_shapes;
+    if (trans_flag) {
+      int_shapes = trans::GetRuntimePaddingShape(item, PARAMETER_OUTPUT_INDEX);
+    } else {
+      auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX);
+      (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                           [](size_t inner_item) { return SizeToInt(inner_item); });
+    }
+
+    auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX);
+
+    auto format = kOpFormat_DEFAULT;
+    string filepath = dump_path + '/' + parameter_name + '_' + "output_0";
+
+    DebugServices *debug_services = debugger->debug_services();
+    TensorLoader *tensor_loader = debug_services->tensor_loader();
+    std::string original_kernel_name = parameter_name;
+    size_t slot = 0;
+    auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type,
+                                               addr_type_id, addr_format, slot);
+
+    if (!ret) {
+      std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath +
+                          ", host_format:" + format + ".!";
+    }
+  }
+}
+}  // namespace
+
+bool GPUKernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_LOG(INFO) << "Start dump step";
+  DumpConfPtr dump_conf = GetDumpConf();
+  MS_EXCEPTION_IF_NULL(dump_conf);
+  dump_conf->UpdataCurIter();
+  bool dump_flag = dump_conf->dump_enable();
+  if (!dump_flag) {
+    MS_LOG(INFO) << "Dump flag is disable, pass dump step";
+    return true;
+  }
+  uint32_t cur_iter = dump_conf->cur_iter();
+  if (dump_conf->dump_iter() != 0) {
+    if (cur_iter != dump_conf->dump_iter()) {
+      return true;
+    }
+  }
+  MS_LOG(INFO) << "Cur iter is " << cur_iter;
+  std::string net_name = dump_conf->dump_net_name();
+  std::string iterator = std::to_string(cur_iter);
+  std::string dump_path = dump_conf->dump_path();
+  if (dump_path.back() == '/') {
+    dump_path = dump_path + net_name + '/' + iterator;
+  } else {
+    dump_path = dump_path + '/' + net_name + '/' + iterator;
+  }
+
+  // dump output
+  DumpOutput(graph, dump_path, dump_conf, debugger);
+  // dump parameters
+  DumpParameters(graph, dump_path, dump_conf, debugger);
+
+  return true;
+}
+#endif
+
+#ifdef ENABLE_DEBUGGER
+namespace {
+void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
+                    const std::vector<mindspore::kernel::AddressPtr> &kernel_inputs,
+                    const std::vector<mindspore::kernel::AddressPtr> &kernel_workspaces,
+                    const std::vector<mindspore::kernel::AddressPtr> &kernel_outputs, int exec_order, void *stream_ptr,
+                    bool dump_enabled) {
+  if (!(debugger && (debugger->debugger_enabled() || dump_enabled))) {
+    return;
+  }
+  std::string kernel_name = kernel->fullname_with_scope();
+  auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
+  for (size_t j = 0; j < output_size; ++j) {
+    auto addr = kernel_outputs[j];
+    auto type = AnfAlgo::GetOutputInferDataType(kernel, j);
+    auto format = kOpFormat_DEFAULT;
+    auto gpu_addr = std::make_unique<GPUDeviceAddress>(addr->addr, addr->size, format, type);
+    string tensor_name = kernel_name + ':' + std::to_string(j);
+    std::vector<int> int_shapes;
+    auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j);
+    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                         [](size_t inner_item) { return SizeToInt(inner_item); });
+    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, debugger, false);
+    if (!ret) {
+      MS_LOG(ERROR) << "LoadMemToHost:"
+                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
+    }
+  }
+}
+
+void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) {
+  MS_EXCEPTION_IF_NULL(graph);
+  if (!(debugger && (debugger->debugger_enabled() || dump_enabled))) {
+    return;
+  }
+  const auto &parameters = graph->inputs();
+  // for parameters, set its execution order to be 0;
+  int exec_order = 0;
+  for (auto &item : parameters) {
+    if (!item->isa<Parameter>()) {
+      continue;
+    }
+    std::string parameter_name = item->fullname_with_scope();
+    auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX);
+    auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX);
+    auto format = kOpFormat_DEFAULT;
+    string tensor_name = parameter_name + ':' + "0";
+    auto gpu_addr = dynamic_cast<const mindspore::device::gpu::GPUDeviceAddress *>(addr);
+    std::vector<int> int_shapes;
+    auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX);
+    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                         [](size_t inner_item) { return SizeToInt(inner_item); });
+    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
+    if (!ret) {
+      MS_LOG(ERROR) << "LoadMemToHost:"
+                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
+    }
+  }
+}
+
+void ClearCurrentData(Debugger *debugger, bool dump_enabled) {
+  if (debugger && (debugger->debugger_enabled() || dump_enabled)) {
+    DebugServices *debug_services = debugger->debug_services();
+    TensorLoader *tensor_loader = debug_services->tensor_loader();
+    tensor_loader->EmptyCurrentTensor();
+  }
+}
+}  // namespace
+#endif
+
 DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                                        TypeId type_id) {
   return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);
@@ -147,7 +368,7 @@ void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
   }
 }
 
-bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
+bool GPUKernelRuntime::Run(session::KernelGraph *graph, Debugger *debugger) {
   struct timeval start_time, end_time;
   (void)gettimeofday(&start_time, nullptr);
   bool ret = true;
@@ -170,7 +391,7 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
     mem_reuse_util_ = mem_reuse_iter->second;
     MS_EXCEPTION_IF_NULL(mem_reuse_util_);
 
-    ret = RunOneStep(graph);
+    ret = RunOneStep(graph, debugger);
   } else {
     ret = LaunchKernel(graph);
   }
@@ -182,28 +403,28 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
   return ret;
 }
 
-bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph) {
+bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph, Debugger *debugger) {
   bool ret = true;
   auto graph_id = graph->graph_id();
   if (!is_first_step_map_[graph_id]) {
     // Normally run graph
-    ret = LaunchKernelDynamic(graph);
+    ret = LaunchKernelDynamic(graph, debugger);
   } else {
     // Mock run first step
-    ret = LaunchKernelDynamic(graph, true, false);
+    ret = LaunchKernelDynamic(graph, debugger, true, false);
     if (ret) {
       // Normally run graph
-      ret = LaunchKernelDynamic(graph);
+      ret = LaunchKernelDynamic(graph, debugger);
     } else {
       // Trigger memory swap
-      ret = SearchMemSwapScheme(graph);
+      ret = SearchMemSwapScheme(graph, debugger);
     }
     is_first_step_map_[graph_id] = false;
   }
   return ret;
 }
 
-bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {
+bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) {
   MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment.";
   bool ret = false;
   ClearKernelOldOutputAndWorkspace(graph);
@@ -217,7 +438,7 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {
     if (!mem_swap_manager_->RetreatSwapInfo()) {
       return false;
     }
-    ret = LaunchKernelDynamic(graph, true, false);
+    ret = LaunchKernelDynamic(graph, debugger, true, false);
     if (!ret) {
       ClearKernelOldOutputAndWorkspace(graph);
     }
@@ -225,14 +446,14 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {
   mem_swap_manager_->AssignHostMemory();
 
   // Time profiling
-  ret = LaunchKernelDynamic(graph, false, true);
+  ret = LaunchKernelDynamic(graph, debugger, false, true);
   if (!ret) {
     return ret;
   }
-  return RefineMemSwapScheme(graph);
+  return RefineMemSwapScheme(graph, debugger);
 }
 
-bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) {
+bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) {
   MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment.";
   auto &kernels = graph->execution_order();
   for (const auto &kernel : kernels) {
@@ -245,7 +466,7 @@ bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) {
       bool ret = false;
       while (!ret) {
         mem_swap_manager_->AdjustSwapInPos(kernel, swap_in_task_idx);
-        ret = LaunchKernelDynamic(graph, true, false);
+        ret = LaunchKernelDynamic(graph, debugger, true, false);
         if (!ret) {
           ClearKernelOldOutputAndWorkspace(graph);
           ClearSwapInfo(true);
@@ -384,14 +605,24 @@ void GPUKernelRuntime::ClearKernelWorkspaceAddress(const session::KernelGraph *g
   }
 }
 
-bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bool mock, bool profiling) {
+bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger, bool mock,
+                                           bool profiling) {
   MS_EXCEPTION_IF_NULL(graph);
   MS_EXCEPTION_IF_NULL(mem_reuse_util_);
   // Reset the reference count.
   mem_reuse_util_->ResetDynamicUsedRefCount();
   // The inputs and outputs memory of communication kernel need be continuous, so separate processing.
   AllocCommunicationOpDynamicRes(graph);
+
+#ifdef ENABLE_DEBUGGER
+  bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
+  if (!mock) {
+    // collect weights and bias
+    LoadParameters(graph, debugger, dump_enabled);
+  }
+#endif
   auto &kernels = graph->execution_order();
+  int exec_order = 1;
   for (const auto &kernel : kernels) {
     auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
     MS_EXCEPTION_IF_NULL(kernel_mod);
@@ -400,6 +631,12 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
     AddressPtrList kernel_outputs;
     auto ret = AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs, mock);
     if (!ret) {
+#ifdef ENABLE_DEBUGGER
+      if (!mock) {
+        // invalidate current data collected by the debugger
+        ClearCurrentData(debugger, dump_enabled);
+      }
+#endif
       return false;
     }
     if (!mock) {
@@ -409,9 +646,21 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
       } else {
         LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs);
       }
+#ifdef ENABLE_DEBUGGER
+      // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
+      LoadKernelData(debugger, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_,
+                     dump_enabled);
+#endif
     }
+    exec_order = exec_order + 1;
     FreeKernelDynamicRes(kernel);
     if (!UpdateMemorySwapTask(kernel, mock, profiling)) {
+#ifdef ENABLE_DEBUGGER
+      if (!mock) {
+        // invalidate current data collected by the debugger
+        ClearCurrentData(debugger, dump_enabled);
+      }
+#endif
       return false;
     }
   }
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
index 9a210c8e772..8f3cb9cb252 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
@@ -38,7 +38,10 @@ class GPUKernelRuntime : public KernelRuntime {
   bool Init() override;
   void ReleaseDeviceRes() override;
   void AssignMemory(session::KernelGraph *graph) override;
-  bool Run(session::KernelGraph *graph) override;
+  bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
+#ifdef ENABLE_DUMP_E2E
+  bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
+#endif
 
  protected:
   DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
@@ -61,10 +64,11 @@ class GPUKernelRuntime : public KernelRuntime {
   void ClearKernelOutputAddress(const session::KernelGraph *graph);
   void ClearKernelWorkspaceAddress(const session::KernelGraph *graph);
   void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph);
-  bool RunOneStep(const session::KernelGraph *graph);
-  bool SearchMemSwapScheme(const session::KernelGraph *graph);
-  bool RefineMemSwapScheme(const session::KernelGraph *graph);
-  bool LaunchKernelDynamic(const session::KernelGraph *graph, bool mock = false, bool profiling = false);
+  bool RunOneStep(const session::KernelGraph *graph, Debugger *debugger = nullptr);
+  bool SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
+  bool RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
+  bool LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger = nullptr, bool mock = false,
+                           bool profiling = false);
   void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs,
                                      const AddressPtrList &workspace, const AddressPtrList &outputs);
   bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock);
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
index b51e9127c50..c6f3c6a7351 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@@ -41,7 +41,7 @@ KernelRuntime::~KernelRuntime() {
 #endif
 }
 
-bool KernelRuntime::Run(session::KernelGraph *graph) {
+bool KernelRuntime::Run(session::KernelGraph *graph, Debugger *debugger) {
   bool ret = false;
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
@@ -72,7 +72,7 @@ bool KernelRuntime::Run(session::KernelGraph *graph) {
 }
 
 // for D to impl
-bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph) {
+bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
   if (graph != nullptr) {
     return true;
   }
@@ -190,6 +190,39 @@ void KernelRuntime::RunOpClearMemory(const session::KernelGraph *graph) {
   }
 }
 
+bool KernelRuntime::DumpDataEnabled() {
+  bool ret = false;
+#ifdef ENABLE_DUMP_E2E
+  DumpConfPtr dump_conf = GetDumpConf();
+  MS_EXCEPTION_IF_NULL(dump_conf);
+  bool dump_flag = dump_conf->dump_enable();
+  if (!dump_flag) {
+    return ret;
+  }
+  ret = true;
+#endif
+  return ret;
+}
+
+bool KernelRuntime::DumpDataEnabledIteration() {
+  bool ret = false;
+#ifdef ENABLE_DUMP_E2E
+  if (!DumpDataEnabled()) {
+    return ret;
+  }
+  DumpConfPtr dump_conf = GetDumpConf();
+  MS_EXCEPTION_IF_NULL(dump_conf);
+  uint32_t cur_iter = dump_conf->cur_iter() + 1;
+  if (dump_conf->dump_iter() != 0) {
+    if (cur_iter != dump_conf->dump_iter()) {
+      return ret;
+    }
+  }
+  ret = true;
+#endif
+  return ret;
+}
+
 void KernelRuntime::AssignStaticMemory(session::KernelGraph *graph) {
   AssignStaticMemoryInput(graph);
   AssignStaticMemoryValueNode(graph);
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h
index 3b771b0090d..e56c80bca08 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -55,8 +55,10 @@ class KernelRuntime {
   virtual void AssignMemory(session::KernelGraph *graph);
   void RunOpAssignMemory(const std::vector<tensor::TensorPtr> &input_tensors, session::KernelGraph *graph);
   void RunOpClearMemory(const session::KernelGraph *graph);
-  virtual bool Run(session::KernelGraph *graph);
-  virtual bool DumpData(session::KernelGraph *graph);
+  bool DumpDataEnabled();
+  bool DumpDataEnabledIteration();
+  virtual bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr);
+  virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr);
   virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger);
   virtual bool RunTask(const session::KernelGraph *graph);
   virtual bool GenTask(const session::KernelGraph *graph);