diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc
index 325c3e0f66a..5712d9e9d9c 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -344,7 +344,7 @@ bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) {
   return false;
 }
 
-void Debugger::PostExecuteNode(const CNodePtr &kernel) {
+void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
   // access lock for public method
   std::lock_guard<std::mutex> a_lock(access_lock_);
   if (pipeline::ExecutorPy::GetDebugTerminate()) {
@@ -363,8 +363,9 @@ void Debugger::PostExecuteNode(const CNodePtr &kernel) {
         hit_empty_flag = false;
       }
     }
-    if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) {
+    if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_) && !last_kernel) {
       // if kernel is not watchpoint and is next_to or continue_to node, suspend
+      // No need to suspend if this is the last node in graph since PostExecute suspends at the end of graph
       CommandLoop();
     }
     return;
diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h
index 19feb29f18c..efddb4cd08d 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -83,7 +83,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
 
   bool ReadNodeDataRequired(const CNodePtr &kernel);
 
-  void PostExecuteNode(const CNodePtr &kernel);
+  void PostExecuteNode(const CNodePtr &kernel, bool last_kernel);
 
   // suspend the execution after a debug_op
   void PostDebugOp();
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index e57107e1969..37746837836 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -104,7 +104,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
                     const std::vector<mindspore::kernel::AddressPtr> &kernel_inputs,
                     const std::vector<mindspore::kernel::AddressPtr> &kernel_workspaces,
                     const std::vector<mindspore::kernel::AddressPtr> &kernel_outputs, int exec_order, void *stream_ptr,
-                    bool dump_enabled) {
+                    bool dump_enabled, bool last_kernel) {
   // check if we should read the kernel data
   bool read_data = false;
   auto &dump_json_parser = DumpJsonParser::GetInstance();
@@ -171,7 +171,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
       }
     }
   }
-  debugger->PostExecuteNode(kernel);
+  debugger->PostExecuteNode(kernel, last_kernel);
 }
 }  // namespace
 
@@ -578,6 +578,19 @@ void GPUKernelRuntime::ClearKernelWorkspaceAddress(const session::KernelGraph *g
   }
 }
 
+CNodePtr GetLastKernel(const session::KernelGraph *graph) {
+  const auto &kernels = graph->execution_order();
+  CNodePtr last_kernel;
+  for (const auto &kernel : kernels) {
+    if (AnfAlgo::IsInplaceNode(kernel, "skip")) {
+      continue;
+    } else {
+      last_kernel = kernel;
+    }
+  }
+  return last_kernel;
+}
+
 bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bool mock, bool profiling) {
   MS_EXCEPTION_IF_NULL(graph);
   MS_EXCEPTION_IF_NULL(mem_reuse_util_);
@@ -602,7 +615,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
       profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph));
     profiler_inst->SetStepTraceOpName(profiling_trace);
   }
-
+  CNodePtr last_kernel = GetLastKernel(graph);
   for (const auto &kernel : kernels) {
     auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
     MS_EXCEPTION_IF_NULL(kernel_mod);
@@ -658,7 +671,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
 
       // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
       LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_,
-                     dump_enabled);
+                     dump_enabled, kernel == last_kernel);
     }
     exec_order = exec_order + 1;
     FreeKernelDynamicRes(kernel);