diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 325c3e0f66a..5712d9e9d9c 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -344,7 +344,7 @@ bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) { return false; } -void Debugger::PostExecuteNode(const CNodePtr &kernel) { +void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) { // access lock for public method std::lock_guard a_lock(access_lock_); if (pipeline::ExecutorPy::GetDebugTerminate()) { @@ -363,8 +363,9 @@ void Debugger::PostExecuteNode(const CNodePtr &kernel) { hit_empty_flag = false; } } - if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) { + if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_) && !last_kernel) { // if kernel is not watchpoint and is next_to or continue_to node, suspend + // No need to suspend if this is the last node in graph since PostExecute suspends at the end of graph CommandLoop(); } return; diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index 19feb29f18c..efddb4cd08d 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -83,7 +83,7 @@ class Debugger : public std::enable_shared_from_this { bool ReadNodeDataRequired(const CNodePtr &kernel); - void PostExecuteNode(const CNodePtr &kernel); + void PostExecuteNode(const CNodePtr &kernel, bool last_kernel); // suspend the execution after a debug_op void PostDebugOp(); diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index e57107e1969..37746837836 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -104,7 +104,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, const std::vector &kernel_inputs, const std::vector &kernel_workspaces, const std::vector &kernel_outputs, int exec_order, void *stream_ptr, - bool dump_enabled) { + bool dump_enabled, bool last_kernel) { // check if we should read the kernel data bool read_data = false; auto &dump_json_parser = DumpJsonParser::GetInstance(); @@ -171,7 +171,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, } } } - debugger->PostExecuteNode(kernel); + debugger->PostExecuteNode(kernel, last_kernel); } } // namespace @@ -578,6 +578,19 @@ void GPUKernelRuntime::ClearKernelWorkspaceAddress(const session::KernelGraph *g } } +CNodePtr GetLastKernel(const session::KernelGraph *graph) { + const auto &kernels = graph->execution_order(); + CNodePtr last_kernel; + for (const auto &kernel : kernels) { + if (AnfAlgo::IsInplaceNode(kernel, "skip")) { + continue; + } else { + last_kernel = kernel; + } + } + return last_kernel; +} + bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bool mock, bool profiling) { MS_EXCEPTION_IF_NULL(graph); MS_EXCEPTION_IF_NULL(mem_reuse_util_); @@ -602,7 +615,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph)); profiler_inst->SetStepTraceOpName(profiling_trace); } - + CNodePtr last_kernel = GetLastKernel(graph); for (const auto &kernel : kernels) { auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); @@ -658,7 +671,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost) LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, - dump_enabled); + dump_enabled, kernel == last_kernel); } exec_order = exec_order + 1; FreeKernelDynamicRes(kernel);