diff --git a/build.sh b/build.sh index efed9fec892..3a6e08a7f87 100755 --- a/build.sh +++ b/build.sh @@ -56,7 +56,7 @@ usage() echo " -K Compile with AKG, default on" echo " -s Enable serving module, default off" echo " -w Enable acl module, default off" - echo " -B Enable debugger, default off" + echo " -B Enable debugger, default on" echo " -E Enable IBVERBS for parameter server, default off" echo " -l Compile with python dependency, default on" } @@ -102,7 +102,7 @@ checkopts() ENABLE_AKG="on" ENABLE_SERVING="off" ENABLE_ACL="off" - ENABLE_DEBUGGER="off" + ENABLE_DEBUGGER="on" ENABLE_IBVERBS="off" ENABLE_PYTHON="on" ENABLE_GPU="off" @@ -282,8 +282,7 @@ checkopts() ;; B) check_on_off $OPTARG B - ENABLE_DEBUGGER="on" - echo "enable debugger" + ENABLE_DEBUGGER="$OPTARG" ;; E) ENABLE_IBVERBS="on" diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc index 6bbf6c8a5de..81161d56aef 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc @@ -16,9 +16,6 @@ #include "backend/kernel_compiler/cpu/debug_cpu_kernel.h" #include "runtime/device/cpu/cpu_device_address.h" #include "utils/ms_utils.h" -#ifdef ENABLE_DEBUGGER -#include "debug/debugger/debugger.h" -#endif namespace mindspore { namespace kernel { @@ -39,11 +36,6 @@ bool DebugCPUKernel::Launch(const std::vector &inputs, output[i] = val[i]; } -#ifdef ENABLE_DEBUGGER - // debugger will suspend execution is neccessary - Debugger::GetInstance()->PostDebugOp(); -#endif - return true; } } // namespace kernel diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc index e791d318fae..427b6b776de 100644 --- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc +++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc @@ -80,11 +80,13 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr MS_EXCEPTION_IF_NULL(kernel_prev); #ifdef ENABLE_DEBUGGER auto debugger_ = mindspore::Debugger::GetInstance(); - DebugServices *debug_services = debugger_->debug_services(); - auto watchpoint_table = debug_services->GetWatchpointTable(); - std::string current_kernel_name = kernel_curr->scope_full_name(); - if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) { - return false; + if (debugger_->DebuggerBackendEnabled()) { + DebugServices *debug_services = debugger_->debug_services(); + auto watchpoint_table = debug_services->GetWatchpointTable(); + std::string current_kernel_name = kernel_curr->scope_full_name(); + if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) { + return false; + } } #endif auto curr_stream_id = kernel_curr->stream_id(); diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index adb35257e18..b10ac2c55a0 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -605,16 +605,18 @@ void AscendSession::LoadTensor(const std::shared_ptr &kernel_graph) MS_LOG(INFO) << "Start!"; MS_EXCEPTION_IF_NULL(kernel_graph); #ifdef ENABLE_DEBUGGER - auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); - MS_EXCEPTION_IF_NULL(runtime_instance); - DebugServices *debug_services = debugger_->debug_services(); - TensorLoader *tensor_loader = debug_services->tensor_loader(); - // TensorData will be freed up here - tensor_loader->EmptyTensor(); - uint32_t iter_num = tensor_loader->GetIterNum(); - tensor_loader->set_iter_num(++iter_num); - (void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get()); - tensor_loader->EmptyPrevTensor(); + if (debugger_->DebuggerBackendEnabled()) { + auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); + MS_EXCEPTION_IF_NULL(runtime_instance); + DebugServices *debug_services = debugger_->debug_services(); + TensorLoader *tensor_loader = debug_services->tensor_loader(); + // TensorData will be freed up here + tensor_loader->EmptyTensor(); + uint32_t iter_num = tensor_loader->GetIterNum(); + tensor_loader->set_iter_num(++iter_num); + (void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get()); + tensor_loader->EmptyPrevTensor(); + } #endif MS_LOG(INFO) << "Finish!"; } diff --git a/mindspore/ccsrc/backend/session/cpu_session.cc b/mindspore/ccsrc/backend/session/cpu_session.cc index ce24f8b6f44..3e4388bb338 100644 --- a/mindspore/ccsrc/backend/session/cpu_session.cc +++ b/mindspore/ccsrc/backend/session/cpu_session.cc @@ -26,9 +26,6 @@ #include "backend/optimizer/common/optimizer.h" #include "backend/optimizer/common/pass_manager.h" #include "backend/optimizer/pass/replace_node_by_proxy.h" -#ifdef ENABLE_DEBUGGER -#include "debug/debugger/debugger.h" -#endif #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) #include "frontend/parallel/ps/util.h" #endif @@ -112,12 +109,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vectorsummary_nodes(); runtime_.IncreaseSummaryRefCount(summary_outputs); } -#ifdef ENABLE_DEBUGGER - // debugger pre-execution processing - if (debugger_) { - debugger_->PreExecute(kernel_graph); - } -#endif + bool ret = runtime_.Run(kernel_graph.get(), false); if (!ret) { MS_LOG(EXCEPTION) << "Run graph failed"; @@ -128,12 +120,6 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vectorPostExecute(); - } -#endif MS_LOG(INFO) << "Run graph end"; } diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index 104a46bc719..74eb25cf33e 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -351,10 +351,12 @@ void GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info #ifdef ENABLE_DEBUGGER void GPUSession::Dump(const std::shared_ptr &kernel_graph) const { #ifdef ENABLE_DUMP_E2E - MS_EXCEPTION_IF_NULL(kernel_graph); - auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); - MS_EXCEPTION_IF_NULL(runtime_instance); - (void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get()); + if (debugger_->DebuggerBackendEnabled()) { + MS_EXCEPTION_IF_NULL(kernel_graph); + auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); + MS_EXCEPTION_IF_NULL(runtime_instance); + (void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get()); + } #endif } diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 9383a4a39d8..560df790cfc 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -80,25 +80,16 @@ void Debugger::EnableDebugger() { grpc_client_ = nullptr; debug_services_ = nullptr; - // see if dump is enabled - bool dump_enabled = false; - if (device_target_ == kGPUDevice) { - auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); - MS_EXCEPTION_IF_NULL(runtime_instance); - dump_enabled = runtime_instance->DumpDataEnabled(); - } + // see if dump using debugger backend is enabled + bool dump_enabled = CheckDebuggerDumpEnabled(); + MS_LOG(INFO) << "dump using debugger backend = " << dump_enabled; - // get env variables to configure debugger - const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER"); - if (env_enable_str != nullptr) { - MS_LOG(INFO) << "Getenv ENABLE_MS_DEBUGGER: " << env_enable_str; - if (std::strcmp(env_enable_str, "1") == 0) { - debugger_enabled_ = true; - } - } + // check if debugger enabled + debugger_enabled_ = CheckDebuggerEnabled(); + MS_LOG(INFO) << "debugger_enabled_ = " << debugger_enabled_; if (!debugger_enabled_ && !dump_enabled) { - MS_LOG(WARNING) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger."; + MS_LOG(INFO) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger."; return; } @@ -109,7 +100,7 @@ void Debugger::EnableDebugger() { MS_LOG(INFO) << "Getenv MS_DEBUGGER_HOST: " << env_host_str; host = std::string(env_host_str); } else { - MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost"; + MS_LOG(INFO) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost"; host = "localhost"; } // configure grpc port @@ -119,7 +110,7 @@ void Debugger::EnableDebugger() { MS_LOG(INFO) << "Getenv MS_DEBUGGER_PORT: " << env_port_str; port = std::string(env_port_str); } else { - MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051"; + MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051"; port = "50051"; } @@ -140,8 +131,8 @@ void Debugger::EnableDebugger() { MS_LOG(WARNING) << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first " "step. 2. Tensor values are only available for nodes that are watched by any watchpoint."; } else { - MS_LOG(WARNING) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory " - "usage for large models."; + MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory " + "usage for large models."; } #ifdef ENABLE_D // set operation overflow info @@ -180,6 +171,29 @@ void Debugger::EnableDebugger() { debug_services_ = std::make_unique(); } +bool Debugger::CheckDebuggerDumpEnabled() { + // see if dump is enabled + if (device_target_ == kGPUDevice) { + auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); + MS_EXCEPTION_IF_NULL(runtime_instance); + return runtime_instance->DumpDataEnabled(); + } + return false; +} + +bool Debugger::CheckDebuggerEnabled() { + // get env variables to configure debugger + const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER"); + if (env_enable_str != nullptr) { + if (std::strcmp(env_enable_str, "1") == 0) { + return true; + } + } + return false; +} + +bool Debugger::DebuggerBackendEnabled() { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); } + void Debugger::Reset() { // access lock for public method std::lock_guard a_lock(access_lock_); @@ -201,25 +215,29 @@ void Debugger::Reset() { void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) { // access lock for public method std::lock_guard a_lock(access_lock_); - // check and save graph_ptr, suspend if graph is new - CheckGraphPtr(graph_ptr); + if (debugger_->DebuggerBackendEnabled()) { + // check and save graph_ptr, suspend if graph is new + CheckGraphPtr(graph_ptr); + } } void Debugger::PostExecute() { // access lock for public method std::lock_guard a_lock(access_lock_); - // analyze tensor data and send the watchpoints been hit - if (run_level_ == "node") { - MS_LOG(INFO) << "Debugger is in node level mode "; - return; - } - if (debugger_enabled_ && !is_dataset_graph_) { - if (device_target_ != kGPUDevice) { - num_step_++; - MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_; - SendWatchpointsAndSuspend(CheckWatchpoints()); - } else { - CommandLoop(); + if (debugger_->DebuggerBackendEnabled()) { + // analyze tensor data and send the watchpoints been hit + if (run_level_ == "node") { + MS_LOG(INFO) << "Debugger is in node level mode "; + return; + } + if (debugger_enabled_ && !is_dataset_graph_) { + if (device_target_ != kGPUDevice) { + num_step_++; + MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_; + SendWatchpointsAndSuspend(CheckWatchpoints()); + } else { + CommandLoop(); + } } } } @@ -302,8 +320,8 @@ void Debugger::CheckDatasetGraph() { auto node_name = AnfAlgo::GetCNodeName(node); MS_LOG(INFO) << "node: " << node->fullname_with_scope(); if (node_name == "GetNext" || node_name == "InitDataSetQueue") { - MS_LOG(WARNING) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node " - << node_name; + MS_LOG(INFO) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node " + << node_name; is_dataset_graph_ = true; return; } diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index 58f49de454a..53e55f65761 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -96,6 +96,9 @@ class Debugger : public std::enable_shared_from_this { std::map, std::string> &GetStreamTaskToOpnameMap(); + // check if any feature that uses the debugger backend is enabled + bool DebuggerBackendEnabled(); + private: // private constructor for singleton Debugger(); @@ -105,6 +108,12 @@ class Debugger : public std::enable_shared_from_this { // read env variable for grpc client void EnableDebugger(); + // check if dump using debugger backend is enabled + bool CheckDebuggerDumpEnabled(); + + // check if debugger enabled + bool CheckDebuggerEnabled(); + // check and save graph pointer void CheckGraphPtr(const KernelGraphPtr &graph_ptr); diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h index 8afe6a39ca2..9dc43f22780 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h @@ -40,7 +40,7 @@ class AscendKernelRuntime : public KernelRuntime { ~AscendKernelRuntime() override; bool Init() override; bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override; - bool LoadData(session::KernelGraph *graph, Debugger *debugger); + bool LoadData(session::KernelGraph *graph, Debugger *debugger) override; bool GenTask(const session::KernelGraph *graph); bool LoadTask(const session::KernelGraph *graph); bool RunTask(const session::KernelGraph *graph); diff --git a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc index c40322e3bb0..3c55d8fb2a0 100644 --- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc +++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc @@ -97,14 +97,16 @@ void DataDumper::LoadDumpInfo() { #ifdef ENABLE_DEBUGGER auto debugger = mindspore::Debugger::GetInstance(); MS_EXCEPTION_IF_NULL(debugger); - std::map, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap(); - // extract stream id, task id and opname from runtime_info_map for overflow detection - std::transform(runtime_info_map_.begin(), runtime_info_map_.end(), - std::inserter(stream_task_to_opname, stream_task_to_opname.end()), - [](const std::pair> &p) - -> std::pair, std::string> { - return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first}; - }); + if (debugger->DebuggerBackendEnabled()) { + std::map, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap(); + // extract stream id, task id and opname from runtime_info_map for overflow detection + std::transform(runtime_info_map_.begin(), runtime_info_map_.end(), + std::inserter(stream_task_to_opname, stream_task_to_opname.end()), + [](const std::pair> &p) + -> std::pair, std::string> { + return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first}; + }); + } #endif MS_LOG(INFO) << "[DataDump] LoadDumpInfo end"; } diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc index 74809a3329f..af14c239bbd 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc @@ -49,6 +49,8 @@ bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *d return false; } +bool KernelRuntime::LoadData(session::KernelGraph *graph, Debugger *debugger) { return false; } + bool KernelRuntime::NodeOutputDeviceAddressExist(const AnfNodePtr &kernel, size_t index) { MS_EXCEPTION_IF_NULL(kernel); if (AnfAlgo::OutputAddrExist(kernel, index)) { diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h index 1906e778f37..5c87e0998bf 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h @@ -59,6 +59,7 @@ class KernelRuntime { bool DumpDataEnabled(); bool DumpDataEnabledIteration(); virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr); + virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger); virtual bool Load(session::KernelGraph *graph, bool is_task_sink); virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0; bool LaunchKernel(const session::KernelGraph *graph); diff --git a/mindspore/core/utils/ms_context.cc b/mindspore/core/utils/ms_context.cc index 4e73fd82420..2b7a2455da3 100644 --- a/mindspore/core/utils/ms_context.cc +++ b/mindspore/core/utils/ms_context.cc @@ -53,11 +53,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) { set_param(MS_CTX_ENABLE_TASK_SINK, true); set_param(MS_CTX_IR_FUSION_FLAG, true); set_param(MS_CTX_ENABLE_HCCL, false); -#ifdef ENABLE_DEBUGGER - set_param(MS_CTX_ENABLE_MEM_REUSE, false); -#else set_param(MS_CTX_ENABLE_MEM_REUSE, true); -#endif set_param(MS_CTX_ENABLE_GPU_SUMMARY, true); set_param(MS_CTX_PRECOMPILE_ONLY, false); set_param(MS_CTX_ENABLE_AUTO_MIXED_PRECISION, false);