!5970 enable debugger by default and set correct log message severity

Merge pull request !5970 from john_tzanakakis/master_ms1_grpc
2020-09-11 20:43:21 +08:00 · 2020-09-11 20:43:21 +08:00 · 939737c017
parent d8c09067ce b0a7ebdeb0
commit 939737c017
13 changed files with 106 additions and 95 deletions
--- a/build.sh
+++ b/build.sh
@ -56,7 +56,7 @@ usage()
  echo "    -K Compile with AKG, default on"
  echo "    -s Enable serving module, default off"
  echo "    -w Enable acl module, default off"
-  echo "    -B Enable debugger, default off"
+  echo "    -B Enable debugger, default on"
  echo "    -E Enable IBVERBS for parameter server, default off"
  echo "    -l Compile with python dependency, default on"
 }
@ -102,7 +102,7 @@ checkopts()
  ENABLE_AKG="on"
  ENABLE_SERVING="off"
  ENABLE_ACL="off"
-  ENABLE_DEBUGGER="off"
+  ENABLE_DEBUGGER="on"
  ENABLE_IBVERBS="off"
  ENABLE_PYTHON="on"
  ENABLE_GPU="off"
@ -282,8 +282,7 @@ checkopts()
        ;;
      B)
        check_on_off $OPTARG B
-        ENABLE_DEBUGGER="on"
-        echo "enable debugger"
+        ENABLE_DEBUGGER="$OPTARG"
        ;;
      E)
        ENABLE_IBVERBS="on"
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc
@ -16,9 +16,6 @@
 #include "backend/kernel_compiler/cpu/debug_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "utils/ms_utils.h"
-#ifdef ENABLE_DEBUGGER
-#include "debug/debugger/debugger.h"
-#endif

 namespace mindspore {
 namespace kernel {
@ -39,11 +36,6 @@ bool DebugCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
    output[i] = val[i];
  }

-#ifdef ENABLE_DEBUGGER
-  // debugger will suspend execution is neccessary
-  Debugger::GetInstance()->PostDebugOp();
-#endif
-
  return true;
 }
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
@ -80,11 +80,13 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr
  MS_EXCEPTION_IF_NULL(kernel_prev);
 #ifdef ENABLE_DEBUGGER
  auto debugger_ = mindspore::Debugger::GetInstance();
-  DebugServices *debug_services = debugger_->debug_services();
-  auto watchpoint_table = debug_services->GetWatchpointTable();
-  std::string current_kernel_name = kernel_curr->scope_full_name();
-  if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) {
-    return false;
+  if (debugger_->DebuggerBackendEnabled()) {
+    DebugServices *debug_services = debugger_->debug_services();
+    auto watchpoint_table = debug_services->GetWatchpointTable();
+    std::string current_kernel_name = kernel_curr->scope_full_name();
+    if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) {
+      return false;
+    }
  }
 #endif
  auto curr_stream_id = kernel_curr->stream_id();
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@ -605,16 +605,18 @@ void AscendSession::LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph)
  MS_LOG(INFO) << "Start!";
  MS_EXCEPTION_IF_NULL(kernel_graph);
 #ifdef ENABLE_DEBUGGER
-  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
-  MS_EXCEPTION_IF_NULL(runtime_instance);
-  DebugServices *debug_services = debugger_->debug_services();
-  TensorLoader *tensor_loader = debug_services->tensor_loader();
-  // TensorData will be freed up here
-  tensor_loader->EmptyTensor();
-  uint32_t iter_num = tensor_loader->GetIterNum();
-  tensor_loader->set_iter_num(++iter_num);
-  (void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get());
-  tensor_loader->EmptyPrevTensor();
+  if (debugger_->DebuggerBackendEnabled()) {
+    auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
+    MS_EXCEPTION_IF_NULL(runtime_instance);
+    DebugServices *debug_services = debugger_->debug_services();
+    TensorLoader *tensor_loader = debug_services->tensor_loader();
+    // TensorData will be freed up here
+    tensor_loader->EmptyTensor();
+    uint32_t iter_num = tensor_loader->GetIterNum();
+    tensor_loader->set_iter_num(++iter_num);
+    (void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get());
+    tensor_loader->EmptyPrevTensor();
+  }
 #endif
  MS_LOG(INFO) << "Finish!";
 }
--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@ -26,9 +26,6 @@
 #include "backend/optimizer/common/optimizer.h"
 #include "backend/optimizer/common/pass_manager.h"
 #include "backend/optimizer/pass/replace_node_by_proxy.h"
-#ifdef ENABLE_DEBUGGER
-#include "debug/debugger/debugger.h"
-#endif
 #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
 #include "frontend/parallel/ps/util.h"
 #endif
@ -112,12 +109,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
    summary_outputs = kernel_graph->summary_nodes();
    runtime_.IncreaseSummaryRefCount(summary_outputs);
  }
-#ifdef ENABLE_DEBUGGER
-  // debugger pre-execution processing
-  if (debugger_) {
-    debugger_->PreExecute(kernel_graph);
-  }
-#endif
+
  bool ret = runtime_.Run(kernel_graph.get(), false);
  if (!ret) {
    MS_LOG(EXCEPTION) << "Run graph failed";
@ -128,12 +120,6 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
    runtime_.DecreaseSummaryRefCount(summary_outputs);
  }

-#ifdef ENABLE_DEBUGGER
-  // debugger post-execution processing
-  if (debugger_) {
-    debugger_->PostExecute();
-  }
-#endif
  MS_LOG(INFO) << "Run graph end";
 }

--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@ -351,10 +351,12 @@ void GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info
 #ifdef ENABLE_DEBUGGER
 void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
 #ifdef ENABLE_DUMP_E2E
-  MS_EXCEPTION_IF_NULL(kernel_graph);
-  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
-  MS_EXCEPTION_IF_NULL(runtime_instance);
-  (void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
+  if (debugger_->DebuggerBackendEnabled()) {
+    MS_EXCEPTION_IF_NULL(kernel_graph);
+    auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
+    MS_EXCEPTION_IF_NULL(runtime_instance);
+    (void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
+  }
 #endif
 }

--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -80,25 +80,16 @@ void Debugger::EnableDebugger() {
  grpc_client_ = nullptr;
  debug_services_ = nullptr;

-  // see if dump is enabled
-  bool dump_enabled = false;
-  if (device_target_ == kGPUDevice) {
-    auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
-    MS_EXCEPTION_IF_NULL(runtime_instance);
-    dump_enabled = runtime_instance->DumpDataEnabled();
-  }
+  // see if dump using debugger backend is enabled
+  bool dump_enabled = CheckDebuggerDumpEnabled();
+  MS_LOG(INFO) << "dump using debugger backend = " << dump_enabled;

-  // get env variables to configure debugger
-  const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
-  if (env_enable_str != nullptr) {
-    MS_LOG(INFO) << "Getenv ENABLE_MS_DEBUGGER: " << env_enable_str;
-    if (std::strcmp(env_enable_str, "1") == 0) {
-      debugger_enabled_ = true;
-    }
-  }
+  // check if debugger enabled
+  debugger_enabled_ = CheckDebuggerEnabled();
+  MS_LOG(INFO) << "debugger_enabled_ = " << debugger_enabled_;

  if (!debugger_enabled_ && !dump_enabled) {
-    MS_LOG(WARNING) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
+    MS_LOG(INFO) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
    return;
  }

@ -109,7 +100,7 @@ void Debugger::EnableDebugger() {
    MS_LOG(INFO) << "Getenv MS_DEBUGGER_HOST: " << env_host_str;
    host = std::string(env_host_str);
  } else {
-    MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
+    MS_LOG(INFO) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
    host = "localhost";
  }
  // configure grpc port
@ -119,7 +110,7 @@ void Debugger::EnableDebugger() {
    MS_LOG(INFO) << "Getenv MS_DEBUGGER_PORT: " << env_port_str;
    port = std::string(env_port_str);
  } else {
-    MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
+    MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
    port = "50051";
  }

@ -140,8 +131,8 @@ void Debugger::EnableDebugger() {
    MS_LOG(WARNING) << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
                       "step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
  } else {
-    MS_LOG(WARNING) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
-                       "usage for large models.";
+    MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
+                    "usage for large models.";
  }
 #ifdef ENABLE_D
  // set operation overflow info
@ -180,6 +171,29 @@ void Debugger::EnableDebugger() {
  debug_services_ = std::make_unique<DebugServices>();
 }

+bool Debugger::CheckDebuggerDumpEnabled() {
+  // see if dump is enabled
+  if (device_target_ == kGPUDevice) {
+    auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
+    MS_EXCEPTION_IF_NULL(runtime_instance);
+    return runtime_instance->DumpDataEnabled();
+  }
+  return false;
+}
+
+bool Debugger::CheckDebuggerEnabled() {
+  // get env variables to configure debugger
+  const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
+  if (env_enable_str != nullptr) {
+    if (std::strcmp(env_enable_str, "1") == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool Debugger::DebuggerBackendEnabled() { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }
+
 void Debugger::Reset() {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
@ -201,25 +215,29 @@ void Debugger::Reset() {
 void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
-  // check and save graph_ptr, suspend if graph is new
-  CheckGraphPtr(graph_ptr);
+  if (debugger_->DebuggerBackendEnabled()) {
+    // check and save graph_ptr, suspend if graph is new
+    CheckGraphPtr(graph_ptr);
+  }
 }

 void Debugger::PostExecute() {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
-  // analyze tensor data and send the watchpoints been hit
-  if (run_level_ == "node") {
-    MS_LOG(INFO) << "Debugger is in node level mode ";
-    return;
-  }
-  if (debugger_enabled_ && !is_dataset_graph_) {
-    if (device_target_ != kGPUDevice) {
-      num_step_++;
-      MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
-      SendWatchpointsAndSuspend(CheckWatchpoints());
-    } else {
-      CommandLoop();
+  if (debugger_->DebuggerBackendEnabled()) {
+    // analyze tensor data and send the watchpoints been hit
+    if (run_level_ == "node") {
+      MS_LOG(INFO) << "Debugger is in node level mode ";
+      return;
+    }
+    if (debugger_enabled_ && !is_dataset_graph_) {
+      if (device_target_ != kGPUDevice) {
+        num_step_++;
+        MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
+        SendWatchpointsAndSuspend(CheckWatchpoints());
+      } else {
+        CommandLoop();
+      }
    }
  }
 }
@ -302,8 +320,8 @@ void Debugger::CheckDatasetGraph() {
    auto node_name = AnfAlgo::GetCNodeName(node);
    MS_LOG(INFO) << "node: " << node->fullname_with_scope();
    if (node_name == "GetNext" || node_name == "InitDataSetQueue") {
-      MS_LOG(WARNING) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
-                      << node_name;
+      MS_LOG(INFO) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
+                   << node_name;
      is_dataset_graph_ = true;
      return;
    }
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@ -96,6 +96,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  std::map<std::pair<uint32_t, uint32_t>, std::string> &GetStreamTaskToOpnameMap();

+  // check if any feature that uses the debugger backend is enabled
+  bool DebuggerBackendEnabled();
+
 private:
  // private constructor for singleton
  Debugger();
@ -105,6 +108,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // read env variable for grpc client
  void EnableDebugger();

+  // check if dump using debugger backend is enabled
+  bool CheckDebuggerDumpEnabled();
+
+  // check if debugger enabled
+  bool CheckDebuggerEnabled();
+
  // check and save graph pointer
  void CheckGraphPtr(const KernelGraphPtr &graph_ptr);

--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@ -40,7 +40,7 @@ class AscendKernelRuntime : public KernelRuntime {
  ~AscendKernelRuntime() override;
  bool Init() override;
  bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
-  bool LoadData(session::KernelGraph *graph, Debugger *debugger);
+  bool LoadData(session::KernelGraph *graph, Debugger *debugger) override;
  bool GenTask(const session::KernelGraph *graph);
  bool LoadTask(const session::KernelGraph *graph);
  bool RunTask(const session::KernelGraph *graph);
--- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
@ -97,14 +97,16 @@ void DataDumper::LoadDumpInfo() {
 #ifdef ENABLE_DEBUGGER
  auto debugger = mindspore::Debugger::GetInstance();
  MS_EXCEPTION_IF_NULL(debugger);
-  std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap();
-  // extract stream id, task id and opname from runtime_info_map for overflow detection
-  std::transform(runtime_info_map_.begin(), runtime_info_map_.end(),
-                 std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
-                 [](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
-                   -> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
-                   return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
-                 });
+  if (debugger->DebuggerBackendEnabled()) {
+    std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap();
+    // extract stream id, task id and opname from runtime_info_map for overflow detection
+    std::transform(runtime_info_map_.begin(), runtime_info_map_.end(),
+                   std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
+                   [](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
+                     -> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
+                     return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
+                   });
+  }
 #endif
  MS_LOG(INFO) << "[DataDump] LoadDumpInfo end";
 }
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@ -49,6 +49,8 @@ bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *d
  return false;
 }

+bool KernelRuntime::LoadData(session::KernelGraph *graph, Debugger *debugger) { return false; }
+
 bool KernelRuntime::NodeOutputDeviceAddressExist(const AnfNodePtr &kernel, size_t index) {
  MS_EXCEPTION_IF_NULL(kernel);
  if (AnfAlgo::OutputAddrExist(kernel, index)) {
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@ -59,6 +59,7 @@ class KernelRuntime {
  bool DumpDataEnabled();
  bool DumpDataEnabledIteration();
  virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr);
+  virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger);
  virtual bool Load(session::KernelGraph *graph, bool is_task_sink);
  virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0;
  bool LaunchKernel(const session::KernelGraph *graph);
--- a/mindspore/core/utils/ms_context.cc
+++ b/mindspore/core/utils/ms_context.cc
@ -53,11 +53,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
  set_param<bool>(MS_CTX_ENABLE_TASK_SINK, true);
  set_param<bool>(MS_CTX_IR_FUSION_FLAG, true);
  set_param<bool>(MS_CTX_ENABLE_HCCL, false);
-#ifdef ENABLE_DEBUGGER
-  set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, false);
-#else
  set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, true);
-#endif
  set_param<bool>(MS_CTX_ENABLE_GPU_SUMMARY, true);
  set_param<bool>(MS_CTX_PRECOMPILE_ONLY, false);
  set_param<bool>(MS_CTX_ENABLE_AUTO_MIXED_PRECISION, false);