enable debugger by default and set correct log message severity

This commit is contained in:
John Tzanakakis 2020-09-09 18:55:10 -04:00
parent 7b3873559f
commit b0a7ebdeb0
13 changed files with 106 additions and 95 deletions

View File

@ -56,7 +56,7 @@ usage()
echo " -K Compile with AKG, default on"
echo " -s Enable serving module, default off"
echo " -w Enable acl module, default off"
echo " -B Enable debugger, default off"
echo " -B Enable debugger, default on"
echo " -E Enable IBVERBS for parameter server, default off"
echo " -l Compile with python dependency, default on"
}
@ -102,7 +102,7 @@ checkopts()
ENABLE_AKG="on"
ENABLE_SERVING="off"
ENABLE_ACL="off"
ENABLE_DEBUGGER="off"
ENABLE_DEBUGGER="on"
ENABLE_IBVERBS="off"
ENABLE_PYTHON="on"
ENABLE_GPU="off"
@ -282,8 +282,7 @@ checkopts()
;;
B)
check_on_off $OPTARG B
ENABLE_DEBUGGER="on"
echo "enable debugger"
ENABLE_DEBUGGER="$OPTARG"
;;
E)
ENABLE_IBVERBS="on"

View File

@ -16,9 +16,6 @@
#include "backend/kernel_compiler/cpu/debug_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "utils/ms_utils.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
namespace mindspore {
namespace kernel {
@ -39,11 +36,6 @@ bool DebugCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
output[i] = val[i];
}
#ifdef ENABLE_DEBUGGER
// debugger will suspend execution is neccessary
Debugger::GetInstance()->PostDebugOp();
#endif
return true;
}
} // namespace kernel

View File

@ -80,11 +80,13 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr
MS_EXCEPTION_IF_NULL(kernel_prev);
#ifdef ENABLE_DEBUGGER
auto debugger_ = mindspore::Debugger::GetInstance();
DebugServices *debug_services = debugger_->debug_services();
auto watchpoint_table = debug_services->GetWatchpointTable();
std::string current_kernel_name = kernel_curr->scope_full_name();
if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) {
return false;
if (debugger_->DebuggerBackendEnabled()) {
DebugServices *debug_services = debugger_->debug_services();
auto watchpoint_table = debug_services->GetWatchpointTable();
std::string current_kernel_name = kernel_curr->scope_full_name();
if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) {
return false;
}
}
#endif
auto curr_stream_id = kernel_curr->stream_id();

View File

@ -605,16 +605,18 @@ void AscendSession::LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph)
MS_LOG(INFO) << "Start!";
MS_EXCEPTION_IF_NULL(kernel_graph);
#ifdef ENABLE_DEBUGGER
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
DebugServices *debug_services = debugger_->debug_services();
TensorLoader *tensor_loader = debug_services->tensor_loader();
// TensorData will be freed up here
tensor_loader->EmptyTensor();
uint32_t iter_num = tensor_loader->GetIterNum();
tensor_loader->set_iter_num(++iter_num);
(void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get());
tensor_loader->EmptyPrevTensor();
if (debugger_->DebuggerBackendEnabled()) {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
DebugServices *debug_services = debugger_->debug_services();
TensorLoader *tensor_loader = debug_services->tensor_loader();
// TensorData will be freed up here
tensor_loader->EmptyTensor();
uint32_t iter_num = tensor_loader->GetIterNum();
tensor_loader->set_iter_num(++iter_num);
(void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get());
tensor_loader->EmptyPrevTensor();
}
#endif
MS_LOG(INFO) << "Finish!";
}

View File

@ -26,9 +26,6 @@
#include "backend/optimizer/common/optimizer.h"
#include "backend/optimizer/common/pass_manager.h"
#include "backend/optimizer/pass/replace_node_by_proxy.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
#include "frontend/parallel/ps/util.h"
#endif
@ -112,12 +109,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
summary_outputs = kernel_graph->summary_nodes();
runtime_.IncreaseSummaryRefCount(summary_outputs);
}
#ifdef ENABLE_DEBUGGER
// debugger pre-execution processing
if (debugger_) {
debugger_->PreExecute(kernel_graph);
}
#endif
bool ret = runtime_.Run(kernel_graph.get(), false);
if (!ret) {
MS_LOG(EXCEPTION) << "Run graph failed";
@ -128,12 +120,6 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
runtime_.DecreaseSummaryRefCount(summary_outputs);
}
#ifdef ENABLE_DEBUGGER
// debugger post-execution processing
if (debugger_) {
debugger_->PostExecute();
}
#endif
MS_LOG(INFO) << "Run graph end";
}

View File

@ -351,10 +351,12 @@ void GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info
#ifdef ENABLE_DEBUGGER
void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
#ifdef ENABLE_DUMP_E2E
MS_EXCEPTION_IF_NULL(kernel_graph);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
(void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
if (debugger_->DebuggerBackendEnabled()) {
MS_EXCEPTION_IF_NULL(kernel_graph);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
(void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
}
#endif
}

View File

@ -80,25 +80,16 @@ void Debugger::EnableDebugger() {
grpc_client_ = nullptr;
debug_services_ = nullptr;
// see if dump is enabled
bool dump_enabled = false;
if (device_target_ == kGPUDevice) {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
dump_enabled = runtime_instance->DumpDataEnabled();
}
// see if dump using debugger backend is enabled
bool dump_enabled = CheckDebuggerDumpEnabled();
MS_LOG(INFO) << "dump using debugger backend = " << dump_enabled;
// get env variables to configure debugger
const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
if (env_enable_str != nullptr) {
MS_LOG(INFO) << "Getenv ENABLE_MS_DEBUGGER: " << env_enable_str;
if (std::strcmp(env_enable_str, "1") == 0) {
debugger_enabled_ = true;
}
}
// check if debugger enabled
debugger_enabled_ = CheckDebuggerEnabled();
MS_LOG(INFO) << "debugger_enabled_ = " << debugger_enabled_;
if (!debugger_enabled_ && !dump_enabled) {
MS_LOG(WARNING) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
MS_LOG(INFO) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
return;
}
@ -109,7 +100,7 @@ void Debugger::EnableDebugger() {
MS_LOG(INFO) << "Getenv MS_DEBUGGER_HOST: " << env_host_str;
host = std::string(env_host_str);
} else {
MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
MS_LOG(INFO) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
host = "localhost";
}
// configure grpc port
@ -119,7 +110,7 @@ void Debugger::EnableDebugger() {
MS_LOG(INFO) << "Getenv MS_DEBUGGER_PORT: " << env_port_str;
port = std::string(env_port_str);
} else {
MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
port = "50051";
}
@ -140,8 +131,8 @@ void Debugger::EnableDebugger() {
MS_LOG(WARNING) << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
"step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
} else {
MS_LOG(WARNING) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
"usage for large models.";
MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
"usage for large models.";
}
#ifdef ENABLE_D
// set operation overflow info
@ -180,6 +171,29 @@ void Debugger::EnableDebugger() {
debug_services_ = std::make_unique<DebugServices>();
}
bool Debugger::CheckDebuggerDumpEnabled() {
// see if dump is enabled
if (device_target_ == kGPUDevice) {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
return runtime_instance->DumpDataEnabled();
}
return false;
}
bool Debugger::CheckDebuggerEnabled() {
// get env variables to configure debugger
const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
if (env_enable_str != nullptr) {
if (std::strcmp(env_enable_str, "1") == 0) {
return true;
}
}
return false;
}
bool Debugger::DebuggerBackendEnabled() { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }
void Debugger::Reset() {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
@ -201,25 +215,29 @@ void Debugger::Reset() {
void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
// check and save graph_ptr, suspend if graph is new
CheckGraphPtr(graph_ptr);
if (debugger_->DebuggerBackendEnabled()) {
// check and save graph_ptr, suspend if graph is new
CheckGraphPtr(graph_ptr);
}
}
void Debugger::PostExecute() {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
// analyze tensor data and send the watchpoints been hit
if (run_level_ == "node") {
MS_LOG(INFO) << "Debugger is in node level mode ";
return;
}
if (debugger_enabled_ && !is_dataset_graph_) {
if (device_target_ != kGPUDevice) {
num_step_++;
MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
SendWatchpointsAndSuspend(CheckWatchpoints());
} else {
CommandLoop();
if (debugger_->DebuggerBackendEnabled()) {
// analyze tensor data and send the watchpoints been hit
if (run_level_ == "node") {
MS_LOG(INFO) << "Debugger is in node level mode ";
return;
}
if (debugger_enabled_ && !is_dataset_graph_) {
if (device_target_ != kGPUDevice) {
num_step_++;
MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
SendWatchpointsAndSuspend(CheckWatchpoints());
} else {
CommandLoop();
}
}
}
}
@ -302,8 +320,8 @@ void Debugger::CheckDatasetGraph() {
auto node_name = AnfAlgo::GetCNodeName(node);
MS_LOG(INFO) << "node: " << node->fullname_with_scope();
if (node_name == "GetNext" || node_name == "InitDataSetQueue") {
MS_LOG(WARNING) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
<< node_name;
MS_LOG(INFO) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
<< node_name;
is_dataset_graph_ = true;
return;
}

View File

@ -96,6 +96,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
std::map<std::pair<uint32_t, uint32_t>, std::string> &GetStreamTaskToOpnameMap();
// check if any feature that uses the debugger backend is enabled
bool DebuggerBackendEnabled();
private:
// private constructor for singleton
Debugger();
@ -105,6 +108,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// read env variable for grpc client
void EnableDebugger();
// check if dump using debugger backend is enabled
bool CheckDebuggerDumpEnabled();
// check if debugger enabled
bool CheckDebuggerEnabled();
// check and save graph pointer
void CheckGraphPtr(const KernelGraphPtr &graph_ptr);

View File

@ -40,7 +40,7 @@ class AscendKernelRuntime : public KernelRuntime {
~AscendKernelRuntime() override;
bool Init() override;
bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
bool LoadData(session::KernelGraph *graph, Debugger *debugger);
bool LoadData(session::KernelGraph *graph, Debugger *debugger) override;
bool GenTask(const session::KernelGraph *graph);
bool LoadTask(const session::KernelGraph *graph);
bool RunTask(const session::KernelGraph *graph);

View File

@ -97,14 +97,16 @@ void DataDumper::LoadDumpInfo() {
#ifdef ENABLE_DEBUGGER
auto debugger = mindspore::Debugger::GetInstance();
MS_EXCEPTION_IF_NULL(debugger);
std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap();
// extract stream id, task id and opname from runtime_info_map for overflow detection
std::transform(runtime_info_map_.begin(), runtime_info_map_.end(),
std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
[](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
-> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
});
if (debugger->DebuggerBackendEnabled()) {
std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap();
// extract stream id, task id and opname from runtime_info_map for overflow detection
std::transform(runtime_info_map_.begin(), runtime_info_map_.end(),
std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
[](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
-> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
});
}
#endif
MS_LOG(INFO) << "[DataDump] LoadDumpInfo end";
}

View File

@ -49,6 +49,8 @@ bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *d
return false;
}
bool KernelRuntime::LoadData(session::KernelGraph *graph, Debugger *debugger) { return false; }
bool KernelRuntime::NodeOutputDeviceAddressExist(const AnfNodePtr &kernel, size_t index) {
MS_EXCEPTION_IF_NULL(kernel);
if (AnfAlgo::OutputAddrExist(kernel, index)) {

View File

@ -59,6 +59,7 @@ class KernelRuntime {
bool DumpDataEnabled();
bool DumpDataEnabledIteration();
virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr);
virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger);
virtual bool Load(session::KernelGraph *graph, bool is_task_sink);
virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0;
bool LaunchKernel(const session::KernelGraph *graph);

View File

@ -53,11 +53,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
set_param<bool>(MS_CTX_ENABLE_TASK_SINK, true);
set_param<bool>(MS_CTX_IR_FUSION_FLAG, true);
set_param<bool>(MS_CTX_ENABLE_HCCL, false);
#ifdef ENABLE_DEBUGGER
set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, false);
#else
set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, true);
#endif
set_param<bool>(MS_CTX_ENABLE_GPU_SUMMARY, true);
set_param<bool>(MS_CTX_PRECOMPILE_ONLY, false);
set_param<bool>(MS_CTX_ENABLE_AUTO_MIXED_PRECISION, false);