forked from OSSInnovation/mindspore
!5970 enable debugger by default and set correct log message severity
Merge pull request !5970 from john_tzanakakis/master_ms1_grpc
This commit is contained in:
commit
939737c017
7
build.sh
7
build.sh
|
@ -56,7 +56,7 @@ usage()
|
|||
echo " -K Compile with AKG, default on"
|
||||
echo " -s Enable serving module, default off"
|
||||
echo " -w Enable acl module, default off"
|
||||
echo " -B Enable debugger, default off"
|
||||
echo " -B Enable debugger, default on"
|
||||
echo " -E Enable IBVERBS for parameter server, default off"
|
||||
echo " -l Compile with python dependency, default on"
|
||||
}
|
||||
|
@ -102,7 +102,7 @@ checkopts()
|
|||
ENABLE_AKG="on"
|
||||
ENABLE_SERVING="off"
|
||||
ENABLE_ACL="off"
|
||||
ENABLE_DEBUGGER="off"
|
||||
ENABLE_DEBUGGER="on"
|
||||
ENABLE_IBVERBS="off"
|
||||
ENABLE_PYTHON="on"
|
||||
ENABLE_GPU="off"
|
||||
|
@ -282,8 +282,7 @@ checkopts()
|
|||
;;
|
||||
B)
|
||||
check_on_off $OPTARG B
|
||||
ENABLE_DEBUGGER="on"
|
||||
echo "enable debugger"
|
||||
ENABLE_DEBUGGER="$OPTARG"
|
||||
;;
|
||||
E)
|
||||
ENABLE_IBVERBS="on"
|
||||
|
|
|
@ -16,9 +16,6 @@
|
|||
#include "backend/kernel_compiler/cpu/debug_cpu_kernel.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debugger/debugger.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -39,11 +36,6 @@ bool DebugCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
|||
output[i] = val[i];
|
||||
}
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
// debugger will suspend execution is neccessary
|
||||
Debugger::GetInstance()->PostDebugOp();
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
|
|
|
@ -80,11 +80,13 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr
|
|||
MS_EXCEPTION_IF_NULL(kernel_prev);
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger_ = mindspore::Debugger::GetInstance();
|
||||
DebugServices *debug_services = debugger_->debug_services();
|
||||
auto watchpoint_table = debug_services->GetWatchpointTable();
|
||||
std::string current_kernel_name = kernel_curr->scope_full_name();
|
||||
if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) {
|
||||
return false;
|
||||
if (debugger_->DebuggerBackendEnabled()) {
|
||||
DebugServices *debug_services = debugger_->debug_services();
|
||||
auto watchpoint_table = debug_services->GetWatchpointTable();
|
||||
std::string current_kernel_name = kernel_curr->scope_full_name();
|
||||
if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
auto curr_stream_id = kernel_curr->stream_id();
|
||||
|
|
|
@ -605,16 +605,18 @@ void AscendSession::LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph)
|
|||
MS_LOG(INFO) << "Start!";
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||
DebugServices *debug_services = debugger_->debug_services();
|
||||
TensorLoader *tensor_loader = debug_services->tensor_loader();
|
||||
// TensorData will be freed up here
|
||||
tensor_loader->EmptyTensor();
|
||||
uint32_t iter_num = tensor_loader->GetIterNum();
|
||||
tensor_loader->set_iter_num(++iter_num);
|
||||
(void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get());
|
||||
tensor_loader->EmptyPrevTensor();
|
||||
if (debugger_->DebuggerBackendEnabled()) {
|
||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||
DebugServices *debug_services = debugger_->debug_services();
|
||||
TensorLoader *tensor_loader = debug_services->tensor_loader();
|
||||
// TensorData will be freed up here
|
||||
tensor_loader->EmptyTensor();
|
||||
uint32_t iter_num = tensor_loader->GetIterNum();
|
||||
tensor_loader->set_iter_num(++iter_num);
|
||||
(void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get());
|
||||
tensor_loader->EmptyPrevTensor();
|
||||
}
|
||||
#endif
|
||||
MS_LOG(INFO) << "Finish!";
|
||||
}
|
||||
|
|
|
@ -26,9 +26,6 @@
|
|||
#include "backend/optimizer/common/optimizer.h"
|
||||
#include "backend/optimizer/common/pass_manager.h"
|
||||
#include "backend/optimizer/pass/replace_node_by_proxy.h"
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debugger/debugger.h"
|
||||
#endif
|
||||
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
|
||||
#include "frontend/parallel/ps/util.h"
|
||||
#endif
|
||||
|
@ -112,12 +109,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
|
|||
summary_outputs = kernel_graph->summary_nodes();
|
||||
runtime_.IncreaseSummaryRefCount(summary_outputs);
|
||||
}
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
// debugger pre-execution processing
|
||||
if (debugger_) {
|
||||
debugger_->PreExecute(kernel_graph);
|
||||
}
|
||||
#endif
|
||||
|
||||
bool ret = runtime_.Run(kernel_graph.get(), false);
|
||||
if (!ret) {
|
||||
MS_LOG(EXCEPTION) << "Run graph failed";
|
||||
|
@ -128,12 +120,6 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
|
|||
runtime_.DecreaseSummaryRefCount(summary_outputs);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
// debugger post-execution processing
|
||||
if (debugger_) {
|
||||
debugger_->PostExecute();
|
||||
}
|
||||
#endif
|
||||
MS_LOG(INFO) << "Run graph end";
|
||||
}
|
||||
|
||||
|
|
|
@ -351,10 +351,12 @@ void GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info
|
|||
#ifdef ENABLE_DEBUGGER
|
||||
void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||
#ifdef ENABLE_DUMP_E2E
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||
(void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
|
||||
if (debugger_->DebuggerBackendEnabled()) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||
(void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -80,25 +80,16 @@ void Debugger::EnableDebugger() {
|
|||
grpc_client_ = nullptr;
|
||||
debug_services_ = nullptr;
|
||||
|
||||
// see if dump is enabled
|
||||
bool dump_enabled = false;
|
||||
if (device_target_ == kGPUDevice) {
|
||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||
dump_enabled = runtime_instance->DumpDataEnabled();
|
||||
}
|
||||
// see if dump using debugger backend is enabled
|
||||
bool dump_enabled = CheckDebuggerDumpEnabled();
|
||||
MS_LOG(INFO) << "dump using debugger backend = " << dump_enabled;
|
||||
|
||||
// get env variables to configure debugger
|
||||
const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
|
||||
if (env_enable_str != nullptr) {
|
||||
MS_LOG(INFO) << "Getenv ENABLE_MS_DEBUGGER: " << env_enable_str;
|
||||
if (std::strcmp(env_enable_str, "1") == 0) {
|
||||
debugger_enabled_ = true;
|
||||
}
|
||||
}
|
||||
// check if debugger enabled
|
||||
debugger_enabled_ = CheckDebuggerEnabled();
|
||||
MS_LOG(INFO) << "debugger_enabled_ = " << debugger_enabled_;
|
||||
|
||||
if (!debugger_enabled_ && !dump_enabled) {
|
||||
MS_LOG(WARNING) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
|
||||
MS_LOG(INFO) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -109,7 +100,7 @@ void Debugger::EnableDebugger() {
|
|||
MS_LOG(INFO) << "Getenv MS_DEBUGGER_HOST: " << env_host_str;
|
||||
host = std::string(env_host_str);
|
||||
} else {
|
||||
MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
|
||||
MS_LOG(INFO) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
|
||||
host = "localhost";
|
||||
}
|
||||
// configure grpc port
|
||||
|
@ -119,7 +110,7 @@ void Debugger::EnableDebugger() {
|
|||
MS_LOG(INFO) << "Getenv MS_DEBUGGER_PORT: " << env_port_str;
|
||||
port = std::string(env_port_str);
|
||||
} else {
|
||||
MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
|
||||
MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
|
||||
port = "50051";
|
||||
}
|
||||
|
||||
|
@ -140,8 +131,8 @@ void Debugger::EnableDebugger() {
|
|||
MS_LOG(WARNING) << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
|
||||
"step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
|
||||
} else {
|
||||
MS_LOG(WARNING) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
|
||||
"usage for large models.";
|
||||
MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
|
||||
"usage for large models.";
|
||||
}
|
||||
#ifdef ENABLE_D
|
||||
// set operation overflow info
|
||||
|
@ -180,6 +171,29 @@ void Debugger::EnableDebugger() {
|
|||
debug_services_ = std::make_unique<DebugServices>();
|
||||
}
|
||||
|
||||
bool Debugger::CheckDebuggerDumpEnabled() {
|
||||
// see if dump is enabled
|
||||
if (device_target_ == kGPUDevice) {
|
||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||
return runtime_instance->DumpDataEnabled();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Debugger::CheckDebuggerEnabled() {
|
||||
// get env variables to configure debugger
|
||||
const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
|
||||
if (env_enable_str != nullptr) {
|
||||
if (std::strcmp(env_enable_str, "1") == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Debugger::DebuggerBackendEnabled() { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }
|
||||
|
||||
void Debugger::Reset() {
|
||||
// access lock for public method
|
||||
std::lock_guard<std::mutex> a_lock(access_lock_);
|
||||
|
@ -201,25 +215,29 @@ void Debugger::Reset() {
|
|||
void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
|
||||
// access lock for public method
|
||||
std::lock_guard<std::mutex> a_lock(access_lock_);
|
||||
// check and save graph_ptr, suspend if graph is new
|
||||
CheckGraphPtr(graph_ptr);
|
||||
if (debugger_->DebuggerBackendEnabled()) {
|
||||
// check and save graph_ptr, suspend if graph is new
|
||||
CheckGraphPtr(graph_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
void Debugger::PostExecute() {
|
||||
// access lock for public method
|
||||
std::lock_guard<std::mutex> a_lock(access_lock_);
|
||||
// analyze tensor data and send the watchpoints been hit
|
||||
if (run_level_ == "node") {
|
||||
MS_LOG(INFO) << "Debugger is in node level mode ";
|
||||
return;
|
||||
}
|
||||
if (debugger_enabled_ && !is_dataset_graph_) {
|
||||
if (device_target_ != kGPUDevice) {
|
||||
num_step_++;
|
||||
MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
|
||||
SendWatchpointsAndSuspend(CheckWatchpoints());
|
||||
} else {
|
||||
CommandLoop();
|
||||
if (debugger_->DebuggerBackendEnabled()) {
|
||||
// analyze tensor data and send the watchpoints been hit
|
||||
if (run_level_ == "node") {
|
||||
MS_LOG(INFO) << "Debugger is in node level mode ";
|
||||
return;
|
||||
}
|
||||
if (debugger_enabled_ && !is_dataset_graph_) {
|
||||
if (device_target_ != kGPUDevice) {
|
||||
num_step_++;
|
||||
MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
|
||||
SendWatchpointsAndSuspend(CheckWatchpoints());
|
||||
} else {
|
||||
CommandLoop();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -302,8 +320,8 @@ void Debugger::CheckDatasetGraph() {
|
|||
auto node_name = AnfAlgo::GetCNodeName(node);
|
||||
MS_LOG(INFO) << "node: " << node->fullname_with_scope();
|
||||
if (node_name == "GetNext" || node_name == "InitDataSetQueue") {
|
||||
MS_LOG(WARNING) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
|
||||
<< node_name;
|
||||
MS_LOG(INFO) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
|
||||
<< node_name;
|
||||
is_dataset_graph_ = true;
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -96,6 +96,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
|
||||
std::map<std::pair<uint32_t, uint32_t>, std::string> &GetStreamTaskToOpnameMap();
|
||||
|
||||
// check if any feature that uses the debugger backend is enabled
|
||||
bool DebuggerBackendEnabled();
|
||||
|
||||
private:
|
||||
// private constructor for singleton
|
||||
Debugger();
|
||||
|
@ -105,6 +108,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
// read env variable for grpc client
|
||||
void EnableDebugger();
|
||||
|
||||
// check if dump using debugger backend is enabled
|
||||
bool CheckDebuggerDumpEnabled();
|
||||
|
||||
// check if debugger enabled
|
||||
bool CheckDebuggerEnabled();
|
||||
|
||||
// check and save graph pointer
|
||||
void CheckGraphPtr(const KernelGraphPtr &graph_ptr);
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ class AscendKernelRuntime : public KernelRuntime {
|
|||
~AscendKernelRuntime() override;
|
||||
bool Init() override;
|
||||
bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
|
||||
bool LoadData(session::KernelGraph *graph, Debugger *debugger);
|
||||
bool LoadData(session::KernelGraph *graph, Debugger *debugger) override;
|
||||
bool GenTask(const session::KernelGraph *graph);
|
||||
bool LoadTask(const session::KernelGraph *graph);
|
||||
bool RunTask(const session::KernelGraph *graph);
|
||||
|
|
|
@ -97,14 +97,16 @@ void DataDumper::LoadDumpInfo() {
|
|||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = mindspore::Debugger::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(debugger);
|
||||
std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap();
|
||||
// extract stream id, task id and opname from runtime_info_map for overflow detection
|
||||
std::transform(runtime_info_map_.begin(), runtime_info_map_.end(),
|
||||
std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
|
||||
[](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
|
||||
-> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
|
||||
return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
|
||||
});
|
||||
if (debugger->DebuggerBackendEnabled()) {
|
||||
std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap();
|
||||
// extract stream id, task id and opname from runtime_info_map for overflow detection
|
||||
std::transform(runtime_info_map_.begin(), runtime_info_map_.end(),
|
||||
std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
|
||||
[](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
|
||||
-> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
|
||||
return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
|
||||
});
|
||||
}
|
||||
#endif
|
||||
MS_LOG(INFO) << "[DataDump] LoadDumpInfo end";
|
||||
}
|
||||
|
|
|
@ -49,6 +49,8 @@ bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *d
|
|||
return false;
|
||||
}
|
||||
|
||||
bool KernelRuntime::LoadData(session::KernelGraph *graph, Debugger *debugger) { return false; }
|
||||
|
||||
bool KernelRuntime::NodeOutputDeviceAddressExist(const AnfNodePtr &kernel, size_t index) {
|
||||
MS_EXCEPTION_IF_NULL(kernel);
|
||||
if (AnfAlgo::OutputAddrExist(kernel, index)) {
|
||||
|
|
|
@ -59,6 +59,7 @@ class KernelRuntime {
|
|||
bool DumpDataEnabled();
|
||||
bool DumpDataEnabledIteration();
|
||||
virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr);
|
||||
virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger);
|
||||
virtual bool Load(session::KernelGraph *graph, bool is_task_sink);
|
||||
virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0;
|
||||
bool LaunchKernel(const session::KernelGraph *graph);
|
||||
|
|
|
@ -53,11 +53,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
|
|||
set_param<bool>(MS_CTX_ENABLE_TASK_SINK, true);
|
||||
set_param<bool>(MS_CTX_IR_FUSION_FLAG, true);
|
||||
set_param<bool>(MS_CTX_ENABLE_HCCL, false);
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, false);
|
||||
#else
|
||||
set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, true);
|
||||
#endif
|
||||
set_param<bool>(MS_CTX_ENABLE_GPU_SUMMARY, true);
|
||||
set_param<bool>(MS_CTX_PRECOMPILE_ONLY, false);
|
||||
set_param<bool>(MS_CTX_ENABLE_AUTO_MIXED_PRECISION, false);
|
||||
|
|
Loading…
Reference in New Issue