Add debugger to new unified GPU runtime

This commit is contained in:
Parastoo Ashtari 2021-06-14 12:31:17 -04:00
parent dca5504fd4
commit 6ed17d52b1
11 changed files with 359 additions and 21 deletions

View File

@ -34,6 +34,15 @@
#include "debug/data_dump/e2e_dump.h"
#include "utils/config_manager.h"
#include "debug/env_config_parser.h"
#include "utils/comm_manager.h"
#include "runtime/framework/actor/actor_common.h"
#include "runtime/hardware/device_context_manager.h"
#include "debug/anf_ir_dump.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/proto_exporter.h"
#else
#include "debug/debugger/proto_exporter_stub.h"
#endif
using debugger::Chunk;
using debugger::EventReply;
@ -228,6 +237,9 @@ bool Debugger::CheckDebuggerDumpEnabled() const {
// see if dump is enabled
if (device_target_ == kGPUDevice) {
return device::KernelRuntime::DumpDataEnabled();
} else if (IsMindRTUsed()) {
auto &dump_json_parser = DumpJsonParser::GetInstance();
return dump_json_parser.e2e_dump_enabled();
}
return false;
}
@ -289,8 +301,23 @@ void Debugger::Reset() {
graph_ptr_list_.clear();
}
void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs) {
// Only GPU is supported for MindRTBackend
if (device_target_ != kGPUDevice) {
return;
}
uint32_t graph_sum = graphs.size();
for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
const auto &graph = graphs[graph_index];
if (debugger_) {
debugger_->PreExecute(graph, graph_sum);
}
DumpSetup(graph);
}
}
void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
CheckDatasetSinkMode();
auto graph_id = graph_ptr->graph_id();
@ -313,7 +340,6 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
if (!debugger_enabled_) {
EnableDebugger();
}
if (debugger_enabled_) {
if (graph_proto_list_.size()) {
// only send compiled graphs once.
@ -323,7 +349,9 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
LoadParametersAndConst();
// revert graph ptr to original value
graph_ptr_ = dbg_graph_ptr;
SendMultiGraphsAndSuspend(graph_proto_list_);
graph_proto_list_.clear();
} else if (graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice) {
// stop only when receive the first sub run graph for each step
@ -351,6 +379,89 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
// resets for the new graph
suspended_at_last_kernel_ = 0;
}
bool Debugger::DumpDataEnabledIteration() const {
auto &dump_json_parser = DumpJsonParser::GetInstance();
if (!dump_json_parser.e2e_dump_enabled()) {
return false;
}
auto cur_iter = dump_json_parser.cur_dump_iter();
if (dump_json_parser.IsDumpIter(cur_iter)) {
return true;
}
return false;
}
void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
const auto &device_context =
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
uint32_t rank_id = device_context->GetRankID();
if (debugger_->DebuggerBackendEnabled()) {
MS_EXCEPTION_IF_NULL(kernel_graph);
E2eDump::DumpData(kernel_graph.get(), rank_id, debugger_.get());
} else {
DumpJsonParser::GetInstance().UpdateDumpIter();
}
}
void Debugger::DumpSetup(const KernelGraphPtr &kernel_graph) const {
MS_LOG(INFO) << "Start!";
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
const auto &device_context =
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
uint32_t rank_id = device_context->GetRankID();
MS_EXCEPTION_IF_NULL(kernel_graph);
E2eDump::DumpSetup(kernel_graph.get(), rank_id);
MS_LOG(INFO) << "Finish!";
}
void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) {
// This function will be called for new GPU runtime using MindRTBackend
auto &json_parser = DumpJsonParser::GetInstance();
if (json_parser.e2e_dump_enabled()) {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
const auto &device_context =
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
uint32_t rank_id = device_context->GetRankID();
kernel_graph->set_root_graph_id(kernel_graph->graph_id());
std::string final_graph = "trace_code_graph_" + std::to_string(kernel_graph->graph_id());
std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id);
std::string target_dir = root_dir + "/graphs";
std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
DumpIRProtoWithSrcInfo(kernel_graph, final_graph, target_dir, kDebugWholeStack);
DumpIR("trace_code_graph", kernel_graph, true, kWholeStack, ir_file_path);
DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(kernel_graph->graph_id()) + ".csv", root_dir,
kernel_graph->execution_order());
}
}
void Debugger::PostExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs) {
// Only GPU is supported for MindRTBackend
if (device_target_ != kGPUDevice) {
return;
}
for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
const auto &graph = graphs[graph_index];
bool dump_enabled = debugger_->DumpDataEnabledIteration();
// debug used for dump
if (debugger_ && dump_enabled) {
debugger_->Dump(graph);
} else {
DumpJsonParser::GetInstance().UpdateDumpIter();
}
if (debugger_) {
debugger_->PostExecute();
}
}
}
void Debugger::PostExecute() {
// access lock for public method
@ -365,6 +476,7 @@ void Debugger::PostExecute() {
num_step_++;
}
SendWatchpoints(CheckWatchpoints());
// no need to suspend at each graph for GPU, suspension happens in preExecute
if (device_target_ != kGPUDevice) {
CommandLoop();
@ -388,7 +500,6 @@ bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) const {
}
return false;
}
void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
@ -405,6 +516,7 @@ void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
if (!hits.empty()) {
SendWatchpoints(hits);
CommandLoop();
hit_empty_flag = false;
}
}
@ -507,7 +619,6 @@ GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
ModelProto model = GetDebuggerFuncGraphProto(graph_ptr);
return model.graph();
}
void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
if (SendMetadata(true)) {
// send graph to Mindinsight server
@ -533,7 +644,9 @@ bool Debugger::SendMetadata(bool version_check) {
MS_LOG(INFO) << "Is training done?" << training_done_;
// set graph munber to not_dataset_graph_sum_
metadata.set_graph_num(not_dataset_graph_sum_);
EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
bool ret = false;
if (reply_metadata.status() == reply_metadata.OK) {
if (version_check) {
@ -575,6 +688,7 @@ void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_prot
auto graph_size = graph.ByteSize();
if (graph_size > g_chunk_size) {
auto sub_graph_str = grpc_client_->ChunkString(str, graph_size);
for (unsigned int i = 0; i < sub_graph_str.size(); i++) {
chunk.set_buffer(sub_graph_str[i]);
chunked_graph_proto_list.push_back(chunk);
@ -834,7 +948,6 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
}
return tensor_list;
}
void Debugger::Exit() {
// clear resource before exit
// debugger will notify main thread to exit because main thread can only exit at step boundary
@ -1171,6 +1284,13 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
return;
}
// When MindRT is used, only ValueNodes and ParameterWeights can be loaded from device to host
if (IsMindRTUsed() && (device_target_ == kGPUDevice)) {
if (!anf_node->isa<ValueNode>() &&
!(anf_node->isa<Parameter>() && AnfAlgo::IsParameterWeight(anf_node->cast<ParameterPtr>()))) {
return;
}
}
// for parameters and value nodes, set its execution order to be 0;
int exec_order = 0;
std::string node_name = anf_node->fullname_with_scope();
@ -1268,6 +1388,14 @@ void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
++num_step_;
}
}
void Debugger::UpdateStepNumGPU() {
// UpdateStepNum with DebugActor::DebugOnStepEnd
if (device_target_ == kGPUDevice && (debugger_enabled_ || DumpDataEnabledIteration())) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
++num_step_;
}
}
void Debugger::ClearCurrentData() {
if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()))

View File

@ -73,6 +73,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// reset debugger
void Reset();
void PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs);
// enable debugger
// send graph and wait for command
// do nothing if graph is set already
@ -82,6 +83,16 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// don't need a graph_ptr because it is saved during pre_execute
void PostExecute();
bool DumpDataEnabledIteration() const;
void Dump(const KernelGraphPtr &kernel_graph) const;
void DumpSetup(const KernelGraphPtr &kernel_graph) const;
void DumpInGraphCompiler(const KernelGraphPtr &kernel_graph);
void PostExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs);
bool ReadNodeDataRequired(const CNodePtr &kernel) const;
void PostExecuteNode(const CNodePtr &kernel, bool last_kernel);
@ -132,6 +143,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
void UpdateStepNum(const session::KernelGraph *graph);
void UpdateStepNumGPU();
void ClearCurrentData();
void LoadGraphOutputs();
@ -194,7 +207,6 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
void ProcessKSetCMD(const EventReply &reply);
// Process the KViewCMD
void ProcessKViewCMD(const EventReply &reply);
// set what nodes and conditions to watch
void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
const ProtoVector<WatchCondition_Parameter> &parameters);
@ -228,6 +240,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
void LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index);
// class members
std::unique_ptr<GrpcClient> grpc_client_;
std::unique_ptr<DebugServices> debug_services_;
KernelGraphPtr graph_ptr_;
@ -249,6 +262,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
std::map<uint32_t, std::string> overflow_bin_path_;
// flag to keep track of the very first suspension of debugger
bool initial_suspend_;
std::list<GraphProto> graph_proto_list_;
std::list<KernelGraphPtr> graph_ptr_list_;
@ -261,9 +275,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
};
using DebuggerPtr = std::shared_ptr<Debugger>;
// get debugger ModelProto
std::string GetDebuggerFuncGraphProtoString(const FuncGraphPtr &func_graph);
ModelProto GetDebuggerFuncGraphProto(const FuncGraphPtr &func_graph);
// for getting proto DataType from Type of Tensor
@ -282,7 +296,6 @@ int32_t GetWatchpointID(const EventReply &reply);
bool GetWatchpointDelete(const EventReply &reply);
ProtoVector<TensorProto> GetTensors(const EventReply &reply);
bool GetMiVersionMatched(const EventReply &reply);
// get the full name of a tensor, which is the name used in TensorLoader
std::string GetTensorFullName(const TensorProto &tensor);

View File

@ -167,7 +167,7 @@ void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *co
}
void DeviceQueueDataSourceActor::SendDebugReq(OpContext<DeviceTensor> *context) {
Async(*debug_aid_, &DebugActor::Debug, data_kernel_, device_context_, context, &GetAID());
Async(*debug_aid_, &DebugActor::Debug, data_kernel_, &launch_info_, device_context_, context, &GetAID());
}
void DeviceQueueDataSourceActor::OnDebugFinish(OpContext<DeviceTensor> *context) {

View File

@ -15,20 +15,134 @@
*/
#include "runtime/framework/actor/debug_actor.h"
#include <vector>
#include <memory>
#include <string>
#include "runtime/framework/actor/debug_aware_actor.h"
#include "mindrt/include/async/async.h"
#include "utils/log_adapter.h"
#ifdef ENABLE_GPU
#include "debug/debugger/debugger.h"
#include "runtime/device/gpu/gpu_device_address.h"
using mindspore::kernel::AddressPtr;
using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;
#endif
namespace mindspore {
namespace runtime {
void DebugActor::Debug(const AnfNodePtr &node, const DeviceContext *device_context, OpContext<DeviceTensor> *op_context,
const AID *from_aid) {
#ifdef ENABLE_GPU
static const size_t PARAMETER_OUTPUT_INDEX = 0;
std::vector<int> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
// define a vector containing real output number
std::vector<int> real_outputs;
// P.BatchNorm is used for training and inference
// can add the filter list for more operators here....
if (node_name == "BatchNorm") {
MS_LOG(INFO) << "loading node named " << node_name;
real_outputs.insert(real_outputs.end(), {0, 3, 4});
} else {
// by default, TensorLoader will load all outputs
for (size_t j = 0; j < output_size; ++j) {
real_outputs.push_back(j);
}
}
return real_outputs;
}
void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) {
// get inputs
auto kernel_inputs = launch_info_->inputs_;
auto input_size = AnfAlgo::GetInputTensorNum(cnode);
for (size_t j = 0; j < input_size; ++j) {
auto input_kernel = cnode->input(j + 1);
std::string input_kernel_name = input_kernel->fullname_with_scope();
auto addr = kernel_inputs[j];
auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
// For example, this happens with the Depend op
if (type == kMetaTypeNone) {
continue;
}
auto format = kOpFormat_DEFAULT;
auto gpu_addr = std::make_unique<device::gpu::GPUDeviceAddress>(addr->addr, addr->size, format, type);
string input_tensor_name = input_kernel_name + ':' + "0";
ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX);
auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order_, format, int_shapes, type, 0, true);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
}
}
}
void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) {
// get outputs
auto kernel_outputs = launch_info_->outputs_;
auto output_size = AnfAlgo::GetOutputTensorNum(cnode);
auto node_name = AnfAlgo::GetCNodeName(cnode);
std::string kernel_name = cnode->fullname_with_scope();
std::vector<int> real_outputs = CheckRealOutput(node_name, output_size);
for (int j : real_outputs) {
auto addr = kernel_outputs[j];
auto type = AnfAlgo::GetOutputInferDataType(cnode, j);
// For example, this happens with the Depend op
if (type == kMetaTypeNone) {
continue;
}
auto format = kOpFormat_DEFAULT;
auto gpu_addr = std::make_unique<device::gpu::GPUDeviceAddress>(addr->addr, addr->size, format, type);
string tensor_name = kernel_name + ':' + std::to_string(j);
ShapeVector int_shapes = trans::GetRuntimePaddingShape(cnode, j);
auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order_, format, int_shapes, type, j, false);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
}
}
}
#endif
void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_info_,
const DeviceContext *device_context, OpContext<DeviceTensor> *op_context, const AID *from_aid) {
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(device_context);
MS_EXCEPTION_IF_NULL(op_context);
MS_EXCEPTION_IF_NULL(from_aid);
// todo debug.
// todo debug.
#ifdef ENABLE_GPU
if (node->isa<CNode>()) {
const auto &cnode = node->cast<CNodePtr>();
auto debugger = Debugger::GetInstance();
if (debugger) {
std::string kernel_name = cnode->fullname_with_scope();
debugger->SetCurNode(kernel_name);
bool read_data = false;
auto &dump_json_parser = DumpJsonParser::GetInstance();
bool dump_enabled = debugger->DumpDataEnabledIteration();
if (dump_enabled) {
auto dump_mode = dump_json_parser.dump_mode();
// dump the node if dump_mode is 0, which means all kernels, or if this kernel is in the kernels list
if ((dump_mode == 0) || ((dump_mode == 1) && dump_json_parser.NeedDump(kernel_name))) {
read_data = true;
}
} else if (debugger->debugger_enabled()) {
read_data = debugger->ReadNodeDataRequired(cnode);
}
if (read_data) {
if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) {
LoadInputs(cnode, launch_info_, exec_order_);
}
if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) {
LoadOutputs(cnode, launch_info_, exec_order_);
}
// check if the node is last kernel
bool last_kernel = !AnfAlgo::IsInplaceNode(cnode, "skip");
debugger->PostExecuteNode(cnode, last_kernel);
}
}
exec_order_ += 1;
}
#endif
// Call back to the from actor to process after debug finished.
Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
}
@ -36,8 +150,16 @@ void DebugActor::Debug(const AnfNodePtr &node, const DeviceContext *device_conte
void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid) {
MS_EXCEPTION_IF_NULL(op_context);
MS_EXCEPTION_IF_NULL(from_aid);
// todo debug.
// todo debug.
#ifdef ENABLE_GPU
auto debugger = Debugger::GetInstance();
if (debugger) {
debugger->Debugger::UpdateStepNumGPU();
debugger->Debugger::LoadParametersAndConst();
// Reset exec_order for the next step
exec_order_ = 0;
}
#endif
// Call back to the from actor to process after debug finished.
Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
}

View File

@ -24,6 +24,7 @@
namespace mindspore {
namespace runtime {
using mindspore::device::DeviceContext;
using mindspore::kernel::KernelLaunchInfo;
// The debug actor is used to debug and dump kernel info, it gets the kernel real time execution info in the device, so
// it is synchronous and blocked.
@ -33,12 +34,17 @@ class DebugActor : public ActorBase {
~DebugActor() override = default;
// The debug of each node.
void Debug(const AnfNodePtr &node, const DeviceContext *device_context, OpContext<DeviceTensor> *op_context,
const AID *from_aid);
void Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_info_, const DeviceContext *device_context,
OpContext<DeviceTensor> *op_context, const AID *from_aid);
// The debug on step end.
void DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid);
private:
// class members
uint32_t exec_order_ = 0;
};
} // namespace runtime
} // namespace mindspore

View File

@ -169,7 +169,7 @@ void KernelActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *context) {
}
void KernelActor::SendDebugReq(OpContext<DeviceTensor> *context) {
Async(*debug_aid_, &DebugActor::Debug, kernel_, device_context_, context, &GetAID());
Async(*debug_aid_, &DebugActor::Debug, kernel_, &launch_info_, device_context_, context, &GetAID());
}
void KernelActor::OnDebugFinish(OpContext<DeviceTensor> *context) {

View File

@ -24,6 +24,10 @@
#include "ir/tensor.h"
#include "backend/optimizer/common/helper.h"
#include "base/base_ref_utils.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
#include "debug/data_dump/dump_json_parser.h"
namespace mindspore {
namespace runtime {
@ -278,6 +282,9 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(device_context);
auto &json_parser = DumpJsonParser::GetInstance();
json_parser.Parse();
// Execute optimization pass.
auto outputs_before_optimizer = AnfAlgo::GetAllOutputWithIndex(graph->output());
device_context->OptimizeGraph(graph);
@ -297,13 +304,20 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
}
graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get()));
#ifdef ENABLE_DEBUGGER
auto debugger = Debugger::GetInstance();
debugger->DumpInGraphCompiler(graph);
#endif
MS_EXCEPTION_IF_NULL(session_);
session_->InitAllBucket(graph, device_context);
session_->SetSummaryNodes(graph.get());
SetSummaryNodesRefCount(graph.get());
#ifdef ENABLE_DEBUGGER
if (debugger && debugger->DebuggerBackendEnabled()) {
debugger->LoadGraphs(graph);
}
#endif
return graph->graph_id();
}

View File

@ -31,7 +31,9 @@
#ifdef ENABLE_DUMP_IR
#include "debug/rdr/recorder_manager.h"
#endif
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
namespace mindspore {
namespace runtime {
namespace {
@ -371,6 +373,18 @@ void GraphScheduler::Initialize() {
(void)actorMgr->Spawn(base_recorder_actor, true);
}
#endif
// Create and schedule debug actor.
#ifdef ENABLE_DEBUGGER
auto debugger = mindspore::Debugger::GetInstance();
if (debugger->DebuggerBackendEnabled()) {
auto debug_actor = std::make_shared<DebugActor>();
MS_EXCEPTION_IF_NULL(debug_actor);
debug_aid_ = &(debug_actor->GetAID());
auto base_debug_actor = static_cast<ActorReference>(debug_actor);
base_debug_actor->set_thread_pool(thread_pool_);
(void)actorMgr->Spawn(base_debug_actor, true);
}
#endif
}
ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info, GraphExecutionStrategy strategy) {

View File

@ -37,6 +37,7 @@
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "debug/rdr/running_data_recorder.h"
#include "utils/comm_manager.h"
#include "debug/debugger/debugger.h"
namespace mindspore {
namespace device {
@ -91,6 +92,12 @@ bool GPUDeviceContext::Initialize() {
(*init_nccl_comm_funcptr)();
}
auto rank_id = GetRankID();
auto &json_parser = DumpJsonParser::GetInstance();
// Dump json config file if dump is enabled
json_parser.CopyJsonToDir(rank_id);
json_parser.CopyMSCfgJsonToDir(rank_id);
initialized_ = true;
return ret;
}
@ -125,6 +132,12 @@ bool GPUDeviceContext::InitDevice() {
void GPUDeviceContext::Destroy() {
// Release GPU buffer manager resource
auto debugger = Debugger::GetInstance();
if (debugger && debugger->debugger_enabled()) {
debugger->SetTrainingDone(true);
debugger->SendMetadata(false);
}
if (GpuBufferMgr::GetInstance().IsInit()) {
if (!GpuBufferMgr::GetInstance().IsClosed() && !GpuBufferMgr::GetInstance().CloseNotify()) {
MS_LOG(EXCEPTION) << "Could not close gpu data queue.";

View File

@ -36,7 +36,9 @@
#ifdef ENABLE_GE
#include "utils/callbacks_ge.h"
#endif
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
namespace mindspore {
namespace compile {
bool Backend::GetCond(const BaseRef &c, bool *const value) { return BaseRefToBool(c, value); }
@ -577,10 +579,24 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args,
const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(actor_info);
MS_EXCEPTION_IF_NULL(actor_set);
runtime::GraphScheduler::GetInstance().PrepareRun(actor_set, graph_compiler_info, input_tensors);
// PreExecuteGraph
#ifdef ENABLE_DEBUGGER
auto debugger = Debugger::GetInstance();
if (debugger) {
debugger->Debugger::PreExecuteGraphDebugger(graph_compiler_info.graphs_);
}
#endif
if (!runtime::GraphScheduler::GetInstance().Run(actor_set)) {
MS_LOG(EXCEPTION) << "The actor runs failed, actor name: " << actor_set->name_;
}
// PostExecuteGraph
#ifdef ENABLE_DEBUGGER
if (debugger) {
debugger->Debugger::PostExecuteGraphDebugger(graph_compiler_info.graphs_);
}
#endif
// Sync device stream.
const auto &first_device_context = graph_compiler_info.device_contexts_[0];
MS_EXCEPTION_IF_NULL(first_device_context);
@ -644,6 +660,15 @@ void MindRTBackend::ConstructOutputs(const AnfNodePtr &output_node,
}
}
#ifdef ENABLE_DEBUGGER
void MindRTBackend::SetDebugger() {
auto debugger_ = Debugger::GetInstance();
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
debugger_->Init(device_id_, ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET));
}
#endif
std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(const FuncGraphPtr &root_graph) {
MS_EXCEPTION_IF_NULL(root_graph);
MS_EXCEPTION_IF_NULL(graph_compiler_);

View File

@ -118,6 +118,9 @@ class MindRTBackend : public Backend {
// Run Graph in the pyNative mode.
void RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info, const std::vector<int64_t> *tensors_mask,
const std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs);
#ifdef ENABLE_DEBUGGER
void SetDebugger() override;
#endif
private:
// The parameter func_graph is a graph, it can be either a root graph or a sub graph,