!18448 Add debugger to new GPU runtime
Merge pull request !18448 from parastooashtari/new_unified_gpu
This commit is contained in:
commit
d4aca69981
|
@ -34,6 +34,15 @@
|
|||
#include "debug/data_dump/e2e_dump.h"
|
||||
#include "utils/config_manager.h"
|
||||
#include "debug/env_config_parser.h"
|
||||
#include "utils/comm_manager.h"
|
||||
#include "runtime/framework/actor/actor_common.h"
|
||||
#include "runtime/hardware/device_context_manager.h"
|
||||
#include "debug/anf_ir_dump.h"
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debugger/proto_exporter.h"
|
||||
#else
|
||||
#include "debug/debugger/proto_exporter_stub.h"
|
||||
#endif
|
||||
|
||||
using debugger::Chunk;
|
||||
using debugger::EventReply;
|
||||
|
@ -228,6 +237,9 @@ bool Debugger::CheckDebuggerDumpEnabled() const {
|
|||
// see if dump is enabled
|
||||
if (device_target_ == kGPUDevice) {
|
||||
return device::KernelRuntime::DumpDataEnabled();
|
||||
} else if (IsMindRTUsed()) {
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
return dump_json_parser.e2e_dump_enabled();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -289,8 +301,23 @@ void Debugger::Reset() {
|
|||
graph_ptr_list_.clear();
|
||||
}
|
||||
|
||||
void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs) {
|
||||
// Only GPU is supported for MindRTBackend
|
||||
if (device_target_ != kGPUDevice) {
|
||||
return;
|
||||
}
|
||||
uint32_t graph_sum = graphs.size();
|
||||
for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
|
||||
const auto &graph = graphs[graph_index];
|
||||
if (debugger_) {
|
||||
debugger_->PreExecute(graph, graph_sum);
|
||||
}
|
||||
DumpSetup(graph);
|
||||
}
|
||||
}
|
||||
void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
|
||||
// access lock for public method
|
||||
|
||||
std::lock_guard<std::mutex> a_lock(access_lock_);
|
||||
CheckDatasetSinkMode();
|
||||
auto graph_id = graph_ptr->graph_id();
|
||||
|
@ -313,7 +340,6 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
|
|||
if (!debugger_enabled_) {
|
||||
EnableDebugger();
|
||||
}
|
||||
|
||||
if (debugger_enabled_) {
|
||||
if (graph_proto_list_.size()) {
|
||||
// only send compiled graphs once.
|
||||
|
@ -323,7 +349,9 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
|
|||
LoadParametersAndConst();
|
||||
// revert graph ptr to original value
|
||||
graph_ptr_ = dbg_graph_ptr;
|
||||
|
||||
SendMultiGraphsAndSuspend(graph_proto_list_);
|
||||
|
||||
graph_proto_list_.clear();
|
||||
} else if (graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice) {
|
||||
// stop only when receive the first sub run graph for each step
|
||||
|
@ -351,6 +379,89 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
|
|||
// resets for the new graph
|
||||
suspended_at_last_kernel_ = 0;
|
||||
}
|
||||
bool Debugger::DumpDataEnabledIteration() const {
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
if (!dump_json_parser.e2e_dump_enabled()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto cur_iter = dump_json_parser.cur_dump_iter();
|
||||
if (dump_json_parser.IsDumpIter(cur_iter)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
const auto &device_context =
|
||||
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
|
||||
uint32_t rank_id = device_context->GetRankID();
|
||||
if (debugger_->DebuggerBackendEnabled()) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
E2eDump::DumpData(kernel_graph.get(), rank_id, debugger_.get());
|
||||
} else {
|
||||
DumpJsonParser::GetInstance().UpdateDumpIter();
|
||||
}
|
||||
}
|
||||
|
||||
void Debugger::DumpSetup(const KernelGraphPtr &kernel_graph) const {
|
||||
MS_LOG(INFO) << "Start!";
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
const auto &device_context =
|
||||
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
|
||||
uint32_t rank_id = device_context->GetRankID();
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
E2eDump::DumpSetup(kernel_graph.get(), rank_id);
|
||||
MS_LOG(INFO) << "Finish!";
|
||||
}
|
||||
void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) {
|
||||
// This function will be called for new GPU runtime using MindRTBackend
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
if (json_parser.e2e_dump_enabled()) {
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
const auto &device_context =
|
||||
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
|
||||
uint32_t rank_id = device_context->GetRankID();
|
||||
kernel_graph->set_root_graph_id(kernel_graph->graph_id());
|
||||
std::string final_graph = "trace_code_graph_" + std::to_string(kernel_graph->graph_id());
|
||||
std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id);
|
||||
std::string target_dir = root_dir + "/graphs";
|
||||
std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
|
||||
DumpIRProtoWithSrcInfo(kernel_graph, final_graph, target_dir, kDebugWholeStack);
|
||||
DumpIR("trace_code_graph", kernel_graph, true, kWholeStack, ir_file_path);
|
||||
DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(kernel_graph->graph_id()) + ".csv", root_dir,
|
||||
kernel_graph->execution_order());
|
||||
}
|
||||
}
|
||||
void Debugger::PostExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs) {
|
||||
// Only GPU is supported for MindRTBackend
|
||||
if (device_target_ != kGPUDevice) {
|
||||
return;
|
||||
}
|
||||
for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
|
||||
const auto &graph = graphs[graph_index];
|
||||
bool dump_enabled = debugger_->DumpDataEnabledIteration();
|
||||
// debug used for dump
|
||||
if (debugger_ && dump_enabled) {
|
||||
debugger_->Dump(graph);
|
||||
} else {
|
||||
DumpJsonParser::GetInstance().UpdateDumpIter();
|
||||
}
|
||||
if (debugger_) {
|
||||
debugger_->PostExecute();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Debugger::PostExecute() {
|
||||
// access lock for public method
|
||||
|
@ -365,6 +476,7 @@ void Debugger::PostExecute() {
|
|||
num_step_++;
|
||||
}
|
||||
SendWatchpoints(CheckWatchpoints());
|
||||
|
||||
// no need to suspend at each graph for GPU, suspension happens in preExecute
|
||||
if (device_target_ != kGPUDevice) {
|
||||
CommandLoop();
|
||||
|
@ -388,7 +500,6 @@ bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) const {
|
|||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
|
||||
// access lock for public method
|
||||
std::lock_guard<std::mutex> a_lock(access_lock_);
|
||||
|
@ -405,6 +516,7 @@ void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
|
|||
if (!hits.empty()) {
|
||||
SendWatchpoints(hits);
|
||||
CommandLoop();
|
||||
|
||||
hit_empty_flag = false;
|
||||
}
|
||||
}
|
||||
|
@ -507,7 +619,6 @@ GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
|
|||
ModelProto model = GetDebuggerFuncGraphProto(graph_ptr);
|
||||
return model.graph();
|
||||
}
|
||||
|
||||
void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
|
||||
if (SendMetadata(true)) {
|
||||
// send graph to Mindinsight server
|
||||
|
@ -533,7 +644,9 @@ bool Debugger::SendMetadata(bool version_check) {
|
|||
MS_LOG(INFO) << "Is training done?" << training_done_;
|
||||
// set graph munber to not_dataset_graph_sum_
|
||||
metadata.set_graph_num(not_dataset_graph_sum_);
|
||||
|
||||
EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
|
||||
|
||||
bool ret = false;
|
||||
if (reply_metadata.status() == reply_metadata.OK) {
|
||||
if (version_check) {
|
||||
|
@ -575,6 +688,7 @@ void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_prot
|
|||
auto graph_size = graph.ByteSize();
|
||||
if (graph_size > g_chunk_size) {
|
||||
auto sub_graph_str = grpc_client_->ChunkString(str, graph_size);
|
||||
|
||||
for (unsigned int i = 0; i < sub_graph_str.size(); i++) {
|
||||
chunk.set_buffer(sub_graph_str[i]);
|
||||
chunked_graph_proto_list.push_back(chunk);
|
||||
|
@ -834,7 +948,6 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
|
|||
}
|
||||
return tensor_list;
|
||||
}
|
||||
|
||||
void Debugger::Exit() {
|
||||
// clear resource before exit
|
||||
// debugger will notify main thread to exit because main thread can only exit at step boundary
|
||||
|
@ -1171,6 +1284,13 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
|
|||
if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
|
||||
return;
|
||||
}
|
||||
// When MindRT is used, only ValueNodes and ParameterWeights can be loaded from device to host
|
||||
if (IsMindRTUsed() && (device_target_ == kGPUDevice)) {
|
||||
if (!anf_node->isa<ValueNode>() &&
|
||||
!(anf_node->isa<Parameter>() && AnfAlgo::IsParameterWeight(anf_node->cast<ParameterPtr>()))) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
// for parameters and value nodes, set its execution order to be 0;
|
||||
int exec_order = 0;
|
||||
std::string node_name = anf_node->fullname_with_scope();
|
||||
|
@ -1268,6 +1388,14 @@ void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
|
|||
++num_step_;
|
||||
}
|
||||
}
|
||||
void Debugger::UpdateStepNumGPU() {
|
||||
// UpdateStepNum with DebugActor::DebugOnStepEnd
|
||||
if (device_target_ == kGPUDevice && (debugger_enabled_ || DumpDataEnabledIteration())) {
|
||||
// access lock for public method
|
||||
std::lock_guard<std::mutex> a_lock(access_lock_);
|
||||
++num_step_;
|
||||
}
|
||||
}
|
||||
|
||||
void Debugger::ClearCurrentData() {
|
||||
if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()))
|
||||
|
|
|
@ -73,6 +73,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
// reset debugger
|
||||
void Reset();
|
||||
|
||||
void PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs);
|
||||
// enable debugger
|
||||
// send graph and wait for command
|
||||
// do nothing if graph is set already
|
||||
|
@ -82,6 +83,16 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
// don't need a graph_ptr because it is saved during pre_execute
|
||||
void PostExecute();
|
||||
|
||||
bool DumpDataEnabledIteration() const;
|
||||
|
||||
void Dump(const KernelGraphPtr &kernel_graph) const;
|
||||
|
||||
void DumpSetup(const KernelGraphPtr &kernel_graph) const;
|
||||
|
||||
void DumpInGraphCompiler(const KernelGraphPtr &kernel_graph);
|
||||
|
||||
void PostExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs);
|
||||
|
||||
bool ReadNodeDataRequired(const CNodePtr &kernel) const;
|
||||
|
||||
void PostExecuteNode(const CNodePtr &kernel, bool last_kernel);
|
||||
|
@ -132,6 +143,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
|
||||
void UpdateStepNum(const session::KernelGraph *graph);
|
||||
|
||||
void UpdateStepNumGPU();
|
||||
|
||||
void ClearCurrentData();
|
||||
|
||||
void LoadGraphOutputs();
|
||||
|
@ -194,7 +207,6 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
void ProcessKSetCMD(const EventReply &reply);
|
||||
// Process the KViewCMD
|
||||
void ProcessKViewCMD(const EventReply &reply);
|
||||
|
||||
// set what nodes and conditions to watch
|
||||
void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
|
||||
const ProtoVector<WatchCondition_Parameter> ¶meters);
|
||||
|
@ -228,6 +240,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
void LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index);
|
||||
|
||||
// class members
|
||||
|
||||
std::unique_ptr<GrpcClient> grpc_client_;
|
||||
std::unique_ptr<DebugServices> debug_services_;
|
||||
KernelGraphPtr graph_ptr_;
|
||||
|
@ -249,6 +262,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
std::map<uint32_t, std::string> overflow_bin_path_;
|
||||
// flag to keep track of the very first suspension of debugger
|
||||
bool initial_suspend_;
|
||||
|
||||
std::list<GraphProto> graph_proto_list_;
|
||||
std::list<KernelGraphPtr> graph_ptr_list_;
|
||||
|
||||
|
@ -261,9 +275,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
};
|
||||
|
||||
using DebuggerPtr = std::shared_ptr<Debugger>;
|
||||
|
||||
// get debugger ModelProto
|
||||
std::string GetDebuggerFuncGraphProtoString(const FuncGraphPtr &func_graph);
|
||||
|
||||
ModelProto GetDebuggerFuncGraphProto(const FuncGraphPtr &func_graph);
|
||||
|
||||
// for getting proto DataType from Type of Tensor
|
||||
|
@ -282,7 +296,6 @@ int32_t GetWatchpointID(const EventReply &reply);
|
|||
bool GetWatchpointDelete(const EventReply &reply);
|
||||
ProtoVector<TensorProto> GetTensors(const EventReply &reply);
|
||||
bool GetMiVersionMatched(const EventReply &reply);
|
||||
|
||||
// get the full name of a tensor, which is the name used in TensorLoader
|
||||
std::string GetTensorFullName(const TensorProto &tensor);
|
||||
|
||||
|
|
|
@ -168,7 +168,7 @@ void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *co
|
|||
}
|
||||
|
||||
void DeviceQueueDataSourceActor::SendDebugReq(OpContext<DeviceTensor> *context) {
|
||||
Async(*debug_aid_, &DebugActor::Debug, data_kernel_, device_context_, context, &GetAID());
|
||||
Async(*debug_aid_, &DebugActor::Debug, data_kernel_, &launch_info_, device_context_, context, &GetAID());
|
||||
}
|
||||
|
||||
void DeviceQueueDataSourceActor::OnDebugFinish(OpContext<DeviceTensor> *context) {
|
||||
|
|
|
@ -15,20 +15,134 @@
|
|||
*/
|
||||
|
||||
#include "runtime/framework/actor/debug_actor.h"
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "runtime/framework/actor/debug_aware_actor.h"
|
||||
#include "mindrt/include/async/async.h"
|
||||
#include "utils/log_adapter.h"
|
||||
#ifdef ENABLE_GPU
|
||||
#include "debug/debugger/debugger.h"
|
||||
#include "runtime/device/gpu/gpu_device_address.h"
|
||||
|
||||
using mindspore::kernel::AddressPtr;
|
||||
using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;
|
||||
#endif
|
||||
namespace mindspore {
|
||||
namespace runtime {
|
||||
void DebugActor::Debug(const AnfNodePtr &node, const DeviceContext *device_context, OpContext<DeviceTensor> *op_context,
|
||||
const AID *from_aid) {
|
||||
|
||||
#ifdef ENABLE_GPU
|
||||
static const size_t PARAMETER_OUTPUT_INDEX = 0;
|
||||
|
||||
std::vector<int> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
|
||||
// define a vector containing real output number
|
||||
std::vector<int> real_outputs;
|
||||
// P.BatchNorm is used for training and inference
|
||||
// can add the filter list for more operators here....
|
||||
if (node_name == "BatchNorm") {
|
||||
MS_LOG(INFO) << "loading node named " << node_name;
|
||||
real_outputs.insert(real_outputs.end(), {0, 3, 4});
|
||||
} else {
|
||||
// by default, TensorLoader will load all outputs
|
||||
for (size_t j = 0; j < output_size; ++j) {
|
||||
real_outputs.push_back(j);
|
||||
}
|
||||
}
|
||||
return real_outputs;
|
||||
}
|
||||
void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) {
|
||||
// get inputs
|
||||
auto kernel_inputs = launch_info_->inputs_;
|
||||
auto input_size = AnfAlgo::GetInputTensorNum(cnode);
|
||||
for (size_t j = 0; j < input_size; ++j) {
|
||||
auto input_kernel = cnode->input(j + 1);
|
||||
std::string input_kernel_name = input_kernel->fullname_with_scope();
|
||||
auto addr = kernel_inputs[j];
|
||||
auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
|
||||
// For example, this happens with the Depend op
|
||||
if (type == kMetaTypeNone) {
|
||||
continue;
|
||||
}
|
||||
auto format = kOpFormat_DEFAULT;
|
||||
auto gpu_addr = std::make_unique<device::gpu::GPUDeviceAddress>(addr->addr, addr->size, format, type);
|
||||
string input_tensor_name = input_kernel_name + ':' + "0";
|
||||
ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX);
|
||||
auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order_, format, int_shapes, type, 0, true);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "LoadMemToHost:"
|
||||
<< ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
|
||||
}
|
||||
}
|
||||
}
|
||||
void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) {
|
||||
// get outputs
|
||||
auto kernel_outputs = launch_info_->outputs_;
|
||||
auto output_size = AnfAlgo::GetOutputTensorNum(cnode);
|
||||
auto node_name = AnfAlgo::GetCNodeName(cnode);
|
||||
std::string kernel_name = cnode->fullname_with_scope();
|
||||
std::vector<int> real_outputs = CheckRealOutput(node_name, output_size);
|
||||
|
||||
for (int j : real_outputs) {
|
||||
auto addr = kernel_outputs[j];
|
||||
auto type = AnfAlgo::GetOutputInferDataType(cnode, j);
|
||||
// For example, this happens with the Depend op
|
||||
if (type == kMetaTypeNone) {
|
||||
continue;
|
||||
}
|
||||
auto format = kOpFormat_DEFAULT;
|
||||
auto gpu_addr = std::make_unique<device::gpu::GPUDeviceAddress>(addr->addr, addr->size, format, type);
|
||||
string tensor_name = kernel_name + ':' + std::to_string(j);
|
||||
ShapeVector int_shapes = trans::GetRuntimePaddingShape(cnode, j);
|
||||
auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order_, format, int_shapes, type, j, false);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "LoadMemToHost:"
|
||||
<< ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_info_,
|
||||
const DeviceContext *device_context, OpContext<DeviceTensor> *op_context, const AID *from_aid) {
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
MS_EXCEPTION_IF_NULL(device_context);
|
||||
MS_EXCEPTION_IF_NULL(op_context);
|
||||
MS_EXCEPTION_IF_NULL(from_aid);
|
||||
// todo debug.
|
||||
|
||||
// todo debug.
|
||||
#ifdef ENABLE_GPU
|
||||
if (node->isa<CNode>()) {
|
||||
const auto &cnode = node->cast<CNodePtr>();
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger) {
|
||||
std::string kernel_name = cnode->fullname_with_scope();
|
||||
debugger->SetCurNode(kernel_name);
|
||||
bool read_data = false;
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
bool dump_enabled = debugger->DumpDataEnabledIteration();
|
||||
if (dump_enabled) {
|
||||
auto dump_mode = dump_json_parser.dump_mode();
|
||||
// dump the node if dump_mode is 0, which means all kernels, or if this kernel is in the kernels list
|
||||
if ((dump_mode == 0) || ((dump_mode == 1) && dump_json_parser.NeedDump(kernel_name))) {
|
||||
read_data = true;
|
||||
}
|
||||
} else if (debugger->debugger_enabled()) {
|
||||
read_data = debugger->ReadNodeDataRequired(cnode);
|
||||
}
|
||||
if (read_data) {
|
||||
if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) {
|
||||
LoadInputs(cnode, launch_info_, exec_order_);
|
||||
}
|
||||
if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) {
|
||||
LoadOutputs(cnode, launch_info_, exec_order_);
|
||||
}
|
||||
// check if the node is last kernel
|
||||
bool last_kernel = !AnfAlgo::IsInplaceNode(cnode, "skip");
|
||||
debugger->PostExecuteNode(cnode, last_kernel);
|
||||
}
|
||||
}
|
||||
exec_order_ += 1;
|
||||
}
|
||||
#endif
|
||||
// Call back to the from actor to process after debug finished.
|
||||
Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
|
||||
}
|
||||
|
@ -36,8 +150,16 @@ void DebugActor::Debug(const AnfNodePtr &node, const DeviceContext *device_conte
|
|||
void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid) {
|
||||
MS_EXCEPTION_IF_NULL(op_context);
|
||||
MS_EXCEPTION_IF_NULL(from_aid);
|
||||
// todo debug.
|
||||
|
||||
// todo debug.
|
||||
#ifdef ENABLE_GPU
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger) {
|
||||
debugger->Debugger::UpdateStepNumGPU();
|
||||
debugger->Debugger::LoadParametersAndConst();
|
||||
// Reset exec_order for the next step
|
||||
exec_order_ = 0;
|
||||
}
|
||||
#endif
|
||||
// Call back to the from actor to process after debug finished.
|
||||
Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
namespace mindspore {
|
||||
namespace runtime {
|
||||
using mindspore::device::DeviceContext;
|
||||
using mindspore::kernel::KernelLaunchInfo;
|
||||
|
||||
// The debug actor is used to debug and dump kernel info, it gets the kernel real time execution info in the device, so
|
||||
// it is synchronous and blocked.
|
||||
|
@ -33,12 +34,17 @@ class DebugActor : public ActorBase {
|
|||
~DebugActor() override = default;
|
||||
|
||||
// The debug of each node.
|
||||
void Debug(const AnfNodePtr &node, const DeviceContext *device_context, OpContext<DeviceTensor> *op_context,
|
||||
const AID *from_aid);
|
||||
void Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_info_, const DeviceContext *device_context,
|
||||
OpContext<DeviceTensor> *op_context, const AID *from_aid);
|
||||
|
||||
// The debug on step end.
|
||||
void DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid);
|
||||
|
||||
private:
|
||||
// class members
|
||||
uint32_t exec_order_ = 0;
|
||||
};
|
||||
|
||||
} // namespace runtime
|
||||
} // namespace mindspore
|
||||
|
||||
|
|
|
@ -170,7 +170,7 @@ void KernelActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *context) {
|
|||
}
|
||||
|
||||
void KernelActor::SendDebugReq(OpContext<DeviceTensor> *context) {
|
||||
Async(*debug_aid_, &DebugActor::Debug, kernel_, device_context_, context, &GetAID());
|
||||
Async(*debug_aid_, &DebugActor::Debug, kernel_, &launch_info_, device_context_, context, &GetAID());
|
||||
}
|
||||
|
||||
void KernelActor::OnDebugFinish(OpContext<DeviceTensor> *context) {
|
||||
|
|
|
@ -24,6 +24,10 @@
|
|||
#include "ir/tensor.h"
|
||||
#include "backend/optimizer/common/helper.h"
|
||||
#include "base/base_ref_utils.h"
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debugger/debugger.h"
|
||||
#endif
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace runtime {
|
||||
|
@ -278,6 +282,9 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
|
|||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(device_context);
|
||||
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
json_parser.Parse();
|
||||
|
||||
// Execute optimization pass.
|
||||
auto outputs_before_optimizer = AnfAlgo::GetAllOutputWithIndex(graph->output());
|
||||
device_context->OptimizeGraph(graph);
|
||||
|
@ -297,13 +304,20 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
|
|||
}
|
||||
|
||||
graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get()));
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = Debugger::GetInstance();
|
||||
debugger->DumpInGraphCompiler(graph);
|
||||
#endif
|
||||
MS_EXCEPTION_IF_NULL(session_);
|
||||
session_->InitAllBucket(graph, device_context);
|
||||
|
||||
session_->SetSummaryNodes(graph.get());
|
||||
SetSummaryNodesRefCount(graph.get());
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
if (debugger && debugger->DebuggerBackendEnabled()) {
|
||||
debugger->LoadGraphs(graph);
|
||||
}
|
||||
#endif
|
||||
return graph->graph_id();
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,9 @@
|
|||
#ifdef ENABLE_DUMP_IR
|
||||
#include "debug/rdr/recorder_manager.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debugger/debugger.h"
|
||||
#endif
|
||||
namespace mindspore {
|
||||
namespace runtime {
|
||||
namespace {
|
||||
|
@ -371,6 +373,18 @@ void GraphScheduler::Initialize() {
|
|||
(void)actorMgr->Spawn(base_recorder_actor, true);
|
||||
}
|
||||
#endif
|
||||
// Create and schedule debug actor.
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = mindspore::Debugger::GetInstance();
|
||||
if (debugger->DebuggerBackendEnabled()) {
|
||||
auto debug_actor = std::make_shared<DebugActor>();
|
||||
MS_EXCEPTION_IF_NULL(debug_actor);
|
||||
debug_aid_ = &(debug_actor->GetAID());
|
||||
auto base_debug_actor = static_cast<ActorReference>(debug_actor);
|
||||
base_debug_actor->set_thread_pool(thread_pool_);
|
||||
(void)actorMgr->Spawn(base_debug_actor, true);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info, GraphExecutionStrategy strategy) {
|
||||
|
|
|
@ -37,6 +37,7 @@
|
|||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
#include "debug/rdr/running_data_recorder.h"
|
||||
#include "utils/comm_manager.h"
|
||||
#include "debug/debugger/debugger.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
|
@ -91,6 +92,12 @@ bool GPUDeviceContext::Initialize() {
|
|||
(*init_nccl_comm_funcptr)();
|
||||
}
|
||||
|
||||
auto rank_id = GetRankID();
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
// Dump json config file if dump is enabled
|
||||
json_parser.CopyJsonToDir(rank_id);
|
||||
json_parser.CopyMSCfgJsonToDir(rank_id);
|
||||
|
||||
initialized_ = true;
|
||||
return ret;
|
||||
}
|
||||
|
@ -125,6 +132,12 @@ bool GPUDeviceContext::InitDevice() {
|
|||
|
||||
void GPUDeviceContext::Destroy() {
|
||||
// Release GPU buffer manager resource
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger && debugger->debugger_enabled()) {
|
||||
debugger->SetTrainingDone(true);
|
||||
debugger->SendMetadata(false);
|
||||
}
|
||||
|
||||
if (GpuBufferMgr::GetInstance().IsInit()) {
|
||||
if (!GpuBufferMgr::GetInstance().IsClosed() && !GpuBufferMgr::GetInstance().CloseNotify()) {
|
||||
MS_LOG(EXCEPTION) << "Could not close gpu data queue.";
|
||||
|
|
|
@ -36,7 +36,9 @@
|
|||
#ifdef ENABLE_GE
|
||||
#include "utils/callbacks_ge.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debugger/debugger.h"
|
||||
#endif
|
||||
namespace mindspore {
|
||||
namespace compile {
|
||||
bool Backend::GetCond(const BaseRef &c, bool *const value) { return BaseRefToBool(c, value); }
|
||||
|
@ -577,10 +579,24 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args,
|
|||
const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(actor_info);
|
||||
MS_EXCEPTION_IF_NULL(actor_set);
|
||||
runtime::GraphScheduler::GetInstance().PrepareRun(actor_set, graph_compiler_info, input_tensors);
|
||||
|
||||
// PreExecuteGraph
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger) {
|
||||
debugger->Debugger::PreExecuteGraphDebugger(graph_compiler_info.graphs_);
|
||||
}
|
||||
#endif
|
||||
if (!runtime::GraphScheduler::GetInstance().Run(actor_set)) {
|
||||
MS_LOG(EXCEPTION) << "The actor runs failed, actor name: " << actor_set->name_;
|
||||
}
|
||||
|
||||
// PostExecuteGraph
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
if (debugger) {
|
||||
debugger->Debugger::PostExecuteGraphDebugger(graph_compiler_info.graphs_);
|
||||
}
|
||||
#endif
|
||||
// Sync device stream.
|
||||
const auto &first_device_context = graph_compiler_info.device_contexts_[0];
|
||||
MS_EXCEPTION_IF_NULL(first_device_context);
|
||||
|
@ -658,6 +674,15 @@ void MindRTBackend::ConstructOutputs(const AnfNodePtr &output_node,
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
void MindRTBackend::SetDebugger() {
|
||||
auto debugger_ = Debugger::GetInstance();
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
debugger_->Init(device_id_, ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET));
|
||||
}
|
||||
#endif
|
||||
|
||||
std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(const FuncGraphPtr &root_graph) {
|
||||
MS_EXCEPTION_IF_NULL(root_graph);
|
||||
MS_EXCEPTION_IF_NULL(graph_compiler_);
|
||||
|
|
|
@ -118,6 +118,9 @@ class MindRTBackend : public Backend {
|
|||
// Run Graph in the pyNative mode.
|
||||
void RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info, const std::vector<int64_t> *tensors_mask,
|
||||
const std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs);
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
void SetDebugger() override;
|
||||
#endif
|
||||
|
||||
private:
|
||||
// The parameter func_graph is a graph, it can be either a root graph or a sub graph,
|
||||
|
|
Loading…
Reference in New Issue