actor runtime support CPU dump

This commit is contained in:
limingqi107 2021-06-30 18:09:54 +08:00
parent 8008103050
commit e761655a42
11 changed files with 99 additions and 26 deletions

View File

@ -134,6 +134,9 @@ void CPUE2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t outp
}
auto addr = AnfAlgo::GetOutputAddr(anf_node, output_index);
MS_EXCEPTION_IF_NULL(addr);
if (addr->GetPtr() == nullptr) {
return;
}
ShapeVector int_shapes;
GetDumpIntShape(anf_node, output_index, NOT_NULL(&int_shapes));
auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index);
@ -164,4 +167,11 @@ void CPUE2eDump::DumpParametersAndConst(const session::KernelGraph *graph, uint3
DumpSingleAnfNode(value_node, VALUE_NODE_OUTPUT_INDEX, dump_path, &const_map);
}
}
void CPUE2eDump::DumpParametersAndConst() {
auto &graphs = DumpJsonParser::GetInstance().graphs();
for (auto graph : graphs) {
DumpParametersAndConst(graph, graph->graph_id());
}
}
} // namespace mindspore

View File

@ -31,6 +31,8 @@ class CPUE2eDump {
// Dump data when task error.
static void DumpParametersAndConst(const session::KernelGraph *graph, uint32_t graph_id);
static void DumpParametersAndConst();
static void DumpCNodeData(const CNodePtr &node, uint32_t graph_id);
private:

View File

@ -21,6 +21,7 @@
#include <map>
#include <set>
#include <mutex>
#include <vector>
#include "nlohmann/json.hpp"
#include "utils/ms_utils.h"
#include "backend/session/kernel_graph.h"
@ -62,6 +63,10 @@ class DumpJsonParser {
std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const;
void UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph);
void ClearGraph() { graphs_.clear(); }
void SaveGraph(session::KernelGraph *graph) { graphs_.emplace_back(graph); }
std::vector<session::KernelGraph *> &graphs() { return graphs_; }
private:
DumpJsonParser() = default;
~DumpJsonParser() = default;
@ -82,6 +87,9 @@ class DumpJsonParser {
uint32_t cur_dump_iter_{0};
bool already_parsed_{false};
// Save graphs for dump.
std::vector<session::KernelGraph *> graphs_;
void ParseCommonDumpSetting(const nlohmann::json &content);
void ParseE2eDumpSetting(const nlohmann::json &content);
bool IsDumpEnabled();

View File

@ -21,6 +21,7 @@
#include "runtime/framework/actor/debug_aware_actor.h"
#include "mindrt/include/async/async.h"
#include "utils/log_adapter.h"
#include "debug/data_dump/cpu_e2e_dump.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#include "debug/debugger/debugger_utils.h"
@ -35,10 +36,22 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
MS_EXCEPTION_IF_NULL(device_context);
MS_EXCEPTION_IF_NULL(op_context);
MS_EXCEPTION_IF_NULL(from_aid);
// todo debug.
if (!node->isa<CNode>()) {
// Call back to the from actor to process after debug finished.
Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
return;
}
const auto &cnode = node->cast<CNodePtr>();
if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kCPU) {
if (DumpJsonParser::GetInstance().GetIterDumpFlag()) {
auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
MS_EXCEPTION_IF_NULL(kernel_graph);
CPUE2eDump::DumpCNodeData(cnode, kernel_graph->graph_id());
}
} else if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kGPU) {
#ifdef ENABLE_DEBUGGER
if (node->isa<CNode>()) {
const auto &cnode = node->cast<CNodePtr>();
auto debugger = Debugger::GetInstance();
if (debugger) {
std::string kernel_name = cnode->fullname_with_scope();
@ -49,8 +62,9 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
}
}
exec_order_ += 1;
}
#endif
}
// Call back to the from actor to process after debug finished.
Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
}
@ -58,7 +72,11 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid) {
MS_EXCEPTION_IF_NULL(op_context);
MS_EXCEPTION_IF_NULL(from_aid);
// todo debug.
if (DumpJsonParser::GetInstance().GetIterDumpFlag()) {
CPUE2eDump::DumpParametersAndConst();
}
#ifdef ENABLE_DEBUGGER
auto debugger = Debugger::GetInstance();
if (debugger) {
@ -67,7 +85,10 @@ void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *
exec_order_ = 0;
debugger->Debugger::PostExecuteGraphDebugger();
}
#else
DumpJsonParser::GetInstance().UpdateDumpIter();
#endif
// Call back to the from actor to process after debug finished.
Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
}

View File

@ -299,12 +299,9 @@ GraphId GraphCompiler::CompileGraph(const AnfNodePtrList &nodes, const AnfNodePt
GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const DeviceContext *device_context) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(device_context);
auto &json_parser = DumpJsonParser::GetInstance();
json_parser.Parse();
const auto &ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
bool save_graphs = ms_context->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
// Dump .pb graph before graph optimization.
if (save_graphs) {

View File

@ -28,12 +28,14 @@
#include "utils/convert_utils.h"
#include "utils/ms_context.h"
#include "common/trans.h"
#include "debug/data_dump/dump_json_parser.h"
#ifdef ENABLE_DUMP_IR
#include "debug/rdr/recorder_manager.h"
#endif
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
namespace mindspore {
namespace runtime {
namespace {
@ -480,10 +482,14 @@ void GraphScheduler::Initialize() {
(void)actorMgr->Spawn(base_recorder_actor, true);
}
#endif
// Create and schedule debug actor.
// Create and schedule debug actor.
bool debugger_actor_need = DumpJsonParser::GetInstance().e2e_dump_enabled();
#ifdef ENABLE_DEBUGGER
auto debugger = mindspore::Debugger::GetInstance();
if (debugger->DebuggerBackendEnabled()) {
if (Debugger::GetInstance()->DebuggerBackendEnabled()) {
debugger_actor_need = true;
}
#endif
if (debugger_actor_need) {
auto debug_actor = std::make_shared<DebugActor>();
MS_EXCEPTION_IF_NULL(debug_actor);
debug_aid_ = &(debug_actor->GetAID());
@ -491,7 +497,6 @@ void GraphScheduler::Initialize() {
base_debug_actor->set_thread_pool(thread_pool_);
(void)actorMgr->Spawn(base_debug_actor, true);
}
#endif
}
ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info) {
@ -1343,7 +1348,8 @@ void GraphScheduler::LinkDataArrow(KernelActor *to_actor, const GraphCompilerInf
const auto &from_actor = dynamic_cast<KernelActor *>(FetchActor(from_kernel->fullname_with_scope()));
LinkDataArrowForKernelActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
} else if (IsInternalParameter(from_kernel, graph)) {
// Link data arrow for internal parameter, convert internal parameter to actor by internal parameter cache to link.
// Link data arrow for internal parameter, convert internal parameter to actor by internal parameter cache to
// link.
LinkDataArrowForInternalParameter(from_kernel, graph_compiler_info.origin_parameters_order_, graph, to_actor,
to_kernel_with_input_idx);
} else if (IsPersistentDeviceTensor(from_kernel)) {
@ -1602,8 +1608,8 @@ void GraphScheduler::LinkControlArrowByAutoMonad(KernelActor *to_actor, const An
if (AnfAlgo::CheckPrimitiveType(input_cnode, prim::kPrimDepend) ||
AnfAlgo::CheckPrimitiveType(input_cnode, prim::kPrimLoad)) {
real_depend_inputs.push_back(input_cnode->input(kDependAttachNodeIndex));
// The real input may be this scene: depend/load --> load/depend, so need add the control arrow for real input node
// in this scene.
// The real input may be this scene: depend/load --> load/depend, so need add the control arrow for real input
// node in this scene.
if (AnfAlgo::IsOneOfPrimitiveCNode(input_cnode->input(kRealInputIndexInDepend), recursion_prims)) {
real_depend_inputs.push_back(input_cnode->input(kRealInputIndexInDepend));
}
@ -1707,8 +1713,8 @@ void GraphScheduler::LinkControlArrowBySendRecvNodes(const KernelGraphPtr &graph
output_actor->input_controls_num_++;
}
// In the scene of allreduce op and computing op parallel multi stream, the input memory of allreduce can be reused
// only when the recv node runs finished, which is expressed by the reference count increased.
// In the scene of allreduce op and computing op parallel multi stream, the input memory of allreduce can be
// reused only when the recv node runs finished, which is expressed by the reference count increased.
for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(from_allreduce_node); ++i) {
auto device_tensor = AnfAlgo::GetPrevNodeMutableOutputAddr(from_allreduce_node, i, false);
MS_EXCEPTION_IF_NULL(device_tensor);

View File

@ -29,6 +29,7 @@
#include "backend/optimizer/pass/replace_node_by_proxy.h"
#include "backend/optimizer/pass/erase_visit_attr.h"
#include "profiler/device/cpu/cpu_profiling.h"
#include "debug/data_dump/dump_json_parser.h"
namespace mindspore {
namespace device {
@ -37,8 +38,17 @@ bool CPUDeviceContext::Initialize() {
if (initialized_) {
return true;
}
mem_manager_ = std::make_shared<CPUMemoryManager>();
MS_EXCEPTION_IF_NULL(mem_manager_);
// Dump json config file if dump is enabled.
auto rank_id = GetRankID();
auto &json_parser = DumpJsonParser::GetInstance();
json_parser.Parse();
json_parser.CopyJsonToDir(rank_id);
json_parser.CopyMSCfgJsonToDir(rank_id);
initialized_ = true;
return true;
}

View File

@ -26,6 +26,9 @@ const size_t kKBToByte = 1024;
const size_t kLineMaxSize = 1024;
size_t GetSystemMemorySize(const std::string &key) {
#if defined(_WIN32) || defined(_WIN64)
return SIZE_MAX;
#else
FILE *file = fopen("/proc/meminfo", "r");
if (file == nullptr) {
MS_LOG(EXCEPTION) << "Get system meminfo failed.";
@ -53,6 +56,7 @@ size_t GetSystemMemorySize(const std::string &key) {
fclose(file);
return mem_size * kKBToByte;
#endif
}
} // namespace

View File

@ -92,9 +92,10 @@ bool GPUDeviceContext::Initialize() {
(*init_nccl_comm_funcptr)();
}
// Dump json config file if dump is enabled.
auto rank_id = GetRankID();
auto &json_parser = DumpJsonParser::GetInstance();
// Dump json config file if dump is enabled
json_parser.Parse();
json_parser.CopyJsonToDir(rank_id);
json_parser.CopyMSCfgJsonToDir(rank_id);

View File

@ -39,6 +39,8 @@
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
#include "debug/data_dump/dump_json_parser.h"
namespace mindspore {
namespace compile {
bool Backend::GetCond(const BaseRef &c, bool *const value) { return BaseRefToBool(c, value); }
@ -602,6 +604,23 @@ bool IsGraphOutputValueNodeOrParameter(const AnfNodePtr &graph_output, const Vec
}
return false;
}
void PrepareForDebuggr(const GraphCompilerInfo &graph_compiler_info) {
#ifdef ENABLE_DEBUGGER
if (Debugger::GetInstance()->DebuggerBackendEnabled()) {
Debugger::GetInstance()->PreExecuteGraphDebugger(graph_compiler_info.graphs_);
}
#endif
if (DumpJsonParser::GetInstance().e2e_dump_enabled()) {
DumpJsonParser::GetInstance().ClearGraph();
for (size_t i = 0; i < graph_compiler_info.graphs_.size(); ++i) {
if (graph_compiler_info.device_contexts_[i]->GetDeviceAddressType() == device::DeviceAddressType::kCPU) {
DumpJsonParser::GetInstance().SaveGraph(graph_compiler_info.graphs_[i].get());
}
}
}
}
} // namespace
void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs,
@ -719,12 +738,8 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args,
const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(actor_info);
MS_EXCEPTION_IF_NULL(actor_set);
runtime::GraphScheduler::GetInstance().PrepareRun(actor_set, graph_compiler_info, input_tensors);
// Debugger pre-execute graph.
#ifdef ENABLE_DEBUGGER
if (Debugger::GetInstance()->DebuggerBackendEnabled()) {
Debugger::GetInstance()->PreExecuteGraphDebugger(graph_compiler_info.graphs_);
}
#endif
// Debugger pre-execute graph.
PrepareForDebuggr(graph_compiler_info);
if (!runtime::GraphScheduler::GetInstance().Run(actor_set)) {
MS_LOG(EXCEPTION) << "The actor runs failed, actor name: " << actor_set->name_;
}

View File

@ -116,7 +116,6 @@ def test_cpu_e2e_dump():
context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
run_e2e_dump()
class ReluReduceMeanDenseRelu(Cell):
def __init__(self, kernel, bias, in_channel, num_class):
super().__init__()