!19169 actor runtime support CPU dump
Merge pull request !19169 from limingqi107/actor_runtime2
This commit is contained in:
commit
fa0d79478d
|
@ -134,6 +134,9 @@ void CPUE2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t outp
|
|||
}
|
||||
auto addr = AnfAlgo::GetOutputAddr(anf_node, output_index);
|
||||
MS_EXCEPTION_IF_NULL(addr);
|
||||
if (addr->GetPtr() == nullptr) {
|
||||
return;
|
||||
}
|
||||
ShapeVector int_shapes;
|
||||
GetDumpIntShape(anf_node, output_index, NOT_NULL(&int_shapes));
|
||||
auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index);
|
||||
|
@ -164,4 +167,11 @@ void CPUE2eDump::DumpParametersAndConst(const session::KernelGraph *graph, uint3
|
|||
DumpSingleAnfNode(value_node, VALUE_NODE_OUTPUT_INDEX, dump_path, &const_map);
|
||||
}
|
||||
}
|
||||
|
||||
void CPUE2eDump::DumpParametersAndConst() {
|
||||
auto &graphs = DumpJsonParser::GetInstance().graphs();
|
||||
for (auto graph : graphs) {
|
||||
DumpParametersAndConst(graph, graph->graph_id());
|
||||
}
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -31,6 +31,8 @@ class CPUE2eDump {
|
|||
// Dump data when task error.
|
||||
static void DumpParametersAndConst(const session::KernelGraph *graph, uint32_t graph_id);
|
||||
|
||||
static void DumpParametersAndConst();
|
||||
|
||||
static void DumpCNodeData(const CNodePtr &node, uint32_t graph_id);
|
||||
|
||||
private:
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include <map>
|
||||
#include <set>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "utils/ms_utils.h"
|
||||
#include "backend/session/kernel_graph.h"
|
||||
|
@ -62,6 +63,10 @@ class DumpJsonParser {
|
|||
std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const;
|
||||
void UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph);
|
||||
|
||||
void ClearGraph() { graphs_.clear(); }
|
||||
void SaveGraph(session::KernelGraph *graph) { graphs_.emplace_back(graph); }
|
||||
std::vector<session::KernelGraph *> &graphs() { return graphs_; }
|
||||
|
||||
private:
|
||||
DumpJsonParser() = default;
|
||||
~DumpJsonParser() = default;
|
||||
|
@ -82,6 +87,9 @@ class DumpJsonParser {
|
|||
uint32_t cur_dump_iter_{0};
|
||||
bool already_parsed_{false};
|
||||
|
||||
// Save graphs for dump.
|
||||
std::vector<session::KernelGraph *> graphs_;
|
||||
|
||||
void ParseCommonDumpSetting(const nlohmann::json &content);
|
||||
void ParseE2eDumpSetting(const nlohmann::json &content);
|
||||
bool IsDumpEnabled();
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "runtime/framework/actor/debug_aware_actor.h"
|
||||
#include "mindrt/include/async/async.h"
|
||||
#include "utils/log_adapter.h"
|
||||
#include "debug/data_dump/cpu_e2e_dump.h"
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debugger/debugger.h"
|
||||
#include "debug/debugger/debugger_utils.h"
|
||||
|
@ -35,10 +36,22 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
|
|||
MS_EXCEPTION_IF_NULL(device_context);
|
||||
MS_EXCEPTION_IF_NULL(op_context);
|
||||
MS_EXCEPTION_IF_NULL(from_aid);
|
||||
// todo debug.
|
||||
|
||||
if (!node->isa<CNode>()) {
|
||||
// Call back to the from actor to process after debug finished.
|
||||
Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
|
||||
return;
|
||||
}
|
||||
|
||||
const auto &cnode = node->cast<CNodePtr>();
|
||||
if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kCPU) {
|
||||
if (DumpJsonParser::GetInstance().GetIterDumpFlag()) {
|
||||
auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
CPUE2eDump::DumpCNodeData(cnode, kernel_graph->graph_id());
|
||||
}
|
||||
} else if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kGPU) {
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
if (node->isa<CNode>()) {
|
||||
const auto &cnode = node->cast<CNodePtr>();
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger) {
|
||||
std::string kernel_name = cnode->fullname_with_scope();
|
||||
|
@ -49,8 +62,9 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
|
|||
}
|
||||
}
|
||||
exec_order_ += 1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Call back to the from actor to process after debug finished.
|
||||
Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
|
||||
}
|
||||
|
@ -58,7 +72,11 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
|
|||
void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid) {
|
||||
MS_EXCEPTION_IF_NULL(op_context);
|
||||
MS_EXCEPTION_IF_NULL(from_aid);
|
||||
// todo debug.
|
||||
|
||||
if (DumpJsonParser::GetInstance().GetIterDumpFlag()) {
|
||||
CPUE2eDump::DumpParametersAndConst();
|
||||
}
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger) {
|
||||
|
@ -67,7 +85,10 @@ void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *
|
|||
exec_order_ = 0;
|
||||
debugger->Debugger::PostExecuteGraphDebugger();
|
||||
}
|
||||
#else
|
||||
DumpJsonParser::GetInstance().UpdateDumpIter();
|
||||
#endif
|
||||
|
||||
// Call back to the from actor to process after debug finished.
|
||||
Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
|
||||
}
|
||||
|
|
|
@ -299,12 +299,9 @@ GraphId GraphCompiler::CompileGraph(const AnfNodePtrList &nodes, const AnfNodePt
|
|||
GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const DeviceContext *device_context) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(device_context);
|
||||
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
json_parser.Parse();
|
||||
|
||||
const auto &ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
|
||||
bool save_graphs = ms_context->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
|
||||
// Dump .pb graph before graph optimization.
|
||||
if (save_graphs) {
|
||||
|
|
|
@ -28,12 +28,14 @@
|
|||
#include "utils/convert_utils.h"
|
||||
#include "utils/ms_context.h"
|
||||
#include "common/trans.h"
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
#include "debug/rdr/recorder_manager.h"
|
||||
#endif
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debugger/debugger.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace runtime {
|
||||
namespace {
|
||||
|
@ -480,10 +482,14 @@ void GraphScheduler::Initialize() {
|
|||
(void)actorMgr->Spawn(base_recorder_actor, true);
|
||||
}
|
||||
#endif
|
||||
// Create and schedule debug actor.
|
||||
// Create and schedule debug actor.
|
||||
bool debugger_actor_need = DumpJsonParser::GetInstance().e2e_dump_enabled();
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = mindspore::Debugger::GetInstance();
|
||||
if (debugger->DebuggerBackendEnabled()) {
|
||||
if (Debugger::GetInstance()->DebuggerBackendEnabled()) {
|
||||
debugger_actor_need = true;
|
||||
}
|
||||
#endif
|
||||
if (debugger_actor_need) {
|
||||
auto debug_actor = std::make_shared<DebugActor>();
|
||||
MS_EXCEPTION_IF_NULL(debug_actor);
|
||||
debug_aid_ = &(debug_actor->GetAID());
|
||||
|
@ -491,7 +497,6 @@ void GraphScheduler::Initialize() {
|
|||
base_debug_actor->set_thread_pool(thread_pool_);
|
||||
(void)actorMgr->Spawn(base_debug_actor, true);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info) {
|
||||
|
@ -1343,7 +1348,8 @@ void GraphScheduler::LinkDataArrow(KernelActor *to_actor, const GraphCompilerInf
|
|||
const auto &from_actor = dynamic_cast<KernelActor *>(FetchActor(from_kernel->fullname_with_scope()));
|
||||
LinkDataArrowForKernelActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
|
||||
} else if (IsInternalParameter(from_kernel, graph)) {
|
||||
// Link data arrow for internal parameter, convert internal parameter to actor by internal parameter cache to link.
|
||||
// Link data arrow for internal parameter, convert internal parameter to actor by internal parameter cache to
|
||||
// link.
|
||||
LinkDataArrowForInternalParameter(from_kernel, graph_compiler_info.origin_parameters_order_, graph, to_actor,
|
||||
to_kernel_with_input_idx);
|
||||
} else if (IsPersistentDeviceTensor(from_kernel)) {
|
||||
|
@ -1602,8 +1608,8 @@ void GraphScheduler::LinkControlArrowByAutoMonad(KernelActor *to_actor, const An
|
|||
if (AnfAlgo::CheckPrimitiveType(input_cnode, prim::kPrimDepend) ||
|
||||
AnfAlgo::CheckPrimitiveType(input_cnode, prim::kPrimLoad)) {
|
||||
real_depend_inputs.push_back(input_cnode->input(kDependAttachNodeIndex));
|
||||
// The real input may be this scene: depend/load --> load/depend, so need add the control arrow for real input node
|
||||
// in this scene.
|
||||
// The real input may be this scene: depend/load --> load/depend, so need add the control arrow for real input
|
||||
// node in this scene.
|
||||
if (AnfAlgo::IsOneOfPrimitiveCNode(input_cnode->input(kRealInputIndexInDepend), recursion_prims)) {
|
||||
real_depend_inputs.push_back(input_cnode->input(kRealInputIndexInDepend));
|
||||
}
|
||||
|
@ -1707,8 +1713,8 @@ void GraphScheduler::LinkControlArrowBySendRecvNodes(const KernelGraphPtr &graph
|
|||
output_actor->input_controls_num_++;
|
||||
}
|
||||
|
||||
// In the scene of allreduce op and computing op parallel multi stream, the input memory of allreduce can be reused
|
||||
// only when the recv node runs finished, which is expressed by the reference count increased.
|
||||
// In the scene of allreduce op and computing op parallel multi stream, the input memory of allreduce can be
|
||||
// reused only when the recv node runs finished, which is expressed by the reference count increased.
|
||||
for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(from_allreduce_node); ++i) {
|
||||
auto device_tensor = AnfAlgo::GetPrevNodeMutableOutputAddr(from_allreduce_node, i, false);
|
||||
MS_EXCEPTION_IF_NULL(device_tensor);
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "backend/optimizer/pass/replace_node_by_proxy.h"
|
||||
#include "backend/optimizer/pass/erase_visit_attr.h"
|
||||
#include "profiler/device/cpu/cpu_profiling.h"
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
|
@ -37,8 +38,17 @@ bool CPUDeviceContext::Initialize() {
|
|||
if (initialized_) {
|
||||
return true;
|
||||
}
|
||||
|
||||
mem_manager_ = std::make_shared<CPUMemoryManager>();
|
||||
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||
|
||||
// Dump json config file if dump is enabled.
|
||||
auto rank_id = GetRankID();
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
json_parser.Parse();
|
||||
json_parser.CopyJsonToDir(rank_id);
|
||||
json_parser.CopyMSCfgJsonToDir(rank_id);
|
||||
|
||||
initialized_ = true;
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -26,6 +26,9 @@ const size_t kKBToByte = 1024;
|
|||
const size_t kLineMaxSize = 1024;
|
||||
|
||||
size_t GetSystemMemorySize(const std::string &key) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return SIZE_MAX;
|
||||
#else
|
||||
FILE *file = fopen("/proc/meminfo", "r");
|
||||
if (file == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Get system meminfo failed.";
|
||||
|
@ -53,6 +56,7 @@ size_t GetSystemMemorySize(const std::string &key) {
|
|||
|
||||
fclose(file);
|
||||
return mem_size * kKBToByte;
|
||||
#endif
|
||||
}
|
||||
} // namespace
|
||||
|
||||
|
|
|
@ -92,9 +92,10 @@ bool GPUDeviceContext::Initialize() {
|
|||
(*init_nccl_comm_funcptr)();
|
||||
}
|
||||
|
||||
// Dump json config file if dump is enabled.
|
||||
auto rank_id = GetRankID();
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
// Dump json config file if dump is enabled
|
||||
json_parser.Parse();
|
||||
json_parser.CopyJsonToDir(rank_id);
|
||||
json_parser.CopyMSCfgJsonToDir(rank_id);
|
||||
|
||||
|
|
|
@ -39,6 +39,8 @@
|
|||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debugger/debugger.h"
|
||||
#endif
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace compile {
|
||||
bool Backend::GetCond(const BaseRef &c, bool *const value) { return BaseRefToBool(c, value); }
|
||||
|
@ -602,6 +604,23 @@ bool IsGraphOutputValueNodeOrParameter(const AnfNodePtr &graph_output, const Vec
|
|||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void PrepareForDebuggr(const GraphCompilerInfo &graph_compiler_info) {
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
if (Debugger::GetInstance()->DebuggerBackendEnabled()) {
|
||||
Debugger::GetInstance()->PreExecuteGraphDebugger(graph_compiler_info.graphs_);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (DumpJsonParser::GetInstance().e2e_dump_enabled()) {
|
||||
DumpJsonParser::GetInstance().ClearGraph();
|
||||
for (size_t i = 0; i < graph_compiler_info.graphs_.size(); ++i) {
|
||||
if (graph_compiler_info.device_contexts_[i]->GetDeviceAddressType() == device::DeviceAddressType::kCPU) {
|
||||
DumpJsonParser::GetInstance().SaveGraph(graph_compiler_info.graphs_[i].get());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs,
|
||||
|
@ -719,12 +738,8 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args,
|
|||
const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(actor_info);
|
||||
MS_EXCEPTION_IF_NULL(actor_set);
|
||||
runtime::GraphScheduler::GetInstance().PrepareRun(actor_set, graph_compiler_info, input_tensors);
|
||||
// Debugger pre-execute graph.
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
if (Debugger::GetInstance()->DebuggerBackendEnabled()) {
|
||||
Debugger::GetInstance()->PreExecuteGraphDebugger(graph_compiler_info.graphs_);
|
||||
}
|
||||
#endif
|
||||
// Debugger pre-execute graph.
|
||||
PrepareForDebuggr(graph_compiler_info);
|
||||
if (!runtime::GraphScheduler::GetInstance().Run(actor_set)) {
|
||||
MS_LOG(EXCEPTION) << "The actor runs failed, actor name: " << actor_set->name_;
|
||||
}
|
||||
|
|
|
@ -116,7 +116,6 @@ def test_cpu_e2e_dump():
|
|||
context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
|
||||
run_e2e_dump()
|
||||
|
||||
|
||||
class ReluReduceMeanDenseRelu(Cell):
|
||||
def __init__(self, kernel, bias, in_channel, num_class):
|
||||
super().__init__()
|
||||
|
|
Loading…
Reference in New Issue