diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index e7ee0c2eb04..8422d133b4a 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -922,7 +922,11 @@ void AscendSession::InitRuntimeResource() { if (!runtime_instance->Init()) { MS_LOG(EXCEPTION) << "Kernel runtime init error."; } - DumpInit(device_id_); + auto env_hccl_mode = common::GetEnv("MS_ENABLE_HCCL"); + if (!env_hccl_mode.empty() && env_hccl_mode != std::to_string(0)) { + rank_id_ = GetRankId(); + } + DumpInit(rank_id_); MS_LOG(INFO) << "Finish!"; } @@ -1218,14 +1222,14 @@ void AscendSession::Execute(const std::shared_ptr &kernel_graph, bo void AscendSession::DumpSetup(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "Start!"; MS_EXCEPTION_IF_NULL(kernel_graph); - E2eDump::DumpSetup(kernel_graph.get(), device_id_); + E2eDump::DumpSetup(kernel_graph.get(), rank_id_); MS_LOG(INFO) << "Finish!"; } void AscendSession::Dump(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "Start!"; MS_EXCEPTION_IF_NULL(kernel_graph); - E2eDump::DumpData(kernel_graph.get(), device_id_); + E2eDump::DumpData(kernel_graph.get(), rank_id_); MS_LOG(INFO) << "Finish!"; } @@ -1242,7 +1246,6 @@ void AscendSession::DumpAllGraphs(const std::vector &all_graphs) } auto kernel_runtime = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(kernel_runtime); - uint32_t device_id = kernel_runtime->device_id(); for (auto &graph : all_graphs) { MS_EXCEPTION_IF_NULL(graph); std::string name = "graph_build." + std::to_string(graph->graph_id()); @@ -1256,7 +1259,7 @@ void AscendSession::DumpAllGraphs(const std::vector &all_graphs) } std::string final_graph = "trace_code_graph_" + std::to_string(graph->graph_id()); if (json_parser.e2e_dump_enabled() || json_parser.async_dump_enabled()) { - std::string root_dir = json_parser.path() + "/rank_" + std::to_string(device_id); + std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id_); std::string target_dir = root_dir + "/graphs"; std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir"; DumpIRProtoWithSrcInfo(graph, final_graph, target_dir, kDebugWholeStack); diff --git a/mindspore/ccsrc/backend/session/cpu_session.cc b/mindspore/ccsrc/backend/session/cpu_session.cc index f8a453dee35..73d9c0da7c2 100644 --- a/mindspore/ccsrc/backend/session/cpu_session.cc +++ b/mindspore/ccsrc/backend/session/cpu_session.cc @@ -49,7 +49,7 @@ void CPUSession::Init(uint32_t device_id) { // Dump json config file if dump is enabled auto &json_parser = DumpJsonParser::GetInstance(); json_parser.Parse(); - json_parser.CopyMSCfgJsonToDir(device_id); + json_parser.CopyMSCfgJsonToDir(rank_id_); InitExecutor(kCPUDevice, device_id); } diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index 5186edb0647..4b06ae4b511 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -61,6 +61,7 @@ #include "debug/debugger/proto_exporter_stub.h" #endif #include "debug/data_dump/dump_json_parser.h" +#include "debug/data_dump/dump_utils.h" #include "debug/tensor_load.h" #include "debug/dump_proto.h" #include "runtime/device/gpu/gpu_kernel_build.h" @@ -108,10 +109,13 @@ void GPUSession::Init(uint32_t device_id) { auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); ms_context->set_param(MS_CTX_DEVICE_ID, device_id); + if (collective_inited) { + rank_id_ = GetRankId(); + } auto &json_parser = DumpJsonParser::GetInstance(); // Dump json config file if dump is enabled - json_parser.CopyJsonToDir(device_id); - json_parser.CopyMSCfgJsonToDir(device_id); + json_parser.CopyJsonToDir(rank_id_); + json_parser.CopyMSCfgJsonToDir(rank_id_); MS_LOG(INFO) << "Set device id " << device_id << " for gpu session."; InitExecutor(kGPUDevice, device_id); } @@ -346,7 +350,6 @@ GraphId GPUSession::CompileGraphImpl(KernelGraphPtr graph) { bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); - uint32_t device_id = runtime_instance->device_id(); auto &json_parser = DumpJsonParser::GetInstance(); json_parser.Parse(); // Dump .pb graph before graph optimization @@ -400,7 +403,7 @@ GraphId GPUSession::CompileGraphImpl(KernelGraphPtr graph) { } if (json_parser.e2e_dump_enabled()) { std::string final_graph = "trace_code_graph_" + std::to_string(graph->graph_id()); - std::string root_dir = json_parser.path() + "/rank_" + std::to_string(device_id); + std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id_); std::string target_dir = root_dir + "/graphs"; std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir"; DumpIRProtoWithSrcInfo(graph, final_graph, target_dir, kDebugWholeStack); @@ -597,7 +600,7 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info, void GPUSession::Dump(const std::shared_ptr &kernel_graph) const { if (debugger_->DebuggerBackendEnabled()) { MS_EXCEPTION_IF_NULL(kernel_graph); - E2eDump::DumpData(kernel_graph.get(), device_id_, debugger_.get()); + E2eDump::DumpData(kernel_graph.get(), rank_id_, debugger_.get()); } else { DumpJsonParser::GetInstance().UpdateDumpIter(); } diff --git a/mindspore/ccsrc/backend/session/session_basic.cc b/mindspore/ccsrc/backend/session/session_basic.cc index b0dde397842..d21806475e7 100644 --- a/mindspore/ccsrc/backend/session/session_basic.cc +++ b/mindspore/ccsrc/backend/session/session_basic.cc @@ -2575,4 +2575,23 @@ void DumpGraphExeOrder(const std::string &file_name, const std::string &target_d // set file mode to read only by user ChangeFileMode(file_path, S_IRUSR); } + +uint32_t GetRankId() { + uint32_t rank_id = 0; + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + std::string world_group; + std::string backend = ms_context->get_param(MS_CTX_DEVICE_TARGET); + if (backend == kAscendDevice) { + world_group = kHcclWorldGroup; + } else if (backend == kGPUDevice) { + world_group = kNcclWorldGroup; + } else { + MS_LOG(ERROR) << "Invalid backend: " << backend; + } + if (!CommManager::GetInstance().GetRankID(world_group, &rank_id)) { + MS_LOG(INFO) << "Failed to get rank id."; + } + return rank_id; +} } // namespace mindspore diff --git a/mindspore/ccsrc/backend/session/session_basic.h b/mindspore/ccsrc/backend/session/session_basic.h index 2a3e22b1430..2959d91d4b3 100644 --- a/mindspore/ccsrc/backend/session/session_basic.h +++ b/mindspore/ccsrc/backend/session/session_basic.h @@ -287,6 +287,8 @@ class SessionBasic : public std::enable_shared_from_this { CallBackFunc summary_callback_; static GraphId graph_sum_; uint32_t device_id_; + // rank id of physical device + uint32_t rank_id_{0}; std::shared_ptr executor_; #if !defined(_WIN32) && !defined(_WIN64) std::shared_ptr debugger_; @@ -301,5 +303,6 @@ using NamedSummaryOutputs = std::map>; } // namespace session void DumpGraphExeOrder(const std::string &file_name, const std::string &target_dir, const std::vector &execution_order); +uint32_t GetRankId(); } // namespace mindspore #endif // MINDSPORE_CCSRC_BACKEND_SESSION_SESSION_BASIC_H diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc index 16ced75e5b4..0f7c3265397 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc @@ -112,7 +112,7 @@ void DumpJsonParser::Parse() { JudgeDumpEnabled(); } -void DumpJsonParser::CopyJsonToDir(uint32_t device_id) { +void DumpJsonParser::CopyJsonToDir(uint32_t rank_id) { this->Parse(); if (!IsDumpEnabled()) { return; @@ -123,8 +123,7 @@ void DumpJsonParser::CopyJsonToDir(uint32_t device_id) { } std::ifstream json_file(dump_config_file.value()); if (async_dump_enabled_ || e2e_dump_enabled_) { - auto realpath = - Common::GetRealPath(path_ + "/rank_" + std::to_string(device_id) + "/.dump_metadata/data_dump.json"); + auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/data_dump.json"); if (!realpath.has_value()) { MS_LOG(ERROR) << "Get real path failed in CopyJsonDir."; } @@ -135,7 +134,7 @@ void DumpJsonParser::CopyJsonToDir(uint32_t device_id) { } } -void DumpJsonParser::CopyHcclJsonToDir(uint32_t device_id) { +void DumpJsonParser::CopyHcclJsonToDir(uint32_t rank_id) { if (!IsDumpEnabled()) { return; } @@ -148,7 +147,7 @@ void DumpJsonParser::CopyHcclJsonToDir(uint32_t device_id) { } } std::ifstream json_file(config_path); - auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(device_id) + "/.dump_metadata/hccl.json"); + auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/hccl.json"); if (!realpath.has_value()) { MS_LOG(ERROR) << "Get real path failed in CopyHcclJsonToDir."; } else { @@ -159,11 +158,11 @@ void DumpJsonParser::CopyHcclJsonToDir(uint32_t device_id) { } } -void DumpJsonParser::CopyMSCfgJsonToDir(uint32_t device_id) { +void DumpJsonParser::CopyMSCfgJsonToDir(uint32_t rank_id) { if (!IsDumpEnabled()) { return; } - auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(device_id) + "/.dump_metadata/config.json"); + auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/config.json"); if (!realpath.has_value()) { MS_LOG(ERROR) << "Get real path failed in CopyMSConfigJsonToDir."; } else { diff --git a/mindspore/ccsrc/debug/data_dump/dump_utils.cc b/mindspore/ccsrc/debug/data_dump/dump_utils.cc index 279e9566e93..fd7eeba3c72 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_utils.cc +++ b/mindspore/ccsrc/debug/data_dump/dump_utils.cc @@ -34,7 +34,7 @@ uint32_t ConvertPhysicalDeviceId(uint32_t device_id) { return kernel_runtime->device_id(); } -std::string GenerateDumpPath(uint32_t graph_id, const uint32_t *device_id) { +std::string GenerateDumpPath(uint32_t graph_id, uint32_t rank_id) { auto &dump_json_parser = DumpJsonParser::GetInstance(); std::string net_name = dump_json_parser.net_name(); std::string iterator = std::to_string(dump_json_parser.cur_dump_iter()); @@ -42,9 +42,7 @@ std::string GenerateDumpPath(uint32_t graph_id, const uint32_t *device_id) { if (dump_path.back() != '/') { dump_path += "/"; } - uint32_t physical_device = device_id == nullptr ? 0 : ConvertPhysicalDeviceId(*device_id); - dump_path += - ("rank_" + std::to_string(physical_device) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator); + dump_path += ("rank_" + std::to_string(rank_id) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator); return dump_path; } diff --git a/mindspore/ccsrc/debug/data_dump/dump_utils.h b/mindspore/ccsrc/debug/data_dump/dump_utils.h index 1a887755ebd..66f904d3a66 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_utils.h +++ b/mindspore/ccsrc/debug/data_dump/dump_utils.h @@ -27,7 +27,7 @@ namespace mindspore { static const size_t PARAMETER_OUTPUT_INDEX = 0; static const size_t VALUE_NODE_OUTPUT_INDEX = 0; -std::string GenerateDumpPath(uint32_t graph_id, const uint32_t *device_id = nullptr); +std::string GenerateDumpPath(uint32_t graph_id, uint32_t rank_id = 0); void GetFileKernelName(NotNull kernel_name); diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc index 7c4361c40c8..3f7e86db25f 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc @@ -236,14 +236,14 @@ void E2eDump::DumpParametersAndConst(const session::KernelGraph *graph, const st } } -void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t device_id) { +void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t rank_id) { auto &dump_json_parser = DumpJsonParser::GetInstance(); uint32_t cur_iter = dump_json_parser.cur_dump_iter(); if (dump_json_parser.AsyncDumpEnabled() && dump_json_parser.IsDumpIter(cur_iter)) { auto zero_dir_dump_path = - dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0"; + dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/_/" + std::to_string(graph->graph_id()) + "/0"; - auto root_cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" + + auto root_cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/" + dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id()); auto cur_iter_dump_path = root_cur_iter_dump_path + "/" + std::to_string(cur_iter); @@ -275,7 +275,7 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t device_id) { } } -bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger) { +bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) { MS_EXCEPTION_IF_NULL(graph); bool success = false; auto &dump_json_parser = DumpJsonParser::GetInstance(); @@ -284,7 +284,7 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co if (dump_json_parser.GetIterDumpFlag()) { MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter(); MS_LOG(INFO) << "Current graph id is " << graph_id; - std::string dump_path = GenerateDumpPath(graph_id, &device_id); + std::string dump_path = GenerateDumpPath(graph_id, rank_id); DumpInput(graph, dump_path, debugger); DumpOutput(graph, dump_path, debugger); @@ -294,9 +294,9 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co uint32_t current_iter = dump_json_parser.cur_dump_iter(); auto zero_dir_dump_path = - dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0"; + dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/_/" + std::to_string(graph->graph_id()) + "/0"; - auto cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" + + auto cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/" + dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id()) + "/" + std::to_string(current_iter); diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.h b/mindspore/ccsrc/debug/data_dump/e2e_dump.h index c7e1543aaf6..15b60136c27 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h @@ -34,8 +34,8 @@ class E2eDump { public: E2eDump() = default; ~E2eDump() = default; - static void DumpSetup(const session::KernelGraph *graph, uint32_t device_id); - static bool DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger = nullptr); + static void DumpSetup(const session::KernelGraph *graph, uint32_t rank_id); + static bool DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr); // Dump data when task error. static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, std::string *kernel_name, const Debugger *debugger); diff --git a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc index f4404cd7d60..124c938f9b7 100644 --- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc +++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc @@ -26,6 +26,7 @@ #include "runtime/rt_model.h" #include "runtime/device/ascend/ge_types_convert.h" #include "proto/op_mapping_info.pb.h" +#include "utils/comm_manager.h" #include "utils/ms_context.h" #include "debug/data_dump/dump_json_parser.h" #ifdef ENABLE_DEBUGGER @@ -138,8 +139,15 @@ void DataDumper::SetOpMappingInfo(NotNull dump_inf MS_LOG(EXCEPTION) << "Dump path invalid"; } uint32_t graph_id = kernel_graph_->graph_id(); - auto device_id = context_ptr->get_param(MS_CTX_DEVICE_ID); - dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(device_id) + "/"); + uint32_t rank_id = 0; + auto env_hccl_mode = common::GetEnv("MS_ENABLE_HCCL"); + if (!env_hccl_mode.empty() && env_hccl_mode != std::to_string(0)) { + // get actual rank id if hcck is initiated. + if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) { + MS_LOG(INFO) << "Failed to get rank id."; + } + } + dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(rank_id) + "/"); MS_LOG(INFO) << "[DataDump] dump_path:" << dump_path; dump_info->set_model_name("_"); dump_info->set_dump_step("0"); diff --git a/tests/st/dump/test_data_dump.py b/tests/st/dump/test_data_dump.py index 6394c41c149..b5f9bd4ee90 100644 --- a/tests/st/dump/test_data_dump.py +++ b/tests/st/dump/test_data_dump.py @@ -65,8 +65,7 @@ def test_async_dump(): dump_path = pwd + "/async_dump" change_current_dump_json('async_dump.json', dump_path) os.environ['MINDSPORE_DUMP_CONFIG'] = pwd + "/async_dump.json" - device_id = context.get_context("device_id") - dump_file_path = dump_path + '/rank_{}/Net/0/0/'.format(device_id) + dump_file_path = dump_path + '/rank_0/Net/0/0/' if os.path.isdir(dump_path): shutil.rmtree(dump_path) add = Net() @@ -82,11 +81,7 @@ def run_e2e_dump(): dump_path = pwd + '/e2e_dump' change_current_dump_json('e2e_dump.json', dump_path) os.environ['MINDSPORE_DUMP_CONFIG'] = pwd + '/e2e_dump.json' - if context.get_context("device_target") == "Ascend": - device_id = context.get_context("device_id") - else: - device_id = 0 - dump_file_path = dump_path + '/rank_{}/Net/0/0/'.format(device_id) + dump_file_path = dump_path + '/rank_0/Net/0/0/' if os.path.isdir(dump_path): shutil.rmtree(dump_path) add = Net() @@ -159,8 +154,8 @@ def test_async_dump_net_multi_layer_mode1(): context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") test_name = "test_async_dump_net_multi_layer_mode1" json_file = os.path.join(os.getcwd(), "{}.json".format(test_name)) - device_id = context.get_context("device_id") - dump_full_path = os.path.join("/tmp/async_dump/", "{}_{}".format(test_name, device_id)) + rank_id = 0 + dump_full_path = os.path.join("/tmp/async_dump/", "{}_{}".format(test_name, rank_id)) os.system("rm -rf {}/*".format(dump_full_path)) os.environ["MINDSPORE_DUMP_CONFIG"] = json_file weight = Tensor(np.ones((1000, 2048)).astype(np.float32)) @@ -176,7 +171,7 @@ def test_async_dump_net_multi_layer_mode1(): label = Tensor(np.zeros(shape=(32, 1000)).astype(np.float32)) net_dict = train_network(inputs, label) - dump_path = "/tmp/async_dump/{}/rank_{}/test/0/0/".format(test_name, device_id) + dump_path = "/tmp/async_dump/{}/rank_{}/test/0/0/".format(test_name, rank_id) dump_file = os.listdir(dump_path) dump_file_name = "" for file in dump_file: