forked from mindspore-Ecosystem/mindspore
!17859 Change rank id in dump path
From: @tina_mengting_zhang Reviewed-by: @ouwenchang,@yelihua,@zhoufeng54,@yelihua,@ouwenchang Signed-off-by: @zhoufeng54,@ouwenchang
This commit is contained in:
commit
9193b4d997
|
@ -922,7 +922,11 @@ void AscendSession::InitRuntimeResource() {
|
|||
if (!runtime_instance->Init()) {
|
||||
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
|
||||
}
|
||||
DumpInit(device_id_);
|
||||
auto env_hccl_mode = common::GetEnv("MS_ENABLE_HCCL");
|
||||
if (!env_hccl_mode.empty() && env_hccl_mode != std::to_string(0)) {
|
||||
rank_id_ = GetRankId();
|
||||
}
|
||||
DumpInit(rank_id_);
|
||||
MS_LOG(INFO) << "Finish!";
|
||||
}
|
||||
|
||||
|
@ -1218,14 +1222,14 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
|
|||
void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||
MS_LOG(INFO) << "Start!";
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
E2eDump::DumpSetup(kernel_graph.get(), device_id_);
|
||||
E2eDump::DumpSetup(kernel_graph.get(), rank_id_);
|
||||
MS_LOG(INFO) << "Finish!";
|
||||
}
|
||||
|
||||
void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||
MS_LOG(INFO) << "Start!";
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
E2eDump::DumpData(kernel_graph.get(), device_id_);
|
||||
E2eDump::DumpData(kernel_graph.get(), rank_id_);
|
||||
MS_LOG(INFO) << "Finish!";
|
||||
}
|
||||
|
||||
|
@ -1242,7 +1246,6 @@ void AscendSession::DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs)
|
|||
}
|
||||
auto kernel_runtime = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
|
||||
MS_EXCEPTION_IF_NULL(kernel_runtime);
|
||||
uint32_t device_id = kernel_runtime->device_id();
|
||||
for (auto &graph : all_graphs) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
std::string name = "graph_build." + std::to_string(graph->graph_id());
|
||||
|
@ -1256,7 +1259,7 @@ void AscendSession::DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs)
|
|||
}
|
||||
std::string final_graph = "trace_code_graph_" + std::to_string(graph->graph_id());
|
||||
if (json_parser.e2e_dump_enabled() || json_parser.async_dump_enabled()) {
|
||||
std::string root_dir = json_parser.path() + "/rank_" + std::to_string(device_id);
|
||||
std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id_);
|
||||
std::string target_dir = root_dir + "/graphs";
|
||||
std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
|
||||
DumpIRProtoWithSrcInfo(graph, final_graph, target_dir, kDebugWholeStack);
|
||||
|
|
|
@ -49,7 +49,7 @@ void CPUSession::Init(uint32_t device_id) {
|
|||
// Dump json config file if dump is enabled
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
json_parser.Parse();
|
||||
json_parser.CopyMSCfgJsonToDir(device_id);
|
||||
json_parser.CopyMSCfgJsonToDir(rank_id_);
|
||||
InitExecutor(kCPUDevice, device_id);
|
||||
}
|
||||
|
||||
|
|
|
@ -62,6 +62,7 @@
|
|||
#include "debug/debugger/proto_exporter_stub.h"
|
||||
#endif
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
#include "debug/data_dump/dump_utils.h"
|
||||
#include "debug/tensor_load.h"
|
||||
#include "debug/dump_proto.h"
|
||||
#include "runtime/device/gpu/gpu_kernel_build.h"
|
||||
|
@ -109,10 +110,13 @@ void GPUSession::Init(uint32_t device_id) {
|
|||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
ms_context->set_param<uint32_t>(MS_CTX_DEVICE_ID, device_id);
|
||||
if (collective_inited) {
|
||||
rank_id_ = GetRankId();
|
||||
}
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
// Dump json config file if dump is enabled
|
||||
json_parser.CopyJsonToDir(device_id);
|
||||
json_parser.CopyMSCfgJsonToDir(device_id);
|
||||
json_parser.CopyJsonToDir(rank_id_);
|
||||
json_parser.CopyMSCfgJsonToDir(rank_id_);
|
||||
MS_LOG(INFO) << "Set device id " << device_id << " for gpu session.";
|
||||
InitExecutor(kGPUDevice, device_id);
|
||||
}
|
||||
|
@ -349,7 +353,6 @@ GraphId GPUSession::CompileGraphImpl(KernelGraphPtr graph) {
|
|||
bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
|
||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||
uint32_t device_id = runtime_instance->device_id();
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
json_parser.Parse();
|
||||
// Dump .pb graph before graph optimization
|
||||
|
@ -403,7 +406,7 @@ GraphId GPUSession::CompileGraphImpl(KernelGraphPtr graph) {
|
|||
}
|
||||
if (json_parser.e2e_dump_enabled()) {
|
||||
std::string final_graph = "trace_code_graph_" + std::to_string(graph->graph_id());
|
||||
std::string root_dir = json_parser.path() + "/rank_" + std::to_string(device_id);
|
||||
std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id_);
|
||||
std::string target_dir = root_dir + "/graphs";
|
||||
std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
|
||||
DumpIRProtoWithSrcInfo(graph, final_graph, target_dir, kDebugWholeStack);
|
||||
|
@ -600,7 +603,7 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
|
|||
void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||
if (debugger_->DebuggerBackendEnabled()) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
E2eDump::DumpData(kernel_graph.get(), device_id_, debugger_.get());
|
||||
E2eDump::DumpData(kernel_graph.get(), rank_id_, debugger_.get());
|
||||
} else {
|
||||
DumpJsonParser::GetInstance().UpdateDumpIter();
|
||||
}
|
||||
|
|
|
@ -2575,4 +2575,23 @@ void DumpGraphExeOrder(const std::string &file_name, const std::string &target_d
|
|||
// set file mode to read only by user
|
||||
ChangeFileMode(file_path, S_IRUSR);
|
||||
}
|
||||
|
||||
uint32_t GetRankId() {
|
||||
uint32_t rank_id = 0;
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
std::string world_group;
|
||||
std::string backend = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
if (backend == kAscendDevice) {
|
||||
world_group = kHcclWorldGroup;
|
||||
} else if (backend == kGPUDevice) {
|
||||
world_group = kNcclWorldGroup;
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Invalid backend: " << backend;
|
||||
}
|
||||
if (!CommManager::GetInstance().GetRankID(world_group, &rank_id)) {
|
||||
MS_LOG(INFO) << "Failed to get rank id.";
|
||||
}
|
||||
return rank_id;
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -287,6 +287,8 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
|
|||
CallBackFunc summary_callback_;
|
||||
static GraphId graph_sum_;
|
||||
uint32_t device_id_;
|
||||
// rank id of physical device
|
||||
uint32_t rank_id_{0};
|
||||
std::shared_ptr<Executor> executor_;
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
std::shared_ptr<Debugger> debugger_;
|
||||
|
@ -301,5 +303,6 @@ using NamedSummaryOutputs = std::map<std::string, std::pair<AnfNodePtr, int>>;
|
|||
} // namespace session
|
||||
void DumpGraphExeOrder(const std::string &file_name, const std::string &target_dir,
|
||||
const std::vector<CNodePtr> &execution_order);
|
||||
uint32_t GetRankId();
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_SESSION_SESSION_BASIC_H
|
||||
|
|
|
@ -112,7 +112,7 @@ void DumpJsonParser::Parse() {
|
|||
JudgeDumpEnabled();
|
||||
}
|
||||
|
||||
void DumpJsonParser::CopyJsonToDir(uint32_t device_id) {
|
||||
void DumpJsonParser::CopyJsonToDir(uint32_t rank_id) {
|
||||
this->Parse();
|
||||
if (!IsDumpEnabled()) {
|
||||
return;
|
||||
|
@ -123,8 +123,7 @@ void DumpJsonParser::CopyJsonToDir(uint32_t device_id) {
|
|||
}
|
||||
std::ifstream json_file(dump_config_file.value());
|
||||
if (async_dump_enabled_ || e2e_dump_enabled_) {
|
||||
auto realpath =
|
||||
Common::GetRealPath(path_ + "/rank_" + std::to_string(device_id) + "/.dump_metadata/data_dump.json");
|
||||
auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/data_dump.json");
|
||||
if (!realpath.has_value()) {
|
||||
MS_LOG(ERROR) << "Get real path failed in CopyJsonDir.";
|
||||
}
|
||||
|
@ -135,7 +134,7 @@ void DumpJsonParser::CopyJsonToDir(uint32_t device_id) {
|
|||
}
|
||||
}
|
||||
|
||||
void DumpJsonParser::CopyHcclJsonToDir(uint32_t device_id) {
|
||||
void DumpJsonParser::CopyHcclJsonToDir(uint32_t rank_id) {
|
||||
if (!IsDumpEnabled()) {
|
||||
return;
|
||||
}
|
||||
|
@ -148,7 +147,7 @@ void DumpJsonParser::CopyHcclJsonToDir(uint32_t device_id) {
|
|||
}
|
||||
}
|
||||
std::ifstream json_file(config_path);
|
||||
auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(device_id) + "/.dump_metadata/hccl.json");
|
||||
auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/hccl.json");
|
||||
if (!realpath.has_value()) {
|
||||
MS_LOG(ERROR) << "Get real path failed in CopyHcclJsonToDir.";
|
||||
} else {
|
||||
|
@ -159,11 +158,11 @@ void DumpJsonParser::CopyHcclJsonToDir(uint32_t device_id) {
|
|||
}
|
||||
}
|
||||
|
||||
void DumpJsonParser::CopyMSCfgJsonToDir(uint32_t device_id) {
|
||||
void DumpJsonParser::CopyMSCfgJsonToDir(uint32_t rank_id) {
|
||||
if (!IsDumpEnabled()) {
|
||||
return;
|
||||
}
|
||||
auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(device_id) + "/.dump_metadata/config.json");
|
||||
auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/config.json");
|
||||
if (!realpath.has_value()) {
|
||||
MS_LOG(ERROR) << "Get real path failed in CopyMSConfigJsonToDir.";
|
||||
} else {
|
||||
|
|
|
@ -34,7 +34,7 @@ uint32_t ConvertPhysicalDeviceId(uint32_t device_id) {
|
|||
return kernel_runtime->device_id();
|
||||
}
|
||||
|
||||
std::string GenerateDumpPath(uint32_t graph_id, const uint32_t *device_id) {
|
||||
std::string GenerateDumpPath(uint32_t graph_id, uint32_t rank_id) {
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
std::string net_name = dump_json_parser.net_name();
|
||||
std::string iterator = std::to_string(dump_json_parser.cur_dump_iter());
|
||||
|
@ -42,9 +42,7 @@ std::string GenerateDumpPath(uint32_t graph_id, const uint32_t *device_id) {
|
|||
if (dump_path.back() != '/') {
|
||||
dump_path += "/";
|
||||
}
|
||||
uint32_t physical_device = device_id == nullptr ? 0 : ConvertPhysicalDeviceId(*device_id);
|
||||
dump_path +=
|
||||
("rank_" + std::to_string(physical_device) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator);
|
||||
dump_path += ("rank_" + std::to_string(rank_id) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator);
|
||||
return dump_path;
|
||||
}
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ namespace mindspore {
|
|||
static const size_t PARAMETER_OUTPUT_INDEX = 0;
|
||||
static const size_t VALUE_NODE_OUTPUT_INDEX = 0;
|
||||
|
||||
std::string GenerateDumpPath(uint32_t graph_id, const uint32_t *device_id = nullptr);
|
||||
std::string GenerateDumpPath(uint32_t graph_id, uint32_t rank_id = 0);
|
||||
|
||||
void GetFileKernelName(NotNull<std::string *> kernel_name);
|
||||
|
||||
|
|
|
@ -236,14 +236,14 @@ void E2eDump::DumpParametersAndConst(const session::KernelGraph *graph, const st
|
|||
}
|
||||
}
|
||||
|
||||
void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t device_id) {
|
||||
void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t rank_id) {
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
uint32_t cur_iter = dump_json_parser.cur_dump_iter();
|
||||
if (dump_json_parser.AsyncDumpEnabled() && dump_json_parser.IsDumpIter(cur_iter)) {
|
||||
auto zero_dir_dump_path =
|
||||
dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
|
||||
dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
|
||||
|
||||
auto root_cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" +
|
||||
auto root_cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/" +
|
||||
dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id());
|
||||
|
||||
auto cur_iter_dump_path = root_cur_iter_dump_path + "/" + std::to_string(cur_iter);
|
||||
|
@ -275,7 +275,7 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t device_id) {
|
|||
}
|
||||
}
|
||||
|
||||
bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger) {
|
||||
bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
bool success = false;
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
|
@ -284,7 +284,7 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co
|
|||
if (dump_json_parser.GetIterDumpFlag()) {
|
||||
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
|
||||
MS_LOG(INFO) << "Current graph id is " << graph_id;
|
||||
std::string dump_path = GenerateDumpPath(graph_id, &device_id);
|
||||
std::string dump_path = GenerateDumpPath(graph_id, rank_id);
|
||||
|
||||
DumpInput(graph, dump_path, debugger);
|
||||
DumpOutput(graph, dump_path, debugger);
|
||||
|
@ -294,9 +294,9 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co
|
|||
uint32_t current_iter = dump_json_parser.cur_dump_iter();
|
||||
|
||||
auto zero_dir_dump_path =
|
||||
dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
|
||||
dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
|
||||
|
||||
auto cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" +
|
||||
auto cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/" +
|
||||
dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id()) + "/" +
|
||||
std::to_string(current_iter);
|
||||
|
||||
|
|
|
@ -34,8 +34,8 @@ class E2eDump {
|
|||
public:
|
||||
E2eDump() = default;
|
||||
~E2eDump() = default;
|
||||
static void DumpSetup(const session::KernelGraph *graph, uint32_t device_id);
|
||||
static bool DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger = nullptr);
|
||||
static void DumpSetup(const session::KernelGraph *graph, uint32_t rank_id);
|
||||
static bool DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
|
||||
// Dump data when task error.
|
||||
static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
|
||||
std::string *kernel_name, const Debugger *debugger);
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "runtime/rt_model.h"
|
||||
#include "runtime/device/ascend/ge_types_convert.h"
|
||||
#include "proto/op_mapping_info.pb.h"
|
||||
#include "utils/comm_manager.h"
|
||||
#include "utils/ms_context.h"
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
|
@ -138,8 +139,15 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf
|
|||
MS_LOG(EXCEPTION) << "Dump path invalid";
|
||||
}
|
||||
uint32_t graph_id = kernel_graph_->graph_id();
|
||||
auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(device_id) + "/");
|
||||
uint32_t rank_id = 0;
|
||||
auto env_hccl_mode = common::GetEnv("MS_ENABLE_HCCL");
|
||||
if (!env_hccl_mode.empty() && env_hccl_mode != std::to_string(0)) {
|
||||
// get actual rank id if hcck is initiated.
|
||||
if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) {
|
||||
MS_LOG(INFO) << "Failed to get rank id.";
|
||||
}
|
||||
}
|
||||
dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(rank_id) + "/");
|
||||
MS_LOG(INFO) << "[DataDump] dump_path:" << dump_path;
|
||||
dump_info->set_model_name("_");
|
||||
dump_info->set_dump_step("0");
|
||||
|
|
|
@ -65,8 +65,7 @@ def test_async_dump():
|
|||
dump_path = pwd + "/async_dump"
|
||||
change_current_dump_json('async_dump.json', dump_path)
|
||||
os.environ['MINDSPORE_DUMP_CONFIG'] = pwd + "/async_dump.json"
|
||||
device_id = context.get_context("device_id")
|
||||
dump_file_path = dump_path + '/rank_{}/Net/0/0/'.format(device_id)
|
||||
dump_file_path = dump_path + '/rank_0/Net/0/0/'
|
||||
if os.path.isdir(dump_path):
|
||||
shutil.rmtree(dump_path)
|
||||
add = Net()
|
||||
|
@ -82,11 +81,7 @@ def run_e2e_dump():
|
|||
dump_path = pwd + '/e2e_dump'
|
||||
change_current_dump_json('e2e_dump.json', dump_path)
|
||||
os.environ['MINDSPORE_DUMP_CONFIG'] = pwd + '/e2e_dump.json'
|
||||
if context.get_context("device_target") == "Ascend":
|
||||
device_id = context.get_context("device_id")
|
||||
else:
|
||||
device_id = 0
|
||||
dump_file_path = dump_path + '/rank_{}/Net/0/0/'.format(device_id)
|
||||
dump_file_path = dump_path + '/rank_0/Net/0/0/'
|
||||
if os.path.isdir(dump_path):
|
||||
shutil.rmtree(dump_path)
|
||||
add = Net()
|
||||
|
@ -159,8 +154,8 @@ def test_async_dump_net_multi_layer_mode1():
|
|||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
|
||||
test_name = "test_async_dump_net_multi_layer_mode1"
|
||||
json_file = os.path.join(os.getcwd(), "{}.json".format(test_name))
|
||||
device_id = context.get_context("device_id")
|
||||
dump_full_path = os.path.join("/tmp/async_dump/", "{}_{}".format(test_name, device_id))
|
||||
rank_id = 0
|
||||
dump_full_path = os.path.join("/tmp/async_dump/", "{}_{}".format(test_name, rank_id))
|
||||
os.system("rm -rf {}/*".format(dump_full_path))
|
||||
os.environ["MINDSPORE_DUMP_CONFIG"] = json_file
|
||||
weight = Tensor(np.ones((1000, 2048)).astype(np.float32))
|
||||
|
@ -176,7 +171,7 @@ def test_async_dump_net_multi_layer_mode1():
|
|||
label = Tensor(np.zeros(shape=(32, 1000)).astype(np.float32))
|
||||
net_dict = train_network(inputs, label)
|
||||
|
||||
dump_path = "/tmp/async_dump/{}/rank_{}/test/0/0/".format(test_name, device_id)
|
||||
dump_path = "/tmp/async_dump/{}/rank_{}/test/0/0/".format(test_name, rank_id)
|
||||
dump_file = os.listdir(dump_path)
|
||||
dump_file_name = ""
|
||||
for file in dump_file:
|
||||
|
|
Loading…
Reference in New Issue