!17859 Change rank id in dump path

From: @tina_mengting_zhang
Reviewed-by: @ouwenchang,@yelihua,@zhoufeng54,@yelihua,@ouwenchang
Signed-off-by: @zhoufeng54,@ouwenchang
This commit is contained in:
mindspore-ci-bot 2021-06-07 16:36:39 +08:00 committed by Gitee
commit 9193b4d997
12 changed files with 72 additions and 44 deletions

View File

@ -922,7 +922,11 @@ void AscendSession::InitRuntimeResource() {
if (!runtime_instance->Init()) {
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
}
DumpInit(device_id_);
auto env_hccl_mode = common::GetEnv("MS_ENABLE_HCCL");
if (!env_hccl_mode.empty() && env_hccl_mode != std::to_string(0)) {
rank_id_ = GetRankId();
}
DumpInit(rank_id_);
MS_LOG(INFO) << "Finish!";
}
@ -1218,14 +1222,14 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const {
MS_LOG(INFO) << "Start!";
MS_EXCEPTION_IF_NULL(kernel_graph);
E2eDump::DumpSetup(kernel_graph.get(), device_id_);
E2eDump::DumpSetup(kernel_graph.get(), rank_id_);
MS_LOG(INFO) << "Finish!";
}
void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
MS_LOG(INFO) << "Start!";
MS_EXCEPTION_IF_NULL(kernel_graph);
E2eDump::DumpData(kernel_graph.get(), device_id_);
E2eDump::DumpData(kernel_graph.get(), rank_id_);
MS_LOG(INFO) << "Finish!";
}
@ -1242,7 +1246,6 @@ void AscendSession::DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs)
}
auto kernel_runtime = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
MS_EXCEPTION_IF_NULL(kernel_runtime);
uint32_t device_id = kernel_runtime->device_id();
for (auto &graph : all_graphs) {
MS_EXCEPTION_IF_NULL(graph);
std::string name = "graph_build." + std::to_string(graph->graph_id());
@ -1256,7 +1259,7 @@ void AscendSession::DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs)
}
std::string final_graph = "trace_code_graph_" + std::to_string(graph->graph_id());
if (json_parser.e2e_dump_enabled() || json_parser.async_dump_enabled()) {
std::string root_dir = json_parser.path() + "/rank_" + std::to_string(device_id);
std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id_);
std::string target_dir = root_dir + "/graphs";
std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
DumpIRProtoWithSrcInfo(graph, final_graph, target_dir, kDebugWholeStack);

View File

@ -49,7 +49,7 @@ void CPUSession::Init(uint32_t device_id) {
// Dump json config file if dump is enabled
auto &json_parser = DumpJsonParser::GetInstance();
json_parser.Parse();
json_parser.CopyMSCfgJsonToDir(device_id);
json_parser.CopyMSCfgJsonToDir(rank_id_);
InitExecutor(kCPUDevice, device_id);
}

View File

@ -62,6 +62,7 @@
#include "debug/debugger/proto_exporter_stub.h"
#endif
#include "debug/data_dump/dump_json_parser.h"
#include "debug/data_dump/dump_utils.h"
#include "debug/tensor_load.h"
#include "debug/dump_proto.h"
#include "runtime/device/gpu/gpu_kernel_build.h"
@ -109,10 +110,13 @@ void GPUSession::Init(uint32_t device_id) {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
ms_context->set_param<uint32_t>(MS_CTX_DEVICE_ID, device_id);
if (collective_inited) {
rank_id_ = GetRankId();
}
auto &json_parser = DumpJsonParser::GetInstance();
// Dump json config file if dump is enabled
json_parser.CopyJsonToDir(device_id);
json_parser.CopyMSCfgJsonToDir(device_id);
json_parser.CopyJsonToDir(rank_id_);
json_parser.CopyMSCfgJsonToDir(rank_id_);
MS_LOG(INFO) << "Set device id " << device_id << " for gpu session.";
InitExecutor(kGPUDevice, device_id);
}
@ -349,7 +353,6 @@ GraphId GPUSession::CompileGraphImpl(KernelGraphPtr graph) {
bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
uint32_t device_id = runtime_instance->device_id();
auto &json_parser = DumpJsonParser::GetInstance();
json_parser.Parse();
// Dump .pb graph before graph optimization
@ -403,7 +406,7 @@ GraphId GPUSession::CompileGraphImpl(KernelGraphPtr graph) {
}
if (json_parser.e2e_dump_enabled()) {
std::string final_graph = "trace_code_graph_" + std::to_string(graph->graph_id());
std::string root_dir = json_parser.path() + "/rank_" + std::to_string(device_id);
std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id_);
std::string target_dir = root_dir + "/graphs";
std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
DumpIRProtoWithSrcInfo(graph, final_graph, target_dir, kDebugWholeStack);
@ -600,7 +603,7 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
if (debugger_->DebuggerBackendEnabled()) {
MS_EXCEPTION_IF_NULL(kernel_graph);
E2eDump::DumpData(kernel_graph.get(), device_id_, debugger_.get());
E2eDump::DumpData(kernel_graph.get(), rank_id_, debugger_.get());
} else {
DumpJsonParser::GetInstance().UpdateDumpIter();
}

View File

@ -2575,4 +2575,23 @@ void DumpGraphExeOrder(const std::string &file_name, const std::string &target_d
// set file mode to read only by user
ChangeFileMode(file_path, S_IRUSR);
}
uint32_t GetRankId() {
uint32_t rank_id = 0;
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
std::string world_group;
std::string backend = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
if (backend == kAscendDevice) {
world_group = kHcclWorldGroup;
} else if (backend == kGPUDevice) {
world_group = kNcclWorldGroup;
} else {
MS_LOG(ERROR) << "Invalid backend: " << backend;
}
if (!CommManager::GetInstance().GetRankID(world_group, &rank_id)) {
MS_LOG(INFO) << "Failed to get rank id.";
}
return rank_id;
}
} // namespace mindspore

View File

@ -287,6 +287,8 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
CallBackFunc summary_callback_;
static GraphId graph_sum_;
uint32_t device_id_;
// rank id of physical device
uint32_t rank_id_{0};
std::shared_ptr<Executor> executor_;
#if !defined(_WIN32) && !defined(_WIN64)
std::shared_ptr<Debugger> debugger_;
@ -301,5 +303,6 @@ using NamedSummaryOutputs = std::map<std::string, std::pair<AnfNodePtr, int>>;
} // namespace session
void DumpGraphExeOrder(const std::string &file_name, const std::string &target_dir,
const std::vector<CNodePtr> &execution_order);
uint32_t GetRankId();
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_SESSION_SESSION_BASIC_H

View File

@ -112,7 +112,7 @@ void DumpJsonParser::Parse() {
JudgeDumpEnabled();
}
void DumpJsonParser::CopyJsonToDir(uint32_t device_id) {
void DumpJsonParser::CopyJsonToDir(uint32_t rank_id) {
this->Parse();
if (!IsDumpEnabled()) {
return;
@ -123,8 +123,7 @@ void DumpJsonParser::CopyJsonToDir(uint32_t device_id) {
}
std::ifstream json_file(dump_config_file.value());
if (async_dump_enabled_ || e2e_dump_enabled_) {
auto realpath =
Common::GetRealPath(path_ + "/rank_" + std::to_string(device_id) + "/.dump_metadata/data_dump.json");
auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/data_dump.json");
if (!realpath.has_value()) {
MS_LOG(ERROR) << "Get real path failed in CopyJsonDir.";
}
@ -135,7 +134,7 @@ void DumpJsonParser::CopyJsonToDir(uint32_t device_id) {
}
}
void DumpJsonParser::CopyHcclJsonToDir(uint32_t device_id) {
void DumpJsonParser::CopyHcclJsonToDir(uint32_t rank_id) {
if (!IsDumpEnabled()) {
return;
}
@ -148,7 +147,7 @@ void DumpJsonParser::CopyHcclJsonToDir(uint32_t device_id) {
}
}
std::ifstream json_file(config_path);
auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(device_id) + "/.dump_metadata/hccl.json");
auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/hccl.json");
if (!realpath.has_value()) {
MS_LOG(ERROR) << "Get real path failed in CopyHcclJsonToDir.";
} else {
@ -159,11 +158,11 @@ void DumpJsonParser::CopyHcclJsonToDir(uint32_t device_id) {
}
}
void DumpJsonParser::CopyMSCfgJsonToDir(uint32_t device_id) {
void DumpJsonParser::CopyMSCfgJsonToDir(uint32_t rank_id) {
if (!IsDumpEnabled()) {
return;
}
auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(device_id) + "/.dump_metadata/config.json");
auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/config.json");
if (!realpath.has_value()) {
MS_LOG(ERROR) << "Get real path failed in CopyMSConfigJsonToDir.";
} else {

View File

@ -34,7 +34,7 @@ uint32_t ConvertPhysicalDeviceId(uint32_t device_id) {
return kernel_runtime->device_id();
}
std::string GenerateDumpPath(uint32_t graph_id, const uint32_t *device_id) {
std::string GenerateDumpPath(uint32_t graph_id, uint32_t rank_id) {
auto &dump_json_parser = DumpJsonParser::GetInstance();
std::string net_name = dump_json_parser.net_name();
std::string iterator = std::to_string(dump_json_parser.cur_dump_iter());
@ -42,9 +42,7 @@ std::string GenerateDumpPath(uint32_t graph_id, const uint32_t *device_id) {
if (dump_path.back() != '/') {
dump_path += "/";
}
uint32_t physical_device = device_id == nullptr ? 0 : ConvertPhysicalDeviceId(*device_id);
dump_path +=
("rank_" + std::to_string(physical_device) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator);
dump_path += ("rank_" + std::to_string(rank_id) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator);
return dump_path;
}

View File

@ -27,7 +27,7 @@ namespace mindspore {
static const size_t PARAMETER_OUTPUT_INDEX = 0;
static const size_t VALUE_NODE_OUTPUT_INDEX = 0;
std::string GenerateDumpPath(uint32_t graph_id, const uint32_t *device_id = nullptr);
std::string GenerateDumpPath(uint32_t graph_id, uint32_t rank_id = 0);
void GetFileKernelName(NotNull<std::string *> kernel_name);

View File

@ -236,14 +236,14 @@ void E2eDump::DumpParametersAndConst(const session::KernelGraph *graph, const st
}
}
void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t device_id) {
void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t rank_id) {
auto &dump_json_parser = DumpJsonParser::GetInstance();
uint32_t cur_iter = dump_json_parser.cur_dump_iter();
if (dump_json_parser.AsyncDumpEnabled() && dump_json_parser.IsDumpIter(cur_iter)) {
auto zero_dir_dump_path =
dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
auto root_cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" +
auto root_cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/" +
dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id());
auto cur_iter_dump_path = root_cur_iter_dump_path + "/" + std::to_string(cur_iter);
@ -275,7 +275,7 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t device_id) {
}
}
bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger) {
bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) {
MS_EXCEPTION_IF_NULL(graph);
bool success = false;
auto &dump_json_parser = DumpJsonParser::GetInstance();
@ -284,7 +284,7 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co
if (dump_json_parser.GetIterDumpFlag()) {
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
MS_LOG(INFO) << "Current graph id is " << graph_id;
std::string dump_path = GenerateDumpPath(graph_id, &device_id);
std::string dump_path = GenerateDumpPath(graph_id, rank_id);
DumpInput(graph, dump_path, debugger);
DumpOutput(graph, dump_path, debugger);
@ -294,9 +294,9 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co
uint32_t current_iter = dump_json_parser.cur_dump_iter();
auto zero_dir_dump_path =
dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
auto cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" +
auto cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/" +
dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id()) + "/" +
std::to_string(current_iter);

View File

@ -34,8 +34,8 @@ class E2eDump {
public:
E2eDump() = default;
~E2eDump() = default;
static void DumpSetup(const session::KernelGraph *graph, uint32_t device_id);
static bool DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger = nullptr);
static void DumpSetup(const session::KernelGraph *graph, uint32_t rank_id);
static bool DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
// Dump data when task error.
static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
std::string *kernel_name, const Debugger *debugger);

View File

@ -26,6 +26,7 @@
#include "runtime/rt_model.h"
#include "runtime/device/ascend/ge_types_convert.h"
#include "proto/op_mapping_info.pb.h"
#include "utils/comm_manager.h"
#include "utils/ms_context.h"
#include "debug/data_dump/dump_json_parser.h"
#ifdef ENABLE_DEBUGGER
@ -138,8 +139,15 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf
MS_LOG(EXCEPTION) << "Dump path invalid";
}
uint32_t graph_id = kernel_graph_->graph_id();
auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(device_id) + "/");
uint32_t rank_id = 0;
auto env_hccl_mode = common::GetEnv("MS_ENABLE_HCCL");
if (!env_hccl_mode.empty() && env_hccl_mode != std::to_string(0)) {
// get actual rank id if hcck is initiated.
if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) {
MS_LOG(INFO) << "Failed to get rank id.";
}
}
dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(rank_id) + "/");
MS_LOG(INFO) << "[DataDump] dump_path:" << dump_path;
dump_info->set_model_name("_");
dump_info->set_dump_step("0");

View File

@ -65,8 +65,7 @@ def test_async_dump():
dump_path = pwd + "/async_dump"
change_current_dump_json('async_dump.json', dump_path)
os.environ['MINDSPORE_DUMP_CONFIG'] = pwd + "/async_dump.json"
device_id = context.get_context("device_id")
dump_file_path = dump_path + '/rank_{}/Net/0/0/'.format(device_id)
dump_file_path = dump_path + '/rank_0/Net/0/0/'
if os.path.isdir(dump_path):
shutil.rmtree(dump_path)
add = Net()
@ -82,11 +81,7 @@ def run_e2e_dump():
dump_path = pwd + '/e2e_dump'
change_current_dump_json('e2e_dump.json', dump_path)
os.environ['MINDSPORE_DUMP_CONFIG'] = pwd + '/e2e_dump.json'
if context.get_context("device_target") == "Ascend":
device_id = context.get_context("device_id")
else:
device_id = 0
dump_file_path = dump_path + '/rank_{}/Net/0/0/'.format(device_id)
dump_file_path = dump_path + '/rank_0/Net/0/0/'
if os.path.isdir(dump_path):
shutil.rmtree(dump_path)
add = Net()
@ -159,8 +154,8 @@ def test_async_dump_net_multi_layer_mode1():
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
test_name = "test_async_dump_net_multi_layer_mode1"
json_file = os.path.join(os.getcwd(), "{}.json".format(test_name))
device_id = context.get_context("device_id")
dump_full_path = os.path.join("/tmp/async_dump/", "{}_{}".format(test_name, device_id))
rank_id = 0
dump_full_path = os.path.join("/tmp/async_dump/", "{}_{}".format(test_name, rank_id))
os.system("rm -rf {}/*".format(dump_full_path))
os.environ["MINDSPORE_DUMP_CONFIG"] = json_file
weight = Tensor(np.ones((1000, 2048)).astype(np.float32))
@ -176,7 +171,7 @@ def test_async_dump_net_multi_layer_mode1():
label = Tensor(np.zeros(shape=(32, 1000)).astype(np.float32))
net_dict = train_network(inputs, label)
dump_path = "/tmp/async_dump/{}/rank_{}/test/0/0/".format(test_name, device_id)
dump_path = "/tmp/async_dump/{}/rank_{}/test/0/0/".format(test_name, rank_id)
dump_file = os.listdir(dump_path)
dump_file_name = ""
for file in dump_file: