forked from mindspore-Ecosystem/mindspore
!17376 Unified dump entry, async dump multiple steps in different dirs, and async dump multiple specific steps in one training session - finer grain control
From: @john_tzanakakis Reviewed-by: @mikef,@pandoublefeng Signed-off-by: @pandoublefeng
This commit is contained in:
commit
103869fbc4
|
@ -3,17 +3,10 @@
|
|||
"dump_mode": 0,
|
||||
"path": "/test",
|
||||
"net_name": "ResNet50",
|
||||
"iteration": 0,
|
||||
"iteration": "0",
|
||||
"input_output": 2,
|
||||
"kernels": ["Default/Conv-op12"],
|
||||
"support_device": [0,1,2,3,4,5,6,7]
|
||||
},
|
||||
"e2e_dump_settings": {
|
||||
"enable": false,
|
||||
"trans_flag": false
|
||||
},
|
||||
"async_dump_settings": {
|
||||
"enable": false,
|
||||
"support_device": [0,1,2,3,4,5,6,7],
|
||||
"op_debug_mode": 0
|
||||
}
|
||||
}
|
|
@ -1198,6 +1198,7 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
|
|||
}
|
||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||
DumpSetup(kernel_graph);
|
||||
bool ret_ok = runtime_instance->Run(kernel_graph.get(), is_task_sink);
|
||||
Dump(kernel_graph);
|
||||
if (!ret_ok) {
|
||||
|
@ -1209,6 +1210,13 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
|
|||
MS_LOG(INFO) << "Finish!";
|
||||
}
|
||||
|
||||
void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||
MS_LOG(INFO) << "Start!";
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
E2eDump::DumpSetup(kernel_graph.get(), device_id_);
|
||||
MS_LOG(INFO) << "Finish!";
|
||||
}
|
||||
|
||||
void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||
MS_LOG(INFO) << "Start!";
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
|
|
|
@ -89,6 +89,7 @@ class AscendSession : public SessionBasic {
|
|||
void Load(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
||||
void Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const;
|
||||
void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
||||
void DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
||||
void DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs);
|
||||
void LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
||||
// below functions are used for run op
|
||||
|
|
|
@ -107,9 +107,8 @@ void DumpJsonParser::Parse() {
|
|||
std::string cfg = ss.str();
|
||||
MS_LOG(INFO) << "Dump json:" << cfg;
|
||||
|
||||
ParseCommonDumpSetting(j);
|
||||
ParseAsyncDumpSetting(j);
|
||||
ParseE2eDumpSetting(j);
|
||||
ParseCommonDumpSetting(j);
|
||||
JudgeDumpEnabled();
|
||||
}
|
||||
|
||||
|
@ -214,6 +213,14 @@ void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
|
|||
auto input_output = CheckJsonKeyExist(*common_dump_settings, kInputOutput);
|
||||
auto kernels = CheckJsonKeyExist(*common_dump_settings, kKernels);
|
||||
auto support_device = CheckJsonKeyExist(*common_dump_settings, kSupportDevice);
|
||||
auto op_debug_mode = CheckJsonKeyExist(*common_dump_settings, kOpDebugMode);
|
||||
|
||||
// async_dump is enabled by default, if e2e dump is enabled it will override this
|
||||
auto context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
|
||||
async_dump_enabled_ = true;
|
||||
}
|
||||
|
||||
ParseDumpMode(*dump_mode);
|
||||
ParseDumpPath(*path);
|
||||
|
@ -222,34 +229,29 @@ void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
|
|||
ParseInputOutput(*input_output);
|
||||
ParseKernels(*kernels);
|
||||
ParseSupportDevice(*support_device);
|
||||
}
|
||||
|
||||
void DumpJsonParser::ParseAsyncDumpSetting(const nlohmann::json &content) {
|
||||
// async dump setting is optional
|
||||
auto async_dump_setting = content.find(kAsyncDumpSettings);
|
||||
if (async_dump_setting == content.end()) {
|
||||
MS_LOG(INFO) << "No async_dump_settings";
|
||||
return;
|
||||
}
|
||||
|
||||
auto async_dump_enable = CheckJsonKeyExist(*async_dump_setting, kEnable);
|
||||
auto op_debug_mode = CheckJsonKeyExist(*async_dump_setting, kOpDebugMode);
|
||||
|
||||
async_dump_enabled_ = ParseEnable(*async_dump_enable);
|
||||
ParseOpDebugMode(*op_debug_mode);
|
||||
}
|
||||
|
||||
void DumpJsonParser::ParseE2eDumpSetting(const nlohmann::json &content) {
|
||||
auto e2e_dump_setting = content.find(kE2eDumpSettings);
|
||||
auto context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
if (e2e_dump_setting == content.end()) {
|
||||
MS_LOG(INFO) << "No e2e_dump_settings";
|
||||
return;
|
||||
if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
|
||||
MS_LOG(EXCEPTION) << "e2e_dump_settings needed for GPU dump";
|
||||
} else {
|
||||
MS_LOG(INFO) << "No e2e_dump_settings";
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
auto e2e_dump_enable = CheckJsonKeyExist(*e2e_dump_setting, kEnable);
|
||||
auto trans_flag = CheckJsonKeyExist(*e2e_dump_setting, kTransFlag);
|
||||
|
||||
e2e_dump_enabled_ = ParseEnable(*e2e_dump_enable);
|
||||
if (e2e_dump_enabled_ && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
|
||||
MS_LOG(WARNING) << "Deprecated: Synchronous dump mode is deprecated and will be removed in a future release";
|
||||
}
|
||||
trans_flag_ = ParseEnable(*trans_flag);
|
||||
}
|
||||
|
||||
|
@ -304,8 +306,68 @@ void DumpJsonParser::ParseNetName(const nlohmann::json &content) {
|
|||
}
|
||||
|
||||
void DumpJsonParser::ParseIteration(const nlohmann::json &content) {
|
||||
CheckJsonUnsignedType(content, kIteration);
|
||||
iteration_ = content;
|
||||
CheckJsonStringType(content, kIteration);
|
||||
if (e2e_dump_enabled_) {
|
||||
std::string temp_iter = content;
|
||||
// is this a single iteration
|
||||
if (temp_iter != "all" && temp_iter.find("-") == std::string::npos && temp_iter.find("|") == std::string::npos) {
|
||||
iteration_ = std::stoul(temp_iter);
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Can only use a single value for the iteration in sync mode.";
|
||||
}
|
||||
} else if (async_dump_enabled_) {
|
||||
async_iteration_ = content;
|
||||
if (async_iteration_.empty()) {
|
||||
MS_LOG(EXCEPTION) << "In async dump settings json file, iteration is empty";
|
||||
}
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Dump Json Parse Failed. Async or E2E should be enabled. ";
|
||||
}
|
||||
}
|
||||
|
||||
bool DumpJsonParser::IsDumpIter(uint32_t iteration) {
|
||||
// bool DumpJsonParser::IsDumpIter(uint32_t iteration) --> checks if iteration should be dumped or not.
|
||||
if (async_iteration_ == "all") {
|
||||
return true;
|
||||
}
|
||||
int start = 0;
|
||||
int end = async_iteration_.find("|");
|
||||
while (end != -1) {
|
||||
std::string temp = async_iteration_.substr(start, end - start);
|
||||
int range_idx = temp.find("-");
|
||||
if (range_idx != -1) {
|
||||
uint32_t low_range = std::stoul(temp.substr(0, range_idx));
|
||||
uint32_t high_range = std::stoul(temp.substr((range_idx + 1), -1));
|
||||
if ((low_range <= iteration) && (iteration <= high_range)) {
|
||||
return true;
|
||||
}
|
||||
} else if (iteration == std::stoul(temp)) {
|
||||
return true;
|
||||
}
|
||||
start = end + 1;
|
||||
end = async_iteration_.find("|", start);
|
||||
}
|
||||
std::string temp = async_iteration_.substr(start, end - start);
|
||||
int range_idx = temp.find("-");
|
||||
if (range_idx != -1) {
|
||||
uint32_t low_range = std::stoul(temp.substr(0, range_idx));
|
||||
uint32_t high_range = std::stoul(temp.substr((range_idx + 1), -1));
|
||||
if ((low_range <= iteration) && (iteration <= high_range)) {
|
||||
return true;
|
||||
}
|
||||
} else if (iteration == std::stoul(temp)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool DumpJsonParser::IsSingleIter() {
|
||||
// bool DumpJsonParser::IsSingleIter() --> checks if iteration in json dump file is single or not.
|
||||
if (async_iteration_ != "all" && async_iteration_.find("-") == std::string::npos &&
|
||||
async_iteration_.find("|") == std::string::npos) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void DumpJsonParser::ParseInputOutput(const nlohmann::json &content) {
|
||||
|
|
|
@ -41,6 +41,9 @@ class DumpJsonParser {
|
|||
bool NeedDump(const std::string &op_full_name) const;
|
||||
void MatchKernel(const std::string &kernel_name);
|
||||
void PrintUnusedKernel();
|
||||
bool IsDumpIter(uint32_t iteration);
|
||||
bool DumpAllIter();
|
||||
bool IsSingleIter();
|
||||
|
||||
bool async_dump_enabled() const { return async_dump_enabled_; }
|
||||
bool e2e_dump_enabled() const { return e2e_dump_enabled_; }
|
||||
|
@ -58,6 +61,7 @@ class DumpJsonParser {
|
|||
bool OutputNeedDump() const;
|
||||
std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const;
|
||||
void UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph);
|
||||
bool AsyncDumpEnabled() const { return async_dump_enabled_; }
|
||||
|
||||
private:
|
||||
DumpJsonParser() = default;
|
||||
|
@ -71,6 +75,7 @@ class DumpJsonParser {
|
|||
std::string path_;
|
||||
std::string net_name_;
|
||||
uint32_t iteration_{0};
|
||||
std::string async_iteration_;
|
||||
uint32_t input_output_{0};
|
||||
std::map<std::string, uint32_t> kernels_;
|
||||
std::set<uint32_t> support_devices_;
|
||||
|
@ -80,7 +85,6 @@ class DumpJsonParser {
|
|||
bool already_parsed_{false};
|
||||
|
||||
void ParseCommonDumpSetting(const nlohmann::json &content);
|
||||
void ParseAsyncDumpSetting(const nlohmann::json &content);
|
||||
void ParseE2eDumpSetting(const nlohmann::json &content);
|
||||
bool IsDumpEnabled();
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
#include "common/trans.h"
|
||||
#include "debug/common.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "utils/ms_context.h"
|
||||
#include "runtime/device/kernel_runtime_manager.h"
|
||||
|
@ -235,6 +236,38 @@ void E2eDump::DumpParametersAndConst(const session::KernelGraph *graph, const st
|
|||
}
|
||||
}
|
||||
|
||||
void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t device_id) {
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
uint32_t cur_iter = dump_json_parser.cur_dump_iter();
|
||||
if (dump_json_parser.AsyncDumpEnabled() && dump_json_parser.IsDumpIter(cur_iter)) {
|
||||
auto zero_dir_dump_path =
|
||||
dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
|
||||
|
||||
auto root_cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" +
|
||||
dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id());
|
||||
|
||||
auto cur_iter_dump_path = root_cur_iter_dump_path + "/" + std::to_string(cur_iter);
|
||||
|
||||
MS_LOG(INFO) << "zero_dir_dump_path: " << zero_dir_dump_path;
|
||||
MS_LOG(INFO) << "root_cur_iter_dump_path: " << root_cur_iter_dump_path;
|
||||
MS_LOG(INFO) << "cur_iter_dump_path: " << cur_iter_dump_path;
|
||||
|
||||
// create cur_iter_dump_path dirs
|
||||
bool status = Common::CreateNotExistDirs(root_cur_iter_dump_path);
|
||||
if (!status) {
|
||||
MS_LOG(EXCEPTION) << "Failed at CreateNotExistDirs for " << root_cur_iter_dump_path;
|
||||
return;
|
||||
}
|
||||
|
||||
// create symlink to active dump dir for the iteration in final dump dir
|
||||
std::string command = "ln -fs " + zero_dir_dump_path + " " + cur_iter_dump_path;
|
||||
MS_LOG(INFO) << "ln command: " << command;
|
||||
if (system(command.c_str())) {
|
||||
MS_LOG(EXCEPTION) << "failed to create symlink to active dump dir for the iteration in final dump dir.";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
|
@ -245,16 +278,60 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co
|
|||
if (starting_graph_id == graph_id) {
|
||||
dump_json_parser.UpdateDumpIter();
|
||||
}
|
||||
if (!dump_json_parser.GetIterDumpFlag()) {
|
||||
|
||||
if (dump_json_parser.GetIterDumpFlag()) {
|
||||
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
|
||||
MS_LOG(INFO) << "Current graph id is " << graph_id;
|
||||
std::string dump_path = GenerateDumpPath(graph_id, &device_id);
|
||||
|
||||
DumpInput(graph, dump_path, debugger);
|
||||
DumpOutput(graph, dump_path, debugger);
|
||||
DumpParametersAndConst(graph, dump_path, debugger);
|
||||
return true;
|
||||
} else if (dump_json_parser.AsyncDumpEnabled()) {
|
||||
uint32_t prev_dump_iter = dump_json_parser.cur_dump_iter() - 1;
|
||||
|
||||
auto zero_dir_dump_path =
|
||||
dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
|
||||
|
||||
auto cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" +
|
||||
dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id()) + "/" +
|
||||
std::to_string(prev_dump_iter);
|
||||
|
||||
MS_LOG(INFO) << "zero_dir_dump_path: " << zero_dir_dump_path;
|
||||
MS_LOG(INFO) << "cur_iter_dump_path: " << cur_iter_dump_path;
|
||||
|
||||
if (dump_json_parser.IsDumpIter(prev_dump_iter)) {
|
||||
// remove symlink to active dump dir
|
||||
std::string command = "rm -f " + cur_iter_dump_path;
|
||||
MS_LOG(INFO) << "rm command: " << command;
|
||||
if (system(command.c_str())) {
|
||||
MS_LOG(EXCEPTION) << "failed to remove symlink to active dump dir.";
|
||||
}
|
||||
|
||||
// create actual dir for iteration in final dump dir
|
||||
bool status = Common::CreateNotExistDirs(cur_iter_dump_path);
|
||||
if (!status) {
|
||||
MS_LOG(EXCEPTION) << "failed at CreateNotExistDirs for " << cur_iter_dump_path;
|
||||
}
|
||||
|
||||
// move contents from active dump dir to final dump dir
|
||||
command = "mv " + zero_dir_dump_path + "/* " + cur_iter_dump_path + "/.";
|
||||
MS_LOG(INFO) << "mv command: " << command;
|
||||
if (system(command.c_str())) {
|
||||
MS_LOG(EXCEPTION) << "Ascend runtime has changed the dump dir structure!!!";
|
||||
}
|
||||
} else {
|
||||
// delete contents from active dump dir
|
||||
std::string command = "rm -f " + zero_dir_dump_path + "/*";
|
||||
MS_LOG(INFO) << "rm command: " << command;
|
||||
if (system(command.c_str())) {
|
||||
MS_LOG(EXCEPTION) << "Ascend runtime has changed the dump dir structure!!!";
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
|
||||
MS_LOG(INFO) << "Current graph id is " << graph_id;
|
||||
std::string dump_path = GenerateDumpPath(graph_id, &device_id);
|
||||
|
||||
DumpInput(graph, dump_path, debugger);
|
||||
DumpOutput(graph, dump_path, debugger);
|
||||
DumpParametersAndConst(graph, dump_path, debugger);
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_H_
|
||||
#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_H_
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
|
@ -33,6 +34,7 @@ class E2eDump {
|
|||
public:
|
||||
E2eDump() = default;
|
||||
~E2eDump() = default;
|
||||
static void DumpSetup(const session::KernelGraph *graph, uint32_t device_id);
|
||||
static bool DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger = nullptr);
|
||||
// Dump data when task error.
|
||||
static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
|
||||
|
|
|
@ -141,8 +141,8 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf
|
|||
auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(device_id) + "/");
|
||||
MS_LOG(INFO) << "[DataDump] dump_path:" << dump_path;
|
||||
dump_info->set_model_name(DumpJsonParser::GetInstance().net_name());
|
||||
dump_info->set_dump_step(std::to_string(DumpJsonParser::GetInstance().iteration()));
|
||||
dump_info->set_model_name("_");
|
||||
dump_info->set_dump_step("0");
|
||||
dump_info->set_model_id(graph_id);
|
||||
dump_info->set_flag(kAicpuLoadFlag);
|
||||
|
||||
|
|
|
@ -3,13 +3,10 @@
|
|||
"dump_mode": 0,
|
||||
"path": "/test",
|
||||
"net_name": "Net",
|
||||
"iteration": 0,
|
||||
"iteration": "0",
|
||||
"input_output": 2,
|
||||
"kernels": ["Default/TensorAdd-op3"],
|
||||
"support_device": [0,1,2,3,4,5,6,7]
|
||||
},
|
||||
"async_dump_settings": {
|
||||
"enable": true,
|
||||
"support_device": [0,1,2,3,4,5,6,7],
|
||||
"op_debug_mode": 0
|
||||
}
|
||||
}
|
|
@ -3,10 +3,11 @@
|
|||
"dump_mode": 0,
|
||||
"path": "/test",
|
||||
"net_name": "Net",
|
||||
"iteration": 0,
|
||||
"iteration": "0",
|
||||
"input_output": 0,
|
||||
"kernels": ["Default/Conv-op12"],
|
||||
"support_device": [0,1,2,3,4,5,6,7]
|
||||
"support_device": [0,1,2,3,4,5,6,7],
|
||||
"op_debug_mode": 0
|
||||
},
|
||||
"e2e_dump_settings": {
|
||||
"enable": true,
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
"dump_mode": 0,
|
||||
"path": "/tmp/async_dump/test_async_dump_net_multi_layer_mode1",
|
||||
"net_name": "test",
|
||||
"iteration": 0,
|
||||
"iteration": "0",
|
||||
"input_output": 2,
|
||||
"kernels": [
|
||||
"default/TensorAdd-op10",
|
||||
|
@ -12,14 +12,7 @@
|
|||
"Default/optimizer-Momentum/tuple_getitem-op29",
|
||||
"Default/optimizer-Momentum/ApplyMomentum-op12"
|
||||
],
|
||||
"support_device": [0,1,2,3,4,5,6,7]
|
||||
},
|
||||
"async_dump_settings": {
|
||||
"enable": true,
|
||||
"support_device": [0,1,2,3,4,5,6,7],
|
||||
"op_debug_mode": 0
|
||||
},
|
||||
"e2e_dump_settings": {
|
||||
"enable": false,
|
||||
"trans_flag": false
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue