Unified dump entry, async dump multiple steps in different dirs, and async dump multiple specific steps in one training session - finer grain control

This commit is contained in:
John Tzanakakis 2021-04-21 16:24:47 -04:00
parent 583857799f
commit 1d62e1653e
11 changed files with 195 additions and 57 deletions

View File

@ -3,17 +3,10 @@
"dump_mode": 0,
"path": "/test",
"net_name": "ResNet50",
"iteration": 0,
"iteration": "0",
"input_output": 2,
"kernels": ["Default/Conv-op12"],
"support_device": [0,1,2,3,4,5,6,7]
},
"e2e_dump_settings": {
"enable": false,
"trans_flag": false
},
"async_dump_settings": {
"enable": false,
"support_device": [0,1,2,3,4,5,6,7],
"op_debug_mode": 0
}
}

View File

@ -1198,6 +1198,7 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
}
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
DumpSetup(kernel_graph);
bool ret_ok = runtime_instance->Run(kernel_graph.get(), is_task_sink);
Dump(kernel_graph);
if (!ret_ok) {
@ -1209,6 +1210,13 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
MS_LOG(INFO) << "Finish!";
}
void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const {
MS_LOG(INFO) << "Start!";
MS_EXCEPTION_IF_NULL(kernel_graph);
E2eDump::DumpSetup(kernel_graph.get(), device_id_);
MS_LOG(INFO) << "Finish!";
}
void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
MS_LOG(INFO) << "Start!";
MS_EXCEPTION_IF_NULL(kernel_graph);

View File

@ -89,6 +89,7 @@ class AscendSession : public SessionBasic {
void Load(const std::shared_ptr<KernelGraph> &kernel_graph) const;
void Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const;
void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
void DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const;
void DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs);
void LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
// below functions are used for run op

View File

@ -107,9 +107,8 @@ void DumpJsonParser::Parse() {
std::string cfg = ss.str();
MS_LOG(INFO) << "Dump json:" << cfg;
ParseCommonDumpSetting(j);
ParseAsyncDumpSetting(j);
ParseE2eDumpSetting(j);
ParseCommonDumpSetting(j);
JudgeDumpEnabled();
}
@ -214,6 +213,14 @@ void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
auto input_output = CheckJsonKeyExist(*common_dump_settings, kInputOutput);
auto kernels = CheckJsonKeyExist(*common_dump_settings, kKernels);
auto support_device = CheckJsonKeyExist(*common_dump_settings, kSupportDevice);
auto op_debug_mode = CheckJsonKeyExist(*common_dump_settings, kOpDebugMode);
// async_dump is enabled by default, if e2e dump is enabled it will override this
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
async_dump_enabled_ = true;
}
ParseDumpMode(*dump_mode);
ParseDumpPath(*path);
@ -222,34 +229,29 @@ void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
ParseInputOutput(*input_output);
ParseKernels(*kernels);
ParseSupportDevice(*support_device);
}
void DumpJsonParser::ParseAsyncDumpSetting(const nlohmann::json &content) {
// async dump setting is optional
auto async_dump_setting = content.find(kAsyncDumpSettings);
if (async_dump_setting == content.end()) {
MS_LOG(INFO) << "No async_dump_settings";
return;
}
auto async_dump_enable = CheckJsonKeyExist(*async_dump_setting, kEnable);
auto op_debug_mode = CheckJsonKeyExist(*async_dump_setting, kOpDebugMode);
async_dump_enabled_ = ParseEnable(*async_dump_enable);
ParseOpDebugMode(*op_debug_mode);
}
void DumpJsonParser::ParseE2eDumpSetting(const nlohmann::json &content) {
auto e2e_dump_setting = content.find(kE2eDumpSettings);
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
if (e2e_dump_setting == content.end()) {
MS_LOG(INFO) << "No e2e_dump_settings";
return;
if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
MS_LOG(EXCEPTION) << "e2e_dump_settings needed for GPU dump";
} else {
MS_LOG(INFO) << "No e2e_dump_settings";
return;
}
}
auto e2e_dump_enable = CheckJsonKeyExist(*e2e_dump_setting, kEnable);
auto trans_flag = CheckJsonKeyExist(*e2e_dump_setting, kTransFlag);
e2e_dump_enabled_ = ParseEnable(*e2e_dump_enable);
if (e2e_dump_enabled_ && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
MS_LOG(WARNING) << "Deprecated: Synchronous dump mode is deprecated and will be removed in a future release";
}
trans_flag_ = ParseEnable(*trans_flag);
}
@ -304,8 +306,68 @@ void DumpJsonParser::ParseNetName(const nlohmann::json &content) {
}
void DumpJsonParser::ParseIteration(const nlohmann::json &content) {
CheckJsonUnsignedType(content, kIteration);
iteration_ = content;
CheckJsonStringType(content, kIteration);
if (e2e_dump_enabled_) {
std::string temp_iter = content;
// is this a single iteration
if (temp_iter != "all" && temp_iter.find("-") == std::string::npos && temp_iter.find("|") == std::string::npos) {
iteration_ = std::stoul(temp_iter);
} else {
MS_LOG(EXCEPTION) << "Can only use a single value for the iteration in sync mode.";
}
} else if (async_dump_enabled_) {
async_iteration_ = content;
if (async_iteration_.empty()) {
MS_LOG(EXCEPTION) << "In async dump settings json file, iteration is empty";
}
} else {
MS_LOG(EXCEPTION) << "Dump Json Parse Failed. Async or E2E should be enabled. ";
}
}
bool DumpJsonParser::IsDumpIter(uint32_t iteration) {
// bool DumpJsonParser::IsDumpIter(uint32_t iteration) --> checks if iteration should be dumped or not.
if (async_iteration_ == "all") {
return true;
}
int start = 0;
int end = async_iteration_.find("|");
while (end != -1) {
std::string temp = async_iteration_.substr(start, end - start);
int range_idx = temp.find("-");
if (range_idx != -1) {
uint32_t low_range = std::stoul(temp.substr(0, range_idx));
uint32_t high_range = std::stoul(temp.substr((range_idx + 1), -1));
if ((low_range <= iteration) && (iteration <= high_range)) {
return true;
}
} else if (iteration == std::stoul(temp)) {
return true;
}
start = end + 1;
end = async_iteration_.find("|", start);
}
std::string temp = async_iteration_.substr(start, end - start);
int range_idx = temp.find("-");
if (range_idx != -1) {
uint32_t low_range = std::stoul(temp.substr(0, range_idx));
uint32_t high_range = std::stoul(temp.substr((range_idx + 1), -1));
if ((low_range <= iteration) && (iteration <= high_range)) {
return true;
}
} else if (iteration == std::stoul(temp)) {
return true;
}
return false;
}
bool DumpJsonParser::IsSingleIter() {
// bool DumpJsonParser::IsSingleIter() --> checks if iteration in json dump file is single or not.
if (async_iteration_ != "all" && async_iteration_.find("-") == std::string::npos &&
async_iteration_.find("|") == std::string::npos) {
return true;
}
return false;
}
void DumpJsonParser::ParseInputOutput(const nlohmann::json &content) {

View File

@ -41,6 +41,9 @@ class DumpJsonParser {
bool NeedDump(const std::string &op_full_name) const;
void MatchKernel(const std::string &kernel_name);
void PrintUnusedKernel();
bool IsDumpIter(uint32_t iteration);
bool DumpAllIter();
bool IsSingleIter();
bool async_dump_enabled() const { return async_dump_enabled_; }
bool e2e_dump_enabled() const { return e2e_dump_enabled_; }
@ -58,6 +61,7 @@ class DumpJsonParser {
bool OutputNeedDump() const;
std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const;
void UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph);
bool AsyncDumpEnabled() const { return async_dump_enabled_; }
private:
DumpJsonParser() = default;
@ -71,6 +75,7 @@ class DumpJsonParser {
std::string path_;
std::string net_name_;
uint32_t iteration_{0};
std::string async_iteration_;
uint32_t input_output_{0};
std::map<std::string, uint32_t> kernels_;
std::set<uint32_t> support_devices_;
@ -80,7 +85,6 @@ class DumpJsonParser {
bool already_parsed_{false};
void ParseCommonDumpSetting(const nlohmann::json &content);
void ParseAsyncDumpSetting(const nlohmann::json &content);
void ParseE2eDumpSetting(const nlohmann::json &content);
bool IsDumpEnabled();

View File

@ -22,6 +22,7 @@
#include "debug/data_dump/dump_json_parser.h"
#include "common/trans.h"
#include "debug/common.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "utils/ms_context.h"
#include "runtime/device/kernel_runtime_manager.h"
@ -235,6 +236,38 @@ void E2eDump::DumpParametersAndConst(const session::KernelGraph *graph, const st
}
}
void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t device_id) {
auto &dump_json_parser = DumpJsonParser::GetInstance();
uint32_t cur_iter = dump_json_parser.cur_dump_iter();
if (dump_json_parser.AsyncDumpEnabled() && dump_json_parser.IsDumpIter(cur_iter)) {
auto zero_dir_dump_path =
dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
auto root_cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" +
dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id());
auto cur_iter_dump_path = root_cur_iter_dump_path + "/" + std::to_string(cur_iter);
MS_LOG(INFO) << "zero_dir_dump_path: " << zero_dir_dump_path;
MS_LOG(INFO) << "root_cur_iter_dump_path: " << root_cur_iter_dump_path;
MS_LOG(INFO) << "cur_iter_dump_path: " << cur_iter_dump_path;
// create cur_iter_dump_path dirs
bool status = Common::CreateNotExistDirs(root_cur_iter_dump_path);
if (!status) {
MS_LOG(EXCEPTION) << "Failed at CreateNotExistDirs for " << root_cur_iter_dump_path;
return;
}
// create symlink to active dump dir for the iteration in final dump dir
std::string command = "ln -fs " + zero_dir_dump_path + " " + cur_iter_dump_path;
MS_LOG(INFO) << "ln command: " << command;
if (system(command.c_str())) {
MS_LOG(EXCEPTION) << "failed to create symlink to active dump dir for the iteration in final dump dir.";
}
}
}
bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger) {
MS_EXCEPTION_IF_NULL(graph);
auto &dump_json_parser = DumpJsonParser::GetInstance();
@ -245,16 +278,60 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co
if (starting_graph_id == graph_id) {
dump_json_parser.UpdateDumpIter();
}
if (!dump_json_parser.GetIterDumpFlag()) {
if (dump_json_parser.GetIterDumpFlag()) {
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
MS_LOG(INFO) << "Current graph id is " << graph_id;
std::string dump_path = GenerateDumpPath(graph_id, &device_id);
DumpInput(graph, dump_path, debugger);
DumpOutput(graph, dump_path, debugger);
DumpParametersAndConst(graph, dump_path, debugger);
return true;
} else if (dump_json_parser.AsyncDumpEnabled()) {
uint32_t prev_dump_iter = dump_json_parser.cur_dump_iter() - 1;
auto zero_dir_dump_path =
dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
auto cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" +
dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id()) + "/" +
std::to_string(prev_dump_iter);
MS_LOG(INFO) << "zero_dir_dump_path: " << zero_dir_dump_path;
MS_LOG(INFO) << "cur_iter_dump_path: " << cur_iter_dump_path;
if (dump_json_parser.IsDumpIter(prev_dump_iter)) {
// remove symlink to active dump dir
std::string command = "rm -f " + cur_iter_dump_path;
MS_LOG(INFO) << "rm command: " << command;
if (system(command.c_str())) {
MS_LOG(EXCEPTION) << "failed to remove symlink to active dump dir.";
}
// create actual dir for iteration in final dump dir
bool status = Common::CreateNotExistDirs(cur_iter_dump_path);
if (!status) {
MS_LOG(EXCEPTION) << "failed at CreateNotExistDirs for " << cur_iter_dump_path;
}
// move contents from active dump dir to final dump dir
command = "mv " + zero_dir_dump_path + "/* " + cur_iter_dump_path + "/.";
MS_LOG(INFO) << "mv command: " << command;
if (system(command.c_str())) {
MS_LOG(EXCEPTION) << "Ascend runtime has changed the dump dir structure!!!";
}
} else {
// delete contents from active dump dir
std::string command = "rm -f " + zero_dir_dump_path + "/*";
MS_LOG(INFO) << "rm command: " << command;
if (system(command.c_str())) {
MS_LOG(EXCEPTION) << "Ascend runtime has changed the dump dir structure!!!";
}
}
return true;
}
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
MS_LOG(INFO) << "Current graph id is " << graph_id;
std::string dump_path = GenerateDumpPath(graph_id, &device_id);
DumpInput(graph, dump_path, debugger);
DumpOutput(graph, dump_path, debugger);
DumpParametersAndConst(graph, dump_path, debugger);
return true;
return false;
}
} // namespace mindspore

View File

@ -17,6 +17,7 @@
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_H_
#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_H_
#include <stdlib.h>
#include <map>
#include <string>
@ -33,6 +34,7 @@ class E2eDump {
public:
E2eDump() = default;
~E2eDump() = default;
static void DumpSetup(const session::KernelGraph *graph, uint32_t device_id);
static bool DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger = nullptr);
// Dump data when task error.
static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,

View File

@ -141,8 +141,8 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf
auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(device_id) + "/");
MS_LOG(INFO) << "[DataDump] dump_path:" << dump_path;
dump_info->set_model_name(DumpJsonParser::GetInstance().net_name());
dump_info->set_dump_step(std::to_string(DumpJsonParser::GetInstance().iteration()));
dump_info->set_model_name("_");
dump_info->set_dump_step("0");
dump_info->set_model_id(graph_id);
dump_info->set_flag(kAicpuLoadFlag);

View File

@ -3,13 +3,10 @@
"dump_mode": 0,
"path": "/test",
"net_name": "Net",
"iteration": 0,
"iteration": "0",
"input_output": 2,
"kernels": ["Default/TensorAdd-op3"],
"support_device": [0,1,2,3,4,5,6,7]
},
"async_dump_settings": {
"enable": true,
"support_device": [0,1,2,3,4,5,6,7],
"op_debug_mode": 0
}
}

View File

@ -3,10 +3,11 @@
"dump_mode": 0,
"path": "/test",
"net_name": "Net",
"iteration": 0,
"iteration": "0",
"input_output": 0,
"kernels": ["Default/Conv-op12"],
"support_device": [0,1,2,3,4,5,6,7]
"support_device": [0,1,2,3,4,5,6,7],
"op_debug_mode": 0
},
"e2e_dump_settings": {
"enable": true,

View File

@ -3,7 +3,7 @@
"dump_mode": 0,
"path": "/tmp/async_dump/test_async_dump_net_multi_layer_mode1",
"net_name": "test",
"iteration": 0,
"iteration": "0",
"input_output": 2,
"kernels": [
"default/TensorAdd-op10",
@ -12,14 +12,7 @@
"Default/optimizer-Momentum/tuple_getitem-op29",
"Default/optimizer-Momentum/ApplyMomentum-op12"
],
"support_device": [0,1,2,3,4,5,6,7]
},
"async_dump_settings": {
"enable": true,
"support_device": [0,1,2,3,4,5,6,7],
"op_debug_mode": 0
},
"e2e_dump_settings": {
"enable": false,
"trans_flag": false
}
}