forked from mindspore-Ecosystem/mindspore
Unified dump entry, async dump multiple steps in different dirs, and async dump multiple specific steps in one training session - finer grain control
This commit is contained in:
parent
583857799f
commit
1d62e1653e
|
@ -3,17 +3,10 @@
|
||||||
"dump_mode": 0,
|
"dump_mode": 0,
|
||||||
"path": "/test",
|
"path": "/test",
|
||||||
"net_name": "ResNet50",
|
"net_name": "ResNet50",
|
||||||
"iteration": 0,
|
"iteration": "0",
|
||||||
"input_output": 2,
|
"input_output": 2,
|
||||||
"kernels": ["Default/Conv-op12"],
|
"kernels": ["Default/Conv-op12"],
|
||||||
"support_device": [0,1,2,3,4,5,6,7]
|
"support_device": [0,1,2,3,4,5,6,7],
|
||||||
},
|
|
||||||
"e2e_dump_settings": {
|
|
||||||
"enable": false,
|
|
||||||
"trans_flag": false
|
|
||||||
},
|
|
||||||
"async_dump_settings": {
|
|
||||||
"enable": false,
|
|
||||||
"op_debug_mode": 0
|
"op_debug_mode": 0
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -1198,6 +1198,7 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
|
||||||
}
|
}
|
||||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
|
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
|
||||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||||
|
DumpSetup(kernel_graph);
|
||||||
bool ret_ok = runtime_instance->Run(kernel_graph.get(), is_task_sink);
|
bool ret_ok = runtime_instance->Run(kernel_graph.get(), is_task_sink);
|
||||||
Dump(kernel_graph);
|
Dump(kernel_graph);
|
||||||
if (!ret_ok) {
|
if (!ret_ok) {
|
||||||
|
@ -1209,6 +1210,13 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
|
||||||
MS_LOG(INFO) << "Finish!";
|
MS_LOG(INFO) << "Finish!";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||||
|
MS_LOG(INFO) << "Start!";
|
||||||
|
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||||
|
E2eDump::DumpSetup(kernel_graph.get(), device_id_);
|
||||||
|
MS_LOG(INFO) << "Finish!";
|
||||||
|
}
|
||||||
|
|
||||||
void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||||
MS_LOG(INFO) << "Start!";
|
MS_LOG(INFO) << "Start!";
|
||||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||||
|
|
|
@ -89,6 +89,7 @@ class AscendSession : public SessionBasic {
|
||||||
void Load(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
void Load(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
||||||
void Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const;
|
void Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const;
|
||||||
void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
||||||
|
void DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
||||||
void DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs);
|
void DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs);
|
||||||
void LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
void LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
||||||
// below functions are used for run op
|
// below functions are used for run op
|
||||||
|
|
|
@ -107,9 +107,8 @@ void DumpJsonParser::Parse() {
|
||||||
std::string cfg = ss.str();
|
std::string cfg = ss.str();
|
||||||
MS_LOG(INFO) << "Dump json:" << cfg;
|
MS_LOG(INFO) << "Dump json:" << cfg;
|
||||||
|
|
||||||
ParseCommonDumpSetting(j);
|
|
||||||
ParseAsyncDumpSetting(j);
|
|
||||||
ParseE2eDumpSetting(j);
|
ParseE2eDumpSetting(j);
|
||||||
|
ParseCommonDumpSetting(j);
|
||||||
JudgeDumpEnabled();
|
JudgeDumpEnabled();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -214,6 +213,14 @@ void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
|
||||||
auto input_output = CheckJsonKeyExist(*common_dump_settings, kInputOutput);
|
auto input_output = CheckJsonKeyExist(*common_dump_settings, kInputOutput);
|
||||||
auto kernels = CheckJsonKeyExist(*common_dump_settings, kKernels);
|
auto kernels = CheckJsonKeyExist(*common_dump_settings, kKernels);
|
||||||
auto support_device = CheckJsonKeyExist(*common_dump_settings, kSupportDevice);
|
auto support_device = CheckJsonKeyExist(*common_dump_settings, kSupportDevice);
|
||||||
|
auto op_debug_mode = CheckJsonKeyExist(*common_dump_settings, kOpDebugMode);
|
||||||
|
|
||||||
|
// async_dump is enabled by default, if e2e dump is enabled it will override this
|
||||||
|
auto context = MsContext::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(context);
|
||||||
|
if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
|
||||||
|
async_dump_enabled_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
ParseDumpMode(*dump_mode);
|
ParseDumpMode(*dump_mode);
|
||||||
ParseDumpPath(*path);
|
ParseDumpPath(*path);
|
||||||
|
@ -222,34 +229,29 @@ void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
|
||||||
ParseInputOutput(*input_output);
|
ParseInputOutput(*input_output);
|
||||||
ParseKernels(*kernels);
|
ParseKernels(*kernels);
|
||||||
ParseSupportDevice(*support_device);
|
ParseSupportDevice(*support_device);
|
||||||
}
|
|
||||||
|
|
||||||
void DumpJsonParser::ParseAsyncDumpSetting(const nlohmann::json &content) {
|
|
||||||
// async dump setting is optional
|
|
||||||
auto async_dump_setting = content.find(kAsyncDumpSettings);
|
|
||||||
if (async_dump_setting == content.end()) {
|
|
||||||
MS_LOG(INFO) << "No async_dump_settings";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto async_dump_enable = CheckJsonKeyExist(*async_dump_setting, kEnable);
|
|
||||||
auto op_debug_mode = CheckJsonKeyExist(*async_dump_setting, kOpDebugMode);
|
|
||||||
|
|
||||||
async_dump_enabled_ = ParseEnable(*async_dump_enable);
|
|
||||||
ParseOpDebugMode(*op_debug_mode);
|
ParseOpDebugMode(*op_debug_mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
void DumpJsonParser::ParseE2eDumpSetting(const nlohmann::json &content) {
|
void DumpJsonParser::ParseE2eDumpSetting(const nlohmann::json &content) {
|
||||||
auto e2e_dump_setting = content.find(kE2eDumpSettings);
|
auto e2e_dump_setting = content.find(kE2eDumpSettings);
|
||||||
|
auto context = MsContext::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(context);
|
||||||
if (e2e_dump_setting == content.end()) {
|
if (e2e_dump_setting == content.end()) {
|
||||||
MS_LOG(INFO) << "No e2e_dump_settings";
|
if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
|
||||||
return;
|
MS_LOG(EXCEPTION) << "e2e_dump_settings needed for GPU dump";
|
||||||
|
} else {
|
||||||
|
MS_LOG(INFO) << "No e2e_dump_settings";
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto e2e_dump_enable = CheckJsonKeyExist(*e2e_dump_setting, kEnable);
|
auto e2e_dump_enable = CheckJsonKeyExist(*e2e_dump_setting, kEnable);
|
||||||
auto trans_flag = CheckJsonKeyExist(*e2e_dump_setting, kTransFlag);
|
auto trans_flag = CheckJsonKeyExist(*e2e_dump_setting, kTransFlag);
|
||||||
|
|
||||||
e2e_dump_enabled_ = ParseEnable(*e2e_dump_enable);
|
e2e_dump_enabled_ = ParseEnable(*e2e_dump_enable);
|
||||||
|
if (e2e_dump_enabled_ && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
|
||||||
|
MS_LOG(WARNING) << "Deprecated: Synchronous dump mode is deprecated and will be removed in a future release";
|
||||||
|
}
|
||||||
trans_flag_ = ParseEnable(*trans_flag);
|
trans_flag_ = ParseEnable(*trans_flag);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -304,8 +306,68 @@ void DumpJsonParser::ParseNetName(const nlohmann::json &content) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void DumpJsonParser::ParseIteration(const nlohmann::json &content) {
|
void DumpJsonParser::ParseIteration(const nlohmann::json &content) {
|
||||||
CheckJsonUnsignedType(content, kIteration);
|
CheckJsonStringType(content, kIteration);
|
||||||
iteration_ = content;
|
if (e2e_dump_enabled_) {
|
||||||
|
std::string temp_iter = content;
|
||||||
|
// is this a single iteration
|
||||||
|
if (temp_iter != "all" && temp_iter.find("-") == std::string::npos && temp_iter.find("|") == std::string::npos) {
|
||||||
|
iteration_ = std::stoul(temp_iter);
|
||||||
|
} else {
|
||||||
|
MS_LOG(EXCEPTION) << "Can only use a single value for the iteration in sync mode.";
|
||||||
|
}
|
||||||
|
} else if (async_dump_enabled_) {
|
||||||
|
async_iteration_ = content;
|
||||||
|
if (async_iteration_.empty()) {
|
||||||
|
MS_LOG(EXCEPTION) << "In async dump settings json file, iteration is empty";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
MS_LOG(EXCEPTION) << "Dump Json Parse Failed. Async or E2E should be enabled. ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool DumpJsonParser::IsDumpIter(uint32_t iteration) {
|
||||||
|
// bool DumpJsonParser::IsDumpIter(uint32_t iteration) --> checks if iteration should be dumped or not.
|
||||||
|
if (async_iteration_ == "all") {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
int start = 0;
|
||||||
|
int end = async_iteration_.find("|");
|
||||||
|
while (end != -1) {
|
||||||
|
std::string temp = async_iteration_.substr(start, end - start);
|
||||||
|
int range_idx = temp.find("-");
|
||||||
|
if (range_idx != -1) {
|
||||||
|
uint32_t low_range = std::stoul(temp.substr(0, range_idx));
|
||||||
|
uint32_t high_range = std::stoul(temp.substr((range_idx + 1), -1));
|
||||||
|
if ((low_range <= iteration) && (iteration <= high_range)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else if (iteration == std::stoul(temp)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
start = end + 1;
|
||||||
|
end = async_iteration_.find("|", start);
|
||||||
|
}
|
||||||
|
std::string temp = async_iteration_.substr(start, end - start);
|
||||||
|
int range_idx = temp.find("-");
|
||||||
|
if (range_idx != -1) {
|
||||||
|
uint32_t low_range = std::stoul(temp.substr(0, range_idx));
|
||||||
|
uint32_t high_range = std::stoul(temp.substr((range_idx + 1), -1));
|
||||||
|
if ((low_range <= iteration) && (iteration <= high_range)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else if (iteration == std::stoul(temp)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool DumpJsonParser::IsSingleIter() {
|
||||||
|
// bool DumpJsonParser::IsSingleIter() --> checks if iteration in json dump file is single or not.
|
||||||
|
if (async_iteration_ != "all" && async_iteration_.find("-") == std::string::npos &&
|
||||||
|
async_iteration_.find("|") == std::string::npos) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void DumpJsonParser::ParseInputOutput(const nlohmann::json &content) {
|
void DumpJsonParser::ParseInputOutput(const nlohmann::json &content) {
|
||||||
|
|
|
@ -41,6 +41,9 @@ class DumpJsonParser {
|
||||||
bool NeedDump(const std::string &op_full_name) const;
|
bool NeedDump(const std::string &op_full_name) const;
|
||||||
void MatchKernel(const std::string &kernel_name);
|
void MatchKernel(const std::string &kernel_name);
|
||||||
void PrintUnusedKernel();
|
void PrintUnusedKernel();
|
||||||
|
bool IsDumpIter(uint32_t iteration);
|
||||||
|
bool DumpAllIter();
|
||||||
|
bool IsSingleIter();
|
||||||
|
|
||||||
bool async_dump_enabled() const { return async_dump_enabled_; }
|
bool async_dump_enabled() const { return async_dump_enabled_; }
|
||||||
bool e2e_dump_enabled() const { return e2e_dump_enabled_; }
|
bool e2e_dump_enabled() const { return e2e_dump_enabled_; }
|
||||||
|
@ -58,6 +61,7 @@ class DumpJsonParser {
|
||||||
bool OutputNeedDump() const;
|
bool OutputNeedDump() const;
|
||||||
std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const;
|
std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const;
|
||||||
void UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph);
|
void UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph);
|
||||||
|
bool AsyncDumpEnabled() const { return async_dump_enabled_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DumpJsonParser() = default;
|
DumpJsonParser() = default;
|
||||||
|
@ -71,6 +75,7 @@ class DumpJsonParser {
|
||||||
std::string path_;
|
std::string path_;
|
||||||
std::string net_name_;
|
std::string net_name_;
|
||||||
uint32_t iteration_{0};
|
uint32_t iteration_{0};
|
||||||
|
std::string async_iteration_;
|
||||||
uint32_t input_output_{0};
|
uint32_t input_output_{0};
|
||||||
std::map<std::string, uint32_t> kernels_;
|
std::map<std::string, uint32_t> kernels_;
|
||||||
std::set<uint32_t> support_devices_;
|
std::set<uint32_t> support_devices_;
|
||||||
|
@ -80,7 +85,6 @@ class DumpJsonParser {
|
||||||
bool already_parsed_{false};
|
bool already_parsed_{false};
|
||||||
|
|
||||||
void ParseCommonDumpSetting(const nlohmann::json &content);
|
void ParseCommonDumpSetting(const nlohmann::json &content);
|
||||||
void ParseAsyncDumpSetting(const nlohmann::json &content);
|
|
||||||
void ParseE2eDumpSetting(const nlohmann::json &content);
|
void ParseE2eDumpSetting(const nlohmann::json &content);
|
||||||
bool IsDumpEnabled();
|
bool IsDumpEnabled();
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
|
|
||||||
#include "debug/data_dump/dump_json_parser.h"
|
#include "debug/data_dump/dump_json_parser.h"
|
||||||
#include "common/trans.h"
|
#include "common/trans.h"
|
||||||
|
#include "debug/common.h"
|
||||||
#include "backend/session/anf_runtime_algorithm.h"
|
#include "backend/session/anf_runtime_algorithm.h"
|
||||||
#include "utils/ms_context.h"
|
#include "utils/ms_context.h"
|
||||||
#include "runtime/device/kernel_runtime_manager.h"
|
#include "runtime/device/kernel_runtime_manager.h"
|
||||||
|
@ -235,6 +236,38 @@ void E2eDump::DumpParametersAndConst(const session::KernelGraph *graph, const st
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t device_id) {
|
||||||
|
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||||
|
uint32_t cur_iter = dump_json_parser.cur_dump_iter();
|
||||||
|
if (dump_json_parser.AsyncDumpEnabled() && dump_json_parser.IsDumpIter(cur_iter)) {
|
||||||
|
auto zero_dir_dump_path =
|
||||||
|
dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
|
||||||
|
|
||||||
|
auto root_cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" +
|
||||||
|
dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id());
|
||||||
|
|
||||||
|
auto cur_iter_dump_path = root_cur_iter_dump_path + "/" + std::to_string(cur_iter);
|
||||||
|
|
||||||
|
MS_LOG(INFO) << "zero_dir_dump_path: " << zero_dir_dump_path;
|
||||||
|
MS_LOG(INFO) << "root_cur_iter_dump_path: " << root_cur_iter_dump_path;
|
||||||
|
MS_LOG(INFO) << "cur_iter_dump_path: " << cur_iter_dump_path;
|
||||||
|
|
||||||
|
// create cur_iter_dump_path dirs
|
||||||
|
bool status = Common::CreateNotExistDirs(root_cur_iter_dump_path);
|
||||||
|
if (!status) {
|
||||||
|
MS_LOG(EXCEPTION) << "Failed at CreateNotExistDirs for " << root_cur_iter_dump_path;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create symlink to active dump dir for the iteration in final dump dir
|
||||||
|
std::string command = "ln -fs " + zero_dir_dump_path + " " + cur_iter_dump_path;
|
||||||
|
MS_LOG(INFO) << "ln command: " << command;
|
||||||
|
if (system(command.c_str())) {
|
||||||
|
MS_LOG(EXCEPTION) << "failed to create symlink to active dump dir for the iteration in final dump dir.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger) {
|
bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger) {
|
||||||
MS_EXCEPTION_IF_NULL(graph);
|
MS_EXCEPTION_IF_NULL(graph);
|
||||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||||
|
@ -245,16 +278,60 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co
|
||||||
if (starting_graph_id == graph_id) {
|
if (starting_graph_id == graph_id) {
|
||||||
dump_json_parser.UpdateDumpIter();
|
dump_json_parser.UpdateDumpIter();
|
||||||
}
|
}
|
||||||
if (!dump_json_parser.GetIterDumpFlag()) {
|
|
||||||
|
if (dump_json_parser.GetIterDumpFlag()) {
|
||||||
|
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
|
||||||
|
MS_LOG(INFO) << "Current graph id is " << graph_id;
|
||||||
|
std::string dump_path = GenerateDumpPath(graph_id, &device_id);
|
||||||
|
|
||||||
|
DumpInput(graph, dump_path, debugger);
|
||||||
|
DumpOutput(graph, dump_path, debugger);
|
||||||
|
DumpParametersAndConst(graph, dump_path, debugger);
|
||||||
|
return true;
|
||||||
|
} else if (dump_json_parser.AsyncDumpEnabled()) {
|
||||||
|
uint32_t prev_dump_iter = dump_json_parser.cur_dump_iter() - 1;
|
||||||
|
|
||||||
|
auto zero_dir_dump_path =
|
||||||
|
dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
|
||||||
|
|
||||||
|
auto cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" +
|
||||||
|
dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id()) + "/" +
|
||||||
|
std::to_string(prev_dump_iter);
|
||||||
|
|
||||||
|
MS_LOG(INFO) << "zero_dir_dump_path: " << zero_dir_dump_path;
|
||||||
|
MS_LOG(INFO) << "cur_iter_dump_path: " << cur_iter_dump_path;
|
||||||
|
|
||||||
|
if (dump_json_parser.IsDumpIter(prev_dump_iter)) {
|
||||||
|
// remove symlink to active dump dir
|
||||||
|
std::string command = "rm -f " + cur_iter_dump_path;
|
||||||
|
MS_LOG(INFO) << "rm command: " << command;
|
||||||
|
if (system(command.c_str())) {
|
||||||
|
MS_LOG(EXCEPTION) << "failed to remove symlink to active dump dir.";
|
||||||
|
}
|
||||||
|
|
||||||
|
// create actual dir for iteration in final dump dir
|
||||||
|
bool status = Common::CreateNotExistDirs(cur_iter_dump_path);
|
||||||
|
if (!status) {
|
||||||
|
MS_LOG(EXCEPTION) << "failed at CreateNotExistDirs for " << cur_iter_dump_path;
|
||||||
|
}
|
||||||
|
|
||||||
|
// move contents from active dump dir to final dump dir
|
||||||
|
command = "mv " + zero_dir_dump_path + "/* " + cur_iter_dump_path + "/.";
|
||||||
|
MS_LOG(INFO) << "mv command: " << command;
|
||||||
|
if (system(command.c_str())) {
|
||||||
|
MS_LOG(EXCEPTION) << "Ascend runtime has changed the dump dir structure!!!";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// delete contents from active dump dir
|
||||||
|
std::string command = "rm -f " + zero_dir_dump_path + "/*";
|
||||||
|
MS_LOG(INFO) << "rm command: " << command;
|
||||||
|
if (system(command.c_str())) {
|
||||||
|
MS_LOG(EXCEPTION) << "Ascend runtime has changed the dump dir structure!!!";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
|
return false;
|
||||||
MS_LOG(INFO) << "Current graph id is " << graph_id;
|
|
||||||
std::string dump_path = GenerateDumpPath(graph_id, &device_id);
|
|
||||||
|
|
||||||
DumpInput(graph, dump_path, debugger);
|
|
||||||
DumpOutput(graph, dump_path, debugger);
|
|
||||||
DumpParametersAndConst(graph, dump_path, debugger);
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_H_
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_H_
|
||||||
#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_H_
|
#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_H_
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
@ -33,6 +34,7 @@ class E2eDump {
|
||||||
public:
|
public:
|
||||||
E2eDump() = default;
|
E2eDump() = default;
|
||||||
~E2eDump() = default;
|
~E2eDump() = default;
|
||||||
|
static void DumpSetup(const session::KernelGraph *graph, uint32_t device_id);
|
||||||
static bool DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger = nullptr);
|
static bool DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger = nullptr);
|
||||||
// Dump data when task error.
|
// Dump data when task error.
|
||||||
static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
|
static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
|
||||||
|
|
|
@ -141,8 +141,8 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf
|
||||||
auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||||
dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(device_id) + "/");
|
dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(device_id) + "/");
|
||||||
MS_LOG(INFO) << "[DataDump] dump_path:" << dump_path;
|
MS_LOG(INFO) << "[DataDump] dump_path:" << dump_path;
|
||||||
dump_info->set_model_name(DumpJsonParser::GetInstance().net_name());
|
dump_info->set_model_name("_");
|
||||||
dump_info->set_dump_step(std::to_string(DumpJsonParser::GetInstance().iteration()));
|
dump_info->set_dump_step("0");
|
||||||
dump_info->set_model_id(graph_id);
|
dump_info->set_model_id(graph_id);
|
||||||
dump_info->set_flag(kAicpuLoadFlag);
|
dump_info->set_flag(kAicpuLoadFlag);
|
||||||
|
|
||||||
|
|
|
@ -3,13 +3,10 @@
|
||||||
"dump_mode": 0,
|
"dump_mode": 0,
|
||||||
"path": "/test",
|
"path": "/test",
|
||||||
"net_name": "Net",
|
"net_name": "Net",
|
||||||
"iteration": 0,
|
"iteration": "0",
|
||||||
"input_output": 2,
|
"input_output": 2,
|
||||||
"kernels": ["Default/TensorAdd-op3"],
|
"kernels": ["Default/TensorAdd-op3"],
|
||||||
"support_device": [0,1,2,3,4,5,6,7]
|
"support_device": [0,1,2,3,4,5,6,7],
|
||||||
},
|
|
||||||
"async_dump_settings": {
|
|
||||||
"enable": true,
|
|
||||||
"op_debug_mode": 0
|
"op_debug_mode": 0
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -3,10 +3,11 @@
|
||||||
"dump_mode": 0,
|
"dump_mode": 0,
|
||||||
"path": "/test",
|
"path": "/test",
|
||||||
"net_name": "Net",
|
"net_name": "Net",
|
||||||
"iteration": 0,
|
"iteration": "0",
|
||||||
"input_output": 0,
|
"input_output": 0,
|
||||||
"kernels": ["Default/Conv-op12"],
|
"kernels": ["Default/Conv-op12"],
|
||||||
"support_device": [0,1,2,3,4,5,6,7]
|
"support_device": [0,1,2,3,4,5,6,7],
|
||||||
|
"op_debug_mode": 0
|
||||||
},
|
},
|
||||||
"e2e_dump_settings": {
|
"e2e_dump_settings": {
|
||||||
"enable": true,
|
"enable": true,
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
"dump_mode": 0,
|
"dump_mode": 0,
|
||||||
"path": "/tmp/async_dump/test_async_dump_net_multi_layer_mode1",
|
"path": "/tmp/async_dump/test_async_dump_net_multi_layer_mode1",
|
||||||
"net_name": "test",
|
"net_name": "test",
|
||||||
"iteration": 0,
|
"iteration": "0",
|
||||||
"input_output": 2,
|
"input_output": 2,
|
||||||
"kernels": [
|
"kernels": [
|
||||||
"default/TensorAdd-op10",
|
"default/TensorAdd-op10",
|
||||||
|
@ -12,14 +12,7 @@
|
||||||
"Default/optimizer-Momentum/tuple_getitem-op29",
|
"Default/optimizer-Momentum/tuple_getitem-op29",
|
||||||
"Default/optimizer-Momentum/ApplyMomentum-op12"
|
"Default/optimizer-Momentum/ApplyMomentum-op12"
|
||||||
],
|
],
|
||||||
"support_device": [0,1,2,3,4,5,6,7]
|
"support_device": [0,1,2,3,4,5,6,7],
|
||||||
},
|
|
||||||
"async_dump_settings": {
|
|
||||||
"enable": true,
|
|
||||||
"op_debug_mode": 0
|
"op_debug_mode": 0
|
||||||
},
|
|
||||||
"e2e_dump_settings": {
|
|
||||||
"enable": false,
|
|
||||||
"trans_flag": false
|
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue