forked from mindspore-Ecosystem/mindspore
!24520 Add graph run history to dump structure
Merge pull request !24520 from parastooashtari/multi_root_graph_dump
This commit is contained in:
commit
76f4f77cc2
|
@ -622,6 +622,9 @@ void AscendSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_
|
|||
debugger_->PostExecute();
|
||||
}
|
||||
#endif
|
||||
#ifndef ENABLE_SECURITY
|
||||
DumpSetup(kernel_graph);
|
||||
#endif
|
||||
}
|
||||
|
||||
void AscendSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) { Execute(kernel_graph, true); }
|
||||
|
@ -1347,11 +1350,6 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
|
|||
}
|
||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (is_task && is_task_sink) {
|
||||
DumpSetup(kernel_graph);
|
||||
}
|
||||
#endif
|
||||
bool ret_ok = runtime_instance->Run(*kernel_graph, is_task_sink);
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (is_task && is_task_sink) {
|
||||
|
@ -1378,6 +1376,7 @@ void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph)
|
|||
void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||
MS_LOG(DEBUG) << "Start!";
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
E2eDump::DumpRunIter(kernel_graph, rank_id_);
|
||||
E2eDump::DumpData(kernel_graph.get(), rank_id_);
|
||||
MS_LOG(DEBUG) << "Finish!";
|
||||
}
|
||||
|
|
|
@ -722,6 +722,7 @@ void GPUSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) con
|
|||
void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||
if (debugger_->DebuggerBackendEnabled()) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
E2eDump::DumpRunIter(kernel_graph, rank_id_);
|
||||
E2eDump::DumpData(kernel_graph.get(), rank_id_, debugger_.get());
|
||||
} else {
|
||||
DumpJsonParser::GetInstance().UpdateDumpIter();
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <map>
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "debug/anf_ir_utils.h"
|
||||
#include "debug/common.h"
|
||||
|
||||
namespace mindspore {
|
||||
void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) {
|
||||
|
@ -39,6 +40,31 @@ void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) {
|
|||
}
|
||||
}
|
||||
|
||||
void CPUE2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
if (!(json_parser.e2e_dump_enabled())) {
|
||||
return;
|
||||
}
|
||||
std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
|
||||
std::string file_name_to_check =
|
||||
execution_order_path + "/ms_global_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv";
|
||||
auto real_path = Common::CreatePrefixPath(file_name_to_check);
|
||||
if (!real_path.has_value()) {
|
||||
MS_LOG(WARNING) << "Check file path: " << file_name_to_check << " failed.";
|
||||
return;
|
||||
}
|
||||
std::string file_name = real_path.value();
|
||||
ChangeFileMode(file_name, S_IWUSR);
|
||||
std::ofstream fout(file_name, std::ofstream::app);
|
||||
if (!fout.is_open()) {
|
||||
MS_LOG(WARNING) << "Open file for saving graph global execution order failed.";
|
||||
return;
|
||||
}
|
||||
fout << std::to_string(json_parser.cur_dump_iter()) + "\n";
|
||||
fout.close();
|
||||
ChangeFileMode(file_name, S_IRUSR);
|
||||
}
|
||||
|
||||
void CPUE2eDump::DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path) {
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
std::string kernel_name = GetKernelNodeName(node);
|
||||
|
|
|
@ -35,6 +35,8 @@ class CPUE2eDump {
|
|||
|
||||
static void DumpCNodeData(const CNodePtr &node, uint32_t graph_id);
|
||||
|
||||
static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0);
|
||||
|
||||
private:
|
||||
static void DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path);
|
||||
|
||||
|
|
|
@ -290,13 +290,9 @@ void E2eDump::UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_m
|
|||
MS_LOG(INFO) << "No need to update iteration for dataset graph.";
|
||||
return;
|
||||
}
|
||||
if (starting_graph_id == INT32_MAX) {
|
||||
// Identify the first graph id and not increasing dump iter for the first iteration (initial dump iter = 0).
|
||||
starting_graph_id = graph_id;
|
||||
} else {
|
||||
// In multi network scripts, dump iter is equal to the number of networks that have been run so far.
|
||||
dump_json_parser.UpdateDumpIter();
|
||||
}
|
||||
|
||||
// In multi network scripts, dump iter is equal to the number of networks that have been executed so far.
|
||||
dump_json_parser.UpdateDumpIter();
|
||||
}
|
||||
|
||||
void E2eDump::DumpSetup(const session::KernelGraph *graph) {
|
||||
|
@ -308,10 +304,31 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph) {
|
|||
}
|
||||
}
|
||||
|
||||
void E2eDump::UpdateIterGPUDump() {
|
||||
if (starting_graph_id != INT32_MAX) {
|
||||
DumpJsonParser::GetInstance().UpdateDumpIter();
|
||||
void E2eDump::UpdateIterGPUDump() { DumpJsonParser::GetInstance().UpdateDumpIter(); }
|
||||
|
||||
void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
if (!(json_parser.async_dump_enabled() || json_parser.e2e_dump_enabled())) {
|
||||
return;
|
||||
}
|
||||
std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
|
||||
std::string file_name_to_check =
|
||||
execution_order_path + "/ms_global_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv";
|
||||
auto real_path = Common::CreatePrefixPath(file_name_to_check);
|
||||
if (!real_path.has_value()) {
|
||||
MS_LOG(WARNING) << "Check file path: " << file_name_to_check << " failed.";
|
||||
return;
|
||||
}
|
||||
std::string file_name = real_path.value();
|
||||
ChangeFileMode(file_name, S_IWUSR);
|
||||
std::ofstream fout(file_name, std::ofstream::app);
|
||||
if (!fout.is_open()) {
|
||||
MS_LOG(WARNING) << "Open file for saving graph global execution order failed.";
|
||||
return;
|
||||
}
|
||||
fout << std::to_string(json_parser.cur_dump_iter()) + "\n";
|
||||
fout.close();
|
||||
ChangeFileMode(file_name, S_IRUSR);
|
||||
}
|
||||
|
||||
void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) {
|
||||
|
|
|
@ -39,6 +39,8 @@ class E2eDump {
|
|||
|
||||
static void UpdateIterGPUDump();
|
||||
|
||||
static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0);
|
||||
|
||||
static void DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
|
||||
|
||||
static bool DumpParametersAndConstData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger);
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include <numeric>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <regex>
|
||||
#include "pybind11/embed.h"
|
||||
#include "pybind11/stl.h"
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
|
@ -33,8 +34,10 @@
|
|||
#include "debug/anf_ir_utils.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#endif
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "debug/debugger/tensor_summary.h"
|
||||
#include "utils/file_utils.h"
|
||||
#include "linux/limits.h"
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
namespace mindspore {
|
||||
#endif
|
||||
|
@ -172,23 +175,28 @@ DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_p
|
|||
}
|
||||
#ifdef OFFLINE_DBG_MODE
|
||||
const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
|
||||
uint32_t *prev_num_elements) {
|
||||
uint32_t *prev_num_elements, bool *history_not_found) {
|
||||
MS_EXCEPTION_IF_NULL(tensor);
|
||||
const void *previous_tensor_ptr = nullptr;
|
||||
std::shared_ptr<TensorData> tensor_prev;
|
||||
if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) {
|
||||
std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
|
||||
if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
|
||||
*history_not_found = 1;
|
||||
MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
|
||||
} else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
|
||||
// when prev_tensor is not available, the prev iteration is set to UINT32_MAX
|
||||
// read data in offline mode
|
||||
std::vector<std::string> file_paths;
|
||||
if (!is_sync_mode_) {
|
||||
ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
|
||||
std::vector<unsigned int>{tensor->GetDeviceId()},
|
||||
std::vector<unsigned int>{tensor->GetIteration() - 1},
|
||||
std::vector<unsigned int>{tensor->GetPrevIteration()},
|
||||
std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
|
||||
}
|
||||
std::vector<std::shared_ptr<TensorData>> result_list_prev;
|
||||
ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
|
||||
std::vector<unsigned int>{tensor->GetDeviceId()},
|
||||
std::vector<unsigned int>{tensor->GetIteration() - 1},
|
||||
std::vector<unsigned int>{tensor->GetPrevIteration()},
|
||||
std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
|
||||
file_paths, &result_list_prev);
|
||||
tensor_prev = result_list_prev[0];
|
||||
|
@ -303,7 +311,7 @@ void DebugServices::ProcessCheckpointsOutofMemory(
|
|||
const std::vector<parameter_t> ¶meter_list) {
|
||||
if (no_mem_to_read) {
|
||||
// bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
|
||||
int32_t oversize_error_code = 8;
|
||||
int32_t oversize_error_code = ITensorSummary::OUT_OF_MEMORY;
|
||||
for (auto &wp : watchpoints_to_check) {
|
||||
SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
|
||||
chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
|
||||
|
@ -313,6 +321,18 @@ void DebugServices::ProcessCheckpointsOutofMemory(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
|
||||
// set the tensor into not-in-use status in tensor_loader.
|
||||
auto tensor_name = tensor->GetName();
|
||||
std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
|
||||
std::to_string(tensor->GetRootGraphId()) + ":" +
|
||||
std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
|
||||
AppendToCacheEvictQueue(key_name_in_cache);
|
||||
if (previous_tensor_ptr != nullptr) {
|
||||
AppendToCacheEvictQueue(key_name_in_cache + ":prev");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void DebugServices::CheckWatchpointsForTensor(
|
||||
|
@ -373,7 +393,8 @@ void DebugServices::CheckWatchpointsForTensor(
|
|||
uint32_t prev_num_elements = 0;
|
||||
const void *previous_tensor_ptr = nullptr;
|
||||
#ifdef OFFLINE_DBG_MODE
|
||||
previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements);
|
||||
bool history_not_found = 0;
|
||||
previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
|
||||
#else
|
||||
std::shared_ptr<TensorData> prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
|
||||
if (prev_tensor_data) {
|
||||
|
@ -400,6 +421,11 @@ void DebugServices::CheckWatchpointsForTensor(
|
|||
auto item = base_summary_ptr->IsWatchpointHit(wp);
|
||||
is_hit = std::get<ITensorSummary::eHitPos>(item);
|
||||
error_code = std::get<ITensorSummary::eErrorCodePos>(item);
|
||||
#ifdef OFFLINE_DBG_MODE
|
||||
if (history_not_found) {
|
||||
error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
|
||||
}
|
||||
#endif
|
||||
parameter_list = std::get<ITensorSummary::eParamListPos>(item);
|
||||
}
|
||||
AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
|
||||
|
@ -413,14 +439,7 @@ void DebugServices::CheckWatchpointsForTensor(
|
|||
}
|
||||
|
||||
#ifdef OFFLINE_DBG_MODE
|
||||
// set the tensor into not-in-use status in tensor_loader.
|
||||
std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
|
||||
std::to_string(tensor->GetRootGraphId()) + ":" +
|
||||
std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
|
||||
AppendToCacheEvictQueue(key_name_in_cache);
|
||||
if (previous_tensor_ptr != nullptr) {
|
||||
AppendToCacheEvictQueue(key_name_in_cache + ":prev");
|
||||
}
|
||||
SetTensorToNotInUse(tensor, previous_tensor_ptr);
|
||||
// in offline mode remove the need for the data
|
||||
tensor.reset();
|
||||
#endif
|
||||
|
@ -685,7 +704,7 @@ void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &f
|
|||
std::string real_dump_iter_dir = RealPath(dump_key);
|
||||
DIR *d_handle = opendir(real_dump_iter_dir.c_str());
|
||||
if (d_handle == nullptr) {
|
||||
MS_LOG(ERROR) << "Directory does not exit in ConvertToHostFormat.";
|
||||
MS_LOG(ERROR) << "Directory does not exist in ConvertToHostFormat.";
|
||||
return;
|
||||
}
|
||||
struct dirent *dir = nullptr;
|
||||
|
@ -865,12 +884,153 @@ void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::str
|
|||
tensor_data->SetType("");
|
||||
tensor_data->SetShape(shape);
|
||||
tensor_data->SetIsOutput(output_flag);
|
||||
tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
|
||||
|
||||
tensor_list->push_back(tensor_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
|
||||
std::regex re;
|
||||
if (mode == "rank") {
|
||||
re = "^rank_([0-9]+)$";
|
||||
} else if (mode == "graph") {
|
||||
re = "^([0-9]+)$";
|
||||
}
|
||||
std::smatch tokens;
|
||||
if (regex_match(name, tokens, re)) {
|
||||
return std::stoi(tokens[1]);
|
||||
} else {
|
||||
return UINT32_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
|
||||
std::vector<uint32_t> rank_id_list;
|
||||
std::string dump_dir = GetDumpDir();
|
||||
DIR *d_handle = opendir(dump_dir.c_str());
|
||||
if (d_handle == nullptr) {
|
||||
MS_LOG(ERROR) << "Dump directory does not exist.";
|
||||
return rank_id_list;
|
||||
}
|
||||
struct dirent *dir = nullptr;
|
||||
while ((dir = readdir(d_handle)) != nullptr) {
|
||||
if (dir->d_type == DT_DIR) {
|
||||
std::string rank_dir_name = dir->d_name;
|
||||
if (GetRankOrGraphId("rank", rank_dir_name) != UINT32_MAX) {
|
||||
rank_id_list.push_back(GetRankOrGraphId("rank", rank_dir_name));
|
||||
}
|
||||
}
|
||||
}
|
||||
(void)closedir(d_handle);
|
||||
return rank_id_list;
|
||||
}
|
||||
|
||||
void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
|
||||
std::string net_name = GetNetName();
|
||||
std::string dump_dir = GetDumpDir();
|
||||
for (uint32_t rank_id : rank_id_list) {
|
||||
std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
|
||||
std::string abspath = RealPath(path);
|
||||
DIR *d_handle_rank = opendir(abspath.c_str());
|
||||
if (d_handle_rank == nullptr) {
|
||||
MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
|
||||
continue;
|
||||
}
|
||||
struct dirent *direc = nullptr;
|
||||
while ((direc = readdir(d_handle_rank)) != nullptr) {
|
||||
if (direc->d_type == DT_DIR) {
|
||||
std::string graph_dir = direc->d_name;
|
||||
if (graph_dir == "." || graph_dir == "..") {
|
||||
continue;
|
||||
}
|
||||
if (GetRankOrGraphId("graph", graph_dir) != UINT32_MAX) {
|
||||
uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
|
||||
ReadGraphsHistory(rank_id, graph_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
(void)closedir(d_handle_rank);
|
||||
}
|
||||
}
|
||||
|
||||
void DebugServices::SetGraphsHistory() {
|
||||
// extract rank_id_list
|
||||
std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
|
||||
// for each rank_id extract the graph_id list and set the dump version
|
||||
// and for each graph read the graph history file
|
||||
CheckDumpGraphIdList(rank_id_list);
|
||||
}
|
||||
|
||||
void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
|
||||
std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
|
||||
if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
|
||||
// graph history was already stored for this rank_id and graph_id
|
||||
return;
|
||||
}
|
||||
std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
|
||||
std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
|
||||
DIR *d_handle = opendir(exec_order_path.c_str());
|
||||
if (d_handle == nullptr) {
|
||||
MS_LOG(ERROR) << "Directory does not exist.";
|
||||
return;
|
||||
}
|
||||
// read file and store the info
|
||||
std::string full_path = exec_order_path + "/" + file_to_check;
|
||||
std::string checked_path = RealPath(full_path);
|
||||
if (!checked_path.empty()) {
|
||||
ReadGraphRunIter(checked_path, rank_and_graph);
|
||||
}
|
||||
(void)closedir(d_handle);
|
||||
}
|
||||
|
||||
std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
|
||||
std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
|
||||
for (auto w_table_item : watchpoint_table_) {
|
||||
auto wp = std::get<1>(w_table_item);
|
||||
unsigned int index = 0;
|
||||
for (auto check_node : wp.check_node_list) {
|
||||
std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
|
||||
std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
|
||||
// graph represents root_graph for Ascend and kernel_graph for GPU
|
||||
for (auto rank : ranks) {
|
||||
for (auto graph : graphs) {
|
||||
std::tuple<uint32_t, uint32_t> key(rank, graph);
|
||||
(rank_and_graph_to_nodes)[key].push_back(check_node);
|
||||
}
|
||||
}
|
||||
index++;
|
||||
}
|
||||
}
|
||||
return rank_and_graph_to_nodes;
|
||||
}
|
||||
|
||||
void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
|
||||
std::ifstream infile;
|
||||
std::string line;
|
||||
infile.open(file_path.c_str());
|
||||
if (!infile.is_open()) {
|
||||
MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
|
||||
const int kMaxFilenameLength = NAME_MAX;
|
||||
char err_info[kMaxFilenameLength];
|
||||
if (strerror_r(errno, err_info, sizeof(err_info)) != nullptr) {
|
||||
MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
std::vector<uint32_t> run_iters_vec;
|
||||
while (std::getline(infile, line)) {
|
||||
uint32_t iter;
|
||||
std::stringstream ss(line);
|
||||
ss >> iter;
|
||||
run_iters_vec.push_back(iter);
|
||||
}
|
||||
(void)graphs_run_history_.emplace(
|
||||
std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
|
||||
}
|
||||
|
||||
void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
|
||||
const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
|
||||
const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
|
||||
|
@ -895,6 +1055,7 @@ void DebugServices::AddToTensorData(const std::string &backend_name, const std::
|
|||
tensor_data->SetType(type_name);
|
||||
tensor_data->SetShape(shape);
|
||||
tensor_data->SetTimeStamp(time_stamp);
|
||||
tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
|
||||
if (data_size) {
|
||||
(void)tensor_loader_->LoadNewTensor(tensor_data, false);
|
||||
}
|
||||
|
@ -1089,34 +1250,19 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
|
|||
unsigned int iteration, std::vector<std::string> *const async_file_pool) {
|
||||
// get a list of nodes and the devices they are on to monitor
|
||||
std::vector<std::shared_ptr<TensorData>> tensor_list;
|
||||
std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> device_and_graph_to_nodes;
|
||||
for (auto w_table_item : watchpoint_table_) {
|
||||
auto wp = std::get<1>(w_table_item);
|
||||
unsigned int index = 0;
|
||||
for (auto check_node : wp.check_node_list) {
|
||||
std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
|
||||
std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
|
||||
for (auto device : devices) {
|
||||
for (auto graph : graphs) {
|
||||
std::tuple<uint32_t, uint32_t> key(device, graph);
|
||||
device_and_graph_to_nodes[key].push_back(check_node);
|
||||
}
|
||||
}
|
||||
|
||||
index++;
|
||||
}
|
||||
}
|
||||
std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
|
||||
GetAllWpNodes();
|
||||
|
||||
// scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
|
||||
// as they are found
|
||||
for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
|
||||
std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
|
||||
uint32_t device_id = std::get<0>(device_and_graph);
|
||||
uint32_t root_graph_id = std::get<1>(device_and_graph);
|
||||
std::vector<std::tuple<std::string, bool>> wp_nodes = device_and_graph_item.second;
|
||||
for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
|
||||
std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
|
||||
uint32_t rank_id = std::get<0>(rank_and_graph);
|
||||
uint32_t root_graph_id = std::get<1>(rank_and_graph);
|
||||
std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
|
||||
std::vector<std::tuple<std::string, std::string>> proto_to_dump;
|
||||
|
||||
std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
|
||||
std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
|
||||
std::to_string(root_graph_id) + "/" + IterationString(iteration);
|
||||
|
||||
// convert node names to dump style
|
||||
|
@ -1140,12 +1286,11 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
|
|||
if (is_sync_mode_) {
|
||||
// search files in dir for the one that meets the filename prefix and read the file into memory
|
||||
std::string abspath = RealPath(specific_dump_dir);
|
||||
ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, device_id, root_graph_id,
|
||||
&tensor_list);
|
||||
ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, rank_id, root_graph_id, &tensor_list);
|
||||
} else {
|
||||
// convert all files in proto_to_dump to npy and add to pool of async file names
|
||||
ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
|
||||
GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool,
|
||||
GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool,
|
||||
&tensor_list);
|
||||
}
|
||||
}
|
||||
|
@ -1285,6 +1430,32 @@ bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, boo
|
|||
return tensor_loader_->LoadNewTensor(tensor, keep_prev);
|
||||
}
|
||||
|
||||
uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
|
||||
uint32_t prev_iter;
|
||||
uint32_t rank_id = tensor->GetDeviceId();
|
||||
uint32_t root_graph_id = tensor->GetRootGraphId();
|
||||
std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
|
||||
if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
|
||||
return UINT32_MAX;
|
||||
}
|
||||
auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
|
||||
tensor->GetIteration());
|
||||
if (it == graphs_run_history_[rank_and_graph].end()) {
|
||||
// The graph is not executed in that iteration
|
||||
return UINT32_MAX;
|
||||
} else if (it == graphs_run_history_[rank_and_graph].begin()) {
|
||||
// current iteration is the first iteration that the graph was run
|
||||
// no prev iter is available
|
||||
MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
|
||||
<< " is the first run iteration for tensor: " << tensor->GetName();
|
||||
return UINT32_MAX;
|
||||
}
|
||||
it--;
|
||||
prev_iter = *it;
|
||||
tensor->SetPrevIteration(prev_iter);
|
||||
return prev_iter;
|
||||
}
|
||||
|
||||
void DebugServices::ResetLoadedTensors() {
|
||||
wp_id_cache_.clear();
|
||||
MS_LOG(INFO) << "Resetting loaded tensors";
|
||||
|
|
|
@ -292,6 +292,9 @@ class DebugServices {
|
|||
std::vector<uint64_t> *chunk_tensor_byte_size, partitioned_id *chunk_device_id,
|
||||
partitioned_id *chunk_root_graph_id, std::vector<unsigned int> *device_id,
|
||||
std::vector<unsigned int> *root_graph_id);
|
||||
#ifdef OFFLINE_DBG_MODE
|
||||
void SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr);
|
||||
#endif
|
||||
|
||||
void AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
|
||||
const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
|
||||
|
@ -350,7 +353,7 @@ class DebugServices {
|
|||
std::vector<std::string> *const async_file_pool);
|
||||
|
||||
const void *GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
|
||||
uint32_t *prev_num_elements);
|
||||
uint32_t *prev_num_elements, bool *history_not_found);
|
||||
|
||||
void ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name, std::string *const tensor_type,
|
||||
std::size_t *const size, std::vector<int64_t> *const shape,
|
||||
|
@ -380,6 +383,18 @@ class DebugServices {
|
|||
uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
|
||||
std::vector<std::shared_ptr<TensorData>> *const tensor_list);
|
||||
|
||||
void SetGraphsHistory();
|
||||
|
||||
std::vector<uint32_t> GetDumpRankIdList();
|
||||
|
||||
void CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list);
|
||||
|
||||
void ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id);
|
||||
|
||||
std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> GetAllWpNodes();
|
||||
|
||||
void ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph);
|
||||
|
||||
std::string GetStrippedFilename(const std::string &file_name);
|
||||
|
||||
std::string IterationString(unsigned int iteration);
|
||||
|
@ -410,6 +425,8 @@ class DebugServices {
|
|||
|
||||
bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);
|
||||
|
||||
uint32_t GetPrevIteration(const std::shared_ptr<TensorData> &tensor);
|
||||
|
||||
void ResetLoadedTensors();
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel);
|
||||
|
@ -458,6 +475,8 @@ class DebugServices {
|
|||
std::unordered_map<std::string, std::vector<std::string>> overflow_ops_;
|
||||
std::string net_name_;
|
||||
std::string dump_dir_;
|
||||
// store history of graphs that have been run (rank_id, graph_id)
|
||||
std::map<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>> graphs_run_history_;
|
||||
bool is_sync_mode_{false};
|
||||
|
||||
std::shared_ptr<TensorLoader> tensor_loader_;
|
||||
|
|
|
@ -282,7 +282,6 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
|
|||
if (device_target_ != kGPUDevice) {
|
||||
return;
|
||||
}
|
||||
E2eDump::UpdateIterGPUDump();
|
||||
// Store graphs that are run in one step.
|
||||
graph_ptr_step_vec_ = graphs;
|
||||
for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
|
||||
|
@ -290,7 +289,6 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
|
|||
if (debugger_) {
|
||||
debugger_->PreExecute(graph);
|
||||
}
|
||||
DumpSetup(graph);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -390,6 +388,7 @@ uint32_t Debugger::GetRankID() {
|
|||
|
||||
void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
|
||||
uint32_t rank_id = GetRankID();
|
||||
E2eDump::DumpRunIter(kernel_graph, rank_id);
|
||||
if (debugger_ && debugger_->DebuggerBackendEnabled()) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
(void)E2eDump::DumpParametersAndConstData(kernel_graph.get(), rank_id, debugger_.get());
|
||||
|
@ -458,6 +457,7 @@ void Debugger::PostExecuteGraphDebugger() {
|
|||
if (debugger_) {
|
||||
debugger_->PostExecute();
|
||||
}
|
||||
E2eDump::UpdateIterGPUDump();
|
||||
}
|
||||
|
||||
void Debugger::PostExecute() {
|
||||
|
|
|
@ -62,6 +62,7 @@ int32_t DbgServices::Initialize(const std::string net_name, const std::string du
|
|||
const uint64_t ratio_inversion = 2;
|
||||
const uint64_t memlimit = max_mem_usage * kMegabytesToBytes / ratio_inversion;
|
||||
debug_services_->SetMemLimit(memlimit);
|
||||
debug_services_->SetGraphsHistory();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -88,6 +88,7 @@ class VarianceAndMeanCalculator {
|
|||
class ITensorSummary {
|
||||
public:
|
||||
enum WatchpointPos { eHitPos = 0, eErrorCodePos = 1, eParamListPos = 2 };
|
||||
enum ErrorCode { NAN_TENSOR = 0, INF_TENSOR = 2, NULL_PREV_TENSOR = 4, OUT_OF_MEMORY = 8, HISTORY_NOT_FOUND = 16 };
|
||||
virtual ~ITensorSummary() = default;
|
||||
virtual void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) = 0;
|
||||
virtual std::tuple<bool, int32_t, std::vector<DebugServices::parameter_t>> IsWatchpointHit(
|
||||
|
|
|
@ -229,6 +229,10 @@ class TensorData {
|
|||
|
||||
void SetIteration(unsigned int iteration) { this->iteration_ = iteration; }
|
||||
|
||||
unsigned int GetPrevIteration() const { return this->prev_iteration_; }
|
||||
|
||||
void SetPrevIteration(unsigned int prev_iteration) { this->prev_iteration_ = prev_iteration; }
|
||||
|
||||
unsigned int GetDeviceId() const { return this->device_id_; }
|
||||
|
||||
void SetDeviceId(unsigned int device_id) { this->device_id_ = device_id; }
|
||||
|
@ -430,6 +434,7 @@ class TensorData {
|
|||
std::string name_;
|
||||
uint64_t slot_;
|
||||
unsigned int iteration_{0};
|
||||
unsigned int prev_iteration_{0};
|
||||
unsigned int device_id_{0};
|
||||
unsigned int root_graph_id_{0};
|
||||
bool is_output_{true};
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
* Copyright 2019-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -90,7 +90,7 @@ class TensorLoader {
|
|||
key_name += (":" + std::to_string(tensor->GetDeviceId()) + ":" + std::to_string(tensor->GetRootGraphId()) + ":" +
|
||||
std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot()));
|
||||
if (tensor_list_map_.find(key_name) != tensor_list_map_.end() &&
|
||||
tensor->GetIteration() == tensor_list_map_[key_name]->GetIteration() - 1) {
|
||||
tensor->GetIteration() == tensor_list_map_[key_name]->GetPrevIteration()) {
|
||||
key_name += ":prev";
|
||||
}
|
||||
auto iter = tensor_list_map_.find(key_name);
|
||||
|
|
|
@ -33,6 +33,9 @@
|
|||
#include "runtime/base.h"
|
||||
#include "runtime/device/ascend/ascend_stream_manager.h"
|
||||
#include "utils/shape_utils.h"
|
||||
#ifndef ENABLE_SECURITY
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
constexpr auto kGradients = "Gradients";
|
||||
|
@ -1062,7 +1065,12 @@ void KernelAdjust::LoadDeviceLoopCtrlParameters(const std::shared_ptr<session::K
|
|||
MS_LOG(INFO) << "Load device loop control data";
|
||||
SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurLoopCountName, 0);
|
||||
SetDeviceLoopCtrlTensor(kernel_graph_ptr, kNextLoopCountName, 0);
|
||||
SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName, SizeToInt(kernel_graph_ptr->current_epoch()));
|
||||
#ifndef ENABLE_SECURITY
|
||||
SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName,
|
||||
SizeToInt(DumpJsonParser::GetInstance().cur_dump_iter()));
|
||||
#else
|
||||
SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName, 0);
|
||||
#endif
|
||||
|
||||
kernel_graph_ptr->set_current_epoch(kernel_graph_ptr->current_epoch() + 1);
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "utils/log_adapter.h"
|
||||
#ifndef ENABLE_SECURITY
|
||||
#include "debug/data_dump/cpu_e2e_dump.h"
|
||||
#include "debug/data_dump/e2e_dump.h"
|
||||
#endif
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debugger/debugger.h"
|
||||
|
@ -52,6 +53,7 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
|
|||
auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
CPUE2eDump::DumpCNodeData(cnode, kernel_graph->graph_id());
|
||||
CPUE2eDump::DumpRunIter(kernel_graph);
|
||||
}
|
||||
#endif
|
||||
} else if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kGPU) {
|
||||
|
|
|
@ -18,11 +18,14 @@ Utils for testing offline debugger.
|
|||
|
||||
import os
|
||||
import tempfile
|
||||
import bisect
|
||||
import csv
|
||||
import numpy as np
|
||||
|
||||
|
||||
def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_info_list):
|
||||
"""Build dump file structure from tensor_list."""
|
||||
ranks_run_history = {}
|
||||
temp_dir = tempfile.mkdtemp(prefix=net_name, dir=path)
|
||||
for tensor_name, tensor, tensor_info in zip(tensor_name_list, tensor_list, tensor_info_list):
|
||||
slot = str(tensor_info.slot)
|
||||
|
@ -30,6 +33,16 @@ def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_i
|
|||
rank_id = str(tensor_info.rank_id)
|
||||
root_graph_id = str(tensor_info.root_graph_id)
|
||||
is_output = str(tensor_info.is_output)
|
||||
if rank_id not in ranks_run_history:
|
||||
graphs_run_history = {}
|
||||
ranks_run_history[rank_id] = graphs_run_history
|
||||
if root_graph_id not in ranks_run_history[rank_id]:
|
||||
iter_list = []
|
||||
iter_list.append(iteration)
|
||||
graphs_run_history[root_graph_id] = iter_list
|
||||
elif iteration not in graphs_run_history[root_graph_id]:
|
||||
bisect.insort(graphs_run_history[root_graph_id], iteration)
|
||||
|
||||
path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration)
|
||||
os.makedirs(path, exist_ok=True)
|
||||
if is_output == "True":
|
||||
|
@ -40,4 +53,16 @@ def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_i
|
|||
".DefaultFormat.npy", dir=path)
|
||||
full_path = file[1]
|
||||
np.save(full_path, tensor)
|
||||
build_global_execution_order(temp_dir, ranks_run_history)
|
||||
return temp_dir
|
||||
|
||||
|
||||
def build_global_execution_order(path, ranks_run_history):
|
||||
for rank_id in ranks_run_history.keys():
|
||||
exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order"
|
||||
os.makedirs(exec_order_path, exist_ok=True)
|
||||
for graph in ranks_run_history[rank_id].keys():
|
||||
full_path = os.path.join(exec_order_path, "ms_global_execution_order_graph_" + graph + ".csv")
|
||||
with open(full_path, 'w+', newline='') as csv_file:
|
||||
write = csv.writer(csv_file)
|
||||
write.writerows(ranks_run_history[rank_id][graph])
|
||||
|
|
|
@ -69,6 +69,35 @@ async_dump_dict_2 = {
|
|||
}
|
||||
}
|
||||
|
||||
e2e_dump_dict_2 = {
|
||||
"common_dump_settings": {
|
||||
"dump_mode": 0,
|
||||
"path": "",
|
||||
"net_name": "Net",
|
||||
"iteration": "all",
|
||||
"input_output": 0,
|
||||
"kernels": ["Default/Conv-op12"],
|
||||
"support_device": [0, 1, 2, 3, 4, 5, 6, 7],
|
||||
"op_debug_mode": 0
|
||||
},
|
||||
"e2e_dump_settings": {
|
||||
"enable": True,
|
||||
"trans_flag": False
|
||||
}
|
||||
}
|
||||
|
||||
async_dump_dict_3 = {
|
||||
"common_dump_settings": {
|
||||
"dump_mode": 0,
|
||||
"path": "",
|
||||
"net_name": "Net",
|
||||
"iteration": "all",
|
||||
"input_output": 2,
|
||||
"kernels": ["Default/TensorAdd-op3"],
|
||||
"support_device": [0, 1, 2, 3, 4, 5, 6, 7],
|
||||
"op_debug_mode": 0
|
||||
}
|
||||
}
|
||||
|
||||
def generate_dump_json(dump_path, json_file_name, test_key):
|
||||
"""
|
||||
|
@ -83,6 +112,13 @@ def generate_dump_json(dump_path, json_file_name, test_key):
|
|||
elif test_key == "test_async_dump_net_multi_layer_mode1":
|
||||
data = async_dump_dict_2
|
||||
data["common_dump_settings"]["path"] = dump_path
|
||||
elif test_key in ("test_GPU_e2e_multi_root_graph_dump", "test_Ascend_e2e_multi_root_graph_dump"):
|
||||
data = e2e_dump_dict_2
|
||||
data["common_dump_settings"]["path"] = dump_path
|
||||
elif test_key == "test_Ascend_async_multi_root_graph_dump":
|
||||
data = async_dump_dict_3
|
||||
data["common_dump_settings"]["path"] = dump_path
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
"Failed to generate dump json file. The test name value " + test_key + " is invalid.")
|
||||
|
|
|
@ -283,7 +283,7 @@ def run_e2e_dump_execution_graph():
|
|||
add = Net()
|
||||
add(Tensor(x), Tensor(y))
|
||||
exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order')
|
||||
assert len(os.listdir(exe_graph_path)) == 1
|
||||
assert len(os.listdir(exe_graph_path)) == 2
|
||||
del os.environ['MINDSPORE_DUMP_CONFIG']
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,158 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
import os
|
||||
from os import path
|
||||
import tempfile
|
||||
import time
|
||||
import shutil
|
||||
import csv
|
||||
import numpy as np
|
||||
import pytest
|
||||
import mindspore.context as context
|
||||
from mindspore import Tensor
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.nn import Cell
|
||||
from dump_test_utils import generate_dump_json
|
||||
from tests.security_utils import security_off_wrap
|
||||
|
||||
|
||||
class AddNet(Cell):
|
||||
def __init__(self):
|
||||
super(AddNet, self).__init__()
|
||||
self.add = P.TensorAdd()
|
||||
|
||||
def construct(self, input_x, input_y):
|
||||
output_z = self.add(input_x, input_y)
|
||||
return output_z
|
||||
|
||||
|
||||
class NewAddNet(Cell):
|
||||
def __init__(self):
|
||||
super(NewAddNet, self).__init__()
|
||||
self.add = P.AddN()
|
||||
|
||||
def construct(self, x, y):
|
||||
z = self.add([x, y, y])
|
||||
return z
|
||||
|
||||
|
||||
def train_addnet(epoch):
|
||||
net = AddNet()
|
||||
net2 = NewAddNet()
|
||||
output_list = []
|
||||
input_x = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32))
|
||||
input_y = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32))
|
||||
for _ in range(epoch):
|
||||
out_put = net(input_x, input_y)
|
||||
out2 = net2(out_put, input_x)
|
||||
output_list.append(out2.asnumpy())
|
||||
input_x = input_x + input_y
|
||||
|
||||
|
||||
def run_multi_root_graph_dump(device, dump_mode, test_name):
|
||||
"""Run dump for multi root graph script."""
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=device)
|
||||
|
||||
with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
|
||||
dump_path = os.path.join(tmp_dir, dump_mode)
|
||||
dump_config_path = os.path.join(tmp_dir, dump_mode + ".json")
|
||||
generate_dump_json(dump_path, dump_config_path, test_name)
|
||||
os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
|
||||
dump_file_path = os.path.join(dump_path, 'rank_0', 'Net')
|
||||
if os.path.isdir(dump_path):
|
||||
shutil.rmtree(dump_path)
|
||||
epoch = 3
|
||||
train_addnet(epoch)
|
||||
for _ in range(3):
|
||||
if not os.path.exists(dump_file_path):
|
||||
time.sleep(2)
|
||||
# Multi root graph script : we have 2 graphs under rank_0 dir
|
||||
# Each graph should have 3 iteration
|
||||
# Each graph was executed once per epoch,
|
||||
# Graph 0 was executed in even iterations, graph one was executed in odd iterations
|
||||
assert len(os.listdir(dump_file_path)) == 2
|
||||
dump_path_graph_0 = os.path.join(dump_file_path, '0')
|
||||
dump_path_graph_1 = os.path.join(dump_file_path, '1')
|
||||
assert sorted(os.listdir(dump_path_graph_0)) == ['0', '2', '4']
|
||||
assert sorted(os.listdir(dump_path_graph_1)) == ['1', '3', '5']
|
||||
execution_order_path = os.path.join(dump_path, 'rank_0', 'execution_order')
|
||||
# Four files in execution_order dir.
|
||||
# Two files for each graph (ms_execution_order and ms_global_execution_order)
|
||||
assert len(os.listdir(execution_order_path)) == 4
|
||||
global_exec_order_graph_0 = os.path.join(execution_order_path, 'ms_global_execution_order_graph_0.csv')
|
||||
assert path.exists(global_exec_order_graph_0)
|
||||
with open(global_exec_order_graph_0) as csvfile:
|
||||
history_graph_0 = csv.reader(csvfile)
|
||||
iter_list_graph_0 = list(history_graph_0)
|
||||
assert iter_list_graph_0 == [['0'], ['2'], ['4']]
|
||||
global_exec_order_graph_1 = os.path.join(execution_order_path, 'ms_global_execution_order_graph_1.csv')
|
||||
assert path.exists(global_exec_order_graph_1)
|
||||
with open(global_exec_order_graph_1) as csvfile:
|
||||
history_graph_1 = csv.reader(csvfile)
|
||||
iter_list_graph_1 = list(history_graph_1)
|
||||
assert iter_list_graph_1 == [['1'], ['3'], ['5']]
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_GPU_e2e_multi_root_graph_dump():
|
||||
"""
|
||||
Feature:
|
||||
Multi root graph e2e dump for GPU.
|
||||
Description:
|
||||
Test multi root graph e2e dump GPU.
|
||||
Expectation:
|
||||
Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
|
||||
"""
|
||||
run_multi_root_graph_dump("GPU", "e2e_dump", "test_GPU_e2e_multi_root_graph_dump")
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_arm_ascend_training
|
||||
@pytest.mark.platform_x86_ascend_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_Ascend_e2e_multi_root_graph_dump():
|
||||
"""
|
||||
Feature:
|
||||
Multi root graph e2e dump for Ascend.
|
||||
Description:
|
||||
Test multi root graph e2e dump Ascend.
|
||||
Expectation:
|
||||
Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
|
||||
"""
|
||||
|
||||
run_multi_root_graph_dump("Ascend", "e2e_dump", "test_Ascend_e2e_multi_root_graph_dump")
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_arm_ascend_training
|
||||
@pytest.mark.platform_x86_ascend_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_Ascend_async_multi_root_graph_dump():
|
||||
"""
|
||||
Feature:
|
||||
Multi root graph async dump for Ascend.
|
||||
Description:
|
||||
Test multi root graph async dump Ascend.
|
||||
Expectation:
|
||||
Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
|
||||
"""
|
||||
|
||||
run_multi_root_graph_dump("Ascend", "async_dump", "test_Ascend_async_multi_root_graph_dump")
|
|
@ -18,11 +18,14 @@ Utils for testing offline debugger.
|
|||
|
||||
import os
|
||||
import tempfile
|
||||
import bisect
|
||||
import csv
|
||||
import numpy as np
|
||||
|
||||
|
||||
def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_list):
|
||||
"""Build dump file structure from tensor_list."""
|
||||
ranks_run_history = {}
|
||||
temp_dir = tempfile.mkdtemp(prefix=net_name, dir="./")
|
||||
for x, _ in enumerate(tensor_info_list):
|
||||
slot = str(tensor_info_list[x].slot)
|
||||
|
@ -30,6 +33,16 @@ def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_li
|
|||
rank_id = str(tensor_info_list[x].rank_id)
|
||||
root_graph_id = str(tensor_info_list[x].root_graph_id)
|
||||
is_output = str(tensor_info_list[x].is_output)
|
||||
if rank_id not in ranks_run_history:
|
||||
graphs_run_history = {}
|
||||
ranks_run_history[rank_id] = graphs_run_history
|
||||
if root_graph_id not in ranks_run_history[rank_id]:
|
||||
iter_list = []
|
||||
iter_list.append(iteration)
|
||||
graphs_run_history[root_graph_id] = iter_list
|
||||
elif iteration not in graphs_run_history[root_graph_id]:
|
||||
bisect.insort(graphs_run_history[root_graph_id], iteration)
|
||||
|
||||
path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration)
|
||||
os.makedirs(path, exist_ok=True)
|
||||
if is_output == "True":
|
||||
|
@ -40,4 +53,16 @@ def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_li
|
|||
".DefaultFormat.npy", dir=path)
|
||||
full_path = file[1]
|
||||
np.save(full_path, tensor_list[x])
|
||||
build_global_execution_order(temp_dir, ranks_run_history)
|
||||
return temp_dir
|
||||
|
||||
|
||||
def build_global_execution_order(path, ranks_run_history):
|
||||
for rank_id in ranks_run_history.keys():
|
||||
exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order"
|
||||
os.makedirs(exec_order_path, exist_ok=True)
|
||||
for graph in ranks_run_history[rank_id].keys():
|
||||
full_path = os.path.join(exec_order_path, "ms_global_execution_order_graph_" + graph + ".csv")
|
||||
with open(full_path, 'w+', newline='') as csv_file:
|
||||
write = csv.writer(csv_file)
|
||||
write.writerows(ranks_run_history[rank_id][graph])
|
||||
|
|
Loading…
Reference in New Issue