!24520 Add graph run history to dump structure

Merge pull request !24520 from parastooashtari/multi_root_graph_dump
This commit is contained in:
i-robot 2021-11-04 12:55:01 +00:00 committed by Gitee
commit 76f4f77cc2
20 changed files with 561 additions and 63 deletions

View File

@ -622,6 +622,9 @@ void AscendSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_
debugger_->PostExecute();
}
#endif
#ifndef ENABLE_SECURITY
DumpSetup(kernel_graph);
#endif
}
void AscendSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) { Execute(kernel_graph, true); }
@ -1347,11 +1350,6 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
}
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
#ifndef ENABLE_SECURITY
if (is_task && is_task_sink) {
DumpSetup(kernel_graph);
}
#endif
bool ret_ok = runtime_instance->Run(*kernel_graph, is_task_sink);
#ifndef ENABLE_SECURITY
if (is_task && is_task_sink) {
@ -1378,6 +1376,7 @@ void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph)
void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
MS_LOG(DEBUG) << "Start!";
MS_EXCEPTION_IF_NULL(kernel_graph);
E2eDump::DumpRunIter(kernel_graph, rank_id_);
E2eDump::DumpData(kernel_graph.get(), rank_id_);
MS_LOG(DEBUG) << "Finish!";
}

View File

@ -722,6 +722,7 @@ void GPUSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) con
void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
if (debugger_->DebuggerBackendEnabled()) {
MS_EXCEPTION_IF_NULL(kernel_graph);
E2eDump::DumpRunIter(kernel_graph, rank_id_);
E2eDump::DumpData(kernel_graph.get(), rank_id_, debugger_.get());
} else {
DumpJsonParser::GetInstance().UpdateDumpIter();

View File

@ -18,6 +18,7 @@
#include <map>
#include "backend/session/anf_runtime_algorithm.h"
#include "debug/anf_ir_utils.h"
#include "debug/common.h"
namespace mindspore {
void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) {
@ -39,6 +40,31 @@ void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) {
}
}
void CPUE2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
auto &json_parser = DumpJsonParser::GetInstance();
if (!(json_parser.e2e_dump_enabled())) {
return;
}
std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
std::string file_name_to_check =
execution_order_path + "/ms_global_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv";
auto real_path = Common::CreatePrefixPath(file_name_to_check);
if (!real_path.has_value()) {
MS_LOG(WARNING) << "Check file path: " << file_name_to_check << " failed.";
return;
}
std::string file_name = real_path.value();
ChangeFileMode(file_name, S_IWUSR);
std::ofstream fout(file_name, std::ofstream::app);
if (!fout.is_open()) {
MS_LOG(WARNING) << "Open file for saving graph global execution order failed.";
return;
}
fout << std::to_string(json_parser.cur_dump_iter()) + "\n";
fout.close();
ChangeFileMode(file_name, S_IRUSR);
}
void CPUE2eDump::DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path) {
MS_EXCEPTION_IF_NULL(node);
std::string kernel_name = GetKernelNodeName(node);

View File

@ -35,6 +35,8 @@ class CPUE2eDump {
static void DumpCNodeData(const CNodePtr &node, uint32_t graph_id);
static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0);
private:
static void DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path);

View File

@ -290,13 +290,9 @@ void E2eDump::UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_m
MS_LOG(INFO) << "No need to update iteration for dataset graph.";
return;
}
if (starting_graph_id == INT32_MAX) {
// Identify the first graph id and not increasing dump iter for the first iteration (initial dump iter = 0).
starting_graph_id = graph_id;
} else {
// In multi network scripts, dump iter is equal to the number of networks that have been run so far.
dump_json_parser.UpdateDumpIter();
}
// In multi network scripts, dump iter is equal to the number of networks that have been executed so far.
dump_json_parser.UpdateDumpIter();
}
void E2eDump::DumpSetup(const session::KernelGraph *graph) {
@ -308,10 +304,31 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph) {
}
}
void E2eDump::UpdateIterGPUDump() {
if (starting_graph_id != INT32_MAX) {
DumpJsonParser::GetInstance().UpdateDumpIter();
void E2eDump::UpdateIterGPUDump() { DumpJsonParser::GetInstance().UpdateDumpIter(); }
void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
auto &json_parser = DumpJsonParser::GetInstance();
if (!(json_parser.async_dump_enabled() || json_parser.e2e_dump_enabled())) {
return;
}
std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
std::string file_name_to_check =
execution_order_path + "/ms_global_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv";
auto real_path = Common::CreatePrefixPath(file_name_to_check);
if (!real_path.has_value()) {
MS_LOG(WARNING) << "Check file path: " << file_name_to_check << " failed.";
return;
}
std::string file_name = real_path.value();
ChangeFileMode(file_name, S_IWUSR);
std::ofstream fout(file_name, std::ofstream::app);
if (!fout.is_open()) {
MS_LOG(WARNING) << "Open file for saving graph global execution order failed.";
return;
}
fout << std::to_string(json_parser.cur_dump_iter()) + "\n";
fout.close();
ChangeFileMode(file_name, S_IRUSR);
}
void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) {

View File

@ -39,6 +39,8 @@ class E2eDump {
static void UpdateIterGPUDump();
static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0);
static void DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
static bool DumpParametersAndConstData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger);

View File

@ -25,6 +25,7 @@
#include <numeric>
#include <unordered_set>
#include <utility>
#include <regex>
#include "pybind11/embed.h"
#include "pybind11/stl.h"
#ifdef ONLINE_DBG_MODE
@ -33,8 +34,10 @@
#include "debug/anf_ir_utils.h"
#include "backend/session/anf_runtime_algorithm.h"
#endif
#include "nlohmann/json.hpp"
#include "debug/debugger/tensor_summary.h"
#include "utils/file_utils.h"
#include "linux/limits.h"
#ifdef ONLINE_DBG_MODE
namespace mindspore {
#endif
@ -172,23 +175,28 @@ DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_p
}
#ifdef OFFLINE_DBG_MODE
const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
uint32_t *prev_num_elements) {
uint32_t *prev_num_elements, bool *history_not_found) {
MS_EXCEPTION_IF_NULL(tensor);
const void *previous_tensor_ptr = nullptr;
std::shared_ptr<TensorData> tensor_prev;
if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) {
std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
*history_not_found = 1;
MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
} else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
// when prev_tensor is not available, the prev iteration is set to UINT32_MAX
// read data in offline mode
std::vector<std::string> file_paths;
if (!is_sync_mode_) {
ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
std::vector<unsigned int>{tensor->GetDeviceId()},
std::vector<unsigned int>{tensor->GetIteration() - 1},
std::vector<unsigned int>{tensor->GetPrevIteration()},
std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
}
std::vector<std::shared_ptr<TensorData>> result_list_prev;
ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
std::vector<unsigned int>{tensor->GetDeviceId()},
std::vector<unsigned int>{tensor->GetIteration() - 1},
std::vector<unsigned int>{tensor->GetPrevIteration()},
std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
file_paths, &result_list_prev);
tensor_prev = result_list_prev[0];
@ -303,7 +311,7 @@ void DebugServices::ProcessCheckpointsOutofMemory(
const std::vector<parameter_t> &parameter_list) {
if (no_mem_to_read) {
// bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
int32_t oversize_error_code = 8;
int32_t oversize_error_code = ITensorSummary::OUT_OF_MEMORY;
for (auto &wp : watchpoints_to_check) {
SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
@ -313,6 +321,18 @@ void DebugServices::ProcessCheckpointsOutofMemory(
}
}
}
void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
// set the tensor into not-in-use status in tensor_loader.
auto tensor_name = tensor->GetName();
std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
std::to_string(tensor->GetRootGraphId()) + ":" +
std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
AppendToCacheEvictQueue(key_name_in_cache);
if (previous_tensor_ptr != nullptr) {
AppendToCacheEvictQueue(key_name_in_cache + ":prev");
}
}
#endif
void DebugServices::CheckWatchpointsForTensor(
@ -373,7 +393,8 @@ void DebugServices::CheckWatchpointsForTensor(
uint32_t prev_num_elements = 0;
const void *previous_tensor_ptr = nullptr;
#ifdef OFFLINE_DBG_MODE
previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements);
bool history_not_found = 0;
previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
#else
std::shared_ptr<TensorData> prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
if (prev_tensor_data) {
@ -400,6 +421,11 @@ void DebugServices::CheckWatchpointsForTensor(
auto item = base_summary_ptr->IsWatchpointHit(wp);
is_hit = std::get<ITensorSummary::eHitPos>(item);
error_code = std::get<ITensorSummary::eErrorCodePos>(item);
#ifdef OFFLINE_DBG_MODE
if (history_not_found) {
error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
}
#endif
parameter_list = std::get<ITensorSummary::eParamListPos>(item);
}
AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
@ -413,14 +439,7 @@ void DebugServices::CheckWatchpointsForTensor(
}
#ifdef OFFLINE_DBG_MODE
// set the tensor into not-in-use status in tensor_loader.
std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
std::to_string(tensor->GetRootGraphId()) + ":" +
std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
AppendToCacheEvictQueue(key_name_in_cache);
if (previous_tensor_ptr != nullptr) {
AppendToCacheEvictQueue(key_name_in_cache + ":prev");
}
SetTensorToNotInUse(tensor, previous_tensor_ptr);
// in offline mode remove the need for the data
tensor.reset();
#endif
@ -685,7 +704,7 @@ void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &f
std::string real_dump_iter_dir = RealPath(dump_key);
DIR *d_handle = opendir(real_dump_iter_dir.c_str());
if (d_handle == nullptr) {
MS_LOG(ERROR) << "Directory does not exit in ConvertToHostFormat.";
MS_LOG(ERROR) << "Directory does not exist in ConvertToHostFormat.";
return;
}
struct dirent *dir = nullptr;
@ -865,12 +884,153 @@ void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::str
tensor_data->SetType("");
tensor_data->SetShape(shape);
tensor_data->SetIsOutput(output_flag);
tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
tensor_list->push_back(tensor_data);
}
}
}
uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
std::regex re;
if (mode == "rank") {
re = "^rank_([0-9]+)$";
} else if (mode == "graph") {
re = "^([0-9]+)$";
}
std::smatch tokens;
if (regex_match(name, tokens, re)) {
return std::stoi(tokens[1]);
} else {
return UINT32_MAX;
}
}
std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
std::vector<uint32_t> rank_id_list;
std::string dump_dir = GetDumpDir();
DIR *d_handle = opendir(dump_dir.c_str());
if (d_handle == nullptr) {
MS_LOG(ERROR) << "Dump directory does not exist.";
return rank_id_list;
}
struct dirent *dir = nullptr;
while ((dir = readdir(d_handle)) != nullptr) {
if (dir->d_type == DT_DIR) {
std::string rank_dir_name = dir->d_name;
if (GetRankOrGraphId("rank", rank_dir_name) != UINT32_MAX) {
rank_id_list.push_back(GetRankOrGraphId("rank", rank_dir_name));
}
}
}
(void)closedir(d_handle);
return rank_id_list;
}
void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
std::string net_name = GetNetName();
std::string dump_dir = GetDumpDir();
for (uint32_t rank_id : rank_id_list) {
std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
std::string abspath = RealPath(path);
DIR *d_handle_rank = opendir(abspath.c_str());
if (d_handle_rank == nullptr) {
MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
continue;
}
struct dirent *direc = nullptr;
while ((direc = readdir(d_handle_rank)) != nullptr) {
if (direc->d_type == DT_DIR) {
std::string graph_dir = direc->d_name;
if (graph_dir == "." || graph_dir == "..") {
continue;
}
if (GetRankOrGraphId("graph", graph_dir) != UINT32_MAX) {
uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
ReadGraphsHistory(rank_id, graph_id);
}
}
}
(void)closedir(d_handle_rank);
}
}
void DebugServices::SetGraphsHistory() {
// extract rank_id_list
std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
// for each rank_id extract the graph_id list and set the dump version
// and for each graph read the graph history file
CheckDumpGraphIdList(rank_id_list);
}
void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
// graph history was already stored for this rank_id and graph_id
return;
}
std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
DIR *d_handle = opendir(exec_order_path.c_str());
if (d_handle == nullptr) {
MS_LOG(ERROR) << "Directory does not exist.";
return;
}
// read file and store the info
std::string full_path = exec_order_path + "/" + file_to_check;
std::string checked_path = RealPath(full_path);
if (!checked_path.empty()) {
ReadGraphRunIter(checked_path, rank_and_graph);
}
(void)closedir(d_handle);
}
std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
for (auto w_table_item : watchpoint_table_) {
auto wp = std::get<1>(w_table_item);
unsigned int index = 0;
for (auto check_node : wp.check_node_list) {
std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
// graph represents root_graph for Ascend and kernel_graph for GPU
for (auto rank : ranks) {
for (auto graph : graphs) {
std::tuple<uint32_t, uint32_t> key(rank, graph);
(rank_and_graph_to_nodes)[key].push_back(check_node);
}
}
index++;
}
}
return rank_and_graph_to_nodes;
}
void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
std::ifstream infile;
std::string line;
infile.open(file_path.c_str());
if (!infile.is_open()) {
MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
const int kMaxFilenameLength = NAME_MAX;
char err_info[kMaxFilenameLength];
if (strerror_r(errno, err_info, sizeof(err_info)) != nullptr) {
MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
}
return;
}
std::vector<uint32_t> run_iters_vec;
while (std::getline(infile, line)) {
uint32_t iter;
std::stringstream ss(line);
ss >> iter;
run_iters_vec.push_back(iter);
}
(void)graphs_run_history_.emplace(
std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
}
void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
@ -895,6 +1055,7 @@ void DebugServices::AddToTensorData(const std::string &backend_name, const std::
tensor_data->SetType(type_name);
tensor_data->SetShape(shape);
tensor_data->SetTimeStamp(time_stamp);
tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
if (data_size) {
(void)tensor_loader_->LoadNewTensor(tensor_data, false);
}
@ -1089,34 +1250,19 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
unsigned int iteration, std::vector<std::string> *const async_file_pool) {
// get a list of nodes and the devices they are on to monitor
std::vector<std::shared_ptr<TensorData>> tensor_list;
std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> device_and_graph_to_nodes;
for (auto w_table_item : watchpoint_table_) {
auto wp = std::get<1>(w_table_item);
unsigned int index = 0;
for (auto check_node : wp.check_node_list) {
std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
for (auto device : devices) {
for (auto graph : graphs) {
std::tuple<uint32_t, uint32_t> key(device, graph);
device_and_graph_to_nodes[key].push_back(check_node);
}
}
index++;
}
}
std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
GetAllWpNodes();
// scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
// as they are found
for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
uint32_t device_id = std::get<0>(device_and_graph);
uint32_t root_graph_id = std::get<1>(device_and_graph);
std::vector<std::tuple<std::string, bool>> wp_nodes = device_and_graph_item.second;
for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
uint32_t rank_id = std::get<0>(rank_and_graph);
uint32_t root_graph_id = std::get<1>(rank_and_graph);
std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
std::vector<std::tuple<std::string, std::string>> proto_to_dump;
std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
std::to_string(root_graph_id) + "/" + IterationString(iteration);
// convert node names to dump style
@ -1140,12 +1286,11 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
if (is_sync_mode_) {
// search files in dir for the one that meets the filename prefix and read the file into memory
std::string abspath = RealPath(specific_dump_dir);
ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, device_id, root_graph_id,
&tensor_list);
ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, rank_id, root_graph_id, &tensor_list);
} else {
// convert all files in proto_to_dump to npy and add to pool of async file names
ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool,
GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool,
&tensor_list);
}
}
@ -1285,6 +1430,32 @@ bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, boo
return tensor_loader_->LoadNewTensor(tensor, keep_prev);
}
uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
uint32_t prev_iter;
uint32_t rank_id = tensor->GetDeviceId();
uint32_t root_graph_id = tensor->GetRootGraphId();
std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
return UINT32_MAX;
}
auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
tensor->GetIteration());
if (it == graphs_run_history_[rank_and_graph].end()) {
// The graph is not executed in that iteration
return UINT32_MAX;
} else if (it == graphs_run_history_[rank_and_graph].begin()) {
// current iteration is the first iteration that the graph was run
// no prev iter is available
MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
<< " is the first run iteration for tensor: " << tensor->GetName();
return UINT32_MAX;
}
it--;
prev_iter = *it;
tensor->SetPrevIteration(prev_iter);
return prev_iter;
}
void DebugServices::ResetLoadedTensors() {
wp_id_cache_.clear();
MS_LOG(INFO) << "Resetting loaded tensors";

View File

@ -292,6 +292,9 @@ class DebugServices {
std::vector<uint64_t> *chunk_tensor_byte_size, partitioned_id *chunk_device_id,
partitioned_id *chunk_root_graph_id, std::vector<unsigned int> *device_id,
std::vector<unsigned int> *root_graph_id);
#ifdef OFFLINE_DBG_MODE
void SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr);
#endif
void AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
@ -350,7 +353,7 @@ class DebugServices {
std::vector<std::string> *const async_file_pool);
const void *GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
uint32_t *prev_num_elements);
uint32_t *prev_num_elements, bool *history_not_found);
void ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name, std::string *const tensor_type,
std::size_t *const size, std::vector<int64_t> *const shape,
@ -380,6 +383,18 @@ class DebugServices {
uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
std::vector<std::shared_ptr<TensorData>> *const tensor_list);
void SetGraphsHistory();
std::vector<uint32_t> GetDumpRankIdList();
void CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list);
void ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id);
std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> GetAllWpNodes();
void ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph);
std::string GetStrippedFilename(const std::string &file_name);
std::string IterationString(unsigned int iteration);
@ -410,6 +425,8 @@ class DebugServices {
bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);
uint32_t GetPrevIteration(const std::shared_ptr<TensorData> &tensor);
void ResetLoadedTensors();
#ifdef ONLINE_DBG_MODE
std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel);
@ -458,6 +475,8 @@ class DebugServices {
std::unordered_map<std::string, std::vector<std::string>> overflow_ops_;
std::string net_name_;
std::string dump_dir_;
// store history of graphs that have been run (rank_id, graph_id)
std::map<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>> graphs_run_history_;
bool is_sync_mode_{false};
std::shared_ptr<TensorLoader> tensor_loader_;

View File

@ -282,7 +282,6 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
if (device_target_ != kGPUDevice) {
return;
}
E2eDump::UpdateIterGPUDump();
// Store graphs that are run in one step.
graph_ptr_step_vec_ = graphs;
for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
@ -290,7 +289,6 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
if (debugger_) {
debugger_->PreExecute(graph);
}
DumpSetup(graph);
}
}
@ -390,6 +388,7 @@ uint32_t Debugger::GetRankID() {
void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
uint32_t rank_id = GetRankID();
E2eDump::DumpRunIter(kernel_graph, rank_id);
if (debugger_ && debugger_->DebuggerBackendEnabled()) {
MS_EXCEPTION_IF_NULL(kernel_graph);
(void)E2eDump::DumpParametersAndConstData(kernel_graph.get(), rank_id, debugger_.get());
@ -458,6 +457,7 @@ void Debugger::PostExecuteGraphDebugger() {
if (debugger_) {
debugger_->PostExecute();
}
E2eDump::UpdateIterGPUDump();
}
void Debugger::PostExecute() {

View File

@ -62,6 +62,7 @@ int32_t DbgServices::Initialize(const std::string net_name, const std::string du
const uint64_t ratio_inversion = 2;
const uint64_t memlimit = max_mem_usage * kMegabytesToBytes / ratio_inversion;
debug_services_->SetMemLimit(memlimit);
debug_services_->SetGraphsHistory();
return 0;
}

View File

@ -88,6 +88,7 @@ class VarianceAndMeanCalculator {
class ITensorSummary {
public:
enum WatchpointPos { eHitPos = 0, eErrorCodePos = 1, eParamListPos = 2 };
enum ErrorCode { NAN_TENSOR = 0, INF_TENSOR = 2, NULL_PREV_TENSOR = 4, OUT_OF_MEMORY = 8, HISTORY_NOT_FOUND = 16 };
virtual ~ITensorSummary() = default;
virtual void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) = 0;
virtual std::tuple<bool, int32_t, std::vector<DebugServices::parameter_t>> IsWatchpointHit(

View File

@ -229,6 +229,10 @@ class TensorData {
void SetIteration(unsigned int iteration) { this->iteration_ = iteration; }
unsigned int GetPrevIteration() const { return this->prev_iteration_; }
void SetPrevIteration(unsigned int prev_iteration) { this->prev_iteration_ = prev_iteration; }
unsigned int GetDeviceId() const { return this->device_id_; }
void SetDeviceId(unsigned int device_id) { this->device_id_ = device_id; }
@ -430,6 +434,7 @@ class TensorData {
std::string name_;
uint64_t slot_;
unsigned int iteration_{0};
unsigned int prev_iteration_{0};
unsigned int device_id_{0};
unsigned int root_graph_id_{0};
bool is_output_{true};

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -90,7 +90,7 @@ class TensorLoader {
key_name += (":" + std::to_string(tensor->GetDeviceId()) + ":" + std::to_string(tensor->GetRootGraphId()) + ":" +
std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot()));
if (tensor_list_map_.find(key_name) != tensor_list_map_.end() &&
tensor->GetIteration() == tensor_list_map_[key_name]->GetIteration() - 1) {
tensor->GetIteration() == tensor_list_map_[key_name]->GetPrevIteration()) {
key_name += ":prev";
}
auto iter = tensor_list_map_.find(key_name);

View File

@ -33,6 +33,9 @@
#include "runtime/base.h"
#include "runtime/device/ascend/ascend_stream_manager.h"
#include "utils/shape_utils.h"
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#endif
namespace {
constexpr auto kGradients = "Gradients";
@ -1062,7 +1065,12 @@ void KernelAdjust::LoadDeviceLoopCtrlParameters(const std::shared_ptr<session::K
MS_LOG(INFO) << "Load device loop control data";
SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurLoopCountName, 0);
SetDeviceLoopCtrlTensor(kernel_graph_ptr, kNextLoopCountName, 0);
SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName, SizeToInt(kernel_graph_ptr->current_epoch()));
#ifndef ENABLE_SECURITY
SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName,
SizeToInt(DumpJsonParser::GetInstance().cur_dump_iter()));
#else
SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName, 0);
#endif
kernel_graph_ptr->set_current_epoch(kernel_graph_ptr->current_epoch() + 1);
}

View File

@ -23,6 +23,7 @@
#include "utils/log_adapter.h"
#ifndef ENABLE_SECURITY
#include "debug/data_dump/cpu_e2e_dump.h"
#include "debug/data_dump/e2e_dump.h"
#endif
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
@ -52,6 +53,7 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
MS_EXCEPTION_IF_NULL(kernel_graph);
CPUE2eDump::DumpCNodeData(cnode, kernel_graph->graph_id());
CPUE2eDump::DumpRunIter(kernel_graph);
}
#endif
} else if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kGPU) {

View File

@ -18,11 +18,14 @@ Utils for testing offline debugger.
import os
import tempfile
import bisect
import csv
import numpy as np
def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_info_list):
"""Build dump file structure from tensor_list."""
ranks_run_history = {}
temp_dir = tempfile.mkdtemp(prefix=net_name, dir=path)
for tensor_name, tensor, tensor_info in zip(tensor_name_list, tensor_list, tensor_info_list):
slot = str(tensor_info.slot)
@ -30,6 +33,16 @@ def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_i
rank_id = str(tensor_info.rank_id)
root_graph_id = str(tensor_info.root_graph_id)
is_output = str(tensor_info.is_output)
if rank_id not in ranks_run_history:
graphs_run_history = {}
ranks_run_history[rank_id] = graphs_run_history
if root_graph_id not in ranks_run_history[rank_id]:
iter_list = []
iter_list.append(iteration)
graphs_run_history[root_graph_id] = iter_list
elif iteration not in graphs_run_history[root_graph_id]:
bisect.insort(graphs_run_history[root_graph_id], iteration)
path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration)
os.makedirs(path, exist_ok=True)
if is_output == "True":
@ -40,4 +53,16 @@ def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_i
".DefaultFormat.npy", dir=path)
full_path = file[1]
np.save(full_path, tensor)
build_global_execution_order(temp_dir, ranks_run_history)
return temp_dir
def build_global_execution_order(path, ranks_run_history):
for rank_id in ranks_run_history.keys():
exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order"
os.makedirs(exec_order_path, exist_ok=True)
for graph in ranks_run_history[rank_id].keys():
full_path = os.path.join(exec_order_path, "ms_global_execution_order_graph_" + graph + ".csv")
with open(full_path, 'w+', newline='') as csv_file:
write = csv.writer(csv_file)
write.writerows(ranks_run_history[rank_id][graph])

View File

@ -69,6 +69,35 @@ async_dump_dict_2 = {
}
}
e2e_dump_dict_2 = {
"common_dump_settings": {
"dump_mode": 0,
"path": "",
"net_name": "Net",
"iteration": "all",
"input_output": 0,
"kernels": ["Default/Conv-op12"],
"support_device": [0, 1, 2, 3, 4, 5, 6, 7],
"op_debug_mode": 0
},
"e2e_dump_settings": {
"enable": True,
"trans_flag": False
}
}
async_dump_dict_3 = {
"common_dump_settings": {
"dump_mode": 0,
"path": "",
"net_name": "Net",
"iteration": "all",
"input_output": 2,
"kernels": ["Default/TensorAdd-op3"],
"support_device": [0, 1, 2, 3, 4, 5, 6, 7],
"op_debug_mode": 0
}
}
def generate_dump_json(dump_path, json_file_name, test_key):
"""
@ -83,6 +112,13 @@ def generate_dump_json(dump_path, json_file_name, test_key):
elif test_key == "test_async_dump_net_multi_layer_mode1":
data = async_dump_dict_2
data["common_dump_settings"]["path"] = dump_path
elif test_key in ("test_GPU_e2e_multi_root_graph_dump", "test_Ascend_e2e_multi_root_graph_dump"):
data = e2e_dump_dict_2
data["common_dump_settings"]["path"] = dump_path
elif test_key == "test_Ascend_async_multi_root_graph_dump":
data = async_dump_dict_3
data["common_dump_settings"]["path"] = dump_path
else:
raise ValueError(
"Failed to generate dump json file. The test name value " + test_key + " is invalid.")

View File

@ -283,7 +283,7 @@ def run_e2e_dump_execution_graph():
add = Net()
add(Tensor(x), Tensor(y))
exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order')
assert len(os.listdir(exe_graph_path)) == 1
assert len(os.listdir(exe_graph_path)) == 2
del os.environ['MINDSPORE_DUMP_CONFIG']

View File

@ -0,0 +1,158 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import os
from os import path
import tempfile
import time
import shutil
import csv
import numpy as np
import pytest
import mindspore.context as context
from mindspore import Tensor
from mindspore.ops import operations as P
from mindspore.nn import Cell
from dump_test_utils import generate_dump_json
from tests.security_utils import security_off_wrap
class AddNet(Cell):
def __init__(self):
super(AddNet, self).__init__()
self.add = P.TensorAdd()
def construct(self, input_x, input_y):
output_z = self.add(input_x, input_y)
return output_z
class NewAddNet(Cell):
def __init__(self):
super(NewAddNet, self).__init__()
self.add = P.AddN()
def construct(self, x, y):
z = self.add([x, y, y])
return z
def train_addnet(epoch):
net = AddNet()
net2 = NewAddNet()
output_list = []
input_x = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32))
input_y = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32))
for _ in range(epoch):
out_put = net(input_x, input_y)
out2 = net2(out_put, input_x)
output_list.append(out2.asnumpy())
input_x = input_x + input_y
def run_multi_root_graph_dump(device, dump_mode, test_name):
"""Run dump for multi root graph script."""
context.set_context(mode=context.GRAPH_MODE, device_target=device)
with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
dump_path = os.path.join(tmp_dir, dump_mode)
dump_config_path = os.path.join(tmp_dir, dump_mode + ".json")
generate_dump_json(dump_path, dump_config_path, test_name)
os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
dump_file_path = os.path.join(dump_path, 'rank_0', 'Net')
if os.path.isdir(dump_path):
shutil.rmtree(dump_path)
epoch = 3
train_addnet(epoch)
for _ in range(3):
if not os.path.exists(dump_file_path):
time.sleep(2)
# Multi root graph script : we have 2 graphs under rank_0 dir
# Each graph should have 3 iteration
# Each graph was executed once per epoch,
# Graph 0 was executed in even iterations, graph one was executed in odd iterations
assert len(os.listdir(dump_file_path)) == 2
dump_path_graph_0 = os.path.join(dump_file_path, '0')
dump_path_graph_1 = os.path.join(dump_file_path, '1')
assert sorted(os.listdir(dump_path_graph_0)) == ['0', '2', '4']
assert sorted(os.listdir(dump_path_graph_1)) == ['1', '3', '5']
execution_order_path = os.path.join(dump_path, 'rank_0', 'execution_order')
# Four files in execution_order dir.
# Two files for each graph (ms_execution_order and ms_global_execution_order)
assert len(os.listdir(execution_order_path)) == 4
global_exec_order_graph_0 = os.path.join(execution_order_path, 'ms_global_execution_order_graph_0.csv')
assert path.exists(global_exec_order_graph_0)
with open(global_exec_order_graph_0) as csvfile:
history_graph_0 = csv.reader(csvfile)
iter_list_graph_0 = list(history_graph_0)
assert iter_list_graph_0 == [['0'], ['2'], ['4']]
global_exec_order_graph_1 = os.path.join(execution_order_path, 'ms_global_execution_order_graph_1.csv')
assert path.exists(global_exec_order_graph_1)
with open(global_exec_order_graph_1) as csvfile:
history_graph_1 = csv.reader(csvfile)
iter_list_graph_1 = list(history_graph_1)
assert iter_list_graph_1 == [['1'], ['3'], ['5']]
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
@security_off_wrap
def test_GPU_e2e_multi_root_graph_dump():
"""
Feature:
Multi root graph e2e dump for GPU.
Description:
Test multi root graph e2e dump GPU.
Expectation:
Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
"""
run_multi_root_graph_dump("GPU", "e2e_dump", "test_GPU_e2e_multi_root_graph_dump")
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_Ascend_e2e_multi_root_graph_dump():
"""
Feature:
Multi root graph e2e dump for Ascend.
Description:
Test multi root graph e2e dump Ascend.
Expectation:
Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
"""
run_multi_root_graph_dump("Ascend", "e2e_dump", "test_Ascend_e2e_multi_root_graph_dump")
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_Ascend_async_multi_root_graph_dump():
"""
Feature:
Multi root graph async dump for Ascend.
Description:
Test multi root graph async dump Ascend.
Expectation:
Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
"""
run_multi_root_graph_dump("Ascend", "async_dump", "test_Ascend_async_multi_root_graph_dump")

View File

@ -18,11 +18,14 @@ Utils for testing offline debugger.
import os
import tempfile
import bisect
import csv
import numpy as np
def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_list):
"""Build dump file structure from tensor_list."""
ranks_run_history = {}
temp_dir = tempfile.mkdtemp(prefix=net_name, dir="./")
for x, _ in enumerate(tensor_info_list):
slot = str(tensor_info_list[x].slot)
@ -30,6 +33,16 @@ def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_li
rank_id = str(tensor_info_list[x].rank_id)
root_graph_id = str(tensor_info_list[x].root_graph_id)
is_output = str(tensor_info_list[x].is_output)
if rank_id not in ranks_run_history:
graphs_run_history = {}
ranks_run_history[rank_id] = graphs_run_history
if root_graph_id not in ranks_run_history[rank_id]:
iter_list = []
iter_list.append(iteration)
graphs_run_history[root_graph_id] = iter_list
elif iteration not in graphs_run_history[root_graph_id]:
bisect.insort(graphs_run_history[root_graph_id], iteration)
path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration)
os.makedirs(path, exist_ok=True)
if is_output == "True":
@ -40,4 +53,16 @@ def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_li
".DefaultFormat.npy", dir=path)
full_path = file[1]
np.save(full_path, tensor_list[x])
build_global_execution_order(temp_dir, ranks_run_history)
return temp_dir
def build_global_execution_order(path, ranks_run_history):
for rank_id in ranks_run_history.keys():
exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order"
os.makedirs(exec_order_path, exist_ok=True)
for graph in ranks_run_history[rank_id].keys():
full_path = os.path.join(exec_order_path, "ms_global_execution_order_graph_" + graph + ".csv")
with open(full_path, 'w+', newline='') as csv_file:
write = csv.writer(csv_file)
write.writerows(ranks_run_history[rank_id][graph])