!24520 Add graph run history to dump structure

Merge pull request !24520 from parastooashtari/multi_root_graph_dump
2021-11-04 12:55:01 +00:00 · 2021-11-04 12:55:01 +00:00 · 76f4f77cc2
parent 49d38f84e1 1a59dc37bf
commit 76f4f77cc2
20 changed files with 561 additions and 63 deletions
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@ -622,6 +622,9 @@ void AscendSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_
    debugger_->PostExecute();
  }
 #endif
+#ifndef ENABLE_SECURITY
+  DumpSetup(kernel_graph);
+#endif
 }

 void AscendSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) { Execute(kernel_graph, true); }
@ -1347,11 +1350,6 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
  }
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
-#ifndef ENABLE_SECURITY
-  if (is_task && is_task_sink) {
-    DumpSetup(kernel_graph);
-  }
-#endif
  bool ret_ok = runtime_instance->Run(*kernel_graph, is_task_sink);
 #ifndef ENABLE_SECURITY
  if (is_task && is_task_sink) {
@ -1378,6 +1376,7 @@ void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph)
 void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  MS_LOG(DEBUG) << "Start!";
  MS_EXCEPTION_IF_NULL(kernel_graph);
+  E2eDump::DumpRunIter(kernel_graph, rank_id_);
  E2eDump::DumpData(kernel_graph.get(), rank_id_);
  MS_LOG(DEBUG) << "Finish!";
 }
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@ -722,6 +722,7 @@ void GPUSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) con
 void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  if (debugger_->DebuggerBackendEnabled()) {
    MS_EXCEPTION_IF_NULL(kernel_graph);
+    E2eDump::DumpRunIter(kernel_graph, rank_id_);
    E2eDump::DumpData(kernel_graph.get(), rank_id_, debugger_.get());
  } else {
    DumpJsonParser::GetInstance().UpdateDumpIter();
--- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc
@ -18,6 +18,7 @@
 #include <map>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "debug/anf_ir_utils.h"
+#include "debug/common.h"

 namespace mindspore {
 void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) {
@ -39,6 +40,31 @@ void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) {
  }
 }

+void CPUE2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
+  auto &json_parser = DumpJsonParser::GetInstance();
+  if (!(json_parser.e2e_dump_enabled())) {
+    return;
+  }
+  std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
+  std::string file_name_to_check =
+    execution_order_path + "/ms_global_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv";
+  auto real_path = Common::CreatePrefixPath(file_name_to_check);
+  if (!real_path.has_value()) {
+    MS_LOG(WARNING) << "Check file path: " << file_name_to_check << " failed.";
+    return;
+  }
+  std::string file_name = real_path.value();
+  ChangeFileMode(file_name, S_IWUSR);
+  std::ofstream fout(file_name, std::ofstream::app);
+  if (!fout.is_open()) {
+    MS_LOG(WARNING) << "Open file for saving graph global execution order failed.";
+    return;
+  }
+  fout << std::to_string(json_parser.cur_dump_iter()) + "\n";
+  fout.close();
+  ChangeFileMode(file_name, S_IRUSR);
+}
+
 void CPUE2eDump::DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path) {
  MS_EXCEPTION_IF_NULL(node);
  std::string kernel_name = GetKernelNodeName(node);
--- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h
+++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h
@ -35,6 +35,8 @@ class CPUE2eDump {

  static void DumpCNodeData(const CNodePtr &node, uint32_t graph_id);

+  static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0);
+
 private:
  static void DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path);

--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
@ -290,13 +290,9 @@ void E2eDump::UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_m
    MS_LOG(INFO) << "No need to update iteration for dataset graph.";
    return;
  }
-  if (starting_graph_id == INT32_MAX) {
-    // Identify the first graph id and not increasing dump iter for the first iteration (initial dump iter = 0).
-    starting_graph_id = graph_id;
-  } else {
-    // In multi network scripts, dump iter is equal to the number of networks that have been run so far.
-    dump_json_parser.UpdateDumpIter();
-  }
+
+  // In multi network scripts, dump iter is equal to the number of networks that have been executed so far.
+  dump_json_parser.UpdateDumpIter();
 }

 void E2eDump::DumpSetup(const session::KernelGraph *graph) {
@ -308,10 +304,31 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph) {
  }
 }

-void E2eDump::UpdateIterGPUDump() {
-  if (starting_graph_id != INT32_MAX) {
-    DumpJsonParser::GetInstance().UpdateDumpIter();
+void E2eDump::UpdateIterGPUDump() { DumpJsonParser::GetInstance().UpdateDumpIter(); }
+
+void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
+  auto &json_parser = DumpJsonParser::GetInstance();
+  if (!(json_parser.async_dump_enabled() || json_parser.e2e_dump_enabled())) {
+    return;
  }
+  std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
+  std::string file_name_to_check =
+    execution_order_path + "/ms_global_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv";
+  auto real_path = Common::CreatePrefixPath(file_name_to_check);
+  if (!real_path.has_value()) {
+    MS_LOG(WARNING) << "Check file path: " << file_name_to_check << " failed.";
+    return;
+  }
+  std::string file_name = real_path.value();
+  ChangeFileMode(file_name, S_IWUSR);
+  std::ofstream fout(file_name, std::ofstream::app);
+  if (!fout.is_open()) {
+    MS_LOG(WARNING) << "Open file for saving graph global execution order failed.";
+    return;
+  }
+  fout << std::to_string(json_parser.cur_dump_iter()) + "\n";
+  fout.close();
+  ChangeFileMode(file_name, S_IRUSR);
 }

 void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) {
--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h
@ -39,6 +39,8 @@ class E2eDump {

  static void UpdateIterGPUDump();

+  static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0);
+
  static void DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);

  static bool DumpParametersAndConstData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger);
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@ -25,6 +25,7 @@
 #include <numeric>
 #include <unordered_set>
 #include <utility>
+#include <regex>
 #include "pybind11/embed.h"
 #include "pybind11/stl.h"
 #ifdef ONLINE_DBG_MODE
@ -33,8 +34,10 @@
 #include "debug/anf_ir_utils.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #endif
+#include "nlohmann/json.hpp"
 #include "debug/debugger/tensor_summary.h"
 #include "utils/file_utils.h"
+#include "linux/limits.h"
 #ifdef ONLINE_DBG_MODE
 namespace mindspore {
 #endif
@ -172,23 +175,28 @@ DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_p
 }
 #ifdef OFFLINE_DBG_MODE
 const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
-                                         uint32_t *prev_num_elements) {
+                                         uint32_t *prev_num_elements, bool *history_not_found) {
  MS_EXCEPTION_IF_NULL(tensor);
  const void *previous_tensor_ptr = nullptr;
  std::shared_ptr<TensorData> tensor_prev;
-  if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) {
+  std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
+  if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
+    *history_not_found = 1;
+    MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
+  } else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
+    // when prev_tensor is not available, the prev iteration is set to UINT32_MAX
    // read data in offline mode
    std::vector<std::string> file_paths;
    if (!is_sync_mode_) {
      ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
                         std::vector<unsigned int>{tensor->GetDeviceId()},
-                         std::vector<unsigned int>{tensor->GetIteration() - 1},
+                         std::vector<unsigned int>{tensor->GetPrevIteration()},
                         std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
    }
    std::vector<std::shared_ptr<TensorData>> result_list_prev;
    ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
                     std::vector<unsigned int>{tensor->GetDeviceId()},
-                     std::vector<unsigned int>{tensor->GetIteration() - 1},
+                     std::vector<unsigned int>{tensor->GetPrevIteration()},
                     std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
                     file_paths, &result_list_prev);
    tensor_prev = result_list_prev[0];
@ -303,7 +311,7 @@ void DebugServices::ProcessCheckpointsOutofMemory(
  const std::vector<parameter_t> &parameter_list) {
  if (no_mem_to_read) {
    // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
-    int32_t oversize_error_code = 8;
+    int32_t oversize_error_code = ITensorSummary::OUT_OF_MEMORY;
    for (auto &wp : watchpoints_to_check) {
      SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
                                chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
@ -313,6 +321,18 @@ void DebugServices::ProcessCheckpointsOutofMemory(
    }
  }
 }
+
+void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
+  // set the tensor into not-in-use status in tensor_loader.
+  auto tensor_name = tensor->GetName();
+  std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
+                                  std::to_string(tensor->GetRootGraphId()) + ":" +
+                                  std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
+  AppendToCacheEvictQueue(key_name_in_cache);
+  if (previous_tensor_ptr != nullptr) {
+    AppendToCacheEvictQueue(key_name_in_cache + ":prev");
+  }
+}
 #endif

 void DebugServices::CheckWatchpointsForTensor(
@ -373,7 +393,8 @@ void DebugServices::CheckWatchpointsForTensor(
    uint32_t prev_num_elements = 0;
    const void *previous_tensor_ptr = nullptr;
 #ifdef OFFLINE_DBG_MODE
-    previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements);
+    bool history_not_found = 0;
+    previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
 #else
    std::shared_ptr<TensorData> prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
    if (prev_tensor_data) {
@ -400,6 +421,11 @@ void DebugServices::CheckWatchpointsForTensor(
        auto item = base_summary_ptr->IsWatchpointHit(wp);
        is_hit = std::get<ITensorSummary::eHitPos>(item);
        error_code = std::get<ITensorSummary::eErrorCodePos>(item);
+#ifdef OFFLINE_DBG_MODE
+        if (history_not_found) {
+          error_code = ITensorSummary::HISTORY_NOT_FOUND;  // error code for history not found
+        }
+#endif
        parameter_list = std::get<ITensorSummary::eParamListPos>(item);
      }
      AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
@ -413,14 +439,7 @@ void DebugServices::CheckWatchpointsForTensor(
    }

 #ifdef OFFLINE_DBG_MODE
-    // set the tensor into not-in-use status in tensor_loader.
-    std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
-                                    std::to_string(tensor->GetRootGraphId()) + ":" +
-                                    std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
-    AppendToCacheEvictQueue(key_name_in_cache);
-    if (previous_tensor_ptr != nullptr) {
-      AppendToCacheEvictQueue(key_name_in_cache + ":prev");
-    }
+    SetTensorToNotInUse(tensor, previous_tensor_ptr);
    // in offline mode remove the need for the data
    tensor.reset();
 #endif
@ -685,7 +704,7 @@ void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &f
  std::string real_dump_iter_dir = RealPath(dump_key);
  DIR *d_handle = opendir(real_dump_iter_dir.c_str());
  if (d_handle == nullptr) {
-    MS_LOG(ERROR) << "Directory does not exit in ConvertToHostFormat.";
+    MS_LOG(ERROR) << "Directory does not exist in ConvertToHostFormat.";
    return;
  }
  struct dirent *dir = nullptr;
@ -865,12 +884,153 @@ void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::str
      tensor_data->SetType("");
      tensor_data->SetShape(shape);
      tensor_data->SetIsOutput(output_flag);
+      tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));

      tensor_list->push_back(tensor_data);
    }
  }
 }

+uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
+  std::regex re;
+  if (mode == "rank") {
+    re = "^rank_([0-9]+)$";
+  } else if (mode == "graph") {
+    re = "^([0-9]+)$";
+  }
+  std::smatch tokens;
+  if (regex_match(name, tokens, re)) {
+    return std::stoi(tokens[1]);
+  } else {
+    return UINT32_MAX;
+  }
+}
+
+std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
+  std::vector<uint32_t> rank_id_list;
+  std::string dump_dir = GetDumpDir();
+  DIR *d_handle = opendir(dump_dir.c_str());
+  if (d_handle == nullptr) {
+    MS_LOG(ERROR) << "Dump directory does not exist.";
+    return rank_id_list;
+  }
+  struct dirent *dir = nullptr;
+  while ((dir = readdir(d_handle)) != nullptr) {
+    if (dir->d_type == DT_DIR) {
+      std::string rank_dir_name = dir->d_name;
+      if (GetRankOrGraphId("rank", rank_dir_name) != UINT32_MAX) {
+        rank_id_list.push_back(GetRankOrGraphId("rank", rank_dir_name));
+      }
+    }
+  }
+  (void)closedir(d_handle);
+  return rank_id_list;
+}
+
+void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
+  std::string net_name = GetNetName();
+  std::string dump_dir = GetDumpDir();
+  for (uint32_t rank_id : rank_id_list) {
+    std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
+    std::string abspath = RealPath(path);
+    DIR *d_handle_rank = opendir(abspath.c_str());
+    if (d_handle_rank == nullptr) {
+      MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
+      continue;
+    }
+    struct dirent *direc = nullptr;
+    while ((direc = readdir(d_handle_rank)) != nullptr) {
+      if (direc->d_type == DT_DIR) {
+        std::string graph_dir = direc->d_name;
+        if (graph_dir == "." || graph_dir == "..") {
+          continue;
+        }
+        if (GetRankOrGraphId("graph", graph_dir) != UINT32_MAX) {
+          uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
+          ReadGraphsHistory(rank_id, graph_id);
+        }
+      }
+    }
+    (void)closedir(d_handle_rank);
+  }
+}
+
+void DebugServices::SetGraphsHistory() {
+  // extract rank_id_list
+  std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
+  // for each rank_id extract the graph_id list and set the dump version
+  // and for each graph read the graph history file
+  CheckDumpGraphIdList(rank_id_list);
+}
+
+void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
+  std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
+  if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
+    // graph history was already stored for this rank_id and graph_id
+    return;
+  }
+  std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
+  std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
+  DIR *d_handle = opendir(exec_order_path.c_str());
+  if (d_handle == nullptr) {
+    MS_LOG(ERROR) << "Directory does not exist.";
+    return;
+  }
+  // read file and store the info
+  std::string full_path = exec_order_path + "/" + file_to_check;
+  std::string checked_path = RealPath(full_path);
+  if (!checked_path.empty()) {
+    ReadGraphRunIter(checked_path, rank_and_graph);
+  }
+  (void)closedir(d_handle);
+}
+
+std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
+  std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
+  for (auto w_table_item : watchpoint_table_) {
+    auto wp = std::get<1>(w_table_item);
+    unsigned int index = 0;
+    for (auto check_node : wp.check_node_list) {
+      std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
+      std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
+      // graph represents root_graph for Ascend and kernel_graph for GPU
+      for (auto rank : ranks) {
+        for (auto graph : graphs) {
+          std::tuple<uint32_t, uint32_t> key(rank, graph);
+          (rank_and_graph_to_nodes)[key].push_back(check_node);
+        }
+      }
+      index++;
+    }
+  }
+  return rank_and_graph_to_nodes;
+}
+
+void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
+  std::ifstream infile;
+  std::string line;
+  infile.open(file_path.c_str());
+  if (!infile.is_open()) {
+    MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
+    const int kMaxFilenameLength = NAME_MAX;
+    char err_info[kMaxFilenameLength];
+    if (strerror_r(errno, err_info, sizeof(err_info)) != nullptr) {
+      MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
+    }
+
+    return;
+  }
+  std::vector<uint32_t> run_iters_vec;
+  while (std::getline(infile, line)) {
+    uint32_t iter;
+    std::stringstream ss(line);
+    ss >> iter;
+    run_iters_vec.push_back(iter);
+  }
+  (void)graphs_run_history_.emplace(
+    std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
+}
+
 void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
                                    const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
                                    const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
@ -895,6 +1055,7 @@ void DebugServices::AddToTensorData(const std::string &backend_name, const std::
  tensor_data->SetType(type_name);
  tensor_data->SetShape(shape);
  tensor_data->SetTimeStamp(time_stamp);
+  tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  if (data_size) {
    (void)tensor_loader_->LoadNewTensor(tensor_data, false);
  }
@ -1089,34 +1250,19 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
  unsigned int iteration, std::vector<std::string> *const async_file_pool) {
  // get a list of nodes and the devices they are on to monitor
  std::vector<std::shared_ptr<TensorData>> tensor_list;
-  std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> device_and_graph_to_nodes;
-  for (auto w_table_item : watchpoint_table_) {
-    auto wp = std::get<1>(w_table_item);
-    unsigned int index = 0;
-    for (auto check_node : wp.check_node_list) {
-      std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
-      std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
-      for (auto device : devices) {
-        for (auto graph : graphs) {
-          std::tuple<uint32_t, uint32_t> key(device, graph);
-          device_and_graph_to_nodes[key].push_back(check_node);
-        }
-      }
-
-      index++;
-    }
-  }
+  std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
+    GetAllWpNodes();

  // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  // as they are found
-  for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
-    std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
-    uint32_t device_id = std::get<0>(device_and_graph);
-    uint32_t root_graph_id = std::get<1>(device_and_graph);
-    std::vector<std::tuple<std::string, bool>> wp_nodes = device_and_graph_item.second;
+  for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
+    std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
+    uint32_t rank_id = std::get<0>(rank_and_graph);
+    uint32_t root_graph_id = std::get<1>(rank_and_graph);
+    std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
    std::vector<std::tuple<std::string, std::string>> proto_to_dump;

-    std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
+    std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
                                    std::to_string(root_graph_id) + "/" + IterationString(iteration);

    // convert node names to dump style
@ -1140,12 +1286,11 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
    if (is_sync_mode_) {
      // search files in dir for the one that meets the filename prefix and read the file into memory
      std::string abspath = RealPath(specific_dump_dir);
-      ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, device_id, root_graph_id,
-                            &tensor_list);
+      ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, rank_id, root_graph_id, &tensor_list);
    } else {
      // convert all files in proto_to_dump to npy and add to pool of async file names
      ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
-      GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool,
+      GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool,
                             &tensor_list);
    }
  }
@ -1285,6 +1430,32 @@ bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, boo
  return tensor_loader_->LoadNewTensor(tensor, keep_prev);
 }

+uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
+  uint32_t prev_iter;
+  uint32_t rank_id = tensor->GetDeviceId();
+  uint32_t root_graph_id = tensor->GetRootGraphId();
+  std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
+  if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
+    return UINT32_MAX;
+  }
+  auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
+                      tensor->GetIteration());
+  if (it == graphs_run_history_[rank_and_graph].end()) {
+    // The graph is not executed in that iteration
+    return UINT32_MAX;
+  } else if (it == graphs_run_history_[rank_and_graph].begin()) {
+    // current iteration is the first iteration that the graph was run
+    // no prev iter is available
+    MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
+                  << " is the first run iteration for tensor: " << tensor->GetName();
+    return UINT32_MAX;
+  }
+  it--;
+  prev_iter = *it;
+  tensor->SetPrevIteration(prev_iter);
+  return prev_iter;
+}
+
 void DebugServices::ResetLoadedTensors() {
  wp_id_cache_.clear();
  MS_LOG(INFO) << "Resetting loaded tensors";
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@ -292,6 +292,9 @@ class DebugServices {
                           std::vector<uint64_t> *chunk_tensor_byte_size, partitioned_id *chunk_device_id,
                           partitioned_id *chunk_root_graph_id, std::vector<unsigned int> *device_id,
                           std::vector<unsigned int> *root_graph_id);
+#ifdef OFFLINE_DBG_MODE
+  void SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr);
+#endif

  void AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
                             const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
@ -350,7 +353,7 @@ class DebugServices {
                                                                   std::vector<std::string> *const async_file_pool);

  const void *GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
-                            uint32_t *prev_num_elements);
+                            uint32_t *prev_num_elements, bool *history_not_found);

  void ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name, std::string *const tensor_type,
                         std::size_t *const size, std::vector<int64_t> *const shape,
@ -380,6 +383,18 @@ class DebugServices {
                              uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
                              std::vector<std::shared_ptr<TensorData>> *const tensor_list);

+  void SetGraphsHistory();
+
+  std::vector<uint32_t> GetDumpRankIdList();
+
+  void CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list);
+
+  void ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id);
+
+  std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> GetAllWpNodes();
+
+  void ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph);
+
  std::string GetStrippedFilename(const std::string &file_name);

  std::string IterationString(unsigned int iteration);
@ -410,6 +425,8 @@ class DebugServices {

  bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);

+  uint32_t GetPrevIteration(const std::shared_ptr<TensorData> &tensor);
+
  void ResetLoadedTensors();
 #ifdef ONLINE_DBG_MODE
  std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel);
@ -458,6 +475,8 @@ class DebugServices {
  std::unordered_map<std::string, std::vector<std::string>> overflow_ops_;
  std::string net_name_;
  std::string dump_dir_;
+  // store history of graphs that have been run (rank_id, graph_id)
+  std::map<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>> graphs_run_history_;
  bool is_sync_mode_{false};

  std::shared_ptr<TensorLoader> tensor_loader_;
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -282,7 +282,6 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
  if (device_target_ != kGPUDevice) {
    return;
  }
-  E2eDump::UpdateIterGPUDump();
  // Store graphs that are run in one step.
  graph_ptr_step_vec_ = graphs;
  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
@ -290,7 +289,6 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
    if (debugger_) {
      debugger_->PreExecute(graph);
    }
-    DumpSetup(graph);
  }
 }

@ -390,6 +388,7 @@ uint32_t Debugger::GetRankID() {

 void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
  uint32_t rank_id = GetRankID();
+  E2eDump::DumpRunIter(kernel_graph, rank_id);
  if (debugger_ && debugger_->DebuggerBackendEnabled()) {
    MS_EXCEPTION_IF_NULL(kernel_graph);
    (void)E2eDump::DumpParametersAndConstData(kernel_graph.get(), rank_id, debugger_.get());
@ -458,6 +457,7 @@ void Debugger::PostExecuteGraphDebugger() {
  if (debugger_) {
    debugger_->PostExecute();
  }
+  E2eDump::UpdateIterGPUDump();
 }

 void Debugger::PostExecute() {
--- a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc
+++ b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc
@ -62,6 +62,7 @@ int32_t DbgServices::Initialize(const std::string net_name, const std::string du
  const uint64_t ratio_inversion = 2;
  const uint64_t memlimit = max_mem_usage * kMegabytesToBytes / ratio_inversion;
  debug_services_->SetMemLimit(memlimit);
+  debug_services_->SetGraphsHistory();
  return 0;
 }

--- a/mindspore/ccsrc/debug/debugger/tensor_summary.h
+++ b/mindspore/ccsrc/debug/debugger/tensor_summary.h
@ -88,6 +88,7 @@ class VarianceAndMeanCalculator {
 class ITensorSummary {
 public:
  enum WatchpointPos { eHitPos = 0, eErrorCodePos = 1, eParamListPos = 2 };
+  enum ErrorCode { NAN_TENSOR = 0, INF_TENSOR = 2, NULL_PREV_TENSOR = 4, OUT_OF_MEMORY = 8, HISTORY_NOT_FOUND = 16 };
  virtual ~ITensorSummary() = default;
  virtual void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) = 0;
  virtual std::tuple<bool, int32_t, std::vector<DebugServices::parameter_t>> IsWatchpointHit(
--- a/mindspore/ccsrc/debug/tensor_data.h
+++ b/mindspore/ccsrc/debug/tensor_data.h
@ -229,6 +229,10 @@ class TensorData {

  void SetIteration(unsigned int iteration) { this->iteration_ = iteration; }

+  unsigned int GetPrevIteration() const { return this->prev_iteration_; }
+
+  void SetPrevIteration(unsigned int prev_iteration) { this->prev_iteration_ = prev_iteration; }
+
  unsigned int GetDeviceId() const { return this->device_id_; }

  void SetDeviceId(unsigned int device_id) { this->device_id_ = device_id; }
@ -430,6 +434,7 @@ class TensorData {
  std::string name_;
  uint64_t slot_;
  unsigned int iteration_{0};
+  unsigned int prev_iteration_{0};
  unsigned int device_id_{0};
  unsigned int root_graph_id_{0};
  bool is_output_{true};
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -90,7 +90,7 @@ class TensorLoader {
    key_name += (":" + std::to_string(tensor->GetDeviceId()) + ":" + std::to_string(tensor->GetRootGraphId()) + ":" +
                 std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot()));
    if (tensor_list_map_.find(key_name) != tensor_list_map_.end() &&
-        tensor->GetIteration() == tensor_list_map_[key_name]->GetIteration() - 1) {
+        tensor->GetIteration() == tensor_list_map_[key_name]->GetPrevIteration()) {
      key_name += ":prev";
    }
    auto iter = tensor_list_map_.find(key_name);
--- a/mindspore/ccsrc/runtime/device/kernel_adjust.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_adjust.cc
@ -33,6 +33,9 @@
 #include "runtime/base.h"
 #include "runtime/device/ascend/ascend_stream_manager.h"
 #include "utils/shape_utils.h"
+#ifndef ENABLE_SECURITY
+#include "debug/data_dump/dump_json_parser.h"
+#endif

 namespace {
 constexpr auto kGradients = "Gradients";
@ -1062,7 +1065,12 @@ void KernelAdjust::LoadDeviceLoopCtrlParameters(const std::shared_ptr<session::K
  MS_LOG(INFO) << "Load device loop control data";
  SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurLoopCountName, 0);
  SetDeviceLoopCtrlTensor(kernel_graph_ptr, kNextLoopCountName, 0);
-  SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName, SizeToInt(kernel_graph_ptr->current_epoch()));
+#ifndef ENABLE_SECURITY
+  SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName,
+                          SizeToInt(DumpJsonParser::GetInstance().cur_dump_iter()));
+#else
+  SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName, 0);
+#endif

  kernel_graph_ptr->set_current_epoch(kernel_graph_ptr->current_epoch() + 1);
 }
--- a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
@ -23,6 +23,7 @@
 #include "utils/log_adapter.h"
 #ifndef ENABLE_SECURITY
 #include "debug/data_dump/cpu_e2e_dump.h"
+#include "debug/data_dump/e2e_dump.h"
 #endif
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
@ -52,6 +53,7 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
      auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
      MS_EXCEPTION_IF_NULL(kernel_graph);
      CPUE2eDump::DumpCNodeData(cnode, kernel_graph->graph_id());
+      CPUE2eDump::DumpRunIter(kernel_graph);
    }
 #endif
  } else if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kGPU) {
--- a/tests/st/debugger/dump_test_utils.py
+++ b/tests/st/debugger/dump_test_utils.py
@ -18,11 +18,14 @@ Utils for testing offline debugger.

 import os
 import tempfile
+import bisect
+import csv
 import numpy as np


 def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_info_list):
    """Build dump file structure from tensor_list."""
+    ranks_run_history = {}
    temp_dir = tempfile.mkdtemp(prefix=net_name, dir=path)
    for tensor_name, tensor, tensor_info in zip(tensor_name_list, tensor_list, tensor_info_list):
        slot = str(tensor_info.slot)
@ -30,6 +33,16 @@ def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_i
        rank_id = str(tensor_info.rank_id)
        root_graph_id = str(tensor_info.root_graph_id)
        is_output = str(tensor_info.is_output)
+        if rank_id not in ranks_run_history:
+            graphs_run_history = {}
+            ranks_run_history[rank_id] = graphs_run_history
+        if root_graph_id not in ranks_run_history[rank_id]:
+            iter_list = []
+            iter_list.append(iteration)
+            graphs_run_history[root_graph_id] = iter_list
+        elif iteration not in graphs_run_history[root_graph_id]:
+            bisect.insort(graphs_run_history[root_graph_id], iteration)
+
        path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration)
        os.makedirs(path, exist_ok=True)
        if is_output == "True":
@ -40,4 +53,16 @@ def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_i
                                    ".DefaultFormat.npy", dir=path)
        full_path = file[1]
        np.save(full_path, tensor)
+    build_global_execution_order(temp_dir, ranks_run_history)
    return temp_dir
+
+
+def build_global_execution_order(path, ranks_run_history):
+    for rank_id in ranks_run_history.keys():
+        exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order"
+        os.makedirs(exec_order_path, exist_ok=True)
+        for graph in ranks_run_history[rank_id].keys():
+            full_path = os.path.join(exec_order_path, "ms_global_execution_order_graph_" + graph + ".csv")
+            with open(full_path, 'w+', newline='') as csv_file:
+                write = csv.writer(csv_file)
+                write.writerows(ranks_run_history[rank_id][graph])
--- a/tests/st/dump/dump_test_utils.py
+++ b/tests/st/dump/dump_test_utils.py
@ -69,6 +69,35 @@ async_dump_dict_2 = {
    }
 }

+e2e_dump_dict_2 = {
+    "common_dump_settings": {
+        "dump_mode": 0,
+        "path": "",
+        "net_name": "Net",
+        "iteration": "all",
+        "input_output": 0,
+        "kernels": ["Default/Conv-op12"],
+        "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
+        "op_debug_mode": 0
+    },
+    "e2e_dump_settings": {
+        "enable": True,
+        "trans_flag": False
+    }
+}
+
+async_dump_dict_3 = {
+    "common_dump_settings": {
+        "dump_mode": 0,
+        "path": "",
+        "net_name": "Net",
+        "iteration": "all",
+        "input_output": 2,
+        "kernels": ["Default/TensorAdd-op3"],
+        "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
+        "op_debug_mode": 0
+    }
+}

 def generate_dump_json(dump_path, json_file_name, test_key):
    """
@ -83,6 +112,13 @@ def generate_dump_json(dump_path, json_file_name, test_key):
    elif test_key == "test_async_dump_net_multi_layer_mode1":
        data = async_dump_dict_2
        data["common_dump_settings"]["path"] = dump_path
+    elif test_key in ("test_GPU_e2e_multi_root_graph_dump", "test_Ascend_e2e_multi_root_graph_dump"):
+        data = e2e_dump_dict_2
+        data["common_dump_settings"]["path"] = dump_path
+    elif test_key == "test_Ascend_async_multi_root_graph_dump":
+        data = async_dump_dict_3
+        data["common_dump_settings"]["path"] = dump_path
+
    else:
        raise ValueError(
            "Failed to generate dump json file. The test name value " + test_key + " is invalid.")
--- a/tests/st/dump/test_data_dump.py
+++ b/tests/st/dump/test_data_dump.py
@ -283,7 +283,7 @@ def run_e2e_dump_execution_graph():
        add = Net()
        add(Tensor(x), Tensor(y))
        exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order')
-        assert len(os.listdir(exe_graph_path)) == 1
+        assert len(os.listdir(exe_graph_path)) == 2
        del os.environ['MINDSPORE_DUMP_CONFIG']


--- a/tests/st/dump/test_multi_root_graph_dump.py
+++ b/tests/st/dump/test_multi_root_graph_dump.py
@ -0,0 +1,158 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import os
+from os import path
+import tempfile
+import time
+import shutil
+import csv
+import numpy as np
+import pytest
+import mindspore.context as context
+from mindspore import Tensor
+from mindspore.ops import operations as P
+from mindspore.nn import Cell
+from dump_test_utils import generate_dump_json
+from tests.security_utils import security_off_wrap
+
+
+class AddNet(Cell):
+    def __init__(self):
+        super(AddNet, self).__init__()
+        self.add = P.TensorAdd()
+
+    def construct(self, input_x, input_y):
+        output_z = self.add(input_x, input_y)
+        return output_z
+
+
+class NewAddNet(Cell):
+    def __init__(self):
+        super(NewAddNet, self).__init__()
+        self.add = P.AddN()
+
+    def construct(self, x, y):
+        z = self.add([x, y, y])
+        return z
+
+
+def train_addnet(epoch):
+    net = AddNet()
+    net2 = NewAddNet()
+    output_list = []
+    input_x = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32))
+    input_y = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32))
+    for _ in range(epoch):
+        out_put = net(input_x, input_y)
+        out2 = net2(out_put, input_x)
+        output_list.append(out2.asnumpy())
+        input_x = input_x + input_y
+
+
+def run_multi_root_graph_dump(device, dump_mode, test_name):
+    """Run dump for multi root graph script."""
+
+    context.set_context(mode=context.GRAPH_MODE, device_target=device)
+
+    with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
+        dump_path = os.path.join(tmp_dir, dump_mode)
+        dump_config_path = os.path.join(tmp_dir, dump_mode + ".json")
+        generate_dump_json(dump_path, dump_config_path, test_name)
+        os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
+        dump_file_path = os.path.join(dump_path, 'rank_0', 'Net')
+        if os.path.isdir(dump_path):
+            shutil.rmtree(dump_path)
+        epoch = 3
+        train_addnet(epoch)
+        for _ in range(3):
+            if not os.path.exists(dump_file_path):
+                time.sleep(2)
+        # Multi root graph script : we have 2 graphs under rank_0 dir
+        # Each graph should have 3 iteration
+        # Each graph was executed once per epoch,
+        # Graph 0 was executed in even iterations, graph one was executed in odd iterations
+        assert len(os.listdir(dump_file_path)) == 2
+        dump_path_graph_0 = os.path.join(dump_file_path, '0')
+        dump_path_graph_1 = os.path.join(dump_file_path, '1')
+        assert sorted(os.listdir(dump_path_graph_0)) == ['0', '2', '4']
+        assert sorted(os.listdir(dump_path_graph_1)) == ['1', '3', '5']
+        execution_order_path = os.path.join(dump_path, 'rank_0', 'execution_order')
+        # Four files in execution_order dir.
+        # Two files for each graph (ms_execution_order and ms_global_execution_order)
+        assert len(os.listdir(execution_order_path)) == 4
+        global_exec_order_graph_0 = os.path.join(execution_order_path, 'ms_global_execution_order_graph_0.csv')
+        assert path.exists(global_exec_order_graph_0)
+        with open(global_exec_order_graph_0) as csvfile:
+            history_graph_0 = csv.reader(csvfile)
+            iter_list_graph_0 = list(history_graph_0)
+        assert iter_list_graph_0 == [['0'], ['2'], ['4']]
+        global_exec_order_graph_1 = os.path.join(execution_order_path, 'ms_global_execution_order_graph_1.csv')
+        assert path.exists(global_exec_order_graph_1)
+        with open(global_exec_order_graph_1) as csvfile:
+            history_graph_1 = csv.reader(csvfile)
+            iter_list_graph_1 = list(history_graph_1)
+        assert iter_list_graph_1 == [['1'], ['3'], ['5']]
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_GPU_e2e_multi_root_graph_dump():
+    """
+    Feature:
+        Multi root graph e2e dump for GPU.
+    Description:
+        Test multi root graph e2e dump GPU.
+    Expectation:
+        Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
+    """
+    run_multi_root_graph_dump("GPU", "e2e_dump", "test_GPU_e2e_multi_root_graph_dump")
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_Ascend_e2e_multi_root_graph_dump():
+    """
+    Feature:
+        Multi root graph e2e dump for Ascend.
+    Description:
+        Test multi root graph e2e dump Ascend.
+    Expectation:
+        Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
+    """
+
+    run_multi_root_graph_dump("Ascend", "e2e_dump", "test_Ascend_e2e_multi_root_graph_dump")
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_Ascend_async_multi_root_graph_dump():
+    """
+    Feature:
+        Multi root graph async dump for Ascend.
+    Description:
+        Test multi root graph async dump Ascend.
+    Expectation:
+        Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
+    """
+
+    run_multi_root_graph_dump("Ascend", "async_dump", "test_Ascend_async_multi_root_graph_dump")
--- a/tests/ut/python/debugger/gpu_tests/dump_test_utils.py
+++ b/tests/ut/python/debugger/gpu_tests/dump_test_utils.py
@ -18,11 +18,14 @@ Utils for testing offline debugger.

 import os
 import tempfile
+import bisect
+import csv
 import numpy as np


 def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_list):
    """Build dump file structure from tensor_list."""
+    ranks_run_history = {}
    temp_dir = tempfile.mkdtemp(prefix=net_name, dir="./")
    for x, _ in enumerate(tensor_info_list):
        slot = str(tensor_info_list[x].slot)
@ -30,6 +33,16 @@ def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_li
        rank_id = str(tensor_info_list[x].rank_id)
        root_graph_id = str(tensor_info_list[x].root_graph_id)
        is_output = str(tensor_info_list[x].is_output)
+        if rank_id not in ranks_run_history:
+            graphs_run_history = {}
+            ranks_run_history[rank_id] = graphs_run_history
+        if root_graph_id not in ranks_run_history[rank_id]:
+            iter_list = []
+            iter_list.append(iteration)
+            graphs_run_history[root_graph_id] = iter_list
+        elif iteration not in graphs_run_history[root_graph_id]:
+            bisect.insort(graphs_run_history[root_graph_id], iteration)
+
        path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration)
        os.makedirs(path, exist_ok=True)
        if is_output == "True":
@ -40,4 +53,16 @@ def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_li
                                    ".DefaultFormat.npy", dir=path)
        full_path = file[1]
        np.save(full_path, tensor_list[x])
+    build_global_execution_order(temp_dir, ranks_run_history)
    return temp_dir
+
+
+def build_global_execution_order(path, ranks_run_history):
+    for rank_id in ranks_run_history.keys():
+        exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order"
+        os.makedirs(exec_order_path, exist_ok=True)
+        for graph in ranks_run_history[rank_id].keys():
+            full_path = os.path.join(exec_order_path, "ms_global_execution_order_graph_" + graph +  ".csv")
+            with open(full_path, 'w+', newline='') as csv_file:
+                write = csv.writer(csv_file)
+                write.writerows(ranks_run_history[rank_id][graph])