From 1a59dc37bf8095035b3529836f5bf88ae1c9ad6f Mon Sep 17 00:00:00 2001
From: parastooashtari <parastoo.ashtari@huawei.com>
Date: Tue, 2 Nov 2021 11:00:38 -0400
Subject: [PATCH] add graph execution order history to dump

---
 .../ccsrc/backend/session/ascend_session.cc   |   9 +-
 .../ccsrc/backend/session/gpu_session.cc      |   1 +
 .../ccsrc/debug/data_dump/cpu_e2e_dump.cc     |  26 ++
 .../ccsrc/debug/data_dump/cpu_e2e_dump.h      |   2 +
 mindspore/ccsrc/debug/data_dump/e2e_dump.cc   |  37 ++-
 mindspore/ccsrc/debug/data_dump/e2e_dump.h    |   2 +
 mindspore/ccsrc/debug/debug_services.cc       | 253 +++++++++++++++---
 mindspore/ccsrc/debug/debug_services.h        |  21 +-
 mindspore/ccsrc/debug/debugger/debugger.cc    |   4 +-
 .../debugger/offline_debug/dbg_services.cc    |   1 +
 .../ccsrc/debug/debugger/tensor_summary.h     |   1 +
 mindspore/ccsrc/debug/tensor_data.h           |   5 +
 mindspore/ccsrc/debug/tensor_load.h           |   4 +-
 .../ccsrc/runtime/device/kernel_adjust.cc     |  10 +-
 .../runtime/framework/actor/debug_actor.cc    |   2 +
 tests/st/debugger/dump_test_utils.py          |  25 ++
 tests/st/dump/dump_test_utils.py              |  36 +++
 tests/st/dump/test_data_dump.py               |   2 +-
 tests/st/dump/test_multi_root_graph_dump.py   | 158 +++++++++++
 .../debugger/gpu_tests/dump_test_utils.py     |  25 ++
 20 files changed, 561 insertions(+), 63 deletions(-)
 create mode 100644 tests/st/dump/test_multi_root_graph_dump.py
diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc
index b74e050d189..737c385867a 100644
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -621,6 +621,9 @@ void AscendSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_
     debugger_->PostExecute();
   }
 #endif
+#ifndef ENABLE_SECURITY
+  DumpSetup(kernel_graph);
+#endif
 }
 
 void AscendSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) { Execute(kernel_graph, true); }
@@ -1342,11 +1345,6 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
   }
   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
   MS_EXCEPTION_IF_NULL(runtime_instance);
-#ifndef ENABLE_SECURITY
-  if (is_task && is_task_sink) {
-    DumpSetup(kernel_graph);
-  }
-#endif
   bool ret_ok = runtime_instance->Run(*kernel_graph, is_task_sink);
 #ifndef ENABLE_SECURITY
   if (is_task && is_task_sink) {
@@ -1373,6 +1371,7 @@ void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph)
 void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
   MS_LOG(DEBUG) << "Start!";
   MS_EXCEPTION_IF_NULL(kernel_graph);
+  E2eDump::DumpRunIter(kernel_graph, rank_id_);
   E2eDump::DumpData(kernel_graph.get(), rank_id_);
   MS_LOG(DEBUG) << "Finish!";
 }
diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc
index 96cd2269392..1eec34b960d 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -722,6 +722,7 @@ void GPUSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) con
 void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
   if (debugger_->DebuggerBackendEnabled()) {
     MS_EXCEPTION_IF_NULL(kernel_graph);
+    E2eDump::DumpRunIter(kernel_graph, rank_id_);
     E2eDump::DumpData(kernel_graph.get(), rank_id_, debugger_.get());
   } else {
     DumpJsonParser::GetInstance().UpdateDumpIter();
diff --git a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc
index 17d56601023..c116a0150c8 100644
--- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc
@@ -18,6 +18,7 @@
 #include <map>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "debug/anf_ir_utils.h"
+#include "debug/common.h"
 
 namespace mindspore {
 void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) {
@@ -39,6 +40,31 @@ void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) {
   }
 }
 
+void CPUE2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
+  auto &json_parser = DumpJsonParser::GetInstance();
+  if (!(json_parser.e2e_dump_enabled())) {
+    return;
+  }
+  std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
+  std::string file_name_to_check =
+    execution_order_path + "/ms_global_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv";
+  auto real_path = Common::CreatePrefixPath(file_name_to_check);
+  if (!real_path.has_value()) {
+    MS_LOG(WARNING) << "Check file path: " << file_name_to_check << " failed.";
+    return;
+  }
+  std::string file_name = real_path.value();
+  ChangeFileMode(file_name, S_IWUSR);
+  std::ofstream fout(file_name, std::ofstream::app);
+  if (!fout.is_open()) {
+    MS_LOG(WARNING) << "Open file for saving graph global execution order failed.";
+    return;
+  }
+  fout << std::to_string(json_parser.cur_dump_iter()) + "\n";
+  fout.close();
+  ChangeFileMode(file_name, S_IRUSR);
+}
+
 void CPUE2eDump::DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path) {
   MS_EXCEPTION_IF_NULL(node);
   std::string kernel_name = GetKernelNodeName(node);
diff --git a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h
index 17d0aad2c2a..ee297ce9a1c 100644
--- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h
+++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h
@@ -35,6 +35,8 @@ class CPUE2eDump {
 
   static void DumpCNodeData(const CNodePtr &node, uint32_t graph_id);
 
+  static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0);
+
  private:
   static void DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path);
 
diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
index 68d5af02bc0..4159ae2b1ac 100644
--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
@@ -290,13 +290,9 @@ void E2eDump::UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_m
     MS_LOG(INFO) << "No need to update iteration for dataset graph.";
     return;
   }
-  if (starting_graph_id == INT32_MAX) {
-    // Identify the first graph id and not increasing dump iter for the first iteration (initial dump iter = 0).
-    starting_graph_id = graph_id;
-  } else {
-    // In multi network scripts, dump iter is equal to the number of networks that have been run so far.
-    dump_json_parser.UpdateDumpIter();
-  }
+
+  // In multi network scripts, dump iter is equal to the number of networks that have been executed so far.
+  dump_json_parser.UpdateDumpIter();
 }
 
 void E2eDump::DumpSetup(const session::KernelGraph *graph) {
@@ -308,10 +304,31 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph) {
   }
 }
 
-void E2eDump::UpdateIterGPUDump() {
-  if (starting_graph_id != INT32_MAX) {
-    DumpJsonParser::GetInstance().UpdateDumpIter();
+void E2eDump::UpdateIterGPUDump() { DumpJsonParser::GetInstance().UpdateDumpIter(); }
+
+void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
+  auto &json_parser = DumpJsonParser::GetInstance();
+  if (!(json_parser.async_dump_enabled() || json_parser.e2e_dump_enabled())) {
+    return;
   }
+  std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
+  std::string file_name_to_check =
+    execution_order_path + "/ms_global_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv";
+  auto real_path = Common::CreatePrefixPath(file_name_to_check);
+  if (!real_path.has_value()) {
+    MS_LOG(WARNING) << "Check file path: " << file_name_to_check << " failed.";
+    return;
+  }
+  std::string file_name = real_path.value();
+  ChangeFileMode(file_name, S_IWUSR);
+  std::ofstream fout(file_name, std::ofstream::app);
+  if (!fout.is_open()) {
+    MS_LOG(WARNING) << "Open file for saving graph global execution order failed.";
+    return;
+  }
+  fout << std::to_string(json_parser.cur_dump_iter()) + "\n";
+  fout.close();
+  ChangeFileMode(file_name, S_IRUSR);
 }
 
 void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) {
diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.h b/mindspore/ccsrc/debug/data_dump/e2e_dump.h
index b99a6b2e284..89153718176 100644
--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h
@@ -39,6 +39,8 @@ class E2eDump {
 
   static void UpdateIterGPUDump();
 
+  static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0);
+
   static void DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
 
   static bool DumpParametersAndConstData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger);
diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc
index 6e2d6bafa1b..64f5cd36cf8 100644
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@@ -25,6 +25,7 @@
 #include <numeric>
 #include <unordered_set>
 #include <utility>
+#include <regex>
 #include "pybind11/embed.h"
 #include "pybind11/stl.h"
 #ifdef ONLINE_DBG_MODE
@@ -33,8 +34,10 @@
 #include "debug/anf_ir_utils.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #endif
+#include "nlohmann/json.hpp"
 #include "debug/debugger/tensor_summary.h"
 #include "utils/file_utils.h"
+#include "linux/limits.h"
 #ifdef ONLINE_DBG_MODE
 namespace mindspore {
 #endif
@@ -172,23 +175,28 @@ DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_p
 }
 #ifdef OFFLINE_DBG_MODE
 const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
-                                         uint32_t *prev_num_elements) {
+                                         uint32_t *prev_num_elements, bool *history_not_found) {
   MS_EXCEPTION_IF_NULL(tensor);
   const void *previous_tensor_ptr = nullptr;
   std::shared_ptr<TensorData> tensor_prev;
-  if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) {
+  std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
+  if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
+    *history_not_found = 1;
+    MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
+  } else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
+    // when prev_tensor is not available, the prev iteration is set to UINT32_MAX
     // read data in offline mode
     std::vector<std::string> file_paths;
     if (!is_sync_mode_) {
       ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
                          std::vector<unsigned int>{tensor->GetDeviceId()},
-                         std::vector<unsigned int>{tensor->GetIteration() - 1},
+                         std::vector<unsigned int>{tensor->GetPrevIteration()},
                          std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
     }
     std::vector<std::shared_ptr<TensorData>> result_list_prev;
     ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
                      std::vector<unsigned int>{tensor->GetDeviceId()},
-                     std::vector<unsigned int>{tensor->GetIteration() - 1},
+                     std::vector<unsigned int>{tensor->GetPrevIteration()},
                      std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
                      file_paths, &result_list_prev);
     tensor_prev = result_list_prev[0];
@@ -303,7 +311,7 @@ void DebugServices::ProcessCheckpointsOutofMemory(
   const std::vector<parameter_t> &parameter_list) {
   if (no_mem_to_read) {
     // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
-    int32_t oversize_error_code = 8;
+    int32_t oversize_error_code = ITensorSummary::OUT_OF_MEMORY;
     for (auto &wp : watchpoints_to_check) {
       SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
                                 chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
@@ -313,6 +321,18 @@ void DebugServices::ProcessCheckpointsOutofMemory(
     }
   }
 }
+
+void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
+  // set the tensor into not-in-use status in tensor_loader.
+  auto tensor_name = tensor->GetName();
+  std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
+                                  std::to_string(tensor->GetRootGraphId()) + ":" +
+                                  std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
+  AppendToCacheEvictQueue(key_name_in_cache);
+  if (previous_tensor_ptr != nullptr) {
+    AppendToCacheEvictQueue(key_name_in_cache + ":prev");
+  }
+}
 #endif
 
 void DebugServices::CheckWatchpointsForTensor(
@@ -373,7 +393,8 @@ void DebugServices::CheckWatchpointsForTensor(
     uint32_t prev_num_elements = 0;
     const void *previous_tensor_ptr = nullptr;
 #ifdef OFFLINE_DBG_MODE
-    previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements);
+    bool history_not_found = 0;
+    previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
 #else
     std::shared_ptr<TensorData> prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
     if (prev_tensor_data) {
@@ -400,6 +421,11 @@ void DebugServices::CheckWatchpointsForTensor(
         auto item = base_summary_ptr->IsWatchpointHit(wp);
         is_hit = std::get<ITensorSummary::eHitPos>(item);
         error_code = std::get<ITensorSummary::eErrorCodePos>(item);
+#ifdef OFFLINE_DBG_MODE
+        if (history_not_found) {
+          error_code = ITensorSummary::HISTORY_NOT_FOUND;  // error code for history not found
+        }
+#endif
         parameter_list = std::get<ITensorSummary::eParamListPos>(item);
       }
       AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
@@ -413,14 +439,7 @@ void DebugServices::CheckWatchpointsForTensor(
     }
 
 #ifdef OFFLINE_DBG_MODE
-    // set the tensor into not-in-use status in tensor_loader.
-    std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
-                                    std::to_string(tensor->GetRootGraphId()) + ":" +
-                                    std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
-    AppendToCacheEvictQueue(key_name_in_cache);
-    if (previous_tensor_ptr != nullptr) {
-      AppendToCacheEvictQueue(key_name_in_cache + ":prev");
-    }
+    SetTensorToNotInUse(tensor, previous_tensor_ptr);
     // in offline mode remove the need for the data
     tensor.reset();
 #endif
@@ -679,7 +698,7 @@ void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &f
   std::string real_dump_iter_dir = RealPath(dump_key);
   DIR *d_handle = opendir(real_dump_iter_dir.c_str());
   if (d_handle == nullptr) {
-    MS_LOG(ERROR) << "Directory does not exit in ConvertToHostFormat.";
+    MS_LOG(ERROR) << "Directory does not exist in ConvertToHostFormat.";
     return;
   }
   struct dirent *dir = nullptr;
@@ -859,12 +878,153 @@ void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::str
       tensor_data->SetType("");
       tensor_data->SetShape(shape);
       tensor_data->SetIsOutput(output_flag);
+      tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
 
       tensor_list->push_back(tensor_data);
     }
   }
 }
 
+uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
+  std::regex re;
+  if (mode == "rank") {
+    re = "^rank_([0-9]+)$";
+  } else if (mode == "graph") {
+    re = "^([0-9]+)$";
+  }
+  std::smatch tokens;
+  if (regex_match(name, tokens, re)) {
+    return std::stoi(tokens[1]);
+  } else {
+    return UINT32_MAX;
+  }
+}
+
+std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
+  std::vector<uint32_t> rank_id_list;
+  std::string dump_dir = GetDumpDir();
+  DIR *d_handle = opendir(dump_dir.c_str());
+  if (d_handle == nullptr) {
+    MS_LOG(ERROR) << "Dump directory does not exist.";
+    return rank_id_list;
+  }
+  struct dirent *dir = nullptr;
+  while ((dir = readdir(d_handle)) != nullptr) {
+    if (dir->d_type == DT_DIR) {
+      std::string rank_dir_name = dir->d_name;
+      if (GetRankOrGraphId("rank", rank_dir_name) != UINT32_MAX) {
+        rank_id_list.push_back(GetRankOrGraphId("rank", rank_dir_name));
+      }
+    }
+  }
+  (void)closedir(d_handle);
+  return rank_id_list;
+}
+
+void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
+  std::string net_name = GetNetName();
+  std::string dump_dir = GetDumpDir();
+  for (uint32_t rank_id : rank_id_list) {
+    std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
+    std::string abspath = RealPath(path);
+    DIR *d_handle_rank = opendir(abspath.c_str());
+    if (d_handle_rank == nullptr) {
+      MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
+      continue;
+    }
+    struct dirent *direc = nullptr;
+    while ((direc = readdir(d_handle_rank)) != nullptr) {
+      if (direc->d_type == DT_DIR) {
+        std::string graph_dir = direc->d_name;
+        if (graph_dir == "." || graph_dir == "..") {
+          continue;
+        }
+        if (GetRankOrGraphId("graph", graph_dir) != UINT32_MAX) {
+          uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
+          ReadGraphsHistory(rank_id, graph_id);
+        }
+      }
+    }
+    (void)closedir(d_handle_rank);
+  }
+}
+
+void DebugServices::SetGraphsHistory() {
+  // extract rank_id_list
+  std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
+  // for each rank_id extract the graph_id list and set the dump version
+  // and for each graph read the graph history file
+  CheckDumpGraphIdList(rank_id_list);
+}
+
+void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
+  std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
+  if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
+    // graph history was already stored for this rank_id and graph_id
+    return;
+  }
+  std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
+  std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
+  DIR *d_handle = opendir(exec_order_path.c_str());
+  if (d_handle == nullptr) {
+    MS_LOG(ERROR) << "Directory does not exist.";
+    return;
+  }
+  // read file and store the info
+  std::string full_path = exec_order_path + "/" + file_to_check;
+  std::string checked_path = RealPath(full_path);
+  if (!checked_path.empty()) {
+    ReadGraphRunIter(checked_path, rank_and_graph);
+  }
+  (void)closedir(d_handle);
+}
+
+std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
+  std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
+  for (auto w_table_item : watchpoint_table_) {
+    auto wp = std::get<1>(w_table_item);
+    unsigned int index = 0;
+    for (auto check_node : wp.check_node_list) {
+      std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
+      std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
+      // graph represents root_graph for Ascend and kernel_graph for GPU
+      for (auto rank : ranks) {
+        for (auto graph : graphs) {
+          std::tuple<uint32_t, uint32_t> key(rank, graph);
+          (rank_and_graph_to_nodes)[key].push_back(check_node);
+        }
+      }
+      index++;
+    }
+  }
+  return rank_and_graph_to_nodes;
+}
+
+void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
+  std::ifstream infile;
+  std::string line;
+  infile.open(file_path.c_str());
+  if (!infile.is_open()) {
+    MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
+    const int kMaxFilenameLength = NAME_MAX;
+    char err_info[kMaxFilenameLength];
+    if (strerror_r(errno, err_info, sizeof(err_info)) != nullptr) {
+      MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
+    }
+
+    return;
+  }
+  std::vector<uint32_t> run_iters_vec;
+  while (std::getline(infile, line)) {
+    uint32_t iter;
+    std::stringstream ss(line);
+    ss >> iter;
+    run_iters_vec.push_back(iter);
+  }
+  (void)graphs_run_history_.emplace(
+    std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
+}
+
 void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
                                     const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
                                     const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
@@ -889,6 +1049,7 @@ void DebugServices::AddToTensorData(const std::string &backend_name, const std::
   tensor_data->SetType(type_name);
   tensor_data->SetShape(shape);
   tensor_data->SetTimeStamp(time_stamp);
+  tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
   if (data_size) {
     (void)tensor_loader_->LoadNewTensor(tensor_data, false);
   }
@@ -1080,34 +1241,19 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
   unsigned int iteration, std::vector<std::string> *const async_file_pool) {
   // get a list of nodes and the devices they are on to monitor
   std::vector<std::shared_ptr<TensorData>> tensor_list;
-  std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> device_and_graph_to_nodes;
-  for (auto w_table_item : watchpoint_table_) {
-    auto wp = std::get<1>(w_table_item);
-    unsigned int index = 0;
-    for (auto check_node : wp.check_node_list) {
-      std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
-      std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
-      for (auto device : devices) {
-        for (auto graph : graphs) {
-          std::tuple<uint32_t, uint32_t> key(device, graph);
-          device_and_graph_to_nodes[key].push_back(check_node);
-        }
-      }
-
-      index++;
-    }
-  }
+  std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
+    GetAllWpNodes();
 
   // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
   // as they are found
-  for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
-    std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
-    uint32_t device_id = std::get<0>(device_and_graph);
-    uint32_t root_graph_id = std::get<1>(device_and_graph);
-    std::vector<std::tuple<std::string, bool>> wp_nodes = device_and_graph_item.second;
+  for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
+    std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
+    uint32_t rank_id = std::get<0>(rank_and_graph);
+    uint32_t root_graph_id = std::get<1>(rank_and_graph);
+    std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
     std::vector<std::tuple<std::string, std::string>> proto_to_dump;
 
-    std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
+    std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
                                     std::to_string(root_graph_id) + "/" + IterationString(iteration);
 
     // convert node names to dump style
@@ -1131,12 +1277,11 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
     if (is_sync_mode_) {
       // search files in dir for the one that meets the filename prefix and read the file into memory
       std::string abspath = RealPath(specific_dump_dir);
-      ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, device_id, root_graph_id,
-                            &tensor_list);
+      ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, rank_id, root_graph_id, &tensor_list);
     } else {
       // convert all files in proto_to_dump to npy and add to pool of async file names
       ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
-      GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool,
+      GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool,
                              &tensor_list);
     }
   }
@@ -1276,6 +1421,32 @@ bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, boo
   return tensor_loader_->LoadNewTensor(tensor, keep_prev);
 }
 
+uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
+  uint32_t prev_iter;
+  uint32_t rank_id = tensor->GetDeviceId();
+  uint32_t root_graph_id = tensor->GetRootGraphId();
+  std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
+  if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
+    return UINT32_MAX;
+  }
+  auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
+                      tensor->GetIteration());
+  if (it == graphs_run_history_[rank_and_graph].end()) {
+    // The graph is not executed in that iteration
+    return UINT32_MAX;
+  } else if (it == graphs_run_history_[rank_and_graph].begin()) {
+    // current iteration is the first iteration that the graph was run
+    // no prev iter is available
+    MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
+                  << " is the first run iteration for tensor: " << tensor->GetName();
+    return UINT32_MAX;
+  }
+  it--;
+  prev_iter = *it;
+  tensor->SetPrevIteration(prev_iter);
+  return prev_iter;
+}
+
 void DebugServices::ResetLoadedTensors() {
   wp_id_cache_.clear();
   MS_LOG(INFO) << "Resetting loaded tensors";
diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h
index 3a49da33c22..daa2e0955ea 100644
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@@ -292,6 +292,9 @@ class DebugServices {
                            std::vector<uint64_t> *chunk_tensor_byte_size, partitioned_id *chunk_device_id,
                            partitioned_id *chunk_root_graph_id, std::vector<unsigned int> *device_id,
                            std::vector<unsigned int> *root_graph_id);
+#ifdef OFFLINE_DBG_MODE
+  void SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr);
+#endif
 
   void AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
                              const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
@@ -350,7 +353,7 @@ class DebugServices {
                                                                    std::vector<std::string> *const async_file_pool);
 
   const void *GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
-                            uint32_t *prev_num_elements);
+                            uint32_t *prev_num_elements, bool *history_not_found);
 
   void ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name, std::string *const tensor_type,
                          std::size_t *const size, std::vector<int64_t> *const shape,
@@ -380,6 +383,18 @@ class DebugServices {
                               uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
                               std::vector<std::shared_ptr<TensorData>> *const tensor_list);
 
+  void SetGraphsHistory();
+
+  std::vector<uint32_t> GetDumpRankIdList();
+
+  void CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list);
+
+  void ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id);
+
+  std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> GetAllWpNodes();
+
+  void ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph);
+
   std::string GetStrippedFilename(const std::string &file_name);
 
   std::string IterationString(unsigned int iteration);
@@ -410,6 +425,8 @@ class DebugServices {
 
   bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);
 
+  uint32_t GetPrevIteration(const std::shared_ptr<TensorData> &tensor);
+
   void ResetLoadedTensors();
 #ifdef ONLINE_DBG_MODE
   std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel);
@@ -458,6 +475,8 @@ class DebugServices {
   std::unordered_map<std::string, std::vector<std::string>> overflow_ops_;
   std::string net_name_;
   std::string dump_dir_;
+  // store history of graphs that have been run (rank_id, graph_id)
+  std::map<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>> graphs_run_history_;
   bool is_sync_mode_{false};
 
   std::shared_ptr<TensorLoader> tensor_loader_;
diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc
index 4481632de76..7e5207ee25e 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -282,7 +282,6 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
   if (device_target_ != kGPUDevice) {
     return;
   }
-  E2eDump::UpdateIterGPUDump();
   // Store graphs that are run in one step.
   graph_ptr_step_vec_ = graphs;
   for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
@@ -290,7 +289,6 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
     if (debugger_) {
       debugger_->PreExecute(graph);
     }
-    DumpSetup(graph);
   }
 }
 
@@ -390,6 +388,7 @@ uint32_t Debugger::GetRankID() {
 
 void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
   uint32_t rank_id = GetRankID();
+  E2eDump::DumpRunIter(kernel_graph, rank_id);
   if (debugger_ && debugger_->DebuggerBackendEnabled()) {
     MS_EXCEPTION_IF_NULL(kernel_graph);
     (void)E2eDump::DumpParametersAndConstData(kernel_graph.get(), rank_id, debugger_.get());
@@ -458,6 +457,7 @@ void Debugger::PostExecuteGraphDebugger() {
   if (debugger_) {
     debugger_->PostExecute();
   }
+  E2eDump::UpdateIterGPUDump();
 }
 
 void Debugger::PostExecute() {
diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc
index 77407511944..76d3643fc1d 100644
--- a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc
+++ b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc
@@ -62,6 +62,7 @@ int32_t DbgServices::Initialize(const std::string net_name, const std::string du
   auto cache_mem_ratio = 0.5;
   const uint64_t memlimit = max_mem_usage * kMegabytesToBytes * cache_mem_ratio;
   debug_services_->SetMemLimit(memlimit);
+  debug_services_->SetGraphsHistory();
   return 0;
 }
 
diff --git a/mindspore/ccsrc/debug/debugger/tensor_summary.h b/mindspore/ccsrc/debug/debugger/tensor_summary.h
index 6b9794d434b..e6117bde62b 100644
--- a/mindspore/ccsrc/debug/debugger/tensor_summary.h
+++ b/mindspore/ccsrc/debug/debugger/tensor_summary.h
@@ -88,6 +88,7 @@ class VarianceAndMeanCalculator {
 class ITensorSummary {
  public:
   enum WatchpointPos { eHitPos = 0, eErrorCodePos = 1, eParamListPos = 2 };
+  enum ErrorCode { NAN_TENSOR = 0, INF_TENSOR = 2, NULL_PREV_TENSOR = 4, OUT_OF_MEMORY = 8, HISTORY_NOT_FOUND = 16 };
   virtual ~ITensorSummary() = default;
   virtual void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) = 0;
   virtual std::tuple<bool, int32_t, std::vector<DebugServices::parameter_t>> IsWatchpointHit(
diff --git a/mindspore/ccsrc/debug/tensor_data.h b/mindspore/ccsrc/debug/tensor_data.h
index e8cfe33503c..46e4a1190e7 100644
--- a/mindspore/ccsrc/debug/tensor_data.h
+++ b/mindspore/ccsrc/debug/tensor_data.h
@@ -229,6 +229,10 @@ class TensorData {
 
   void SetIteration(unsigned int iteration) { this->iteration_ = iteration; }
 
+  unsigned int GetPrevIteration() const { return this->prev_iteration_; }
+
+  void SetPrevIteration(unsigned int prev_iteration) { this->prev_iteration_ = prev_iteration; }
+
   unsigned int GetDeviceId() const { return this->device_id_; }
 
   void SetDeviceId(unsigned int device_id) { this->device_id_ = device_id; }
@@ -430,6 +434,7 @@ class TensorData {
   std::string name_;
   uint64_t slot_;
   unsigned int iteration_{0};
+  unsigned int prev_iteration_{0};
   unsigned int device_id_{0};
   unsigned int root_graph_id_{0};
   bool is_output_{true};
diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h
index 82816c8c0ed..7ce1e5bc2ed 100644
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -90,7 +90,7 @@ class TensorLoader {
     key_name += (":" + std::to_string(tensor->GetDeviceId()) + ":" + std::to_string(tensor->GetRootGraphId()) + ":" +
                  std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot()));
     if (tensor_list_map_.find(key_name) != tensor_list_map_.end() &&
-        tensor->GetIteration() == tensor_list_map_[key_name]->GetIteration() - 1) {
+        tensor->GetIteration() == tensor_list_map_[key_name]->GetPrevIteration()) {
       key_name += ":prev";
     }
     auto iter = tensor_list_map_.find(key_name);
diff --git a/mindspore/ccsrc/runtime/device/kernel_adjust.cc b/mindspore/ccsrc/runtime/device/kernel_adjust.cc
index e0440eb0a02..d672268c1ba 100644
--- a/mindspore/ccsrc/runtime/device/kernel_adjust.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_adjust.cc
@@ -33,6 +33,9 @@
 #include "runtime/base.h"
 #include "runtime/device/ascend/ascend_stream_manager.h"
 #include "utils/shape_utils.h"
+#ifndef ENABLE_SECURITY
+#include "debug/data_dump/dump_json_parser.h"
+#endif
 
 namespace {
 constexpr auto kGradients = "Gradients";
@@ -1049,7 +1052,12 @@ void KernelAdjust::LoadDeviceLoopCtrlParameters(const std::shared_ptr<session::K
   MS_LOG(INFO) << "Load device loop control data";
   SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurLoopCountName, 0);
   SetDeviceLoopCtrlTensor(kernel_graph_ptr, kNextLoopCountName, 0);
-  SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName, SizeToInt(kernel_graph_ptr->current_epoch()));
+#ifndef ENABLE_SECURITY
+  SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName,
+                          SizeToInt(DumpJsonParser::GetInstance().cur_dump_iter()));
+#else
+  SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName, 0);
+#endif
 
   kernel_graph_ptr->set_current_epoch(kernel_graph_ptr->current_epoch() + 1);
 }
diff --git a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
index e8b2b75b01f..6ec4f51c298 100644
--- a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
@@ -23,6 +23,7 @@
 #include "utils/log_adapter.h"
 #ifndef ENABLE_SECURITY
 #include "debug/data_dump/cpu_e2e_dump.h"
+#include "debug/data_dump/e2e_dump.h"
 #endif
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
@@ -52,6 +53,7 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
       auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
       MS_EXCEPTION_IF_NULL(kernel_graph);
       CPUE2eDump::DumpCNodeData(cnode, kernel_graph->graph_id());
+      CPUE2eDump::DumpRunIter(kernel_graph);
     }
 #endif
   } else if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kGPU) {
diff --git a/tests/st/debugger/dump_test_utils.py b/tests/st/debugger/dump_test_utils.py
index 77a41113fde..e5e9828f6f5 100644
--- a/tests/st/debugger/dump_test_utils.py
+++ b/tests/st/debugger/dump_test_utils.py
@@ -18,11 +18,14 @@ Utils for testing offline debugger.
 
 import os
 import tempfile
+import bisect
+import csv
 import numpy as np
 
 
 def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_info_list):
     """Build dump file structure from tensor_list."""
+    ranks_run_history = {}
     temp_dir = tempfile.mkdtemp(prefix=net_name, dir=path)
     for tensor_name, tensor, tensor_info in zip(tensor_name_list, tensor_list, tensor_info_list):
         slot = str(tensor_info.slot)
@@ -30,6 +33,16 @@ def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_i
         rank_id = str(tensor_info.rank_id)
         root_graph_id = str(tensor_info.root_graph_id)
         is_output = str(tensor_info.is_output)
+        if rank_id not in ranks_run_history:
+            graphs_run_history = {}
+            ranks_run_history[rank_id] = graphs_run_history
+        if root_graph_id not in ranks_run_history[rank_id]:
+            iter_list = []
+            iter_list.append(iteration)
+            graphs_run_history[root_graph_id] = iter_list
+        elif iteration not in graphs_run_history[root_graph_id]:
+            bisect.insort(graphs_run_history[root_graph_id], iteration)
+
         path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration)
         os.makedirs(path, exist_ok=True)
         if is_output == "True":
@@ -40,4 +53,16 @@ def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_i
                                     ".DefaultFormat.npy", dir=path)
         full_path = file[1]
         np.save(full_path, tensor)
+    build_global_execution_order(temp_dir, ranks_run_history)
     return temp_dir
+
+
+def build_global_execution_order(path, ranks_run_history):
+    for rank_id in ranks_run_history.keys():
+        exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order"
+        os.makedirs(exec_order_path, exist_ok=True)
+        for graph in ranks_run_history[rank_id].keys():
+            full_path = os.path.join(exec_order_path, "ms_global_execution_order_graph_" + graph + ".csv")
+            with open(full_path, 'w+', newline='') as csv_file:
+                write = csv.writer(csv_file)
+                write.writerows(ranks_run_history[rank_id][graph])
diff --git a/tests/st/dump/dump_test_utils.py b/tests/st/dump/dump_test_utils.py
index 3b3a0d64078..550adad8828 100644
--- a/tests/st/dump/dump_test_utils.py
+++ b/tests/st/dump/dump_test_utils.py
@@ -69,6 +69,35 @@ async_dump_dict_2 = {
     }
 }
 
+e2e_dump_dict_2 = {
+    "common_dump_settings": {
+        "dump_mode": 0,
+        "path": "",
+        "net_name": "Net",
+        "iteration": "all",
+        "input_output": 0,
+        "kernels": ["Default/Conv-op12"],
+        "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
+        "op_debug_mode": 0
+    },
+    "e2e_dump_settings": {
+        "enable": True,
+        "trans_flag": False
+    }
+}
+
+async_dump_dict_3 = {
+    "common_dump_settings": {
+        "dump_mode": 0,
+        "path": "",
+        "net_name": "Net",
+        "iteration": "all",
+        "input_output": 2,
+        "kernels": ["Default/TensorAdd-op3"],
+        "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
+        "op_debug_mode": 0
+    }
+}
 
 def generate_dump_json(dump_path, json_file_name, test_key):
     """
@@ -83,6 +112,13 @@ def generate_dump_json(dump_path, json_file_name, test_key):
     elif test_key == "test_async_dump_net_multi_layer_mode1":
         data = async_dump_dict_2
         data["common_dump_settings"]["path"] = dump_path
+    elif test_key in ("test_GPU_e2e_multi_root_graph_dump", "test_Ascend_e2e_multi_root_graph_dump"):
+        data = e2e_dump_dict_2
+        data["common_dump_settings"]["path"] = dump_path
+    elif test_key == "test_Ascend_async_multi_root_graph_dump":
+        data = async_dump_dict_3
+        data["common_dump_settings"]["path"] = dump_path
+
     else:
         raise ValueError(
             "Failed to generate dump json file. The test name value " + test_key + " is invalid.")
diff --git a/tests/st/dump/test_data_dump.py b/tests/st/dump/test_data_dump.py
index 791078e8eac..2d676baa34f 100644
--- a/tests/st/dump/test_data_dump.py
+++ b/tests/st/dump/test_data_dump.py
@@ -283,7 +283,7 @@ def run_e2e_dump_execution_graph():
         add = Net()
         add(Tensor(x), Tensor(y))
         exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order')
-        assert len(os.listdir(exe_graph_path)) == 1
+        assert len(os.listdir(exe_graph_path)) == 2
         del os.environ['MINDSPORE_DUMP_CONFIG']
 
 
diff --git a/tests/st/dump/test_multi_root_graph_dump.py b/tests/st/dump/test_multi_root_graph_dump.py
new file mode 100644
index 00000000000..2f483c3b945
--- /dev/null
+++ b/tests/st/dump/test_multi_root_graph_dump.py
@@ -0,0 +1,158 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import os
+from os import path
+import tempfile
+import time
+import shutil
+import csv
+import numpy as np
+import pytest
+import mindspore.context as context
+from mindspore import Tensor
+from mindspore.ops import operations as P
+from mindspore.nn import Cell
+from dump_test_utils import generate_dump_json
+from tests.security_utils import security_off_wrap
+
+
+class AddNet(Cell):
+    def __init__(self):
+        super(AddNet, self).__init__()
+        self.add = P.TensorAdd()
+
+    def construct(self, input_x, input_y):
+        output_z = self.add(input_x, input_y)
+        return output_z
+
+
+class NewAddNet(Cell):
+    def __init__(self):
+        super(NewAddNet, self).__init__()
+        self.add = P.AddN()
+
+    def construct(self, x, y):
+        z = self.add([x, y, y])
+        return z
+
+
+def train_addnet(epoch):
+    net = AddNet()
+    net2 = NewAddNet()
+    output_list = []
+    input_x = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32))
+    input_y = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32))
+    for _ in range(epoch):
+        out_put = net(input_x, input_y)
+        out2 = net2(out_put, input_x)
+        output_list.append(out2.asnumpy())
+        input_x = input_x + input_y
+
+
+def run_multi_root_graph_dump(device, dump_mode, test_name):
+    """Run dump for multi root graph script."""
+
+    context.set_context(mode=context.GRAPH_MODE, device_target=device)
+
+    with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
+        dump_path = os.path.join(tmp_dir, dump_mode)
+        dump_config_path = os.path.join(tmp_dir, dump_mode + ".json")
+        generate_dump_json(dump_path, dump_config_path, test_name)
+        os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
+        dump_file_path = os.path.join(dump_path, 'rank_0', 'Net')
+        if os.path.isdir(dump_path):
+            shutil.rmtree(dump_path)
+        epoch = 3
+        train_addnet(epoch)
+        for _ in range(3):
+            if not os.path.exists(dump_file_path):
+                time.sleep(2)
+        # Multi root graph script : we have 2 graphs under rank_0 dir
+        # Each graph should have 3 iteration
+        # Each graph was executed once per epoch,
+        # Graph 0 was executed in even iterations, graph one was executed in odd iterations
+        assert len(os.listdir(dump_file_path)) == 2
+        dump_path_graph_0 = os.path.join(dump_file_path, '0')
+        dump_path_graph_1 = os.path.join(dump_file_path, '1')
+        assert sorted(os.listdir(dump_path_graph_0)) == ['0', '2', '4']
+        assert sorted(os.listdir(dump_path_graph_1)) == ['1', '3', '5']
+        execution_order_path = os.path.join(dump_path, 'rank_0', 'execution_order')
+        # Four files in execution_order dir.
+        # Two files for each graph (ms_execution_order and ms_global_execution_order)
+        assert len(os.listdir(execution_order_path)) == 4
+        global_exec_order_graph_0 = os.path.join(execution_order_path, 'ms_global_execution_order_graph_0.csv')
+        assert path.exists(global_exec_order_graph_0)
+        with open(global_exec_order_graph_0) as csvfile:
+            history_graph_0 = csv.reader(csvfile)
+            iter_list_graph_0 = list(history_graph_0)
+        assert iter_list_graph_0 == [['0'], ['2'], ['4']]
+        global_exec_order_graph_1 = os.path.join(execution_order_path, 'ms_global_execution_order_graph_1.csv')
+        assert path.exists(global_exec_order_graph_1)
+        with open(global_exec_order_graph_1) as csvfile:
+            history_graph_1 = csv.reader(csvfile)
+            iter_list_graph_1 = list(history_graph_1)
+        assert iter_list_graph_1 == [['1'], ['3'], ['5']]
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_GPU_e2e_multi_root_graph_dump():
+    """
+    Feature:
+        Multi root graph e2e dump for GPU.
+    Description:
+        Test multi root graph e2e dump GPU.
+    Expectation:
+        Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
+    """
+    run_multi_root_graph_dump("GPU", "e2e_dump", "test_GPU_e2e_multi_root_graph_dump")
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_Ascend_e2e_multi_root_graph_dump():
+    """
+    Feature:
+        Multi root graph e2e dump for Ascend.
+    Description:
+        Test multi root graph e2e dump Ascend.
+    Expectation:
+        Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
+    """
+
+    run_multi_root_graph_dump("Ascend", "e2e_dump", "test_Ascend_e2e_multi_root_graph_dump")
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_Ascend_async_multi_root_graph_dump():
+    """
+    Feature:
+        Multi root graph async dump for Ascend.
+    Description:
+        Test multi root graph async dump Ascend.
+    Expectation:
+        Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
+    """
+
+    run_multi_root_graph_dump("Ascend", "async_dump", "test_Ascend_async_multi_root_graph_dump")
diff --git a/tests/ut/python/debugger/gpu_tests/dump_test_utils.py b/tests/ut/python/debugger/gpu_tests/dump_test_utils.py
index a6fb6c1582f..d27b8929a04 100644
--- a/tests/ut/python/debugger/gpu_tests/dump_test_utils.py
+++ b/tests/ut/python/debugger/gpu_tests/dump_test_utils.py
@@ -18,11 +18,14 @@ Utils for testing offline debugger.
 
 import os
 import tempfile
+import bisect
+import csv
 import numpy as np
 
 
 def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_list):
     """Build dump file structure from tensor_list."""
+    ranks_run_history = {}
     temp_dir = tempfile.mkdtemp(prefix=net_name, dir="./")
     for x, _ in enumerate(tensor_info_list):
         slot = str(tensor_info_list[x].slot)
@@ -30,6 +33,16 @@ def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_li
         rank_id = str(tensor_info_list[x].rank_id)
         root_graph_id = str(tensor_info_list[x].root_graph_id)
         is_output = str(tensor_info_list[x].is_output)
+        if rank_id not in ranks_run_history:
+            graphs_run_history = {}
+            ranks_run_history[rank_id] = graphs_run_history
+        if root_graph_id not in ranks_run_history[rank_id]:
+            iter_list = []
+            iter_list.append(iteration)
+            graphs_run_history[root_graph_id] = iter_list
+        elif iteration not in graphs_run_history[root_graph_id]:
+            bisect.insort(graphs_run_history[root_graph_id], iteration)
+
         path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration)
         os.makedirs(path, exist_ok=True)
         if is_output == "True":
@@ -40,4 +53,16 @@ def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_li
                                     ".DefaultFormat.npy", dir=path)
         full_path = file[1]
         np.save(full_path, tensor_list[x])
+    build_global_execution_order(temp_dir, ranks_run_history)
     return temp_dir
+
+
+def build_global_execution_order(path, ranks_run_history):
+    for rank_id in ranks_run_history.keys():
+        exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order"
+        os.makedirs(exec_order_path, exist_ok=True)
+        for graph in ranks_run_history[rank_id].keys():
+            full_path = os.path.join(exec_order_path, "ms_global_execution_order_graph_" + graph +  ".csv")
+            with open(full_path, 'w+', newline='') as csv_file:
+                write = csv.writer(csv_file)
+                write.writerows(ranks_run_history[rank_id][graph])