improve perf, keep consistent tensor state, fix recheck, check weights at step end

2020-12-08 12:00:31 -05:00 · 2020-12-08 12:00:31 -05:00 · dd0084c52b
parent 5a35e9c56e
commit dd0084c52b
12 changed files with 158 additions and 116 deletions
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@ -1003,18 +1003,9 @@ void AscendSession::DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs)
 void AscendSession::LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  MS_LOG(INFO) << "Start!";
  MS_EXCEPTION_IF_NULL(kernel_graph);
-#ifdef ENABLE_DEBUGGER
-  if (debugger_->DebuggerBackendEnabled()) {
-    auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
-    MS_EXCEPTION_IF_NULL(runtime_instance);
-    // TensorData will be freed up here
-    debugger_->EmptyTensor();
-    uint32_t iter_num = debugger_->GetTensorLoaderIterNum();
-    debugger_->SetTensorLoaderIterNum(++iter_num);
-    (void)runtime_instance->LoadData(kernel_graph.get());
-    debugger_->EmptyPrevTensor();
-  }
-#endif
+  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  (void)runtime_instance->LoadData(kernel_graph.get());
  MS_LOG(INFO) << "Finish!";
 }

--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@ -360,7 +360,9 @@ void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor:
  SyncValueNodeDeviceAddr(kernel_graph);
  // Load input data from user input
  LoadInputData(kernel_graph, inputs);
-  PreIterationDbg(kernel_graph);
+  if (debugger_) {
+    debugger_->PreExecute(kernel_graph, graph_sum_);
+  }
 #if ENABLE_CPU && ENABLE_GPU
  // Initialize parameter server
  InitPSParamAndOptim(kernel_graph, inputs);
@ -372,7 +374,6 @@ void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor:
  for (int64_t i = 0; i < loopsize; i++) {
    Execute(kernel_graph);
  }
-  PostLoadTensor(kernel_graph);
  // In pynative mode, device addresses of tensors in value nodes need be clean.
  CleanValueNodeDeviceAddr(kernel_graph);
  // Summary
@ -443,13 +444,6 @@ bool GPUSession::DumpDataEnabledIteration() const {
  return runtime_instance->DumpDataEnabledIteration();
 }

-void GPUSession::PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
-  if (debugger_) {
-    debugger_->PreExecute(kernel_graph, graph_sum_);
-  }
-  PreLoadTensor(kernel_graph);
-}
-
 void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  bool dump_enabled = DumpDataEnabledIteration();
  // debug used for dump
@ -463,30 +457,6 @@ void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_gra
  }
 }

-void GPUSession::PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
-  bool dump_enabled = DumpDataEnabledIteration();
-  if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
-    return;
-  }
-  MS_EXCEPTION_IF_NULL(kernel_graph);
-  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
-  MS_EXCEPTION_IF_NULL(runtime_instance);
-  debugger_->EmptyTensor();
-  uint32_t iter_num = debugger_->GetTensorLoaderIterNum();
-  debugger_->SetTensorLoaderIterNum(++iter_num);
-}
-
-void GPUSession::PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
-  bool dump_enabled = DumpDataEnabledIteration();
-  if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
-    return;
-  }
-  MS_EXCEPTION_IF_NULL(kernel_graph);
-  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
-  MS_EXCEPTION_IF_NULL(runtime_instance);
-  debugger_->EmptyPrevTensor();
-}
-
 void GPUSession::SyncValueNodeDeviceAddr(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
--- a/mindspore/ccsrc/backend/session/gpu_session.h
+++ b/mindspore/ccsrc/backend/session/gpu_session.h
@ -75,14 +75,8 @@ class GPUSession : public SessionBasic {

  bool DumpDataEnabledIteration() const;

-  void PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;
-
  void PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;

-  void PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
-
-  void PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
-
  void SyncValueNodeDeviceAddr(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  void CleanValueNodeDeviceAddr(const std::shared_ptr<KernelGraph> &kernel_graph) const;
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@ -66,7 +66,7 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
                                     std::vector<std::vector<parameter_t>> *parameters,
                                     std::vector<int32_t> *error_codes, const std::vector<std::string> &op_overflows,
                                     const std::vector<std::shared_ptr<TensorData>> &tensor_list,
-                                     const bool init_dbg_suspend) {
+                                     const bool init_dbg_suspend, const bool step_end, const bool recheck) {
  std::lock_guard<std::mutex> lg(lock_);
  if (watchpoint_table.empty()) return;

@ -75,13 +75,26 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
    const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
    const auto tensor_slot = std::to_string(tensor->GetSlot());
    mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor();
+    // no elements to analyze
+    if (tensor_ptr->DataSize() == 0) continue;
    int tensor_dtype = tensor_ptr->data_type_c();
    std::vector<watchpoint_t> watchpoints_to_check;
    std::string qualified_tensor_name;
    for (auto w_table_item : watchpoint_table) {
      auto wp = std::get<1>(w_table_item);
-      if (wp.condition.type == INIT && !init_dbg_suspend) continue;
+      // check ONLY init conditions on intial suspended state.
+      // skip other conditions on intial suspended state
+      // skip init condition on all the other states
+      if ((wp.condition.type == INIT) ^ init_dbg_suspend) continue;
+
      if (wp.condition.type != IS_OVERFLOW && tensor_dtype == kNumberTypeBool) continue;
+
+      // check change conditions only on step end.
+      if (wp.change_condition() && !step_end) continue;
+
+      // if recheck, ignore the cache results and reanalyze everything.
+      // if not a recheck, check only unanalyzed tensors
+      if (!recheck && wp_id_cache[tensor_name].count(wp.id)) continue;
      std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
      if (!found.empty()) {
        qualified_tensor_name = found;
@ -174,6 +187,10 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
        error_code = std::get<1>(item);
        parameter_list = std::get<2>(item);
      }
+      // add analyzed tensor to cache
+      if (!recheck) {
+        wp_id_cache[tensor_name].insert(wp.id);
+      }

      if (is_hit || error_code) {
        name->push_back(qualified_tensor_name);
@ -238,28 +255,6 @@ bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNode
  }
 }

-void DebugServices::AddWeightsBiasInputs(std::vector<std::shared_ptr<TensorData>> *tensor_list,
-                                         const CNodePtr &kernel) {
-  if (kernel) {
-    auto input_size = AnfAlgo::GetInputTensorNum(kernel);
-    for (size_t j = 0; j < input_size; ++j) {
-      auto input_kernel = kernel->input(j + 1);
-      std::string input_kernel_name = input_kernel->fullname_with_scope();
-      auto found_dot = input_kernel_name.find_last_of('.');
-      if (found_dot != std::string::npos &&
-          (input_kernel_name.substr(found_dot + 1) == "weight" || input_kernel_name.substr(found_dot + 1) == "bias")) {
-        std::string locate_tensor = input_kernel_name + ":0";
-        std::map<std::string, std::shared_ptr<TensorData>> tensor_map = tensor_loader_->GetTensorMap();
-        std::map<std::string, std::shared_ptr<TensorData>>::iterator iter;
-        iter = tensor_map.find(locate_tensor);
-        if (iter != tensor_map.end()) {
-          tensor_list->push_back(iter->second);
-        }
-      }
-    }
-  }
-}
-
 void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); }

 std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
@ -292,4 +287,32 @@ std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::Get
  return watchpoint_table;
 }

+void DebugServices::ResetLoadedTensors() {
+  wp_id_cache.clear();
+  MS_LOG(INFO) << "Resetting loaded tensors";
+  tensor_loader_->MoveParametersCurrentToPrev();
+  tensor_loader_->EmptyCurrentTensor();
+  // will move parameters from previous to current map
+  tensor_loader_->SwapCurrentPrev();
+}
+
+std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
+  MS_EXCEPTION_IF_NULL(kernel);
+  std::vector<std::shared_ptr<TensorData>> result;
+  auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
+  auto kernel_name = kernel->fullname_with_scope();
+  for (size_t j = 0; j < output_size; ++j) {
+    auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
+    auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
+    if (tensor) result.push_back(tensor);
+  }
+  return result;
+}
+bool DebugServices::TensorExistsInCurrent(std::string tensor_name) {
+  return tensor_loader_->TensorExistsInCurrent(tensor_name);
+}
+void DebugServices::MoveTensorCurrentToPrev(std::string tensor_name) {
+  tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
+}
+
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@ -22,6 +22,7 @@
 #include <memory>
 #include <tuple>
 #include <unordered_map>
+#include <set>
 #include <mutex>
 #include <map>
 #include <limits>
@ -160,6 +161,10 @@ class DebugServices {
    bool range_enabled() const {
      return condition.type == RANGE && (!parameter_list[0].disabled || !parameter_list[1].disabled);
    }
+
+    bool change_condition() const {
+      return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL || condition.type == NOT_CHANGED;
+    }
  } watchpoint_t;

  void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
@ -171,7 +176,8 @@ class DebugServices {
  void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
                        std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters,
                        std::vector<int32_t> *error_code, const std::vector<std::string> &op_overflows,
-                        const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend);
+                        const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend,
+                        const bool step_end, const bool recheck);

  void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
                        std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
@ -181,8 +187,6 @@ class DebugServices {

  bool IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const;

-  void AddWeightsBiasInputs(std::vector<std::shared_ptr<TensorData>> *tensor_list, const CNodePtr &kernel);
-
  void EmptyTensor();

  std::vector<std::shared_ptr<TensorData>> GetTensor() const;
@ -205,9 +209,19 @@ class DebugServices {

  std::unordered_map<unsigned int, watchpoint_t> GetWatchpointTable();

+  void ResetLoadedTensors();
+
+  std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel);
+
+  bool TensorExistsInCurrent(std::string tensor_name);
+
+  void MoveTensorCurrentToPrev(std::string tensor_name);
+
 private:
  std::mutex lock_;

+  // to keep track of watchpoints that have been checked already for a tensor in current step
+  std::unordered_map<std::string, std::set<int32_t>> wp_id_cache;
  std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;

  TensorLoader *tensor_loader_;
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -313,20 +313,16 @@ void Debugger::PostExecute() {
  }
  if (debugger_->DebuggerBackendEnabled()) {
    // analyze tensor data and send the watchpoints been hit
-    if (run_level_ == "node") {
-      MS_LOG(INFO) << "Debugger is in node level mode ";
-      return;
-    }
    if (debugger_enabled_ && !is_dataset_graph_) {
      if (device_target_ != kGPUDevice) {
        num_step_++;
-        MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
-        SendWatchpoints(CheckWatchpoints());
-        CommandLoop();
-      } else {
-        CommandLoop();
      }
+      MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
+      SendWatchpoints(CheckWatchpoints());
+      CommandLoop();
    }
+    // Only keep parameters in the current map
+    debug_services_->ResetLoadedTensors();
  }
 }

@ -596,7 +592,7 @@ void Debugger::CommandLoop() {
        MS_LOG(INFO) << "RunCMD";
        if (GetRunLevel(reply) == "recheck") {
          MS_LOG(INFO) << "rechecking all watchpoints";
-          SendWatchpoints(CheckWatchpoints());
+          SendWatchpoints(CheckWatchpoints("", nullptr, true));
        } else {
          // no longer the initial suspension.
          initial_suspend_ = false;
@ -705,9 +701,6 @@ void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCon
      return DebugServices::parameter_t{parameter.name(), parameter.disabled(), parameter.value(), parameter.hit()};
    });
  debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list, parameter_list);
-  if (initial_suspend_ &&
-      static_cast<DebugServices::CONDITION_TYPE>(condition.condition()) == DebugServices::CONDITION_TYPE::INIT)
-    SendWatchpoints(CheckWatchpoints());
 }

 void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); }
@ -780,7 +773,8 @@ void Debugger::Exit() {
  }
 }

-std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel) {
+std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel,
+                                                    bool recheck) {
  std::vector<std::string> name;
  std::vector<std::string> slot;
  std::vector<int> condition;
@ -795,11 +789,10 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
  if (watchnode.empty()) {
    tensor_list = debug_services_->GetTensor();
  } else {
-    tensor_list = debug_services_->GetNodeTensorMap(watchnode);
-    debug_services_->AddWeightsBiasInputs(&tensor_list, kernel);
+    tensor_list = debug_services_->GetNodeTensor(kernel);
  }
  debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
-                                    tensor_list, initial_suspend_);
+                                    tensor_list, initial_suspend_, watchnode.empty(), recheck);
  std::list<WatchpointHit> hits;
  for (unsigned int i = 0; i < name.size(); i++) {
    WatchpointHit hit;
@ -1045,7 +1038,7 @@ std::vector<std::string> Debugger::CheckOpOverflow() {
  }
  closedir(d);

-  if (op_names.size()) {
+  if (!op_names.empty()) {
    MS_LOG(ERROR) << "These operation overflows are detected " << op_names;
  }

@ -1091,12 +1084,6 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
  if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
    return;
  }
-  bool keep_prev;
-  if (anf_node->isa<Parameter>()) {
-    keep_prev = true;
-  } else {
-    keep_prev = false;
-  }
  // for parameters and value nodes, set its execution order to be 0;
  int exec_order = 0;
  std::string node_name = anf_node->fullname_with_scope();
@ -1114,6 +1101,13 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
  auto shape = AnfAlgo::GetOutputDeviceShape(anf_node, output_index);
  (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                       [](size_t inner_item) { return SizeToInt(inner_item); });
+  bool keep_prev;
+  if (anf_node->isa<Parameter>()) {
+    keep_prev = true;
+    debug_services_->MoveTensorCurrentToPrev(tensor_name);
+  } else {
+    keep_prev = false;
+  }
  bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev);
  if (!ret) {
    MS_LOG(ERROR) << "LoadMemToHost:"
@ -1123,9 +1117,6 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output

 void Debugger::LoadParametersAndConst() {
  if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
-  if (!(num_step_ == 0 || device_target_ == kAscendDevice ||
-        (device_target_ == kGPUDevice && device::KernelRuntime::DumpDataEnabledIteration())))
-    return;
  MS_EXCEPTION_IF_NULL(graph_ptr_);
  // load parameters
  MS_LOG(INFO) << "Start to load Parameters!";
@ -1199,5 +1190,8 @@ void Debugger::ClearCurrentData() {
  if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()))
    debug_services_->EmptyCurrentTensor();
 }
+bool Debugger::TensorExistsInCurrent(std::string tensor_name) {
+  return debug_services_->TensorExistsInCurrent(tensor_name);
+}

 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@ -145,6 +145,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  std::list<KernelGraphPtr> GetGraphPtrList() { return graph_ptr_list_; }

+  bool TensorExistsInCurrent(std::string tensor_name);
+
 private:
  // private constructor for singleton
  Debugger();
@ -197,7 +199,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // analyze tensors and check watchpoint conditions
  // return names of tensors and what condition they hit
  std::list<WatchpointHit> CheckWatchpoints(const std::string &watchnode = std::string(),
-                                            const CNodePtr &kernel = NULL);
+                                            const CNodePtr &kernel = nullptr, bool recheck = false);

  // send watchpoints that hit
  void SendWatchpoints(const std::list<WatchpointHit> &points);
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@ -33,6 +33,44 @@ class TensorLoader {

  ~TensorLoader() { EmptyTensor(); }

+  void MoveTensorCurrentToPrev(std::string tensor_name) {
+    auto handle = tensor_list_map.extract(tensor_name);
+    if (!handle.empty()) {
+      MS_LOG(INFO) << "Moving " << tensor_name << " from current map to previous map";
+      prev_tensor_list_map.insert(std::move(handle));
+    }
+  }
+
+  void SwapCurrentPrev() { tensor_list_map.swap(prev_tensor_list_map); }
+
+  bool TensorExistsInCurrent(std::string tensor_name) {
+    return tensor_list_map.find(tensor_name) != tensor_list_map.end();
+  }
+
+  // only parameters will return true
+  bool PrevTensorExistsInCurrent(std::string tensor_name) { return TensorExistsInCurrent(tensor_name + ":prev"); }
+
+  void MoveParametersCurrentToPrev() {
+    MS_LOG(INFO) << "Moving parameters from current map to previous map";
+    auto iter = tensor_list_map.begin();
+    while (iter != tensor_list_map.end()) {
+      auto key = iter->first;
+      if (PrevTensorExistsInCurrent(key)) {
+        // :prev tensor only exists for parameter. Move it to prev
+        ++iter;
+        MoveTensorCurrentToPrev(key);
+      } else {
+        ++iter;
+      }
+    }
+  }
+
+  bool IsPrevTensor(std::string tensor_name) {
+    const std::string suffix = ":prev";
+    if (tensor_name.length() <= suffix.length()) return false;
+    return std::equal(suffix.rbegin(), suffix.rend(), tensor_name.rbegin());
+  }
+
  bool LoadNewTensor(std::shared_ptr<TensorData> tensor, bool keep_prev) {
    std::lock_guard<std::mutex> lg(lock_);
    if (keep_prev) {
@ -43,20 +81,32 @@ class TensorLoader {
        tensor_list_map.insert(std::move(handle));
      }
    }
-    tensor_list.push_back(tensor);
    tensor_list_map[tensor->GetName()] = tensor;  // use [] instead of insert to ensure latest value
    auto node_name = tensor->GetName();
    node_name = node_name.substr(0, node_name.find_first_of(":"));
    node_tensor_map.insert({node_name, tensor});
    return true;
  }
-  std::vector<std::shared_ptr<TensorData>> GetTensor() { return tensor_list; }
+
+  std::vector<std::shared_ptr<TensorData>> GetTensor() {
+    std::vector<std::shared_ptr<TensorData>> tensor_list;
+    for (auto &it : tensor_list_map) {
+      if (!IsPrevTensor(it.first)) tensor_list.push_back(it.second);
+    }
+    return tensor_list;
+  }
+
+  std::shared_ptr<TensorData> GetTensor(const std::string &tensor_name) {
+    auto iter = tensor_list_map.find(tensor_name);
+    if (iter != tensor_list_map.end()) return iter->second;
+    return nullptr;
+  }

  uint32_t GetIterNum() { return iter_num; }

  std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; }

-  std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) {
+  std::shared_ptr<TensorData> GetPrevTensor(const std::string &tensor_name) {
    if (tensor_list_map.find(tensor_name + ":prev") != tensor_list_map.end()) {
      return tensor_list_map[tensor_name + ":prev"];
    }
@ -91,14 +141,13 @@ class TensorLoader {
    prev_tensor_list_map.clear();
    node_tensor_map.clear();
    tensor_list_map.swap(prev_tensor_list_map);
-    tensor_list.clear();
  }

  void EmptyPrevTensor() { prev_tensor_list_map.clear(); }

  void EmptyCurrentTensor() {
    tensor_list_map.clear();
-    tensor_list.clear();
+    node_tensor_map.clear();
  }

  void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; }
@ -142,7 +191,6 @@ class TensorLoader {
  }

 private:
-  std::vector<std::shared_ptr<TensorData>> tensor_list;
  std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map;
  std::multimap<std::string, std::shared_ptr<TensorData>> node_tensor_map;
  std::map<std::string, std::shared_ptr<TensorData>> prev_tensor_list_map;
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@ -674,6 +674,10 @@ bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int exec
                                        const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type,
                                        size_t slot, bool keep_prev) const {
  bool ret = false;
+  if (Debugger::GetInstance()->TensorExistsInCurrent(tensor_name)) {
+    MS_LOG(INFO) << tensor_name << " already loaded for this step so not loading it again.";
+    return true;
+  }
  // TensorData is freed up in AscendSession class
  auto tensor_data = std::make_shared<mindspore::TensorData>();
  tensor_data->SetName(tensor_name);
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@ -296,8 +296,6 @@ bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph) {
  MS_EXCEPTION_IF_NULL(graph);
 #ifdef ENABLE_DEBUGGER
  MS_LOG(INFO) << "Start load step";
-  uint32_t cur_iter = 0;
-  MS_LOG(INFO) << "Cur iter is " << cur_iter;
  for (auto graph_ptr : debugger_->GetGraphPtrList()) {
    debugger_->SetGraphPtr(graph_ptr);
    // load output
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@ -87,6 +87,11 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi
    return true;
  }

+  if (Debugger::GetInstance()->TensorExistsInCurrent(tensor_name)) {
+    MS_LOG(INFO) << tensor_name << " already loaded for this step so not loading it again.";
+    return true;
+  }
+
  mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
  size_t host_size = out_tensor->data().nbytes();
  auto ret_rt_memcpy = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@ -154,8 +154,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
    std::vector<int> real_outputs;
    real_outputs = CheckRealOutput(node_name, output_size);

-    for (std::vector<int>::iterator it = real_outputs.begin(); it != real_outputs.end(); ++it) {
-      auto j = *it;
+    for (int j : real_outputs) {
      auto addr = kernel_outputs[j];
      auto type = AnfAlgo::GetOutputInferDataType(kernel, j);
      auto format = kOpFormat_DEFAULT;