diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc
index b680b6b782b..02673880397 100644
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -1003,18 +1003,9 @@ void AscendSession::DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs)
 void AscendSession::LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
   MS_LOG(INFO) << "Start!";
   MS_EXCEPTION_IF_NULL(kernel_graph);
-#ifdef ENABLE_DEBUGGER
-  if (debugger_->DebuggerBackendEnabled()) {
-    auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
-    MS_EXCEPTION_IF_NULL(runtime_instance);
-    // TensorData will be freed up here
-    debugger_->EmptyTensor();
-    uint32_t iter_num = debugger_->GetTensorLoaderIterNum();
-    debugger_->SetTensorLoaderIterNum(++iter_num);
-    (void)runtime_instance->LoadData(kernel_graph.get());
-    debugger_->EmptyPrevTensor();
-  }
-#endif
+  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  (void)runtime_instance->LoadData(kernel_graph.get());
   MS_LOG(INFO) << "Finish!";
 }
 
diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc
index 994c5c5f6d5..8e20e858376 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -360,7 +360,9 @@ void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor:
   SyncValueNodeDeviceAddr(kernel_graph);
   // Load input data from user input
   LoadInputData(kernel_graph, inputs);
-  PreIterationDbg(kernel_graph);
+  if (debugger_) {
+    debugger_->PreExecute(kernel_graph, graph_sum_);
+  }
 #if ENABLE_CPU && ENABLE_GPU
   // Initialize parameter server
   InitPSParamAndOptim(kernel_graph, inputs);
@@ -372,7 +374,6 @@ void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor:
   for (int64_t i = 0; i < loopsize; i++) {
     Execute(kernel_graph);
   }
-  PostLoadTensor(kernel_graph);
   // In pynative mode, device addresses of tensors in value nodes need be clean.
   CleanValueNodeDeviceAddr(kernel_graph);
   // Summary
@@ -443,13 +444,6 @@ bool GPUSession::DumpDataEnabledIteration() const {
   return runtime_instance->DumpDataEnabledIteration();
 }
 
-void GPUSession::PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
-  if (debugger_) {
-    debugger_->PreExecute(kernel_graph, graph_sum_);
-  }
-  PreLoadTensor(kernel_graph);
-}
-
 void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
   bool dump_enabled = DumpDataEnabledIteration();
   // debug used for dump
@@ -463,30 +457,6 @@ void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_gra
   }
 }
 
-void GPUSession::PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
-  bool dump_enabled = DumpDataEnabledIteration();
-  if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
-    return;
-  }
-  MS_EXCEPTION_IF_NULL(kernel_graph);
-  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
-  MS_EXCEPTION_IF_NULL(runtime_instance);
-  debugger_->EmptyTensor();
-  uint32_t iter_num = debugger_->GetTensorLoaderIterNum();
-  debugger_->SetTensorLoaderIterNum(++iter_num);
-}
-
-void GPUSession::PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
-  bool dump_enabled = DumpDataEnabledIteration();
-  if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
-    return;
-  }
-  MS_EXCEPTION_IF_NULL(kernel_graph);
-  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
-  MS_EXCEPTION_IF_NULL(runtime_instance);
-  debugger_->EmptyPrevTensor();
-}
-
 void GPUSession::SyncValueNodeDeviceAddr(const std::shared_ptr<KernelGraph> &kernel_graph) const {
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
diff --git a/mindspore/ccsrc/backend/session/gpu_session.h b/mindspore/ccsrc/backend/session/gpu_session.h
index 024a9b4fb7d..3ff72d58523 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.h
+++ b/mindspore/ccsrc/backend/session/gpu_session.h
@@ -75,14 +75,8 @@ class GPUSession : public SessionBasic {
 
   bool DumpDataEnabledIteration() const;
 
-  void PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;
-
   void PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;
 
-  void PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
-
-  void PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
-
   void SyncValueNodeDeviceAddr(const std::shared_ptr<KernelGraph> &kernel_graph) const;
 
   void CleanValueNodeDeviceAddr(const std::shared_ptr<KernelGraph> &kernel_graph) const;
diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc
index 530afe20a13..2505b48552c 100644
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@@ -66,7 +66,7 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
                                      std::vector<std::vector<parameter_t>> *parameters,
                                      std::vector<int32_t> *error_codes, const std::vector<std::string> &op_overflows,
                                      const std::vector<std::shared_ptr<TensorData>> &tensor_list,
-                                     const bool init_dbg_suspend) {
+                                     const bool init_dbg_suspend, const bool step_end, const bool recheck) {
   std::lock_guard<std::mutex> lg(lock_);
   if (watchpoint_table.empty()) return;
 
@@ -75,13 +75,26 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
     const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
     const auto tensor_slot = std::to_string(tensor->GetSlot());
     mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor();
+    // no elements to analyze
+    if (tensor_ptr->DataSize() == 0) continue;
     int tensor_dtype = tensor_ptr->data_type_c();
     std::vector<watchpoint_t> watchpoints_to_check;
     std::string qualified_tensor_name;
     for (auto w_table_item : watchpoint_table) {
       auto wp = std::get<1>(w_table_item);
-      if (wp.condition.type == INIT && !init_dbg_suspend) continue;
+      // check ONLY init conditions on intial suspended state.
+      // skip other conditions on intial suspended state
+      // skip init condition on all the other states
+      if ((wp.condition.type == INIT) ^ init_dbg_suspend) continue;
+
       if (wp.condition.type != IS_OVERFLOW && tensor_dtype == kNumberTypeBool) continue;
+
+      // check change conditions only on step end.
+      if (wp.change_condition() && !step_end) continue;
+
+      // if recheck, ignore the cache results and reanalyze everything.
+      // if not a recheck, check only unanalyzed tensors
+      if (!recheck && wp_id_cache[tensor_name].count(wp.id)) continue;
       std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
       if (!found.empty()) {
         qualified_tensor_name = found;
@@ -174,6 +187,10 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
         error_code = std::get<1>(item);
         parameter_list = std::get<2>(item);
       }
+      // add analyzed tensor to cache
+      if (!recheck) {
+        wp_id_cache[tensor_name].insert(wp.id);
+      }
 
       if (is_hit || error_code) {
         name->push_back(qualified_tensor_name);
@@ -238,28 +255,6 @@ bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNode
   }
 }
 
-void DebugServices::AddWeightsBiasInputs(std::vector<std::shared_ptr<TensorData>> *tensor_list,
-                                         const CNodePtr &kernel) {
-  if (kernel) {
-    auto input_size = AnfAlgo::GetInputTensorNum(kernel);
-    for (size_t j = 0; j < input_size; ++j) {
-      auto input_kernel = kernel->input(j + 1);
-      std::string input_kernel_name = input_kernel->fullname_with_scope();
-      auto found_dot = input_kernel_name.find_last_of('.');
-      if (found_dot != std::string::npos &&
-          (input_kernel_name.substr(found_dot + 1) == "weight" || input_kernel_name.substr(found_dot + 1) == "bias")) {
-        std::string locate_tensor = input_kernel_name + ":0";
-        std::map<std::string, std::shared_ptr<TensorData>> tensor_map = tensor_loader_->GetTensorMap();
-        std::map<std::string, std::shared_ptr<TensorData>>::iterator iter;
-        iter = tensor_map.find(locate_tensor);
-        if (iter != tensor_map.end()) {
-          tensor_list->push_back(iter->second);
-        }
-      }
-    }
-  }
-}
-
 void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); }
 
 std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
@@ -292,4 +287,32 @@ std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::Get
   return watchpoint_table;
 }
 
+void DebugServices::ResetLoadedTensors() {
+  wp_id_cache.clear();
+  MS_LOG(INFO) << "Resetting loaded tensors";
+  tensor_loader_->MoveParametersCurrentToPrev();
+  tensor_loader_->EmptyCurrentTensor();
+  // will move parameters from previous to current map
+  tensor_loader_->SwapCurrentPrev();
+}
+
+std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
+  MS_EXCEPTION_IF_NULL(kernel);
+  std::vector<std::shared_ptr<TensorData>> result;
+  auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
+  auto kernel_name = kernel->fullname_with_scope();
+  for (size_t j = 0; j < output_size; ++j) {
+    auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
+    auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
+    if (tensor) result.push_back(tensor);
+  }
+  return result;
+}
+bool DebugServices::TensorExistsInCurrent(std::string tensor_name) {
+  return tensor_loader_->TensorExistsInCurrent(tensor_name);
+}
+void DebugServices::MoveTensorCurrentToPrev(std::string tensor_name) {
+  tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
+}
+
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h
index 67bb38a5df7..890a3569685 100644
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@@ -22,6 +22,7 @@
 #include <memory>
 #include <tuple>
 #include <unordered_map>
+#include <set>
 #include <mutex>
 #include <map>
 #include <limits>
@@ -160,6 +161,10 @@ class DebugServices {
     bool range_enabled() const {
       return condition.type == RANGE && (!parameter_list[0].disabled || !parameter_list[1].disabled);
     }
+
+    bool change_condition() const {
+      return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL || condition.type == NOT_CHANGED;
+    }
   } watchpoint_t;
 
   void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
@@ -171,7 +176,8 @@ class DebugServices {
   void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
                         std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters,
                         std::vector<int32_t> *error_code, const std::vector<std::string> &op_overflows,
-                        const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend);
+                        const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend,
+                        const bool step_end, const bool recheck);
 
   void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
                         std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
@@ -181,8 +187,6 @@ class DebugServices {
 
   bool IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const;
 
-  void AddWeightsBiasInputs(std::vector<std::shared_ptr<TensorData>> *tensor_list, const CNodePtr &kernel);
-
   void EmptyTensor();
 
   std::vector<std::shared_ptr<TensorData>> GetTensor() const;
@@ -205,9 +209,19 @@ class DebugServices {
 
   std::unordered_map<unsigned int, watchpoint_t> GetWatchpointTable();
 
+  void ResetLoadedTensors();
+
+  std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel);
+
+  bool TensorExistsInCurrent(std::string tensor_name);
+
+  void MoveTensorCurrentToPrev(std::string tensor_name);
+
  private:
   std::mutex lock_;
 
+  // to keep track of watchpoints that have been checked already for a tensor in current step
+  std::unordered_map<std::string, std::set<int32_t>> wp_id_cache;
   std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
 
   TensorLoader *tensor_loader_;
diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc
index af2c9064d37..e23fce7ebe6 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -313,20 +313,16 @@ void Debugger::PostExecute() {
   }
   if (debugger_->DebuggerBackendEnabled()) {
     // analyze tensor data and send the watchpoints been hit
-    if (run_level_ == "node") {
-      MS_LOG(INFO) << "Debugger is in node level mode ";
-      return;
-    }
     if (debugger_enabled_ && !is_dataset_graph_) {
       if (device_target_ != kGPUDevice) {
         num_step_++;
-        MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
-        SendWatchpoints(CheckWatchpoints());
-        CommandLoop();
-      } else {
-        CommandLoop();
       }
+      MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
+      SendWatchpoints(CheckWatchpoints());
+      CommandLoop();
     }
+    // Only keep parameters in the current map
+    debug_services_->ResetLoadedTensors();
   }
 }
 
@@ -596,7 +592,7 @@ void Debugger::CommandLoop() {
         MS_LOG(INFO) << "RunCMD";
         if (GetRunLevel(reply) == "recheck") {
           MS_LOG(INFO) << "rechecking all watchpoints";
-          SendWatchpoints(CheckWatchpoints());
+          SendWatchpoints(CheckWatchpoints("", nullptr, true));
         } else {
           // no longer the initial suspension.
           initial_suspend_ = false;
@@ -705,9 +701,6 @@ void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCon
       return DebugServices::parameter_t{parameter.name(), parameter.disabled(), parameter.value(), parameter.hit()};
     });
   debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list, parameter_list);
-  if (initial_suspend_ &&
-      static_cast<DebugServices::CONDITION_TYPE>(condition.condition()) == DebugServices::CONDITION_TYPE::INIT)
-    SendWatchpoints(CheckWatchpoints());
 }
 
 void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); }
@@ -780,7 +773,8 @@ void Debugger::Exit() {
   }
 }
 
-std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel) {
+std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel,
+                                                    bool recheck) {
   std::vector<std::string> name;
   std::vector<std::string> slot;
   std::vector<int> condition;
@@ -795,11 +789,10 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
   if (watchnode.empty()) {
     tensor_list = debug_services_->GetTensor();
   } else {
-    tensor_list = debug_services_->GetNodeTensorMap(watchnode);
-    debug_services_->AddWeightsBiasInputs(&tensor_list, kernel);
+    tensor_list = debug_services_->GetNodeTensor(kernel);
   }
   debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
-                                    tensor_list, initial_suspend_);
+                                    tensor_list, initial_suspend_, watchnode.empty(), recheck);
   std::list<WatchpointHit> hits;
   for (unsigned int i = 0; i < name.size(); i++) {
     WatchpointHit hit;
@@ -1045,7 +1038,7 @@ std::vector<std::string> Debugger::CheckOpOverflow() {
   }
   closedir(d);
 
-  if (op_names.size()) {
+  if (!op_names.empty()) {
     MS_LOG(ERROR) << "These operation overflows are detected " << op_names;
   }
 
@@ -1091,12 +1084,6 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
   if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
     return;
   }
-  bool keep_prev;
-  if (anf_node->isa<Parameter>()) {
-    keep_prev = true;
-  } else {
-    keep_prev = false;
-  }
   // for parameters and value nodes, set its execution order to be 0;
   int exec_order = 0;
   std::string node_name = anf_node->fullname_with_scope();
@@ -1114,6 +1101,13 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
   auto shape = AnfAlgo::GetOutputDeviceShape(anf_node, output_index);
   (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                        [](size_t inner_item) { return SizeToInt(inner_item); });
+  bool keep_prev;
+  if (anf_node->isa<Parameter>()) {
+    keep_prev = true;
+    debug_services_->MoveTensorCurrentToPrev(tensor_name);
+  } else {
+    keep_prev = false;
+  }
   bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev);
   if (!ret) {
     MS_LOG(ERROR) << "LoadMemToHost:"
@@ -1123,9 +1117,6 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
 
 void Debugger::LoadParametersAndConst() {
   if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
-  if (!(num_step_ == 0 || device_target_ == kAscendDevice ||
-        (device_target_ == kGPUDevice && device::KernelRuntime::DumpDataEnabledIteration())))
-    return;
   MS_EXCEPTION_IF_NULL(graph_ptr_);
   // load parameters
   MS_LOG(INFO) << "Start to load Parameters!";
@@ -1199,5 +1190,8 @@ void Debugger::ClearCurrentData() {
   if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()))
     debug_services_->EmptyCurrentTensor();
 }
+bool Debugger::TensorExistsInCurrent(std::string tensor_name) {
+  return debug_services_->TensorExistsInCurrent(tensor_name);
+}
 
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h
index 5e79d18b8e1..ee9196e5df0 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -145,6 +145,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
 
   std::list<KernelGraphPtr> GetGraphPtrList() { return graph_ptr_list_; }
 
+  bool TensorExistsInCurrent(std::string tensor_name);
+
  private:
   // private constructor for singleton
   Debugger();
@@ -197,7 +199,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
   // analyze tensors and check watchpoint conditions
   // return names of tensors and what condition they hit
   std::list<WatchpointHit> CheckWatchpoints(const std::string &watchnode = std::string(),
-                                            const CNodePtr &kernel = NULL);
+                                            const CNodePtr &kernel = nullptr, bool recheck = false);
 
   // send watchpoints that hit
   void SendWatchpoints(const std::list<WatchpointHit> &points);
diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h
index 16a58976eb6..af69519b80b 100644
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@@ -33,6 +33,44 @@ class TensorLoader {
 
   ~TensorLoader() { EmptyTensor(); }
 
+  void MoveTensorCurrentToPrev(std::string tensor_name) {
+    auto handle = tensor_list_map.extract(tensor_name);
+    if (!handle.empty()) {
+      MS_LOG(INFO) << "Moving " << tensor_name << " from current map to previous map";
+      prev_tensor_list_map.insert(std::move(handle));
+    }
+  }
+
+  void SwapCurrentPrev() { tensor_list_map.swap(prev_tensor_list_map); }
+
+  bool TensorExistsInCurrent(std::string tensor_name) {
+    return tensor_list_map.find(tensor_name) != tensor_list_map.end();
+  }
+
+  // only parameters will return true
+  bool PrevTensorExistsInCurrent(std::string tensor_name) { return TensorExistsInCurrent(tensor_name + ":prev"); }
+
+  void MoveParametersCurrentToPrev() {
+    MS_LOG(INFO) << "Moving parameters from current map to previous map";
+    auto iter = tensor_list_map.begin();
+    while (iter != tensor_list_map.end()) {
+      auto key = iter->first;
+      if (PrevTensorExistsInCurrent(key)) {
+        // :prev tensor only exists for parameter. Move it to prev
+        ++iter;
+        MoveTensorCurrentToPrev(key);
+      } else {
+        ++iter;
+      }
+    }
+  }
+
+  bool IsPrevTensor(std::string tensor_name) {
+    const std::string suffix = ":prev";
+    if (tensor_name.length() <= suffix.length()) return false;
+    return std::equal(suffix.rbegin(), suffix.rend(), tensor_name.rbegin());
+  }
+
   bool LoadNewTensor(std::shared_ptr<TensorData> tensor, bool keep_prev) {
     std::lock_guard<std::mutex> lg(lock_);
     if (keep_prev) {
@@ -43,20 +81,32 @@ class TensorLoader {
         tensor_list_map.insert(std::move(handle));
       }
     }
-    tensor_list.push_back(tensor);
     tensor_list_map[tensor->GetName()] = tensor;  // use [] instead of insert to ensure latest value
     auto node_name = tensor->GetName();
     node_name = node_name.substr(0, node_name.find_first_of(":"));
     node_tensor_map.insert({node_name, tensor});
     return true;
   }
-  std::vector<std::shared_ptr<TensorData>> GetTensor() { return tensor_list; }
+
+  std::vector<std::shared_ptr<TensorData>> GetTensor() {
+    std::vector<std::shared_ptr<TensorData>> tensor_list;
+    for (auto &it : tensor_list_map) {
+      if (!IsPrevTensor(it.first)) tensor_list.push_back(it.second);
+    }
+    return tensor_list;
+  }
+
+  std::shared_ptr<TensorData> GetTensor(const std::string &tensor_name) {
+    auto iter = tensor_list_map.find(tensor_name);
+    if (iter != tensor_list_map.end()) return iter->second;
+    return nullptr;
+  }
 
   uint32_t GetIterNum() { return iter_num; }
 
   std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; }
 
-  std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) {
+  std::shared_ptr<TensorData> GetPrevTensor(const std::string &tensor_name) {
     if (tensor_list_map.find(tensor_name + ":prev") != tensor_list_map.end()) {
       return tensor_list_map[tensor_name + ":prev"];
     }
@@ -91,14 +141,13 @@ class TensorLoader {
     prev_tensor_list_map.clear();
     node_tensor_map.clear();
     tensor_list_map.swap(prev_tensor_list_map);
-    tensor_list.clear();
   }
 
   void EmptyPrevTensor() { prev_tensor_list_map.clear(); }
 
   void EmptyCurrentTensor() {
     tensor_list_map.clear();
-    tensor_list.clear();
+    node_tensor_map.clear();
   }
 
   void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; }
@@ -142,7 +191,6 @@ class TensorLoader {
   }
 
  private:
-  std::vector<std::shared_ptr<TensorData>> tensor_list;
   std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map;
   std::multimap<std::string, std::shared_ptr<TensorData>> node_tensor_map;
   std::map<std::string, std::shared_ptr<TensorData>> prev_tensor_list_map;
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
index 2525d78aabe..336fa3395e6 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@@ -674,6 +674,10 @@ bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int exec
                                         const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type,
                                         size_t slot, bool keep_prev) const {
   bool ret = false;
+  if (Debugger::GetInstance()->TensorExistsInCurrent(tensor_name)) {
+    MS_LOG(INFO) << tensor_name << " already loaded for this step so not loading it again.";
+    return true;
+  }
   // TensorData is freed up in AscendSession class
   auto tensor_data = std::make_shared<mindspore::TensorData>();
   tensor_data->SetName(tensor_name);
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
index c1a80a78e7f..b70b7dbb41d 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -296,8 +296,6 @@ bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph) {
   MS_EXCEPTION_IF_NULL(graph);
 #ifdef ENABLE_DEBUGGER
   MS_LOG(INFO) << "Start load step";
-  uint32_t cur_iter = 0;
-  MS_LOG(INFO) << "Cur iter is " << cur_iter;
   for (auto graph_ptr : debugger_->GetGraphPtrList()) {
     debugger_->SetGraphPtr(graph_ptr);
     // load output
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
index 75bb1d5262f..af2570598bd 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@@ -87,6 +87,11 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi
     return true;
   }
 
+  if (Debugger::GetInstance()->TensorExistsInCurrent(tensor_name)) {
+    MS_LOG(INFO) << tensor_name << " already loaded for this step so not loading it again.";
+    return true;
+  }
+
   mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
   size_t host_size = out_tensor->data().nbytes();
   auto ret_rt_memcpy = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index 9ee99d06485..cfb20a39379 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -154,8 +154,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
     std::vector<int> real_outputs;
     real_outputs = CheckRealOutput(node_name, output_size);
 
-    for (std::vector<int>::iterator it = real_outputs.begin(); it != real_outputs.end(); ++it) {
-      auto j = *it;
+    for (int j : real_outputs) {
       auto addr = kernel_outputs[j];
       auto type = AnfAlgo::GetOutputInferDataType(kernel, j);
       auto format = kOpFormat_DEFAULT;