From 2c9758addae1df0094bd5f2cf51adcb2e1065f74 Mon Sep 17 00:00:00 2001
From: lizhenyu <lizhenyu13@huawei.com>
Date: Thu, 15 Jul 2021 14:55:31 +0800
Subject: [PATCH] [bugfix]GPU occur oom when cache all output tensor of graph

---
 mindspore/ccsrc/pybind_api/ir/tensor_py.cc              | 5 +++++
 mindspore/ccsrc/runtime/framework/actor/output_actor.cc | 5 ++++-
 mindspore/core/ir/tensor.cc                             | 3 +++
 mindspore/core/ir/tensor.h                              | 6 ++++++
 4 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/mindspore/ccsrc/pybind_api/ir/tensor_py.cc b/mindspore/ccsrc/pybind_api/ir/tensor_py.cc
index 872bf819f25..60498d99ab3 100644
--- a/mindspore/ccsrc/pybind_api/ir/tensor_py.cc
+++ b/mindspore/ccsrc/pybind_api/ir/tensor_py.cc
@@ -342,6 +342,11 @@ py::array TensorPy::SyncAsNumpy(const Tensor &tensor) {
       tensor.Wait();
     }
     tensor.data_sync();
+
+    // Release device address of graph output tensor.
+    if (tensor.need_release_device_mem()) {
+      const_cast<Tensor &>(tensor).set_device_address(nullptr);
+    }
   }
   return AsNumpy(tensor);
 }
diff --git a/mindspore/ccsrc/runtime/framework/actor/output_actor.cc b/mindspore/ccsrc/runtime/framework/actor/output_actor.cc
index 865469bd7cc..38836e2ce61 100644
--- a/mindspore/ccsrc/runtime/framework/actor/output_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/output_actor.cc
@@ -117,7 +117,10 @@ void OutputActor::CollectOutput(const AnfNodePtr &output_node, size_t output_ind
   if (output_position >= outputs_.size()) {
     SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "The input index is of range.");
   }
-  outputs_[output_position] = CreateOutputTensor(output_node, output_index, output_position);
+
+  auto tensor = CreateOutputTensor(output_node, output_index, output_position);
+  tensor->set_need_release_device_mem(true);
+  outputs_[output_position] = tensor;
   current_outputs_num_++;
 
   // Save the output nodes to clear the device tensor in the running end.
diff --git a/mindspore/core/ir/tensor.cc b/mindspore/core/ir/tensor.cc
index 129f1c3cf93..d1df95ea02b 100644
--- a/mindspore/core/ir/tensor.cc
+++ b/mindspore/core/ir/tensor.cc
@@ -473,6 +473,7 @@ Tensor::Tensor(const Tensor &tensor)
       event_(tensor.event_),
       sync_status_(tensor.sync_status_),
       device_sync_(tensor.device_sync_),
+      need_release_device_mem_(tensor.need_release_device_mem_),
       cache_enable_(tensor.cache_enable_),
       cache_tensor_ptr_(tensor.cache_tensor_ptr_),
       hashmap_tensor_ptr_(tensor.hashmap_tensor_ptr_),
@@ -487,6 +488,7 @@ Tensor::Tensor(const Tensor &tensor, TypeId data_type)
       event_(tensor.event_),
       sync_status_(tensor.sync_status_),
       device_sync_(tensor.device_sync_),
+      need_release_device_mem_(tensor.need_release_device_mem_),
       cache_enable_(tensor.cache_enable_),
       cache_tensor_ptr_(tensor.cache_tensor_ptr_),
       hashmap_tensor_ptr_(tensor.hashmap_tensor_ptr_),
@@ -548,6 +550,7 @@ Tensor &Tensor::AssignValue(const Tensor &tensor) {
   if (this != &tensor) {
     MetaTensor::operator=(tensor);
     device_sync_ = tensor.device_sync_;
+    need_release_device_mem_ = tensor.need_release_device_mem_;
     data_ = tensor.data_;
     id_ = tensor.id_;
     event_ = tensor.event_;
diff --git a/mindspore/core/ir/tensor.h b/mindspore/core/ir/tensor.h
index f6115567e2f..b94757ea403 100644
--- a/mindspore/core/ir/tensor.h
+++ b/mindspore/core/ir/tensor.h
@@ -294,6 +294,10 @@ class Tensor : public MetaTensor {
       device_sync_->ResetRefCount();
     }
   }
+
+  bool need_release_device_mem() const { return need_release_device_mem_; }
+  void set_need_release_device_mem(bool release_device_mem) { need_release_device_mem_ = release_device_mem; }
+
   void set_padding_type(const std::string padding_type) { padding_type_ = padding_type; }
   std::string padding_type() const { return padding_type_; }
 
@@ -375,6 +379,8 @@ class Tensor : public MetaTensor {
   bool graph_output_{false};
   bool updated_by_device_{false};
   DeviceSyncPtr device_sync_{nullptr};
+  // Release device address of graph output tensor or not.
+  bool need_release_device_mem_{false};
   bool cache_enable_{false};
   std::shared_ptr<Tensor> cache_tensor_ptr_{nullptr};
   std::shared_ptr<Tensor> hashmap_tensor_ptr_{nullptr};