From 2c9758addae1df0094bd5f2cf51adcb2e1065f74 Mon Sep 17 00:00:00 2001 From: lizhenyu Date: Thu, 15 Jul 2021 14:55:31 +0800 Subject: [PATCH] [bugfix]GPU occur oom when cache all output tensor of graph --- mindspore/ccsrc/pybind_api/ir/tensor_py.cc | 5 +++++ mindspore/ccsrc/runtime/framework/actor/output_actor.cc | 5 ++++- mindspore/core/ir/tensor.cc | 3 +++ mindspore/core/ir/tensor.h | 6 ++++++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/mindspore/ccsrc/pybind_api/ir/tensor_py.cc b/mindspore/ccsrc/pybind_api/ir/tensor_py.cc index 872bf819f25..60498d99ab3 100644 --- a/mindspore/ccsrc/pybind_api/ir/tensor_py.cc +++ b/mindspore/ccsrc/pybind_api/ir/tensor_py.cc @@ -342,6 +342,11 @@ py::array TensorPy::SyncAsNumpy(const Tensor &tensor) { tensor.Wait(); } tensor.data_sync(); + + // Release device address of graph output tensor. + if (tensor.need_release_device_mem()) { + const_cast(tensor).set_device_address(nullptr); + } } return AsNumpy(tensor); } diff --git a/mindspore/ccsrc/runtime/framework/actor/output_actor.cc b/mindspore/ccsrc/runtime/framework/actor/output_actor.cc index 865469bd7cc..38836e2ce61 100644 --- a/mindspore/ccsrc/runtime/framework/actor/output_actor.cc +++ b/mindspore/ccsrc/runtime/framework/actor/output_actor.cc @@ -117,7 +117,10 @@ void OutputActor::CollectOutput(const AnfNodePtr &output_node, size_t output_ind if (output_position >= outputs_.size()) { SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "The input index is of range."); } - outputs_[output_position] = CreateOutputTensor(output_node, output_index, output_position); + + auto tensor = CreateOutputTensor(output_node, output_index, output_position); + tensor->set_need_release_device_mem(true); + outputs_[output_position] = tensor; current_outputs_num_++; // Save the output nodes to clear the device tensor in the running end. diff --git a/mindspore/core/ir/tensor.cc b/mindspore/core/ir/tensor.cc index 129f1c3cf93..d1df95ea02b 100644 --- a/mindspore/core/ir/tensor.cc +++ b/mindspore/core/ir/tensor.cc @@ -473,6 +473,7 @@ Tensor::Tensor(const Tensor &tensor) event_(tensor.event_), sync_status_(tensor.sync_status_), device_sync_(tensor.device_sync_), + need_release_device_mem_(tensor.need_release_device_mem_), cache_enable_(tensor.cache_enable_), cache_tensor_ptr_(tensor.cache_tensor_ptr_), hashmap_tensor_ptr_(tensor.hashmap_tensor_ptr_), @@ -487,6 +488,7 @@ Tensor::Tensor(const Tensor &tensor, TypeId data_type) event_(tensor.event_), sync_status_(tensor.sync_status_), device_sync_(tensor.device_sync_), + need_release_device_mem_(tensor.need_release_device_mem_), cache_enable_(tensor.cache_enable_), cache_tensor_ptr_(tensor.cache_tensor_ptr_), hashmap_tensor_ptr_(tensor.hashmap_tensor_ptr_), @@ -548,6 +550,7 @@ Tensor &Tensor::AssignValue(const Tensor &tensor) { if (this != &tensor) { MetaTensor::operator=(tensor); device_sync_ = tensor.device_sync_; + need_release_device_mem_ = tensor.need_release_device_mem_; data_ = tensor.data_; id_ = tensor.id_; event_ = tensor.event_; diff --git a/mindspore/core/ir/tensor.h b/mindspore/core/ir/tensor.h index f6115567e2f..b94757ea403 100644 --- a/mindspore/core/ir/tensor.h +++ b/mindspore/core/ir/tensor.h @@ -294,6 +294,10 @@ class Tensor : public MetaTensor { device_sync_->ResetRefCount(); } } + + bool need_release_device_mem() const { return need_release_device_mem_; } + void set_need_release_device_mem(bool release_device_mem) { need_release_device_mem_ = release_device_mem; } + void set_padding_type(const std::string padding_type) { padding_type_ = padding_type; } std::string padding_type() const { return padding_type_; } @@ -375,6 +379,8 @@ class Tensor : public MetaTensor { bool graph_output_{false}; bool updated_by_device_{false}; DeviceSyncPtr device_sync_{nullptr}; + // Release device address of graph output tensor or not. + bool need_release_device_mem_{false}; bool cache_enable_{false}; std::shared_ptr cache_tensor_ptr_{nullptr}; std::shared_ptr hashmap_tensor_ptr_{nullptr};