From 55b3557c0ddd38cbb90c79a288fdb9e576a1f277 Mon Sep 17 00:00:00 2001
From: limingqi107 <limingqi@huawei.com>
Date: Wed, 17 Jun 2020 11:52:41 +0800
Subject: [PATCH] gpu optimize the max device memory config

---
 .../ccsrc/device/gpu/gpu_kernel_runtime.cc    |  1 +
 .../ccsrc/device/gpu/gpu_memory_allocator.cc  | 28 +++++++++++++------
 .../ccsrc/device/gpu/gpu_memory_allocator.h   |  4 ++-
 .../mem_reuse/mem_dynamic_allocator.cc        |  2 +-
 mindspore/context.py                          |  5 +++-
 tests/st/ops/gpu/test_conv2d_op.py            |  2 +-
 6 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
index 0e90b90f2d2..9e7fb884008 100644
--- a/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
@@ -39,6 +39,7 @@ bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().Syn
 
 bool GPUKernelRuntime::Init() {
   if (device_init_ == true) {
+    GPUMemoryAllocator::GetInstance().CheckMaxDeviceMemory();
     return true;
   }
   auto ret = InitDevice();
diff --git a/mindspore/ccsrc/device/gpu/gpu_memory_allocator.cc b/mindspore/ccsrc/device/gpu/gpu_memory_allocator.cc
index fceb5e78b4c..91379456619 100644
--- a/mindspore/ccsrc/device/gpu/gpu_memory_allocator.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_memory_allocator.cc
@@ -29,18 +29,30 @@ bool GPUMemoryAllocator::Init() {
   size_t free_size = CudaDriver::free_mem_size();
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
-  float max_device_memory = context_ptr->max_device_memory();
-  max_available_device_memory_ = FloatToSize(max_device_memory * 1024 * 1024 * 1024);
-  if (total_size > 0 && free_size > 0 && max_available_device_memory_ > 0) {
+  limited_device_memory_ = context_ptr->max_device_memory();
+  available_device_memory_ = FloatToSize(limited_device_memory_ * 1024 * 1024 * 1024);
+  if (total_size > 0 && free_size > 0 && available_device_memory_ > 0) {
     MS_LOG(INFO) << "GPU device total memory size " << total_size << ", current free memory size " << free_size
-                 << ", set max available memory size " << max_available_device_memory_;
+                 << ", set max available memory size " << available_device_memory_ << ".";
   } else {
     MS_LOG(EXCEPTION) << "GPU device memory error, total memory size " << total_size << ", current free memory size "
-                      << free_size << ", set max available memory size " << max_available_device_memory_;
+                      << free_size << ", set max available memory size " << available_device_memory_ << ".";
   }
   return true;
 }
 
+void GPUMemoryAllocator::CheckMaxDeviceMemory() const {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  auto max_device_memory = context_ptr->max_device_memory();
+  //  Currently not support modifying the max device memory.
+  if (limited_device_memory_ != max_device_memory) {
+    MS_LOG(EXCEPTION)
+      << "Can't change context param max_device_memory in runtime, currently effective max_device_memory("
+      << limited_device_memory_ << "GB), set new max_device_memory(" << max_device_memory << "GB) failed.";
+  }
+}
+
 bool GPUMemoryAllocator::Finalize() {
   if (buffer_q_addr_ != nullptr) {
     if (!CudaDriver::FreeDeviceMem(buffer_q_addr_)) {
@@ -73,7 +85,7 @@ size_t GPUMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr *addr) {
     MS_LOG(EXCEPTION) << "Alloc device memory[" << size << "] failed.";
   }
   total_used_device_memory_ += alloc_size;
-  max_available_device_memory_ -= alloc_size;
+  available_device_memory_ -= alloc_size;
   MS_LOG(INFO) << "Current free memory size[" << free_size - alloc_size << "], current alloc size[" << alloc_size
                << "], total used size[" << total_used_device_memory_ << "].";
   return alloc_size;
@@ -81,9 +93,7 @@ size_t GPUMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr *addr) {
 
 bool GPUMemoryAllocator::FreeDeviceMem(const DeviceMemPtr &addr) { return CudaDriver::FreeDeviceMem(addr); }
 
-size_t GPUMemoryAllocator::free_mem_size() {
-  return std::min(CudaDriver::free_mem_size(), max_available_device_memory_);
-}
+size_t GPUMemoryAllocator::free_mem_size() { return std::min(CudaDriver::free_mem_size(), available_device_memory_); }
 
 size_t GPUMemoryAllocator::total_mem_size() { return CudaDriver::total_mem_size(); }
 }  // namespace gpu
diff --git a/mindspore/ccsrc/device/gpu/gpu_memory_allocator.h b/mindspore/ccsrc/device/gpu/gpu_memory_allocator.h
index baaf50b641b..90d77910574 100644
--- a/mindspore/ccsrc/device/gpu/gpu_memory_allocator.h
+++ b/mindspore/ccsrc/device/gpu/gpu_memory_allocator.h
@@ -28,6 +28,7 @@ class GPUMemoryAllocator : public DynamicMemPoolBestFit {
  public:
   ~GPUMemoryAllocator() override = default;
   bool Init();
+  void CheckMaxDeviceMemory() const;
   bool Finalize();
   bool AllocBufferQueueMem(size_t size, DeviceMemPtr *addr);
 
@@ -49,8 +50,9 @@ class GPUMemoryAllocator : public DynamicMemPoolBestFit {
   // Used to track address of data buffer queue.
   DeviceMemPtr buffer_q_addr_{nullptr};
 
+  float limited_device_memory_{0.0};
   size_t total_used_device_memory_{0};
-  size_t max_available_device_memory_{0};
+  size_t available_device_memory_{0};
 };
 }  // namespace gpu
 }  // namespace device
diff --git a/mindspore/ccsrc/pre_activate/mem_reuse/mem_dynamic_allocator.cc b/mindspore/ccsrc/pre_activate/mem_reuse/mem_dynamic_allocator.cc
index a2dfce22414..095f8f6495a 100644
--- a/mindspore/ccsrc/pre_activate/mem_reuse/mem_dynamic_allocator.cc
+++ b/mindspore/ccsrc/pre_activate/mem_reuse/mem_dynamic_allocator.cc
@@ -150,7 +150,7 @@ size_t DynamicMemPoolBestFit::CalMemBlockAllocSize(size_t size) {
     alloc_mem_size = alloc_mem_size * 2;
   }
   alloc_mem_size = std::min(alloc_mem_size, device_free_mem_size);
-  return AlignMemorySize(alloc_mem_size);
+  return alloc_mem_size;
 }
 
 bool DynamicMemPoolBestFit::IsDivide(size_t tensor_size, size_t mem_buf_size) const {
diff --git a/mindspore/context.py b/mindspore/context.py
index ef2be246369..1e7ba3b28b7 100644
--- a/mindspore/context.py
+++ b/mindspore/context.py
@@ -342,6 +342,8 @@ class _Context:
         if not check_input_format(max_device_memory):
             raise ValueError("Context param max_device_memory should be in correct format! Such as \"3.5GB\"")
         max_device_memory_value = float(max_device_memory[:-2])
+        if max_device_memory_value == 0:
+            raise ValueError("Context param max_device_memory should be in correct format! Such as \"3.5GB\"")
         self._context_handle.set_max_device_memory(max_device_memory_value)
 
 def check_input_format(x):
@@ -523,7 +525,8 @@ def set_context(**kwargs):
             separated by colons; single operator can choose op_trace, op_trace cannot be combined with
             training_trace and task_trace. Default: "training_trace".
         check_bprop (bool): Whether to check bprop. Default: False.
-        max_device_memory (str): Sets the maximum memory available for device. Default: "1024GB".
+        max_device_memory (str): Sets the maximum memory available for device, currently only supported on GPU.
+            The format is "xxGB". Default: "1024GB".
 
     Raises:
         ValueError: If input key is not an attribute in context.
diff --git a/tests/st/ops/gpu/test_conv2d_op.py b/tests/st/ops/gpu/test_conv2d_op.py
index a42114a106e..6af5fc39656 100644
--- a/tests/st/ops/gpu/test_conv2d_op.py
+++ b/tests/st/ops/gpu/test_conv2d_op.py
@@ -53,7 +53,7 @@ def test_conv2d():
                          [162, 174, 186],
                          [198, 210, 222]]]]).astype(np.float32)
 
-    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU", max_device_memory="0.2GB")
     conv2d = NetConv2d()
     output = conv2d(x, w)
     assert (output.asnumpy() == expect).all()