!9958 fix opencl subgraph memory reuse

From: @ddwsky Reviewed-by: Signed-off-by:
2020-12-17 20:24:45 +08:00 · 2020-12-17 20:24:45 +08:00 · 18db983d74
parent 5e94f71ba1 5712363365
commit 18db983d74
11 changed files with 46 additions and 35 deletions
--- a/build.sh
+++ b/build.sh
@ -542,15 +542,10 @@ gene_ocl_program() {
    echo "Compile SPIRV done"
 }

-build_opencl() {
+get_opencl() {
    cd ${BASEPATH}
    git submodule update --init third_party/OpenCL-Headers
    git submodule update --init third_party/OpenCL-CLHPP
-    if [[ "${OPENCL_OFFLINE_COMPILE}" == "on" ]]; then
-        gene_ocl_program
-    else
-        gene_clhpp
-    fi
 }


@ -572,7 +567,8 @@ build_lite()
    fi

    if [ "${ENABLE_GPU}" == "on" ] && [ "${LITE_PLATFORM}" == "arm64" ] || [ $1 == "arm64" ]; then
-      echo "start build opencl"
+      echo "start get opencl"
+      get_opencl
    fi
    if [ "${ENABLE_NPU}" == "on" ]; then
      checkddk
--- a/mindspore/lite/nnacl/CMakeLists.txt
+++ b/mindspore/lite/nnacl/CMakeLists.txt
@ -47,6 +47,8 @@ endif()
 string(REPLACE "-fvisibility=hidden" "-fvisibility=default" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
 add_library(nnacl STATIC ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC})
 add_library(nnacl_mid OBJECT ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC})
+add_dependencies(nnacl fbs_src)
+add_dependencies(nnacl_mid fbs_src)

 ########################### arm64 build optimize library ########################
 if (PLATFORM_ARM64)
--- a/mindspore/lite/src/ops/CMakeLists.txt
+++ b/mindspore/lite/src/ops/CMakeLists.txt
@ -6,3 +6,4 @@ file(GLOB OPS_SRC
    )

 add_library(cpu_ops_mid OBJECT ${OPS_SRC})
+add_dependencies(cpu_ops_mid fbs_src)
--- a/mindspore/lite/src/runtime/kernel/arm/CMakeLists.txt
+++ b/mindspore/lite/src/runtime/kernel/arm/CMakeLists.txt
@ -14,6 +14,7 @@ set(KERNEL_SRC ${KERNEL_SRC} ${TRAIN_KERNEL_SRC})
 endif()

 add_library(cpu_kernel_mid OBJECT ${KERNEL_SRC})
+add_dependencies(cpu_kernel_mid fbs_src)

 if (PLATFORM_ARM64)
  file(GLOB FP16_KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/fp16/*.cc)
--- a/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt
+++ b/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt
@ -1,2 +1,4 @@
 file(GLOB_RECURSE OPENCL_KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/kernel/*.cc)
 add_library(opencl_kernel_mid OBJECT ${OPENCL_KERNEL_SRC})
+add_dependencies(opencl_kernel_mid fbs_src)
+
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc
@ -210,6 +210,7 @@ int OpenCLKernel::DequantWeight() {
    void *dequant_weight{nullptr};
    bool set_flag{true};
    if (is_fp16) {
+#ifdef ENABLE_ARM64
      if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) {
        dequant_weight = kernel::DequantUtil::DequantData<int8_t, float16_t>(weight_tensor);
      } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt16) {
@ -217,6 +218,9 @@ int OpenCLKernel::DequantWeight() {
      } else {
        set_flag = false;
      }
+#else
+      set_flag = false;
+#endif
    } else {
      if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) {
        dequant_weight = kernel::DequantUtil::DequantData<int8_t, float>(weight_tensor);
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
@ -147,6 +147,11 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size,
    UnLock();
    return host_ptr;
  }
+  total_size_ += size;
+  const uint64_t max_size = ocl_runtime_->GetGlobalMemSize();
+  if (total_size_ >= max_size) {
+    MS_LOG(ERROR) << "Mem pool out of max_size, total size: " << total_size_ << ", max size: " << max_size;
+  }
  cl::Buffer *buffer = nullptr;
  cl::Image2D *image = nullptr;
  cl_mem_flags flags = CL_MEM_READ_WRITE;
@ -188,7 +193,8 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size,
  UnLock();
  std::string type_name = img_size.empty() ? "buffer" : "Image2D";
  MS_LOG(DEBUG) << "Malloc a new " << type_name << ". size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_
-                << ", device addr: " << mem_buf->device_ptr_ << ", image_addr: " << image;
+                << ", device addr: " << mem_buf->device_ptr_ << ", image_addr: " << image
+                << ", total size: " << total_size_;
  return host_ptr;
 }

@ -250,10 +256,10 @@ void *OpenCLAllocator::GetBuffer(void *buffer) {
  return nullptr;
 }

-void OpenCLAllocator::Clear() {
-  Lock();
+template <typename T>
+void OpenCLAllocator::ClearMemList(T *list) {
  auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
-  for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) {
+  for (auto it = list->begin(); it != list->end(); it++) {
    if (it->second->map_flags) {
      int ret = UnmapBuffer(it->second->host_ptr_);
      if (ret != RET_OK) {
@ -278,29 +284,13 @@ void OpenCLAllocator::Clear() {
    }
    delete it->second;
  }
-  allocated_list_.clear();
+  list->clear();
+}

-  for (auto it = free_list_.begin(); it != free_list_.end(); it++) {
-    if (svm_capabilities) {
-      clSVMFree((*ocl_runtime_->Context())(), it->second->host_ptr_);
-      MS_LOG(DEBUG) << "OpenCL free svm buffer : " << it->second->host_ptr_;
-    } else {
-      cl::Buffer *buffer = static_cast<cl::Buffer *>(it->second->device_ptr_);
-      if (buffer != nullptr) {
-        MS_LOG(DEBUG) << "OpenCL free device buffer : " << buffer;
-        delete buffer;
-        it->second->device_ptr_ = nullptr;
-      }
-      cl::Image *image = static_cast<cl::Image *>(it->second->image_ptr_);
-      if (image != nullptr) {
-        MS_LOG(DEBUG) << "OpenCL free image : " << image;
-        delete image;
-        it->second->image_ptr_ = nullptr;
-      }
-    }
-    delete it->second;
-  }
-  free_list_.clear();
+void OpenCLAllocator::Clear() {
+  Lock();
+  ClearMemList<std::unordered_map<void *, MemBuf *>>(&allocated_list_);
+  ClearMemList<std::multimap<size_t, MemBuf *>>(&free_list_);
  UnLock();
 }

--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.h
@ -63,6 +63,12 @@ class OpenCLAllocator : public Allocator {
  void *CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer);
  void *CreateImage2D(size_t size, const std::vector<size_t> &img_size, void *data, size_t flags, bool is_map,
                      cl::Buffer **buffer, cl::Image2D **image);
+  template <typename T>
+  void ClearMemList(T *list);
+
+ private:
+  OpenCLRuntime *ocl_runtime_{nullptr};
+  std::mutex lock;
  struct MemBuf {
    size_t size_;
    void *device_ptr_;
@ -72,14 +78,13 @@ class OpenCLAllocator : public Allocator {
    bool map_flags{false};
  };

-  std::mutex lock;
  // <membuf->buf, membuf>
  std::unordered_map<void *, MemBuf *> allocated_list_;
  std::multimap<size_t, MemBuf *> free_list_;
+  uint64_t total_size_{0};
  // 6 is empirical value
  int shift_factor_ = 6;
  bool lock_flag_ = false;
-  OpenCLRuntime *ocl_runtime_{nullptr};
 };

 }  // namespace mindspore::lite::opencl
--- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc
@ -59,6 +59,10 @@ int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor
          return ret;
        }
        auto data_ptr = allocator_->Malloc(output->Size(), img_size);
+        if (data_ptr == nullptr) {
+          MS_LOG(ERROR) << "Malloc data failed";
+          return RET_ERROR;
+        }
        output->set_data(data_ptr);
      } else {
        ret = output->MallocData(allocator_);
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
@ -225,8 +225,12 @@ int OpenCLRuntime::Init() {
                   << "SVM_ATOMICS";
    }
  }
+  global_memery_size_ = device_->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();

+  MS_LOG(INFO) << "Address space bits: " << device_->getInfo<CL_DEVICE_ADDRESS_BITS>();
+  MS_LOG(INFO) << "Global Mem Size: " << global_memery_size_;
  MS_LOG(INFO) << "Global Mem Cache Size: " << global_memery_cachesize_;
+  MS_LOG(INFO) << "Max Alloc Size: " << device_->getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
  MS_LOG(INFO) << "Compute Unit: " << compute_units_;
  MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz";

--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
@ -61,6 +61,7 @@ class OpenCLRuntime {
  uint32_t DeviceMaxFreq() const;
  uint64_t GetMaxWorkGroupSize(const cl::Kernel &kernel);
  uint32_t GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range = cl::NullRange);
+  uint64_t GetGlobalMemSize() { return global_memery_size_; }
  GpuInfo GetGpuInfo();
  bool GetFp16Enable() const;
  bool SetFp16Enable(bool enable);
@ -168,6 +169,7 @@ class OpenCLRuntime {
  std::map<std::string, cl::Program> program_map_;
  cl::Program binary_program_{0};
  uint64_t global_memery_cachesize_{0};
+  uint64_t global_memery_size_{0};
  int max_work_group_size_{1};
  uint32_t compute_units_{0};
  uint32_t max_freq_{0};