diff --git a/build.sh b/build.sh index ac6b0d47c12..c207d46f9ca 100755 --- a/build.sh +++ b/build.sh @@ -542,15 +542,10 @@ gene_ocl_program() { echo "Compile SPIRV done" } -build_opencl() { +get_opencl() { cd ${BASEPATH} git submodule update --init third_party/OpenCL-Headers git submodule update --init third_party/OpenCL-CLHPP - if [[ "${OPENCL_OFFLINE_COMPILE}" == "on" ]]; then - gene_ocl_program - else - gene_clhpp - fi } @@ -572,7 +567,8 @@ build_lite() fi if [ "${ENABLE_GPU}" == "on" ] && [ "${LITE_PLATFORM}" == "arm64" ] || [ $1 == "arm64" ]; then - echo "start build opencl" + echo "start get opencl" + get_opencl fi if [ "${ENABLE_NPU}" == "on" ]; then checkddk diff --git a/mindspore/lite/nnacl/CMakeLists.txt b/mindspore/lite/nnacl/CMakeLists.txt index ec9045de7d3..c854813bd59 100644 --- a/mindspore/lite/nnacl/CMakeLists.txt +++ b/mindspore/lite/nnacl/CMakeLists.txt @@ -47,6 +47,8 @@ endif() string(REPLACE "-fvisibility=hidden" "-fvisibility=default" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") add_library(nnacl STATIC ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC}) add_library(nnacl_mid OBJECT ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC}) +add_dependencies(nnacl fbs_src) +add_dependencies(nnacl_mid fbs_src) ########################### arm64 build optimize library ######################## if (PLATFORM_ARM64) diff --git a/mindspore/lite/src/ops/CMakeLists.txt b/mindspore/lite/src/ops/CMakeLists.txt index d35d072be8c..c90b0b22f9b 100644 --- a/mindspore/lite/src/ops/CMakeLists.txt +++ b/mindspore/lite/src/ops/CMakeLists.txt @@ -6,3 +6,4 @@ file(GLOB OPS_SRC ) add_library(cpu_ops_mid OBJECT ${OPS_SRC}) +add_dependencies(cpu_ops_mid fbs_src) diff --git a/mindspore/lite/src/runtime/kernel/arm/CMakeLists.txt b/mindspore/lite/src/runtime/kernel/arm/CMakeLists.txt index fceac9912be..feaad6ed5c2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/CMakeLists.txt +++ b/mindspore/lite/src/runtime/kernel/arm/CMakeLists.txt @@ -14,6 +14,7 @@ set(KERNEL_SRC ${KERNEL_SRC} ${TRAIN_KERNEL_SRC}) endif() add_library(cpu_kernel_mid OBJECT ${KERNEL_SRC}) +add_dependencies(cpu_kernel_mid fbs_src) if (PLATFORM_ARM64) file(GLOB FP16_KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/fp16/*.cc) diff --git a/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt b/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt index 0a7911b7146..ddae211135e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt +++ b/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt @@ -1,2 +1,4 @@ file(GLOB_RECURSE OPENCL_KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/kernel/*.cc) add_library(opencl_kernel_mid OBJECT ${OPENCL_KERNEL_SRC}) +add_dependencies(opencl_kernel_mid fbs_src) + diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc index d8346112ea5..f18f1e2b16d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc @@ -210,6 +210,7 @@ int OpenCLKernel::DequantWeight() { void *dequant_weight{nullptr}; bool set_flag{true}; if (is_fp16) { +#ifdef ENABLE_ARM64 if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) { dequant_weight = kernel::DequantUtil::DequantData(weight_tensor); } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt16) { @@ -217,6 +218,9 @@ int OpenCLKernel::DequantWeight() { } else { set_flag = false; } +#else + set_flag = false; +#endif } else { if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) { dequant_weight = kernel::DequantUtil::DequantData(weight_tensor); diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc index ccbdc1c23bf..84ffd87c0de 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc @@ -147,6 +147,11 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector &img_size, UnLock(); return host_ptr; } + total_size_ += size; + const uint64_t max_size = ocl_runtime_->GetGlobalMemSize(); + if (total_size_ >= max_size) { + MS_LOG(ERROR) << "Mem pool out of max_size, total size: " << total_size_ << ", max size: " << max_size; + } cl::Buffer *buffer = nullptr; cl::Image2D *image = nullptr; cl_mem_flags flags = CL_MEM_READ_WRITE; @@ -188,7 +193,8 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector &img_size, UnLock(); std::string type_name = img_size.empty() ? "buffer" : "Image2D"; MS_LOG(DEBUG) << "Malloc a new " << type_name << ". size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_ - << ", device addr: " << mem_buf->device_ptr_ << ", image_addr: " << image; + << ", device addr: " << mem_buf->device_ptr_ << ", image_addr: " << image + << ", total size: " << total_size_; return host_ptr; } @@ -250,10 +256,10 @@ void *OpenCLAllocator::GetBuffer(void *buffer) { return nullptr; } -void OpenCLAllocator::Clear() { - Lock(); +template +void OpenCLAllocator::ClearMemList(T *list) { auto svm_capabilities = ocl_runtime_->GetSVMCapabilities(); - for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) { + for (auto it = list->begin(); it != list->end(); it++) { if (it->second->map_flags) { int ret = UnmapBuffer(it->second->host_ptr_); if (ret != RET_OK) { @@ -278,29 +284,13 @@ void OpenCLAllocator::Clear() { } delete it->second; } - allocated_list_.clear(); + list->clear(); +} - for (auto it = free_list_.begin(); it != free_list_.end(); it++) { - if (svm_capabilities) { - clSVMFree((*ocl_runtime_->Context())(), it->second->host_ptr_); - MS_LOG(DEBUG) << "OpenCL free svm buffer : " << it->second->host_ptr_; - } else { - cl::Buffer *buffer = static_cast(it->second->device_ptr_); - if (buffer != nullptr) { - MS_LOG(DEBUG) << "OpenCL free device buffer : " << buffer; - delete buffer; - it->second->device_ptr_ = nullptr; - } - cl::Image *image = static_cast(it->second->image_ptr_); - if (image != nullptr) { - MS_LOG(DEBUG) << "OpenCL free image : " << image; - delete image; - it->second->image_ptr_ = nullptr; - } - } - delete it->second; - } - free_list_.clear(); +void OpenCLAllocator::Clear() { + Lock(); + ClearMemList>(&allocated_list_); + ClearMemList>(&free_list_); UnLock(); } diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.h b/mindspore/lite/src/runtime/opencl/opencl_allocator.h index f8a5f6e59a6..ab73a310732 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.h +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.h @@ -63,6 +63,12 @@ class OpenCLAllocator : public Allocator { void *CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer); void *CreateImage2D(size_t size, const std::vector &img_size, void *data, size_t flags, bool is_map, cl::Buffer **buffer, cl::Image2D **image); + template + void ClearMemList(T *list); + + private: + OpenCLRuntime *ocl_runtime_{nullptr}; + std::mutex lock; struct MemBuf { size_t size_; void *device_ptr_; @@ -72,14 +78,13 @@ class OpenCLAllocator : public Allocator { bool map_flags{false}; }; - std::mutex lock; // buf, membuf> std::unordered_map allocated_list_; std::multimap free_list_; + uint64_t total_size_{0}; // 6 is empirical value int shift_factor_ = 6; bool lock_flag_ = false; - OpenCLRuntime *ocl_runtime_{nullptr}; }; } // namespace mindspore::lite::opencl diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/opencl/opencl_executor.cc index 941a25623ef..0c6f9c76f8c 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc @@ -59,6 +59,10 @@ int OpenCLExecutor::RunOrTune(std::vector &inputs, std::vectorMalloc(output->Size(), img_size); + if (data_ptr == nullptr) { + MS_LOG(ERROR) << "Malloc data failed"; + return RET_ERROR; + } output->set_data(data_ptr); } else { ret = output->MallocData(allocator_); diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc index 73422bad331..3e324180c6c 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc @@ -225,8 +225,12 @@ int OpenCLRuntime::Init() { << "SVM_ATOMICS"; } } + global_memery_size_ = device_->getInfo(); + MS_LOG(INFO) << "Address space bits: " << device_->getInfo(); + MS_LOG(INFO) << "Global Mem Size: " << global_memery_size_; MS_LOG(INFO) << "Global Mem Cache Size: " << global_memery_cachesize_; + MS_LOG(INFO) << "Max Alloc Size: " << device_->getInfo(); MS_LOG(INFO) << "Compute Unit: " << compute_units_; MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz"; diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/opencl/opencl_runtime.h index f923ee7e67c..92e1e8de637 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h @@ -61,6 +61,7 @@ class OpenCLRuntime { uint32_t DeviceMaxFreq() const; uint64_t GetMaxWorkGroupSize(const cl::Kernel &kernel); uint32_t GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range = cl::NullRange); + uint64_t GetGlobalMemSize() { return global_memery_size_; } GpuInfo GetGpuInfo(); bool GetFp16Enable() const; bool SetFp16Enable(bool enable); @@ -168,6 +169,7 @@ class OpenCLRuntime { std::map program_map_; cl::Program binary_program_{0}; uint64_t global_memery_cachesize_{0}; + uint64_t global_memery_size_{0}; int max_work_group_size_{1}; uint32_t compute_units_{0}; uint32_t max_freq_{0};