!9958 fix opencl subgraph memory reuse

From: @ddwsky
Reviewed-by: 
Signed-off-by:
This commit is contained in:
mindspore-ci-bot 2020-12-17 20:24:45 +08:00 committed by Gitee
commit 18db983d74
11 changed files with 46 additions and 35 deletions

View File

@ -542,15 +542,10 @@ gene_ocl_program() {
echo "Compile SPIRV done"
}
build_opencl() {
get_opencl() {
cd ${BASEPATH}
git submodule update --init third_party/OpenCL-Headers
git submodule update --init third_party/OpenCL-CLHPP
if [[ "${OPENCL_OFFLINE_COMPILE}" == "on" ]]; then
gene_ocl_program
else
gene_clhpp
fi
}
@ -572,7 +567,8 @@ build_lite()
fi
if [ "${ENABLE_GPU}" == "on" ] && [ "${LITE_PLATFORM}" == "arm64" ] || [ $1 == "arm64" ]; then
echo "start build opencl"
echo "start get opencl"
get_opencl
fi
if [ "${ENABLE_NPU}" == "on" ]; then
checkddk

View File

@ -47,6 +47,8 @@ endif()
string(REPLACE "-fvisibility=hidden" "-fvisibility=default" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
add_library(nnacl STATIC ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC})
add_library(nnacl_mid OBJECT ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC})
add_dependencies(nnacl fbs_src)
add_dependencies(nnacl_mid fbs_src)
########################### arm64 build optimize library ########################
if (PLATFORM_ARM64)

View File

@ -6,3 +6,4 @@ file(GLOB OPS_SRC
)
add_library(cpu_ops_mid OBJECT ${OPS_SRC})
add_dependencies(cpu_ops_mid fbs_src)

View File

@ -14,6 +14,7 @@ set(KERNEL_SRC ${KERNEL_SRC} ${TRAIN_KERNEL_SRC})
endif()
add_library(cpu_kernel_mid OBJECT ${KERNEL_SRC})
add_dependencies(cpu_kernel_mid fbs_src)
if (PLATFORM_ARM64)
file(GLOB FP16_KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/fp16/*.cc)

View File

@ -1,2 +1,4 @@
file(GLOB_RECURSE OPENCL_KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/kernel/*.cc)
add_library(opencl_kernel_mid OBJECT ${OPENCL_KERNEL_SRC})
add_dependencies(opencl_kernel_mid fbs_src)

View File

@ -210,6 +210,7 @@ int OpenCLKernel::DequantWeight() {
void *dequant_weight{nullptr};
bool set_flag{true};
if (is_fp16) {
#ifdef ENABLE_ARM64
if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) {
dequant_weight = kernel::DequantUtil::DequantData<int8_t, float16_t>(weight_tensor);
} else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt16) {
@ -217,6 +218,9 @@ int OpenCLKernel::DequantWeight() {
} else {
set_flag = false;
}
#else
set_flag = false;
#endif
} else {
if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) {
dequant_weight = kernel::DequantUtil::DequantData<int8_t, float>(weight_tensor);

View File

@ -147,6 +147,11 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size,
UnLock();
return host_ptr;
}
total_size_ += size;
const uint64_t max_size = ocl_runtime_->GetGlobalMemSize();
if (total_size_ >= max_size) {
MS_LOG(ERROR) << "Mem pool out of max_size, total size: " << total_size_ << ", max size: " << max_size;
}
cl::Buffer *buffer = nullptr;
cl::Image2D *image = nullptr;
cl_mem_flags flags = CL_MEM_READ_WRITE;
@ -188,7 +193,8 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size,
UnLock();
std::string type_name = img_size.empty() ? "buffer" : "Image2D";
MS_LOG(DEBUG) << "Malloc a new " << type_name << ". size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_
<< ", device addr: " << mem_buf->device_ptr_ << ", image_addr: " << image;
<< ", device addr: " << mem_buf->device_ptr_ << ", image_addr: " << image
<< ", total size: " << total_size_;
return host_ptr;
}
@ -250,10 +256,10 @@ void *OpenCLAllocator::GetBuffer(void *buffer) {
return nullptr;
}
void OpenCLAllocator::Clear() {
Lock();
template <typename T>
void OpenCLAllocator::ClearMemList(T *list) {
auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) {
for (auto it = list->begin(); it != list->end(); it++) {
if (it->second->map_flags) {
int ret = UnmapBuffer(it->second->host_ptr_);
if (ret != RET_OK) {
@ -278,29 +284,13 @@ void OpenCLAllocator::Clear() {
}
delete it->second;
}
allocated_list_.clear();
list->clear();
}
for (auto it = free_list_.begin(); it != free_list_.end(); it++) {
if (svm_capabilities) {
clSVMFree((*ocl_runtime_->Context())(), it->second->host_ptr_);
MS_LOG(DEBUG) << "OpenCL free svm buffer : " << it->second->host_ptr_;
} else {
cl::Buffer *buffer = static_cast<cl::Buffer *>(it->second->device_ptr_);
if (buffer != nullptr) {
MS_LOG(DEBUG) << "OpenCL free device buffer : " << buffer;
delete buffer;
it->second->device_ptr_ = nullptr;
}
cl::Image *image = static_cast<cl::Image *>(it->second->image_ptr_);
if (image != nullptr) {
MS_LOG(DEBUG) << "OpenCL free image : " << image;
delete image;
it->second->image_ptr_ = nullptr;
}
}
delete it->second;
}
free_list_.clear();
void OpenCLAllocator::Clear() {
Lock();
ClearMemList<std::unordered_map<void *, MemBuf *>>(&allocated_list_);
ClearMemList<std::multimap<size_t, MemBuf *>>(&free_list_);
UnLock();
}

View File

@ -63,6 +63,12 @@ class OpenCLAllocator : public Allocator {
void *CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer);
void *CreateImage2D(size_t size, const std::vector<size_t> &img_size, void *data, size_t flags, bool is_map,
cl::Buffer **buffer, cl::Image2D **image);
template <typename T>
void ClearMemList(T *list);
private:
OpenCLRuntime *ocl_runtime_{nullptr};
std::mutex lock;
struct MemBuf {
size_t size_;
void *device_ptr_;
@ -72,14 +78,13 @@ class OpenCLAllocator : public Allocator {
bool map_flags{false};
};
std::mutex lock;
// <membuf->buf, membuf>
std::unordered_map<void *, MemBuf *> allocated_list_;
std::multimap<size_t, MemBuf *> free_list_;
uint64_t total_size_{0};
// 6 is empirical value
int shift_factor_ = 6;
bool lock_flag_ = false;
OpenCLRuntime *ocl_runtime_{nullptr};
};
} // namespace mindspore::lite::opencl

View File

@ -59,6 +59,10 @@ int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor
return ret;
}
auto data_ptr = allocator_->Malloc(output->Size(), img_size);
if (data_ptr == nullptr) {
MS_LOG(ERROR) << "Malloc data failed";
return RET_ERROR;
}
output->set_data(data_ptr);
} else {
ret = output->MallocData(allocator_);

View File

@ -225,8 +225,12 @@ int OpenCLRuntime::Init() {
<< "SVM_ATOMICS";
}
}
global_memery_size_ = device_->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
MS_LOG(INFO) << "Address space bits: " << device_->getInfo<CL_DEVICE_ADDRESS_BITS>();
MS_LOG(INFO) << "Global Mem Size: " << global_memery_size_;
MS_LOG(INFO) << "Global Mem Cache Size: " << global_memery_cachesize_;
MS_LOG(INFO) << "Max Alloc Size: " << device_->getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
MS_LOG(INFO) << "Compute Unit: " << compute_units_;
MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz";

View File

@ -61,6 +61,7 @@ class OpenCLRuntime {
uint32_t DeviceMaxFreq() const;
uint64_t GetMaxWorkGroupSize(const cl::Kernel &kernel);
uint32_t GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range = cl::NullRange);
uint64_t GetGlobalMemSize() { return global_memery_size_; }
GpuInfo GetGpuInfo();
bool GetFp16Enable() const;
bool SetFp16Enable(bool enable);
@ -168,6 +169,7 @@ class OpenCLRuntime {
std::map<std::string, cl::Program> program_map_;
cl::Program binary_program_{0};
uint64_t global_memery_cachesize_{0};
uint64_t global_memery_size_{0};
int max_work_group_size_{1};
uint32_t compute_units_{0};
uint32_t max_freq_{0};