forked from mindspore-Ecosystem/mindspore
!9958 fix opencl subgraph memory reuse
From: @ddwsky Reviewed-by: Signed-off-by:
This commit is contained in:
commit
18db983d74
10
build.sh
10
build.sh
|
@ -542,15 +542,10 @@ gene_ocl_program() {
|
|||
echo "Compile SPIRV done"
|
||||
}
|
||||
|
||||
build_opencl() {
|
||||
get_opencl() {
|
||||
cd ${BASEPATH}
|
||||
git submodule update --init third_party/OpenCL-Headers
|
||||
git submodule update --init third_party/OpenCL-CLHPP
|
||||
if [[ "${OPENCL_OFFLINE_COMPILE}" == "on" ]]; then
|
||||
gene_ocl_program
|
||||
else
|
||||
gene_clhpp
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
|
@ -572,7 +567,8 @@ build_lite()
|
|||
fi
|
||||
|
||||
if [ "${ENABLE_GPU}" == "on" ] && [ "${LITE_PLATFORM}" == "arm64" ] || [ $1 == "arm64" ]; then
|
||||
echo "start build opencl"
|
||||
echo "start get opencl"
|
||||
get_opencl
|
||||
fi
|
||||
if [ "${ENABLE_NPU}" == "on" ]; then
|
||||
checkddk
|
||||
|
|
|
@ -47,6 +47,8 @@ endif()
|
|||
string(REPLACE "-fvisibility=hidden" "-fvisibility=default" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||
add_library(nnacl STATIC ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC})
|
||||
add_library(nnacl_mid OBJECT ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC})
|
||||
add_dependencies(nnacl fbs_src)
|
||||
add_dependencies(nnacl_mid fbs_src)
|
||||
|
||||
########################### arm64 build optimize library ########################
|
||||
if (PLATFORM_ARM64)
|
||||
|
|
|
@ -6,3 +6,4 @@ file(GLOB OPS_SRC
|
|||
)
|
||||
|
||||
add_library(cpu_ops_mid OBJECT ${OPS_SRC})
|
||||
add_dependencies(cpu_ops_mid fbs_src)
|
||||
|
|
|
@ -14,6 +14,7 @@ set(KERNEL_SRC ${KERNEL_SRC} ${TRAIN_KERNEL_SRC})
|
|||
endif()
|
||||
|
||||
add_library(cpu_kernel_mid OBJECT ${KERNEL_SRC})
|
||||
add_dependencies(cpu_kernel_mid fbs_src)
|
||||
|
||||
if (PLATFORM_ARM64)
|
||||
file(GLOB FP16_KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/fp16/*.cc)
|
||||
|
|
|
@ -1,2 +1,4 @@
|
|||
file(GLOB_RECURSE OPENCL_KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/kernel/*.cc)
|
||||
add_library(opencl_kernel_mid OBJECT ${OPENCL_KERNEL_SRC})
|
||||
add_dependencies(opencl_kernel_mid fbs_src)
|
||||
|
||||
|
|
|
@ -210,6 +210,7 @@ int OpenCLKernel::DequantWeight() {
|
|||
void *dequant_weight{nullptr};
|
||||
bool set_flag{true};
|
||||
if (is_fp16) {
|
||||
#ifdef ENABLE_ARM64
|
||||
if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) {
|
||||
dequant_weight = kernel::DequantUtil::DequantData<int8_t, float16_t>(weight_tensor);
|
||||
} else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt16) {
|
||||
|
@ -217,6 +218,9 @@ int OpenCLKernel::DequantWeight() {
|
|||
} else {
|
||||
set_flag = false;
|
||||
}
|
||||
#else
|
||||
set_flag = false;
|
||||
#endif
|
||||
} else {
|
||||
if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) {
|
||||
dequant_weight = kernel::DequantUtil::DequantData<int8_t, float>(weight_tensor);
|
||||
|
|
|
@ -147,6 +147,11 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size,
|
|||
UnLock();
|
||||
return host_ptr;
|
||||
}
|
||||
total_size_ += size;
|
||||
const uint64_t max_size = ocl_runtime_->GetGlobalMemSize();
|
||||
if (total_size_ >= max_size) {
|
||||
MS_LOG(ERROR) << "Mem pool out of max_size, total size: " << total_size_ << ", max size: " << max_size;
|
||||
}
|
||||
cl::Buffer *buffer = nullptr;
|
||||
cl::Image2D *image = nullptr;
|
||||
cl_mem_flags flags = CL_MEM_READ_WRITE;
|
||||
|
@ -188,7 +193,8 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size,
|
|||
UnLock();
|
||||
std::string type_name = img_size.empty() ? "buffer" : "Image2D";
|
||||
MS_LOG(DEBUG) << "Malloc a new " << type_name << ". size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_
|
||||
<< ", device addr: " << mem_buf->device_ptr_ << ", image_addr: " << image;
|
||||
<< ", device addr: " << mem_buf->device_ptr_ << ", image_addr: " << image
|
||||
<< ", total size: " << total_size_;
|
||||
return host_ptr;
|
||||
}
|
||||
|
||||
|
@ -250,10 +256,10 @@ void *OpenCLAllocator::GetBuffer(void *buffer) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
void OpenCLAllocator::Clear() {
|
||||
Lock();
|
||||
template <typename T>
|
||||
void OpenCLAllocator::ClearMemList(T *list) {
|
||||
auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
|
||||
for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) {
|
||||
for (auto it = list->begin(); it != list->end(); it++) {
|
||||
if (it->second->map_flags) {
|
||||
int ret = UnmapBuffer(it->second->host_ptr_);
|
||||
if (ret != RET_OK) {
|
||||
|
@ -278,29 +284,13 @@ void OpenCLAllocator::Clear() {
|
|||
}
|
||||
delete it->second;
|
||||
}
|
||||
allocated_list_.clear();
|
||||
list->clear();
|
||||
}
|
||||
|
||||
for (auto it = free_list_.begin(); it != free_list_.end(); it++) {
|
||||
if (svm_capabilities) {
|
||||
clSVMFree((*ocl_runtime_->Context())(), it->second->host_ptr_);
|
||||
MS_LOG(DEBUG) << "OpenCL free svm buffer : " << it->second->host_ptr_;
|
||||
} else {
|
||||
cl::Buffer *buffer = static_cast<cl::Buffer *>(it->second->device_ptr_);
|
||||
if (buffer != nullptr) {
|
||||
MS_LOG(DEBUG) << "OpenCL free device buffer : " << buffer;
|
||||
delete buffer;
|
||||
it->second->device_ptr_ = nullptr;
|
||||
}
|
||||
cl::Image *image = static_cast<cl::Image *>(it->second->image_ptr_);
|
||||
if (image != nullptr) {
|
||||
MS_LOG(DEBUG) << "OpenCL free image : " << image;
|
||||
delete image;
|
||||
it->second->image_ptr_ = nullptr;
|
||||
}
|
||||
}
|
||||
delete it->second;
|
||||
}
|
||||
free_list_.clear();
|
||||
void OpenCLAllocator::Clear() {
|
||||
Lock();
|
||||
ClearMemList<std::unordered_map<void *, MemBuf *>>(&allocated_list_);
|
||||
ClearMemList<std::multimap<size_t, MemBuf *>>(&free_list_);
|
||||
UnLock();
|
||||
}
|
||||
|
||||
|
|
|
@ -63,6 +63,12 @@ class OpenCLAllocator : public Allocator {
|
|||
void *CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer);
|
||||
void *CreateImage2D(size_t size, const std::vector<size_t> &img_size, void *data, size_t flags, bool is_map,
|
||||
cl::Buffer **buffer, cl::Image2D **image);
|
||||
template <typename T>
|
||||
void ClearMemList(T *list);
|
||||
|
||||
private:
|
||||
OpenCLRuntime *ocl_runtime_{nullptr};
|
||||
std::mutex lock;
|
||||
struct MemBuf {
|
||||
size_t size_;
|
||||
void *device_ptr_;
|
||||
|
@ -72,14 +78,13 @@ class OpenCLAllocator : public Allocator {
|
|||
bool map_flags{false};
|
||||
};
|
||||
|
||||
std::mutex lock;
|
||||
// <membuf->buf, membuf>
|
||||
std::unordered_map<void *, MemBuf *> allocated_list_;
|
||||
std::multimap<size_t, MemBuf *> free_list_;
|
||||
uint64_t total_size_{0};
|
||||
// 6 is empirical value
|
||||
int shift_factor_ = 6;
|
||||
bool lock_flag_ = false;
|
||||
OpenCLRuntime *ocl_runtime_{nullptr};
|
||||
};
|
||||
|
||||
} // namespace mindspore::lite::opencl
|
||||
|
|
|
@ -59,6 +59,10 @@ int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor
|
|||
return ret;
|
||||
}
|
||||
auto data_ptr = allocator_->Malloc(output->Size(), img_size);
|
||||
if (data_ptr == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc data failed";
|
||||
return RET_ERROR;
|
||||
}
|
||||
output->set_data(data_ptr);
|
||||
} else {
|
||||
ret = output->MallocData(allocator_);
|
||||
|
|
|
@ -225,8 +225,12 @@ int OpenCLRuntime::Init() {
|
|||
<< "SVM_ATOMICS";
|
||||
}
|
||||
}
|
||||
global_memery_size_ = device_->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
|
||||
|
||||
MS_LOG(INFO) << "Address space bits: " << device_->getInfo<CL_DEVICE_ADDRESS_BITS>();
|
||||
MS_LOG(INFO) << "Global Mem Size: " << global_memery_size_;
|
||||
MS_LOG(INFO) << "Global Mem Cache Size: " << global_memery_cachesize_;
|
||||
MS_LOG(INFO) << "Max Alloc Size: " << device_->getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
|
||||
MS_LOG(INFO) << "Compute Unit: " << compute_units_;
|
||||
MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz";
|
||||
|
||||
|
|
|
@ -61,6 +61,7 @@ class OpenCLRuntime {
|
|||
uint32_t DeviceMaxFreq() const;
|
||||
uint64_t GetMaxWorkGroupSize(const cl::Kernel &kernel);
|
||||
uint32_t GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range = cl::NullRange);
|
||||
uint64_t GetGlobalMemSize() { return global_memery_size_; }
|
||||
GpuInfo GetGpuInfo();
|
||||
bool GetFp16Enable() const;
|
||||
bool SetFp16Enable(bool enable);
|
||||
|
@ -168,6 +169,7 @@ class OpenCLRuntime {
|
|||
std::map<std::string, cl::Program> program_map_;
|
||||
cl::Program binary_program_{0};
|
||||
uint64_t global_memery_cachesize_{0};
|
||||
uint64_t global_memery_size_{0};
|
||||
int max_work_group_size_{1};
|
||||
uint32_t compute_units_{0};
|
||||
uint32_t max_freq_{0};
|
||||
|
|
Loading…
Reference in New Issue