!4994 add create program from binary interface for opencl

Merge pull request !4994 from liuchao/master
2020-08-24 16:25:44 +08:00 · 2020-08-24 16:25:44 +08:00 · 3454a808c7
parent 816ed8d842 564bf717d5
commit 3454a808c7
14 changed files with 132 additions and 147 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@ -103,10 +103,7 @@ int ArithmeticOpenCLKernel::Init() {

  lite::STATUS error_code = RET_OK;
 #ifdef PROGRAM_WITH_IL
-  bool ret = runtime_->CreateKernelFromIL(kernel_(), kernel_name);
-  if (!ret) {
-    error_code = RET_ERROR;
-  }
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
  if (out_mem_type_ == OpenCLMemType::IMG) {
    kernel_name += "_IMG";
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@ -42,7 +42,7 @@ int Conv2dTransposeOpenCLKernel::Init() {
  std::string kernel_name = "conv2d_transpose2x2";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
  std::string source = conv2d_transpose2x2_source;
  std::set<std::string> build_options;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@ -64,7 +64,7 @@ int DepthwiseConv2dOpenCLKernel::Init() {
    kernel_name += "_1x1";
  }
 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
  std::string program_name = "DepthwiseConv2d";
  std::set<std::string> build_options;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@ -36,7 +36,7 @@ int MatMulOpenCLKernel::Init() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();

 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
  std::set<std::string> build_options;
  std::string source = matmul_source;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@ -62,7 +62,7 @@ int PoolingOpenCLKernel::Init() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();

 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
  if (out_mem_type_ == OpenCLMemType::BUF) {
    kernel_name += "_BUF";
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@ -45,7 +45,7 @@ int ReshapeOpenCLKernel::Init() {
    return RET_ERROR;
  }
 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
  std::set<std::string> build_options;
  std::string source = reshape_source;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@ -107,7 +107,7 @@ int SoftmaxOpenCLKernel::Init() {
    MS_LOG(EXCEPTION) << "Init `Softmax` kernel failed: Unsupported axis: " << parameter_->axis_;
  }
 #ifdef PROGRAM_WITH_IL
-  runtime_->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
  if (!is_image_out_) {
    out_mem_type_ = OpenCLMemType::BUF;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@ -50,7 +50,7 @@ int ToFormatOpenCLKernel::Init() {

  this->set_name(kernel_name);
 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
  std::set<std::string> build_options;
  std::string source = to_format_source;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@ -41,7 +41,7 @@ int TransposeOpenCLKernel::Init() {
    kernel_name += "_IMG";
  }
 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
  std::set<std::string> build_options;
  std::string source = transpose_source;
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
@ -81,7 +81,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
  void *device_ptr = nullptr;
  void *image_ptr = nullptr;

-  if (svm_capabilities && svm_on_) {
+  if (svm_capabilities) {
    cl_svm_mem_flags flags = (svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0;
    flags |= (svm_capabilities & CL_DEVICE_SVM_ATOMICS) ? CL_MEM_SVM_ATOMICS : 0;
    flags = flags | CL_MEM_READ_WRITE;
@ -280,7 +280,7 @@ void OpenCLAllocator::Clear() {
 void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue, bool sync) {
  auto ocl_runtime = opencl::OpenCLRuntime::GetInstance();
  auto svm_capabilities = ocl_runtime->GetSVMCapabilities();
-  if (svm_capabilities && svm_on_) {
+  if (svm_capabilities) {
    if (!(svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) {
      auto it = allocated_list_.find(host_ptr);
      if (it == allocated_list_.end()) {
@ -356,8 +356,8 @@ int OpenCLAllocator::UnmapBuffer(void *host_ptr, void *command_queue) {
  }
 }

-MEM_TYPE OpenCLAllocator::GetMemType(void *host_ptr) {
-  MEM_TYPE mem_type{MEM_TYPE::BUF};
+MemType OpenCLAllocator::GetMemType(void *host_ptr) {
+  MemType mem_type{MemType::BUF};
  Lock();
  auto it = allocated_list_.find(host_ptr);
  if (it == allocated_list_.end()) {
@ -367,9 +367,9 @@ MEM_TYPE OpenCLAllocator::GetMemType(void *host_ptr) {
  }
  MemBuf *mem_buf = it->second;
  if (mem_buf->img_size.empty()) {
-    mem_type = MEM_TYPE::BUF;
+    mem_type = MemType::BUF;
  } else {
-    mem_type = MEM_TYPE::IMG;
+    mem_type = MemType::IMG;
  }
  UnLock();
  return mem_type;
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.h
@ -40,7 +40,7 @@ struct OpenclMemory {
  OpenCLMemoryType mem_type{MS_HOST_BUFFER | MS_CL_BUFFER};
 };

-enum class MEM_TYPE : char { BUF, IMG };
+enum class MemType : char { SVM, BUF, IMG };

 class OpenCLAllocator : public Allocator {
 public:
@ -58,7 +58,7 @@ class OpenCLAllocator : public Allocator {
  void *GetBuffer(void *host_ptr);
  void *MapBuffer(void *host_ptr, int flags, void *command_queue = nullptr, bool sync = true);
  int UnmapBuffer(void *host_ptr, void *command_queue = nullptr);
-  MEM_TYPE GetMemType(void *host_ptr);
+  MemType GetMemType(void *host_ptr);
  int GetImageSize(void *host_ptr, std::vector<size_t> *img_size);
  void *Prepare(void *ptr) override {
    if (ptr != nullptr) {
@ -86,7 +86,6 @@ class OpenCLAllocator : public Allocator {
  // 6 is empirical value
  int shift_factor_ = 6;
  bool lock_flag_ = false;
-  bool svm_on_{false};
 };

 }  // namespace mindspore::lite::opencl
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
@ -138,27 +138,27 @@ int OpenCLRuntime::Init() {
               << max_work_item_sizes_[2];

  gpu_info_ = ParseGpuInfo(device_name, device_version);
-  cl_int err;
+  cl_int ret;
 #if defined(SHARING_MEM_WITH_OPENGL) && (CL_HPP_TARGET_OPENCL_VERSION >= 120)
  // create context from glcontext
  MS_LOG(INFO) << "Create special opencl context to share with OpenGL";
  cl_context_properties context_prop[] = {CL_GL_CONTEXT_KHR, (cl_context_properties)eglGetCurrentContext(),
                                          CL_EGL_DISPLAY_KHR, (cl_context_properties)eglGetCurrentDisplay(), 0};
-  context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, context_prop, nullptr, nullptr, &err);
+  context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, context_prop, nullptr, nullptr, &ret);

-  if (err != CL_SUCCESS) {
+  if (ret != CL_SUCCESS) {
    MS_LOG(ERROR) << "Create special OpenCL context falied, Create common OpenCL context then.";
-    context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &err);
+    context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
  }
 #else
  MS_LOG(INFO) << "Create common opencl context";
  //  cl_context_properties context_prop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0](),
  //                                          CL_PRINTF_CALLBACK_ARM, (cl_context_properties)printf_callback, 0};
  //  context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, context_prop, nullptr, nullptr, &err);
-  context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &err);
+  context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
 #endif
-  if (err != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Context create failed: " << CLErrorCode(err);
+  if (ret != CL_SUCCESS) {
+    MS_LOG(ERROR) << "Context create failed: " << CLErrorCode(ret);
    return RET_ERROR;
  }

@ -170,9 +170,8 @@ int OpenCLRuntime::Init() {
  auto success = device_->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fp_config);
  support_fp16_ = CL_SUCCESS == success && fp_config > 0;

-  err = device_->getInfo(CL_DEVICE_SVM_CAPABILITIES, &svm_capabilities_);
-  svm_capabilities_ = 0;
-  if (err != CL_SUCCESS || svm_capabilities_ == 0) {
+  ret = device_->getInfo(CL_DEVICE_SVM_CAPABILITIES, &svm_capabilities_);
+  if (ret != CL_SUCCESS || svm_capabilities_ == 0) {
    svm_capabilities_ = 0;
    MS_LOG(INFO) << "SVM capalibilties: "
                 << "NONE";
@ -204,16 +203,16 @@ int OpenCLRuntime::Init() {
  properties |= CL_QUEUE_PROFILING_ENABLE;
 #endif

-  default_command_queue_ = std::make_shared<cl::CommandQueue>(*context_, *device_, properties, &err);
-  if (err != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Command Queue create failed: " << CLErrorCode(err);
+  default_command_queue_ = std::make_shared<cl::CommandQueue>(*context_, *device_, properties, &ret);
+  if (ret != CL_SUCCESS) {
+    MS_LOG(ERROR) << "Command Queue create failed: " << CLErrorCode(ret);
    return RET_ERROR;
  }

  allocator_ = std::make_shared<OpenCLAllocator>();
 #ifdef PROGRAM_WITH_IL
  std::string flag = "";
-  CreateProgramFromIL(g_program_binary, flag);
+  binary_program_ = CreateProgramFromIL(g_program_binary, flag);
 #endif
  init_done_ = true;
  MS_LOG(INFO) << "OpenCLRuntime init done!";
@ -317,7 +316,7 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
      MS_LOG(ERROR) << "load program (" << program_name << ") failed!";
      return RET_ERROR;
    }
-    status = this->BuildProgram(build_options_str, &program);
+    status = this->BuildProgram(build_options_str, program);
    if (!status) {
      MS_LOG(ERROR) << program_name << " build failed!";
      return RET_ERROR;
@ -325,53 +324,15 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
    program_map_.emplace(build_program_key, program);
  }

-  cl_int err;
-  kernel = cl::Kernel(program, kernel_name.c_str(), &err);
-  if (err != CL_SUCCESS) {
-    MS_LOG(ERROR) << kernel_name << " Kernel create failed:" << CLErrorCode(err);
+  cl_int ret;
+  kernel = cl::Kernel(program, kernel_name.c_str(), &ret);
+  if (ret != CL_SUCCESS) {
+    MS_LOG(ERROR) << kernel_name << " Kernel create failed:" << CLErrorCode(ret);
    return RET_ERROR;
  }
  return RET_OK;
 }

-// Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
-int OpenCLRuntime::RunKernel(const cl_kernel &kernel, const std::vector<size_t> &global,
-                             const std::vector<size_t> &local, cl::CommandQueue *command_queue) {
-  if (command_queue == nullptr) {
-    command_queue = default_command_queue_.get();
-  }
-  MS_ASSERT(local.size() == 0 || local.size() == global.size());
-  std::vector<size_t> internal_global_ws = global;
-  for (size_t i = 0; i < local.size(); ++i) {
-    internal_global_ws[i] = ROUND_UP(global[i], local[i]);
-  }
-
-  MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size();
-  for (size_t i = 0; i < global.size(); i++) {
-    MS_LOG(DEBUG) << "global[" << i << "] = " << global[i];
-  }
-  for (size_t i = 0; i < local.size(); i++) {
-    MS_LOG(DEBUG) << "local[" << i << "] = " << local[i];
-  }
-
-  cl::Event event;
-  cl_int error = CL_SUCCESS;
-  if (local.size() == 0) {
-    error =
-      clEnqueueNDRangeKernel((*command_queue)(), kernel, global.size(), 0, global.data(), nullptr, 0, nullptr, nullptr);
-  } else {
-    error = clEnqueueNDRangeKernel((*command_queue)(), kernel, global.size(), 0, global.data(), local.data(), 0,
-                                   nullptr, nullptr);
-  }
-
-  if (error != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(error);
-    return RET_ERROR;
-  }
-  MS_LOG(DEBUG) << "RunKernel success!";
-  return RET_OK;
-}
-
 // Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
 int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global,
                             const std::vector<size_t> &local, cl::CommandQueue *command_queue) {
@ -392,9 +353,6 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t>
    MS_LOG(DEBUG) << "local[" << i << "] = " << local[i];
  }

-  cl::Event event;
-  cl_int err = CL_SUCCESS;
-
  cl::NDRange global_range = cl::NullRange;
  cl::NDRange local_range = cl::NullRange;
  if (global.size() == 1) {
@ -417,10 +375,12 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t>
    return RET_ERROR;
  }

-  err = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global_range, local_range, nullptr, &event);
+  cl::Event event;
+  cl_int ret = CL_SUCCESS;
+  ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global_range, local_range, nullptr, &event);

-  if (err != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(err);
+  if (ret != CL_SUCCESS) {
+    MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret);
    return RET_ERROR;
  }
  MS_LOG(DEBUG) << "RunKernel success!";
@ -486,11 +446,11 @@ bool OpenCLRuntime::LoadProgram(const std::string &program_name, cl::Program *pr
 }

 // build program with build options
-bool OpenCLRuntime::BuildProgram(const std::string &build_options, cl::Program *program) {
-  cl_int ret = program->build({*device_}, build_options.c_str());
+bool OpenCLRuntime::BuildProgram(const std::string &build_options, const cl::Program &program) {
+  cl_int ret = program.build({*device_}, build_options.c_str());
  if (ret != CL_SUCCESS) {
-    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*device_) == CL_BUILD_ERROR) {
-      std::string build_log = program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(*device_);
+    if (program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*device_) == CL_BUILD_ERROR) {
+      std::string build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*device_);
      MS_LOG(ERROR) << "Program build log: " << build_log;
    }
    MS_LOG(ERROR) << "Build program failed: " << CLErrorCode(ret);
@ -525,7 +485,7 @@ bool OpenCLRuntime::CopyHostMemToDevice(const void *dst, const void *src, size_t
  return cl_ret == CL_SUCCESS;
 }

-void *OpenCLRuntime::MapBuffer(const cl::Buffer buffer, int flags, size_t size, cl::CommandQueue *command_queue,
+void *OpenCLRuntime::MapBuffer(const cl::Buffer &buffer, int flags, size_t size, cl::CommandQueue *command_queue,
                               bool sync) const {
  if (command_queue == nullptr) {
    command_queue = default_command_queue_.get();
@ -534,7 +494,7 @@ void *OpenCLRuntime::MapBuffer(const cl::Buffer buffer, int flags, size_t size,
 }

 int OpenCLRuntime::MapBuffer(void *host_ptr, int flags, size_t size, cl::CommandQueue *command_queue, bool sync) const {
-  if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
+  if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
    return RET_OK;
  }
  if (command_queue == nullptr) {
@ -543,7 +503,7 @@ int OpenCLRuntime::MapBuffer(void *host_ptr, int flags, size_t size, cl::Command
  return command_queue->enqueueMapSVM(host_ptr, sync, flags, size);
 }

-void *OpenCLRuntime::MapBuffer(const cl::Image2D buffer, bool sync, int flags, const std::vector<size_t> &region,
+void *OpenCLRuntime::MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> &region,
                               cl::CommandQueue *command_queue) const {
  if (command_queue == nullptr) {
    command_queue = default_command_queue_.get();
@ -555,7 +515,7 @@ void *OpenCLRuntime::MapBuffer(const cl::Image2D buffer, bool sync, int flags, c
  return command_queue->enqueueMapImage(buffer, sync, flags, origin_, region_, &row_pitch, &slice_pitch);
 }

-int OpenCLRuntime::UnmapBuffer(const cl::Memory buffer, void *host_ptr, cl::CommandQueue *command_queue) const {
+int OpenCLRuntime::UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue) const {
  if (command_queue == nullptr) {
    command_queue = default_command_queue_.get();
  }
@ -563,7 +523,7 @@ int OpenCLRuntime::UnmapBuffer(const cl::Memory buffer, void *host_ptr, cl::Comm
 }

 int OpenCLRuntime::UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue) const {
-  if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
+  if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
    return RET_OK;
  }
  if (command_queue == nullptr) {
@ -586,43 +546,55 @@ bool OpenCLRuntime::SyncCommandQueue(cl::CommandQueue *command_queue) {

 int OpenCLRuntime::GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id) {
  size_t max_work_group_size;
-  cl_int err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
+  cl_int ret = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
                                        &max_work_group_size, nullptr);
-  if (err != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Failed to get info CL_KERNEL_WORK_GROUP_SIZE " << CLErrorCode(err);
+  if (ret != CL_SUCCESS) {
+    MS_LOG(ERROR) << "Failed to get info CL_KERNEL_WORK_GROUP_SIZE " << CLErrorCode(ret);
  }
  return static_cast<int>(max_work_group_size);
 }

-bool OpenCLRuntime::CreateKernelFromIL(cl_kernel &kernel, const std::string kernel_name) {
+cl::Kernel OpenCLRuntime::GetKernelFromBinary(const std::string &kernel_name) {
  cl_int ret = CL_SUCCESS;
-  kernel = clCreateKernel(il_program_, kernel_name.c_str(), &ret);
+  cl::Kernel kernel = cl::Kernel(binary_program_, kernel_name.c_str(), &ret);
  if (ret != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Create kernel with IL failed: " << CLErrorCode(ret);
+    MS_LOG(ERROR) << "Create kernel with binary program failed: " << CLErrorCode(ret);
  }
-  return ret == CL_SUCCESS;
+  return kernel;
 }

 // build program with IL
-bool OpenCLRuntime::CreateProgramFromIL(const std::vector<u_char> program_binary, const std::string flag) {
+cl::Program OpenCLRuntime::CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag) {
 #if CL_HPP_TARGET_OPENCL_VERSION >= 210
-  size_t program_length = program_binary.size();
-  cl_int ret = CL_SUCCESS;
-  il_program_ = clCreateProgramWithIL((*context_)(), program_binary.data(), program_length, &ret);
-  if (ret != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Create program with IL failed: " << CLErrorCode(ret);
-    return false;
+  cl::Program program = cl::Program(*context_, binary);
+  bool status = BuildProgram(default_build_opts_, program);
+  if (!status) {
+    MS_LOG(ERROR) << "Build program with IL failed!";
  }
-
-  ret = clBuildProgram(il_program_, 1, &(*device_)(), flag.c_str(), NULL, NULL);
-  if (ret != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Build program with IL failed: " << CLErrorCode(ret);
-  }
-  return ret == CL_SUCCESS;
+  return program;
 #else
  MS_LOG(ERROR) << "Create program with IL failed! The compute capabitity of device should be 2.1 and higher.";
-  return false;
+  return cl::Program();
 #endif
 }

+// build program with binary
+cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector<std::vector<unsigned char>> &binary,
+                                                   const std::string &flag) {
+  cl::Program program = cl::Program(*context_, {*device_}, binary);
+  bool status = BuildProgram(default_build_opts_, program);
+  if (!status) {
+    MS_LOG(ERROR) << "Build program with binary failed!";
+  }
+  return program;
+}
+
+std::vector<std::vector<unsigned char>> OpenCLRuntime::GetProgramBinaries(const cl::Program &program) {
+  cl_int ret = CL_SUCCESS;
+  auto binary = program.getInfo<CL_PROGRAM_BINARIES>(&ret);
+  if (ret != CL_SUCCESS) {
+    MS_LOG(ERROR) << "Get program binary failed: " << CLErrorCode(ret);
+  }
+  return binary;
+}
 }  // namespace mindspore::lite::opencl
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
@ -65,61 +65,68 @@ class OpenCLRuntime {
  GpuInfo GetGpuInfo();
  bool GetFp16Enable() const;
  bool SetFp16Enable(bool enable);
-  const std::vector<size_t> &GetWorkItemSize() { return max_work_item_sizes_; }
-  uint32_t GetImagePitchAlignment() { return image_pitch_align_; }
-  cl_device_svm_capabilities GetSVMCapabilities() const { return svm_capabilities_; }
+  bool GetSVMEnable() const { return svm_enable_; }
+  void SetSVMEnable(bool enable) { svm_enable_ = enable; }
+  const std::vector<size_t> &GetWorkItemSize() const { return max_work_item_sizes_; }
+  uint32_t GetImagePitchAlignment() const { return image_pitch_align_; }
+  cl_device_svm_capabilities GetSVMCapabilities() const { return svm_enable_ ? svm_capabilities_ : 0; }

  template <typename T>
-  typename std::enable_if<std::is_pointer<T>::value, cl_int>::type SetKernelArg(cl_kernel &kernel, uint32_t index,
-                                                                                const T value) {
-    if (svm_capabilities_) {
-      MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value;
-      return clSetKernelArgSVMPointer(kernel, index, value);
-    } else {
-      MEM_TYPE mem_type = allocator_->GetMemType(value);
-      if (mem_type == MEM_TYPE::BUF) {
+  typename std::enable_if<std::is_pointer<T>::value, cl_int>::type SetKernelArg(cl::Kernel &kernel, uint32_t index,
+                                                                                const T value,
+                                                                                const MemType mem_type = MemType::IMG) {
+    switch (mem_type) {
+      case MemType::SVM: {
+        MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value;
+        return kernel.setArg(index, value);
+      }
+      case MemType::BUF: {
        cl::Buffer *buffer = reinterpret_cast<cl::Buffer *>(allocator_->GetBuffer(value));
        MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL Buffer " << buffer << ", host_ptr: " << value;
-        return clSetKernelArg(kernel, index, sizeof((*buffer)()), &(*buffer)());
-      } else {
-        cl::Image2D *image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(value));
-        MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL Image2D " << image << ", host_ptr: " << value;
-        return clSetKernelArg(kernel, index, sizeof((*image)()), &(*image)());
+        return kernel.setArg(index, *buffer);
      }
+      case MemType::IMG: {
+        cl::Image2D *image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(value));
+        if (image == nullptr) {
+          MS_LOG(WARNING) << "Can't get Image2D, try to use Buffer. Please confirm the buffer type.";
+          cl::Buffer *buffer = reinterpret_cast<cl::Buffer *>(allocator_->GetBuffer(value));
+          MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL Buffer " << buffer << ", host_ptr: " << value;
+          return kernel.setArg(index, *buffer);
+        }
+        MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL Image2D " << image << ", host_ptr: " << value;
+        return kernel.setArg(index, *image);
+      }
+      default:
+        MS_LOG(ERROR) << "Unsupport opencl memory type: " << static_cast<int>(mem_type);
    }
  }

  template <typename T>
-  typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type SetKernelArg(cl_kernel &kernel, uint32_t index,
-                                                                                 const T value) {
-    return clSetKernelArg(kernel, index, sizeof(value), &value);
+  typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type SetKernelArg(
+    cl::Kernel &kernel, uint32_t index, const T value, const MemType mem_type = MemType::IMG) {
+    return kernel.setArg(index, value);
  }

-  template <typename T>
-  int SetKernelArg(cl::Kernel &kernel, uint32_t index, const T &value) {
-    return SetKernelArg(kernel(), index, value);
-  }
-
-  bool CreateProgramFromIL(const std::vector<u_char> program_binary, const std::string flag);
-  bool CreateKernelFromIL(cl_kernel &kernel, const std::string kernel_name);
+  cl::Program CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag);
+  cl::Program CreateProgramFromBinary(const std::vector<std::vector<unsigned char>> &binary, const std::string &flag);
+  cl::Kernel GetKernelFromBinary(const std::string &kernel_name);
+  std::vector<std::vector<unsigned char>> GetProgramBinaries(const cl::Program &program);
  bool LoadSource(const std::string &program_name, const std::string &source);
  int BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name,
                  const std::set<std::string> &build_options);
-  int RunKernel(const cl_kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local,
-                cl::CommandQueue *command_queue);
  int RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local,
                cl::CommandQueue *command_queue);
  bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
                           bool sync = false) const;
  bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
                           bool sync = false) const;
-  void *MapBuffer(const cl::Buffer buffer, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr,
+  void *MapBuffer(const cl::Buffer &buffer, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr,
                  bool sync = false) const;
-  void *MapBuffer(const cl::Image2D buffer, bool sync, int flags, const std::vector<size_t> &region,
+  void *MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> &region,
                  cl::CommandQueue *command_queue = nullptr) const;
  int MapBuffer(void *host_ptr, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr,
                bool sync = false) const;
-  int UnmapBuffer(const cl::Memory buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
+  int UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
  int UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
  bool SyncCommandQueue(cl::CommandQueue *command_queue = nullptr);

@ -136,7 +143,7 @@ class OpenCLRuntime {
  GpuInfo ParseGpuInfo(std::string device_name, std::string device_version);

  bool LoadProgram(const std::string &program_name, cl::Program *program);
-  bool BuildProgram(const std::string &build_options, cl::Program *program);
+  bool BuildProgram(const std::string &build_options, const cl::Program &program);

 private:
  static std::shared_ptr<OpenCLRuntime> opencl_runtime_singleton_;
@ -146,7 +153,7 @@ class OpenCLRuntime {
  std::shared_ptr<cl::Device> device_{nullptr};
  std::shared_ptr<OpenCLAllocator> allocator_{nullptr};
  std::map<std::string, cl::Program> program_map_{};
-  cl_program il_program_{0};
+  cl::Program binary_program_{0};
  uint64_t global_memery_cachesize_{0};
  int max_work_group_size;
  uint32_t compute_units_{0};
@ -155,6 +162,7 @@ class OpenCLRuntime {
  GpuInfo gpu_info_;
  bool support_fp16_{false};
  bool fp16_enable_{false};
+  bool svm_enable_{false};
  cl_device_svm_capabilities svm_capabilities_{0};
  cl_uint image_pitch_align_{0};
  std::vector<size_t> max_work_item_sizes_;
--- a/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc
@ -273,6 +273,15 @@ cl_program clCreateProgramWithSource(cl_context context, cl_uint count, const ch
  return func(context, count, strings, lengths, errcode_ret);
 }

+// clCreateProgramWithBinary wrapper, use OpenCLWrapper function.
+cl_program clCreateProgramWithBinary(cl_context context, cl_uint num_devices, const cl_device_id *devices_list,
+                                     const size_t *lengths, const unsigned char **binaries, cl_int *binary_status,
+                                     cl_int *errcode_ret) {
+  auto func = mindspore::lite::opencl::OpenCLWrapper::GetInstance()->clCreateProgramWithBinary;
+  MS_ASSERT(func != nullptr);
+  return func(context, num_devices, devices_list, lengths, binaries, binary_status, errcode_ret);
+}
+
 // clGetProgramInfo wrapper, use OpenCLWrapper function.
 cl_int clGetProgramInfo(cl_program program, cl_program_info param_name, size_t param_value_size, void *param_value,
                        size_t *param_value_size_ret) {