fix security check

This commit is contained in:
gongdaguo 2021-08-03 16:20:09 +08:00
parent 0c707cd888
commit fe438fae9c
82 changed files with 2222 additions and 632 deletions

View File

@ -108,12 +108,15 @@ void *OpenCLAllocator::CreateImage2D(size_t size, const ImageSize &img_size, voi
}
if (*image == nullptr) {
delete *buffer;
*buffer = nullptr;
MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << mindspore::kernel::CLErrorCode(ret) << ")";
return nullptr;
}
if (ret != CL_SUCCESS) {
delete *buffer;
delete *image;
*buffer = nullptr;
*image = nullptr;
MS_LOG(ERROR) << "Create OpenCL Image2D (ERROR CODE: " << mindspore::kernel::CLErrorCode(ret) << ")";
return nullptr;
}
@ -125,6 +128,8 @@ void *OpenCLAllocator::CreateImage2D(size_t size, const ImageSize &img_size, voi
if (host_ptr == nullptr) {
delete *buffer;
delete *image;
*buffer = nullptr;
*image = nullptr;
MS_LOG(ERROR) << "Map image failed, can not found image :" << *image << ", host_ptr=" << host_ptr;
return nullptr;
}

View File

@ -210,6 +210,7 @@ int OpenCLRuntime::InitQueue(std::vector<cl::Platform> *platforms) {
#endif
if (context_ == nullptr || ret != CL_SUCCESS) {
delete device_;
device_ = nullptr;
MS_LOG(ERROR) << "Context create failed: " << CLErrorCode(ret);
return RET_ERROR;
}
@ -218,6 +219,8 @@ int OpenCLRuntime::InitQueue(std::vector<cl::Platform> *platforms) {
if (default_command_queue_ == nullptr || ret != CL_SUCCESS) {
delete device_;
delete context_;
device_ = nullptr;
context_ = nullptr;
MS_LOG(ERROR) << "Command Queue create failed: " << CLErrorCode(ret);
return RET_ERROR;
}
@ -227,6 +230,9 @@ int OpenCLRuntime::InitQueue(std::vector<cl::Platform> *platforms) {
delete device_;
delete context_;
delete default_command_queue_;
device_ = nullptr;
context_ = nullptr;
default_command_queue_ = nullptr;
MS_LOG(ERROR) << "Profiling command Queue create failed: " << CLErrorCode(ret);
return RET_ERROR;
}
@ -291,6 +297,10 @@ int OpenCLRuntime::Init() {
delete context_;
delete default_command_queue_;
delete profiling_command_queue_;
device_ = nullptr;
context_ = nullptr;
default_command_queue_ = nullptr;
profiling_command_queue_ = nullptr;
MS_LOG(ERROR) << "Command OpenCL allocator failed!";
return RET_ERROR;
}
@ -305,7 +315,9 @@ int OpenCLRuntime::Uninit() {
if (init_state_ != InitSuccess) {
return RET_OK;
}
StoreCache();
if (StoreCache() != RET_OK) {
MS_LOG(ERROR) << "StoreCache failed!";
}
program_map_.clear();
delete default_command_queue_;
delete profiling_command_queue_;
@ -574,12 +586,15 @@ void *OpenCLRuntime::MapBuffer(const cl::Buffer &buffer, int flags, size_t size,
int OpenCLRuntime::MapBuffer(void *host_ptr, int flags, size_t size, cl::CommandQueue *command_queue, bool sync) const {
if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
return RET_OK;
return RET_ERROR;
}
if (command_queue == nullptr) {
command_queue = default_command_queue_;
}
return clEnqueueSVMMap(command_queue->get(), sync, flags, host_ptr, size, 0, nullptr, nullptr);
if (clEnqueueSVMMap(command_queue->get(), sync, flags, host_ptr, size, 0, nullptr, nullptr) != CL_SUCCESS) {
return RET_ERROR;
}
return RET_OK;
}
void *OpenCLRuntime::MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> &region,
@ -720,17 +735,17 @@ void OpenCLRuntime::LoadCache() {
MS_LOG(INFO) << "Init opencl cache success";
}
void OpenCLRuntime::StoreCache() {
int OpenCLRuntime::StoreCache() {
if (!enable_cache_) {
return;
return RET_OK;
}
if (!flush_cache_) {
return;
return RET_OK;
}
auto fbb = std::make_unique<flatbuffers::FlatBufferBuilder>();
if (fbb == nullptr) {
MS_LOG(ERROR) << "new opencl FlatBufferBuilder fail";
return;
return RET_ERROR;
}
std::vector<flatbuffers::Offset<schema::ProgramBinary>> program_binarys;
for (const auto &kv : program_map_) {
@ -753,8 +768,12 @@ void OpenCLRuntime::StoreCache() {
auto gpu_cache = schema::CreateGpuCache(*fbb, name, version, data);
fbb->Finish(gpu_cache);
uint8_t *buf = fbb->GetBufferPointer();
WriteToBin(cache_path_, reinterpret_cast<void *>(buf), fbb->GetSize());
if (WriteToBin(cache_path_, reinterpret_cast<void *>(buf), fbb->GetSize()) != RET_OK) {
MS_LOG(ERROR) << "WriteToBin failed.";
return RET_ERROR;
}
MS_LOG(INFO) << "store opencl cache ok, size=" << fbb->GetSize();
return RET_OK;
}
cl::Buffer *OpenCLRuntime::CreateSharedMemoryBuffer(size_t size, void *host_ptr) {

View File

@ -203,7 +203,7 @@ class OpenCLRuntime {
// for cache
private:
void LoadCache();
void StoreCache();
int StoreCache();
#ifdef MS_OPENCL_BINARY_CACHE
bool enable_cache_{true};
#else

View File

@ -65,37 +65,53 @@ int ActivationOpenCLKernel::CheckSpecs() {
int ActivationOpenCLKernel::Prepare() {
outShape = GpuTensorInfo(out_tensors_[0]);
std::string source = activation_source;
std::string program_name = "Activation";
const std::string program_name = "Activation";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
}
std::string kernel_name = GetActTypeString(type_);
const std::string kernel_name = GetActTypeString(type_);
auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " init Done!";
return RET_OK;
}
void ActivationOpenCLKernel::SetConstArgs() {
int ActivationOpenCLKernel::SetConstArgs() {
int arg_idx = 2;
cl_int2 image_size = {static_cast<int>(outShape.width), static_cast<int>(outShape.height)};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, image_size);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, image_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (type_ == ActivationType_LEAKY_RELU) {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (type_ == ActivationType_SIGMOID) {
int c4 = outShape.Slice;
int last_c4 = outShape.C % 4 == 0 ? 4 : outShape.C % 4;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, c4);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, last_c4);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, c4) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, last_c4) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
return RET_OK;
}
void ActivationOpenCLKernel::SetGlobalLocal() {
@ -107,8 +123,14 @@ void ActivationOpenCLKernel::SetGlobalLocal() {
int ActivationOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail.";

View File

@ -35,7 +35,7 @@ class ActivationOpenCLKernel : public OpenCLKernel {
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
private:

View File

@ -16,6 +16,7 @@
#include <cstring>
#include <string>
#include <functional>
#include <algorithm>
#include "src/kernel_registry.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "src/runtime/kernel/opencl/kernel/argminmax.h"
@ -58,19 +59,41 @@ int ArgMinMaxOpenCLKernel::CheckSpecs() {
return RET_OK;
}
void ArgMinMaxOpenCLKernel::SetConstArgs() {
int ArgMinMaxOpenCLKernel::SetConstArgs() {
auto param = reinterpret_cast<ArgMinMaxParameter *>(op_parameter_);
cl_int4 in_shape{static_cast<int>(im_in_.N), static_cast<int>(im_in_.H), static_cast<int>(im_in_.W),
static_cast<int>(im_in_.C)};
cl_int4 flags = {param->out_value_, param->get_max_, param->axis_, param->topk_};
int arg_cnt = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, buff_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, ids_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size_);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, cus_size_);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, strides_);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, flags);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, buff_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, ids_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, cus_size_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, strides_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, flags) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void ArgMinMaxOpenCLKernel::SetGlobalLocal() {
@ -134,14 +157,22 @@ int ArgMinMaxOpenCLKernel::InitWeights() {
auto allocator = ocl_runtime_->GetAllocator();
int dtype_size = ocl_runtime_->GetFp16Enable() ? sizeof(int16_t) : sizeof(float);
buff_ = allocator->Malloc(in_tensors_[0]->ElementsNum() * dtype_size, lite::opencl::MemType::BUF);
if (buff_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
ids_ = allocator->Malloc(in_tensors_[0]->ElementsNum() * sizeof(int32_t), lite::opencl::MemType::BUF);
if (ids_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
return RET_OK;
}
int ArgMinMaxOpenCLKernel::Prepare() {
std::string kernel_name = "argminmax";
const std::string kernel_name = "argminmax";
std::string source = argminmax_source;
std::string program_name = "argminmax";
const std::string program_name = "argminmax";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -162,16 +193,28 @@ int ArgMinMaxOpenCLKernel::Prepare() {
InitWeights();
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
int ArgMinMaxOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -32,7 +32,7 @@ class ArgMinMaxOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int InitWeights() override;
int Tune() override { return lite::RET_OK; }

View File

@ -98,6 +98,10 @@ int ArithmeticOpenCLKernel::InitWeights() {
size_t dtype = fp16_enable ? CL_HALF_FLOAT : CL_FLOAT;
ImageSize img_size{in_shape.width, in_shape.height, dtype};
auto weight_ptr_ = allocator->Malloc(img_size, weight.data());
if (weight_ptr_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
weight_ptrs_.push_back(weight_ptr_);
} else {
weight_ptrs_.push_back(nullptr);
@ -106,7 +110,7 @@ int ArithmeticOpenCLKernel::InitWeights() {
return RET_OK;
}
void ArithmeticOpenCLKernel::SetConstArgs() {
int ArithmeticOpenCLKernel::SetConstArgs() {
int arg_idx = 3;
if (!element_flag_) {
cl_int4 in0_shape = {static_cast<int>(in0_shape_.N), static_cast<int>(in0_shape_.H), static_cast<int>(in0_shape_.W),
@ -121,16 +125,38 @@ void ArithmeticOpenCLKernel::SetConstArgs() {
} else if (in0_shape_.C != 1 && in1_shape_.C == 1) {
broadcastC_flag = 2; // BroadCast C4 in input1
}
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in0_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in1_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in0_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in1_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
cl_int2 output_shape{static_cast<int>(global_range_[0]), static_cast<int>(global_range_[1])};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
int ArithmeticOpenCLKernel::Prepare() {
@ -179,7 +205,7 @@ int ArithmeticOpenCLKernel::Prepare() {
activation_max_ = 6.f;
}
std::string program_name = "Arithmetic";
const std::string program_name = "Arithmetic";
std::string source = arithmetic_source;
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
@ -196,7 +222,10 @@ int ArithmeticOpenCLKernel::Prepare() {
if (type() != PrimitiveType_BiasAdd) {
InitWeights();
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << kernel_name_ << " Init Done!";
return RET_OK;
}
@ -206,10 +235,22 @@ int ArithmeticOpenCLKernel::Run() {
auto input_0_ptr = weight_ptrs_[0] == nullptr ? in_tensors_[0]->data_c() : weight_ptrs_[0];
auto input_1_ptr = weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : weight_ptrs_[1];
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -35,7 +35,7 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
int InitWeights() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
private:

View File

@ -86,7 +86,7 @@ int ArithmeticSelfOpenCLKernel::Prepare() {
kernel_name += std::string(schema::EnumNamePrimitiveType(type())) + "_NHWC4";
}
MS_LOG(DEBUG) << "execute kernel name : " << kernel_name;
std::string program_name = "ArithmeticSelf";
const std::string program_name = "ArithmeticSelf";
if (!ocl_runtime_->LoadSource(program_name, arithmeticself_source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -98,15 +98,27 @@ int ArithmeticSelfOpenCLKernel::Prepare() {
return ret;
}
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
return RET_OK;
}
int ArithmeticSelfOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -47,7 +47,13 @@ class ArithmeticSelfOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override { ocl_runtime_->SetKernelArg(kernel_, 2, output_shape_); }
int SetConstArgs() override {
if (ocl_runtime_->SetKernelArg(kernel_, 2, output_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void SetGlobalLocal() override;
int Run() override;

View File

@ -55,7 +55,7 @@ int BatchToSpaceNDOpenCLKernel::CheckSpecs() {
return RET_OK;
}
void BatchToSpaceNDOpenCLKernel::SetConstArgs() {
int BatchToSpaceNDOpenCLKernel::SetConstArgs() {
auto param = reinterpret_cast<BatchToSpaceParameter *>(this->op_parameter_);
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
@ -66,10 +66,23 @@ void BatchToSpaceNDOpenCLKernel::SetConstArgs() {
cl_int4 paddings = {param->crops_[0], param->crops_[1], param->crops_[2], param->crops_[3]};
int arg_cnt = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, block_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, paddings);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, block_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, paddings) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void BatchToSpaceNDOpenCLKernel::SetGlobalLocal() {
@ -82,9 +95,9 @@ void BatchToSpaceNDOpenCLKernel::SetGlobalLocal() {
}
int BatchToSpaceNDOpenCLKernel::Prepare() {
std::string kernel_name = "batch_to_space_nd_NHWC4";
const std::string kernel_name = "batch_to_space_nd_NHWC4";
std::string source = batch_to_space_nd_source;
std::string program_name = "batch_to_space_nd";
const std::string program_name = "batch_to_space_nd";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -96,16 +109,28 @@ int BatchToSpaceNDOpenCLKernel::Prepare() {
return ret;
}
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
int BatchToSpaceNDOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -32,7 +32,7 @@ class BatchToSpaceNDOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Tune() override { return lite::RET_OK; }

View File

@ -59,15 +59,25 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
local->push_back(z);
}
void BatchNormOpenCLKernel::SetConstArgs() {
int BatchNormOpenCLKernel::SetConstArgs() {
int arg_cn = 6;
auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_);
auto input0_shape = in_tensors_.at(0)->shape();
cl_int4 input_shape_ = {input0_shape.at(0), input0_shape.at(1), input0_shape.at(2),
UP_DIV(input0_shape.at(3), C4NUM)};
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input0_shape.at(3));
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input0_shape.at(3)) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void BatchNormOpenCLKernel::SetGlobalLocal() {
@ -83,6 +93,41 @@ void BatchNormOpenCLKernel::SetGlobalLocal() {
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
}
int BatchNormOpenCLKernel::UnmapBuffer() {
auto allocator = ocl_runtime_->GetAllocator();
if (allocator->UnmapBuffer(scale_) != RET_OK) {
return RET_ERROR;
}
if (allocator->UnmapBuffer(offset_) != RET_OK) {
return RET_ERROR;
}
if (allocator->UnmapBuffer(mean_) != RET_OK) {
return RET_ERROR;
}
if (allocator->UnmapBuffer(variance_) != RET_OK) {
return RET_ERROR;
}
return RET_OK;
}
int BatchNormOpenCLKernel::MapBuffer() {
auto allocator = ocl_runtime_->GetAllocator();
if (allocator->MapBuffer(scale_, CL_MAP_WRITE, nullptr, true) == nullptr) {
return RET_ERROR;
}
if (allocator->MapBuffer(offset_, CL_MAP_WRITE, nullptr, true) == nullptr) {
return RET_ERROR;
}
if (allocator->MapBuffer(mean_, CL_MAP_WRITE, nullptr, true) == nullptr) {
return RET_ERROR;
}
if (allocator->MapBuffer(variance_, CL_MAP_WRITE, nullptr, true) == nullptr) {
return RET_ERROR;
}
return RET_OK;
}
int BatchNormOpenCLKernel::Initweight() {
auto allocator = ocl_runtime_->GetAllocator();
GpuTensorInfo img_info(in_tensors_.at(1));
@ -90,15 +135,30 @@ int BatchNormOpenCLKernel::Initweight() {
size_t weight_size = img_info.OriginSize;
// allocated memory for weight and init value
scale_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
if (scale_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
offset_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
if (offset_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
mean_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
if (mean_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
variance_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
if (variance_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
allocator->MapBuffer(scale_, CL_MAP_WRITE, nullptr, true);
allocator->MapBuffer(offset_, CL_MAP_WRITE, nullptr, true);
allocator->MapBuffer(mean_, CL_MAP_WRITE, nullptr, true);
allocator->MapBuffer(variance_, CL_MAP_WRITE, nullptr, true);
if (MapBuffer() != RET_OK) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
memset(scale_, 1, weight_size);
memset(offset_, 0x00, weight_size);
memset(mean_, 0x00, weight_size);
@ -153,18 +213,18 @@ int BatchNormOpenCLKernel::Initweight() {
memcpy(variance_, in_tensors_.at(4)->data_c(), weight_size);
}
}
allocator->UnmapBuffer(scale_);
allocator->UnmapBuffer(offset_);
allocator->UnmapBuffer(mean_);
allocator->UnmapBuffer(variance_);
if (UnmapBuffer() != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
return RET_OK;
}
int BatchNormOpenCLKernel::Prepare() {
use_fp16_enable_ = ocl_runtime_->GetFp16Enable();
std::string kernel_name = "Batch_normalization_NHWC4";
const std::string kernel_name = "Batch_normalization_NHWC4";
std::string source = batchnorm_source;
std::string program_name = "Batch_normalization";
const std::string program_name = "Batch_normalization";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -181,7 +241,10 @@ int BatchNormOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Initweight failed ";
return RET_ERROR;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
return RET_OK;
@ -190,13 +253,34 @@ int BatchNormOpenCLKernel::Prepare() {
int BatchNormOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()); // input tensor
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, scale_, lite::opencl::MemType::BUF); // scale
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, offset_, lite::opencl::MemType::BUF); // offset
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, mean_, lite::opencl::MemType::BUF); // mean
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, variance_, lite::opencl::MemType::BUF); // variance
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c()); // out tensor
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // input tensor
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, scale_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // scale
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, offset_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // offset
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, mean_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // mean
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, variance_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // variance
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // out tensor
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -32,11 +32,13 @@ class BatchNormOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
private:
int Initweight();
int UnmapBuffer();
int MapBuffer();
private:
bool use_fp16_enable_{false};

View File

@ -52,9 +52,13 @@ int CastOpenCLKernel::CheckSpecs() {
return RET_OK;
}
void CastOpenCLKernel::SetConstArgs() {
int CastOpenCLKernel::SetConstArgs() {
cl_int2 shape = {static_cast<int>(shape_.width), static_cast<int>(shape_.height)};
ocl_runtime_->SetKernelArg(kernel_, 2, shape);
if (ocl_runtime_->SetKernelArg(kernel_, 2, shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void CastOpenCLKernel::SetGlobalLocal() {
@ -68,8 +72,8 @@ int CastOpenCLKernel::Prepare() {
{kNumberTypeFloat32, "fp32"},
{kNumberTypeFloat16, "fp16"},
};
std::string program_name = "Cast";
std::string kernel_name =
const std::string program_name = "Cast";
const std::string kernel_name =
"Cast_" + dtype_names[in_tensors_.front()->data_type()] + "_to_" + dtype_names[out_tensors_.front()->data_type()];
if (!ocl_runtime_->LoadSource(program_name, cast_source)) {
MS_LOG(ERROR) << "Load source failed.";
@ -80,16 +84,28 @@ int CastOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
return RET_OK;
}
int CastOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -31,7 +31,7 @@ class CastOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;

View File

@ -38,7 +38,10 @@ int ConcatOpenCLKernel::RunAxis0() {
auto *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data));
for (int i = 0; i < in_tensors_.size(); i++) {
auto src_data = weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data_c() : weight_ptrs_.at(i);
allocator_->GetImageSize(src_data, &img_size);
if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
MS_LOG(ERROR) << "GetImageSize failed.";
return RET_ERROR;
}
auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
auto *input_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
@ -107,7 +110,7 @@ int ConcatOpenCLKernel::CheckSpecs() {
return RET_OK;
}
void ConcatOpenCLKernel::SetConstArgs() {
int ConcatOpenCLKernel::SetConstArgs() {
GpuTensorInfo img_info(out_tensors_[0]);
size_t dtype = ocl_runtime_->GetFp16Enable() ? sizeof(cl_half) : sizeof(cl_float);
stride_w = img_info.RowPitch() / dtype;
@ -124,9 +127,15 @@ void ConcatOpenCLKernel::SetConstArgs() {
temp.s[j] = in_tensor->shape()[j];
}
Broadcast2GpuShape(in_shape_.s, temp.s, in_tensor->shape().size(), 1);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
} else {
for (auto &in_tensor : in_tensors_) {
cl_int4 temp = {};
@ -135,11 +144,18 @@ void ConcatOpenCLKernel::SetConstArgs() {
}
Broadcast2GpuShape(in_shape_.s, temp.s, in_tensor->shape().size(), 1);
in_shape_.s[3] = UP_DIV(in_shape_.s[3], C4NUM);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
}
out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void ConcatOpenCLKernel::SetGlobalLocal() {
@ -190,6 +206,10 @@ int ConcatOpenCLKernel::ConvertWeightToTensor() {
}
ImageSize img_size{in_shape.width, in_shape.height, dtype};
auto weight_ptr_ = allocator->Malloc(img_size, weight.data());
if (weight_ptr_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
weight_ptrs_.push_back(weight_ptr_);
} else {
weight_ptrs_.push_back(nullptr);
@ -222,7 +242,7 @@ int ConcatOpenCLKernel::Prepare() {
kernel_name += "_NHWC4";
MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
std::string source = concat_source;
std::string program_name = "Concat";
const std::string program_name = "Concat";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -234,7 +254,10 @@ int ConcatOpenCLKernel::Prepare() {
return ret;
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
return RET_OK;
}
@ -247,14 +270,27 @@ int ConcatOpenCLKernel::Run() {
int arg_cn = 0;
for (int i = 0; i < in_tensors_.size(); ++i) {
auto input_ptr = weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data_c() : weight_ptrs_.at(i);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_ptr);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_ptr) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (axis_ == 3 && !Align_) {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF) !=
CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK;
}

View File

@ -31,7 +31,7 @@ class ConcatOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;

View File

@ -108,7 +108,10 @@ int Conv2DOpenCLKernel::Prepare() {
return ret;
}
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
return RET_OK;
}
@ -142,7 +145,7 @@ void Conv2DOpenCLKernel::InitAttrs() {
int Conv2DOpenCLKernel::BuildKernel() {
SetBlockSize();
std::string program_name = "conv2d";
const std::string program_name = "conv2d";
std::stringstream kernel_name;
kernel_name << "Conv2D_H" << block_size_.H << "W" << block_size_.W << "C" << block_size_.C;
if (filter_type_ == MemType::IMG) {
@ -245,9 +248,11 @@ void Conv2DOpenCLKernel::SetMaliFp16BlockSize(int task_size_per_cu, bool w_kerne
}
int Conv2DOpenCLKernel::InitWeights() {
InitFilter();
if (InitFilter() != RET_OK) {
return RET_ERROR;
}
if (has_bias_) {
InitBias();
return InitBias();
}
return RET_OK;
}
@ -300,7 +305,7 @@ void ConvertFilter(void *src, void *dst, TypeId src_dtype, TypeId dst_dtype, Fil
}
}
void Conv2DOpenCLKernel::InitFilter() {
int Conv2DOpenCLKernel::InitFilter() {
auto allocator = ocl_runtime_->GetAllocator();
// allocate opencl memory: buffer or image2d
@ -312,9 +317,17 @@ void Conv2DOpenCLKernel::InitFilter() {
size_t dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
size = width * height * CO_TILE * sizeof_FLT_;
packed_filter_ = allocator->Malloc({width, height, dtype});
if (packed_filter_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
} else {
size = UP_DIV(CO_SLICES_, Ogroup) * KH_ * KW_ * CI_SLICES_ * Ogroup * CI_TILE * CO_TILE * sizeof_FLT_;
packed_filter_ = allocator->Malloc(size, lite::opencl::MemType::BUF);
if (packed_filter_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
}
// rearrange filter
@ -333,15 +346,22 @@ void Conv2DOpenCLKernel::InitFilter() {
if (filter_type_ == MemType::IMG) {
ocl_runtime_->WriteImage(packed_filter_, tmp.data());
} else {
allocator->MapBuffer(packed_filter_, CL_MAP_WRITE, nullptr, true);
if (allocator->MapBuffer(packed_filter_, CL_MAP_WRITE, nullptr, true) == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
memcpy(packed_filter_, tmp.data(), size);
allocator->UnmapBuffer(packed_filter_);
if (allocator->UnmapBuffer(packed_filter_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
}
FreeStoredData(stored_filter_);
return RET_OK;
}
void Conv2DOpenCLKernel::InitBias() {
int Conv2DOpenCLKernel::InitBias() {
auto allocator = ocl_runtime_->GetAllocator();
// align bias from C to C4
@ -349,8 +369,15 @@ void Conv2DOpenCLKernel::InitBias() {
void *src_data = stored_bias_ == nullptr ? bias_tensor->data_c() : stored_bias_;
size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_;
packed_bias_ = allocator->Malloc(packed_bias_size, lite::opencl::MemType::BUF);
if (packed_bias_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true);
if (allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true) == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
memset(packed_bias_, 0x00, packed_bias_size);
if (bias_tensor->data_type() == kNumberTypeFloat16) {
if (use_fp16_) {
@ -375,11 +402,15 @@ void Conv2DOpenCLKernel::InitBias() {
memcpy(packed_bias_, src_data, CO_ * sizeof_FLT_);
}
}
allocator->UnmapBuffer(packed_bias_);
if (allocator->UnmapBuffer(packed_bias_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
FreeStoredData(stored_bias_);
return RET_OK;
}
void Conv2DOpenCLKernel::SetConstArgs() {
int Conv2DOpenCLKernel::SetConstArgs() {
cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_};
cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};
cl_int4 kernel_stride = {KH_, KW_, param_->stride_h_, param_->stride_w_};
@ -387,15 +418,43 @@ void Conv2DOpenCLKernel::SetConstArgs() {
cl_int2 dilation = {param_->dilation_h_, param_->dilation_w_};
int arg_cn = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_filter_, filter_type_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param_->act_type_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn, alpha_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_filter_, filter_type_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param_->act_type_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn, alpha_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void Conv2DOpenCLKernel::SetGlobalLocal() {
@ -429,9 +488,18 @@ void Conv2DOpenCLKernel::SetGlobalLocal() {
int Conv2DOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -53,7 +53,7 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
int CheckSpecs() override;
int Prepare() override;
int InitWeights() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;
@ -78,8 +78,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
protected:
void InitAttrs();
virtual int BuildKernel();
virtual void InitFilter();
void InitBias();
virtual int InitFilter();
int InitBias();
bool use_fp16_{false};
size_t sizeof_FLT_{4};
ConvParameter *param_{nullptr};

View File

@ -55,10 +55,10 @@ int Conv2dTransposeOpenCLKernel::CheckSpecs() {
}
int Conv2dTransposeOpenCLKernel::Prepare() {
std::string kernel_name = "conv2d_transpose";
const std::string kernel_name = "conv2d_transpose";
enable_fp16_ = ocl_runtime_->GetFp16Enable();
std::string source = GetActDefines() + conv2d_transpose_source;
std::string program_name = "conv2d_transpose";
const std::string program_name = "conv2d_transpose";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -74,7 +74,10 @@ int Conv2dTransposeOpenCLKernel::Prepare() {
return ret;
}
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
@ -94,7 +97,7 @@ void Conv2dTransposeOpenCLKernel::SetGlobalLocal() {
AlignGlobalLocal(global_size_, local_size_);
}
void Conv2dTransposeOpenCLKernel::SetConstArgs() {
int Conv2dTransposeOpenCLKernel::SetConstArgs() {
int arg_cnt = 2;
auto *param = reinterpret_cast<ConvParameter *>(op_parameter_);
int ci = in_tensors_[0]->shape()[3];
@ -115,14 +118,39 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() {
cl_int2 padding = {pad_h, pad_w};
cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), n};
cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), n};
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt, static_cast<cl_int>(param->act_type_));
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt, static_cast<cl_int>(param->act_type_)) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
int Conv2dTransposeOpenCLKernel::InitWeights() {
@ -147,7 +175,15 @@ int Conv2dTransposeOpenCLKernel::InitFilter() {
// IHWO to OHWI4(I)4(O)(converter format is IHWO)
// init padWeight_(buffer mem)
padWeight_ = allocator->Malloc(div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size, lite::opencl::MemType::BUF);
if (padWeight_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
if (padWeight_ == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
memset(padWeight_, 0x00, div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size);
auto origin_weight = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_;
auto weight_dtype = in_tensors_.at(kWeightIndex)->data_type();
@ -188,7 +224,10 @@ int Conv2dTransposeOpenCLKernel::InitFilter() {
}
}
}
allocator->UnmapBuffer(padWeight_);
if (allocator->UnmapBuffer(padWeight_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
FreeStoredData(stored_weight_);
return RET_OK;
}
@ -208,7 +247,15 @@ int Conv2dTransposeOpenCLKernel::InitBias() {
}
ImageSize img_size{im_dst_x, im_dst_y, img_dtype};
bias_ = allocator->Malloc(img_size);
if (bias_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
if (bias_ == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
memset(bias_, 0x00, div_co * C4NUM * data_size);
if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
@ -225,7 +272,10 @@ int Conv2dTransposeOpenCLKernel::InitBias() {
memcpy(bias_, src_data, co * data_size);
}
}
allocator->UnmapBuffer(bias_);
if (allocator->UnmapBuffer(bias_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
FreeStoredData(stored_bias_);
return RET_OK;
}
@ -233,9 +283,18 @@ int Conv2dTransposeOpenCLKernel::InitBias() {
int Conv2dTransposeOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_cnt = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -34,7 +34,7 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel {
int InitWeights() override;
int InitFilter();
int InitBias();
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int StoreConstData() override;

View File

@ -73,7 +73,7 @@ int DepthwiseConv2dOpenCLKernel::Prepare() {
} else {
block_size_.C = block_size_.H = block_size_.W = 1;
}
std::string program_name = "DepthwiseConv2d";
const std::string program_name = "DepthwiseConv2d";
std::string source = depthwise_conv2d_source;
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
@ -94,7 +94,10 @@ int DepthwiseConv2dOpenCLKernel::Prepare() {
return ret;
}
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << kernel_name << " Init Done! mem type=" << static_cast<int>(out_mem_type_);
return RET_OK;
}
@ -153,10 +156,12 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
size_t img_dtype = ocl_runtime_->GetFp16Enable() ? CL_HALF_FLOAT : CL_FLOAT;
ImageSize img_size{(size_t)plane_out / C4NUM, (size_t)out_info.N * CO4, img_dtype};
packed_weight_ = allocator->Malloc(img_size, temp_filter.data());
} else {
packed_weight_ = allocator->Malloc(pack_weight_size, temp_filter.data());
}
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
FreeStoredData(stored_weight_);
@ -199,13 +204,15 @@ int DepthwiseConv2dOpenCLKernel::InitBias() {
}
bias_data_ = allocator->Malloc(bias_size, temp_bias.data());
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
FreeStoredData(stored_bias_);
return RET_OK;
}
void DepthwiseConv2dOpenCLKernel::SetConstArgs() {
int DepthwiseConv2dOpenCLKernel::SetConstArgs() {
auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
auto in_info = GpuTensorInfo(in_tensors_[0]);
auto out_info = GpuTensorInfo(out_tensors_[0]);
@ -222,16 +229,47 @@ void DepthwiseConv2dOpenCLKernel::SetConstArgs() {
cl_int4 dst_size = {(cl_int)out_info.W, (cl_int)out_info.H, (cl_int)CO4, (cl_int)out_info.N};
int arg_cnt = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, packed_weight_, filter_type_);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dilation);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, packed_weight_, filter_type_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dilation) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() {
@ -286,9 +324,18 @@ int DepthwiseConv2dOpenCLKernel::StoreConstData() {
int DepthwiseConv2dOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}
} // namespace mindspore::kernel

View File

@ -41,7 +41,7 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {
int CheckSpecs() override;
int InitWeights() override;
int InitBias();
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int StoreConstData() override;

View File

@ -35,7 +35,10 @@ int FillOpenCLKernel::RunFill() {
cl_int4 fill_value = {};
fill_value.s[0] = fill_value.s[1] = fill_value.s[2] = fill_value.s[3] = default_;
auto src_data = out_tensors_[0]->data_c();
allocator_->GetImageSize(src_data, &img_size);
if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
MS_LOG(ERROR) << "GetImageSize failed.";
return RET_ERROR;
}
auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
@ -59,7 +62,7 @@ int FillOpenCLKernel::RunShape() {
return RET_OK;
}
void FillOpenCLKernel::SetConstArgs() {}
int FillOpenCLKernel::SetConstArgs() { return RET_OK; }
void FillOpenCLKernel::SetGlobalLocal() {}

View File

@ -31,7 +31,7 @@ class FillOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;

View File

@ -98,7 +98,7 @@ int FullConnectionOpenCLKernel::Prepare() {
kernel_name = "FullConnectionWeightVar";
}
std::string source = fullconnection_source;
std::string program_name = "FullConnection";
const std::string program_name = "FullConnection";
if (!ocl_runtime_->LoadSource(program_name, GetActDefines() + source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -113,7 +113,10 @@ int FullConnectionOpenCLKernel::Prepare() {
if (ret != RET_OK) {
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
@ -137,7 +140,15 @@ int FullConnectionOpenCLKernel::InitFilter() {
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
padWeight_ = allocator->Malloc(nhw_remainder * intensor_shape.Slice * co4 * C4NUM * C4NUM * dtype_size,
lite::opencl::MemType::BUF);
if (padWeight_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
if (padWeight_ == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
memset(padWeight_, 0x00, nhw_remainder * intensor_shape.Slice * co4 * C4NUM * C4NUM * dtype_size);
@ -183,7 +194,10 @@ int FullConnectionOpenCLKernel::InitFilter() {
}
}
}
allocator->UnmapBuffer(padWeight_);
if (allocator->UnmapBuffer(padWeight_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
FreeStoredData(stored_weight_);
return RET_OK;
}
@ -202,7 +216,15 @@ int FullConnectionOpenCLKernel::InitBias() {
}
ImageSize img_size{im_dst_x, im_dst_y, img_dtype};
bias_ = allocator->Malloc(img_size);
if (bias_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
if (bias_ == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
memset(bias_, 0x00, co4 * C4NUM * dtype_size);
if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
@ -218,7 +240,10 @@ int FullConnectionOpenCLKernel::InitBias() {
memcpy(bias_, src_data, CO_ * dtype_size);
}
}
allocator->UnmapBuffer(bias_);
if (allocator->UnmapBuffer(bias_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
FreeStoredData(stored_bias_);
return RET_OK;
}
@ -231,22 +256,44 @@ void FullConnectionOpenCLKernel::SetGlobalLocal() {
AlignGlobalLocal(global_size_, local_size_);
}
void FullConnectionOpenCLKernel::SetConstArgs() {
int FullConnectionOpenCLKernel::SetConstArgs() {
if (!weight_var_) {
ocl_runtime_->SetKernelArg(kernel_, 2, padWeight_, lite::opencl::MemType::BUF);
if (ocl_runtime_->SetKernelArg(kernel_, 2, padWeight_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
int arg_count = 3;
ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, N_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, N_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
auto intensor_shape = GpuTensorInfo(in_tensors_[0]);
int CI4 = CI_remainder_ * intensor_shape.Slice;
ocl_runtime_->SetKernelArg(kernel_, arg_count++, CI4);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, UP_DIV(CO_, C4NUM));
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, CI4) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, UP_DIV(CO_, C4NUM)) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
auto in_shape_info = GpuTensorInfo(in_tensors_[0]);
cl_int2 in_img_shape = {static_cast<int>(in_shape_info.height), static_cast<int>(in_shape_info.width)};
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_img_shape);
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_img_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
auto *param = reinterpret_cast<MatMulParameter *>(op_parameter_);
ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast<cl_int>(param->act_type_));
if (ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast<cl_int>(param->act_type_)) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
int FullConnectionOpenCLKernel::StoreConstData() {
@ -270,12 +317,24 @@ int FullConnectionOpenCLKernel::StoreConstData() {
int FullConnectionOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_count = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
if (weight_var_) {
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (weight_var_) {
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK;
}

View File

@ -31,7 +31,7 @@ class FullConnectionOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
int InitWeights() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Tune() override { return lite::RET_OK; }
int StoreConstData() override;

View File

@ -164,8 +164,8 @@ bool IsEltwiseAndOperatorSupported(LiteKernel *node) {
int FusionEltwiseOpenCLKernel::Prepare() {
std::string source = Codegen();
std::string program_name = "FusionEltwise\n" + source;
std::string kernel_name = "FusionEltwise";
const std::string program_name = "FusionEltwise\n" + source;
const std::string kernel_name = "FusionEltwise";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -183,7 +183,10 @@ int FusionEltwiseOpenCLKernel::Prepare() {
}
InitWeights();
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
return RET_OK;
}
@ -217,7 +220,14 @@ int FusionEltwiseOpenCLKernel::InitWeights() {
size_t num = tensor_info.ElementsNum;
size_t size = tensor_info.Image2DSize;
void *buffer = allocator->Malloc(size, lite::opencl::MemType::BUF);
allocator->MapBuffer(buffer, CL_MAP_WRITE, nullptr, true);
if (buffer == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
if (allocator->MapBuffer(buffer, CL_MAP_WRITE, nullptr, true) == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
memset(buffer, 0x00, size);
if (tensor->data_type() == kNumberTypeFloat16) {
if (use_fp16) {
@ -232,7 +242,10 @@ int FusionEltwiseOpenCLKernel::InitWeights() {
CopyNumber<float32_t, float32_t>(buffer, tensor->data_c(), num);
}
}
allocator->UnmapBuffer(buffer);
if (allocator->UnmapBuffer(buffer) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
buffer_weights_.push_back(buffer);
}
}
@ -247,7 +260,7 @@ void FusionEltwiseOpenCLKernel::SetGlobalLocal() {
AlignGlobalLocal(global_size_, local_size_);
}
void FusionEltwiseOpenCLKernel::SetConstArgs() {
int FusionEltwiseOpenCLKernel::SetConstArgs() {
auto output = GpuTensorInfo(out_tensors_.front());
cl_int4 output_shape = {static_cast<cl_int>(output.N), static_cast<cl_int>(output.H), static_cast<cl_int>(output.W),
static_cast<cl_int>(output.C)};
@ -260,18 +273,32 @@ void FusionEltwiseOpenCLKernel::SetConstArgs() {
if (IsScalar(in_tensor->shape())) {
if (ocl_runtime_->GetFp16Enable()) {
auto value = static_cast<float16_t>(scalar_weights_[scalar_idx++]);
ocl_runtime_->SetKernelArg(kernel_, arg_idx, *(reinterpret_cast<cl_half *>(&value)));
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, *(reinterpret_cast<cl_half *>(&value))) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_idx, scalar_weights_[scalar_idx++]);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, scalar_weights_[scalar_idx++]) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_idx, buffer_weights_[buffer_idx++], lite::opencl::MemType::BUF);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, buffer_weights_[buffer_idx++], lite::opencl::MemType::BUF) !=
CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
}
arg_idx++; // for act input
}
arg_idx++; // for output
ocl_runtime_->SetKernelArg(kernel_, arg_idx, output_shape);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, output_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
int FusionEltwiseOpenCLKernel::Run() {
@ -279,12 +306,21 @@ int FusionEltwiseOpenCLKernel::Run() {
int arg_idx = 0;
for (auto *in_tensor : in_tensors_) {
if (!in_tensor->IsConst()) {
ocl_runtime_->SetKernelArg(kernel_, arg_idx, in_tensor->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, in_tensor->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
arg_idx++;
}
ocl_runtime_->SetKernelArg(kernel_, arg_idx, out_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, out_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -162,7 +162,7 @@ class FusionEltwiseOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int InitWeights() override;
void SetGlobalLocal() override;
void SetConstArgs() override;
int SetConstArgs() override;
int Run() override;
void ClearParameter() { op_parameter_ = nullptr; }

View File

@ -81,7 +81,7 @@ int GatherOpenCLKernel::CheckSpecs() {
}
}
void GatherOpenCLKernel::SetConstArgs() {
int GatherOpenCLKernel::SetConstArgs() {
auto input = GpuTensorInfo(in_tensors_.front());
auto output = GpuTensorInfo(out_tensors_.front());
int indices_num = in_tensors_.at(1)->ElementsNum();
@ -90,10 +90,23 @@ void GatherOpenCLKernel::SetConstArgs() {
cl_int4 dst_size = {static_cast<cl_int>(output.W), static_cast<cl_int>(output.H), static_cast<cl_int>(output.Slice),
static_cast<cl_int>(output.N)};
int arg_cnt = 3;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, indices_num);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt, axis_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, indices_num) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt, axis_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void GatherOpenCLKernel::SetGlobalLocal() {
@ -104,11 +117,11 @@ void GatherOpenCLKernel::SetGlobalLocal() {
}
int GatherOpenCLKernel::Prepare() {
std::string kernel_name = "gather";
const std::string kernel_name = "gather";
if (in_tensors_.at(0)->shape().size() == 1 && axis_ == 0) {
axis_ = 3;
}
std::string program_name = "gather";
const std::string program_name = "gather";
if (!ocl_runtime_->LoadSource(program_name, gather_source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -127,7 +140,10 @@ int GatherOpenCLKernel::Prepare() {
}
}
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
@ -135,11 +151,21 @@ int GatherOpenCLKernel::Prepare() {
int GatherOpenCLKernel::ConvertTensorToweight() {
auto allocator = ocl_runtime_->GetAllocator();
auto indices_tensor = in_tensors_.at(1);
allocator->MapBuffer(indices_tensor->data_c(), CL_MAP_WRITE, nullptr, true);
if (allocator->MapBuffer(indices_tensor->data_c(), CL_MAP_WRITE, nullptr, true) == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
auto indices_num = indices_tensor->ElementsNum();
indices_data_ =
reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num, lite::opencl::MemType::BUF));
allocator->MapBuffer(indices_data_, CL_MAP_WRITE, nullptr, true);
if (indices_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
if (allocator->MapBuffer(indices_data_, CL_MAP_WRITE, nullptr, true) == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
if (indices_data_ == nullptr) {
MS_LOG(ERROR) << "Memory allocation failed";
return RET_ERROR;
@ -155,8 +181,14 @@ int GatherOpenCLKernel::ConvertTensorToweight() {
<< " But Your type is :" << data_type;
return RET_ERROR;
}
allocator->UnmapBuffer(indices_data_);
allocator->UnmapBuffer(indices_tensor->data_c());
if (allocator->UnmapBuffer(indices_data_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
if (allocator->UnmapBuffer(indices_tensor->data_c()) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
return RET_OK;
}
@ -197,7 +229,10 @@ int GatherOpenCLKernel::PreProcess() {
if (!InferShapeDone()) {
auto indices_tensor = in_tensors_[1];
if (!indices_tensor->IsConst()) {
ocl_runtime_->SyncCommandQueue();
if (!ocl_runtime_->SyncCommandQueue()) {
MS_LOG(ERROR) << "SyncCommandQueue failed.";
return RET_ERROR;
}
indices_tensor->MutableData();
}
}
@ -209,10 +244,22 @@ int GatherOpenCLKernel::Run() {
if (intensor1_is_tensor) {
ConvertTensorToweight();
}
ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_.front()->data_c());
ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -34,7 +34,7 @@ class GatherOpenCLKernel : public OpenCLKernel {
int PreProcess() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Tune() override { return lite::RET_OK; }
int ConvertTensorToweight();

View File

@ -98,6 +98,10 @@ int ArithmeticInt8OpenCLKernel::InitWeights() {
size_t dtype = fp16_enable ? CL_HALF_FLOAT : CL_FLOAT;
ImageSize img_size{in_shape.width, in_shape.height, dtype};
auto weight_ptr_ = allocator->Malloc(img_size, weight.data());
if (weight_ptr_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
weight_ptrs_.push_back(weight_ptr_);
} else {
weight_ptrs_.push_back(nullptr);
@ -106,7 +110,7 @@ int ArithmeticInt8OpenCLKernel::InitWeights() {
return RET_OK;
}
void ArithmeticInt8OpenCLKernel::SetConstArgs() {
int ArithmeticInt8OpenCLKernel::SetConstArgs() {
int arg_idx = 3;
if (!element_flag_) {
cl_int4 in0_shape = {static_cast<int>(in0_shape_.N), static_cast<int>(in0_shape_.H), static_cast<int>(in0_shape_.W),
@ -121,16 +125,37 @@ void ArithmeticInt8OpenCLKernel::SetConstArgs() {
} else if (in0_shape_.C != 1 && in1_shape_.C == 1) {
broadcastC_flag = 2; // BroadCast C4 in input1
}
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in0_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in1_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in0_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in1_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
cl_int2 output_shape{static_cast<int>(global_range_[0]), static_cast<int>(global_range_[1])};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
// set quantization parameter.
auto input0_quant_param = in_tensors_[0]->quant_params().front();
@ -141,8 +166,15 @@ void ArithmeticInt8OpenCLKernel::SetConstArgs() {
cl_char4 zero_point = {static_cast<int8_t>(input0_quant_param.zeroPoint),
static_cast<int8_t>(input1_quant_param.zeroPoint),
static_cast<int8_t>(output_quant_param.zeroPoint), 0};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale); // scale
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, zero_point); // zero_point
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // scale
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, zero_point) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // zero_point
return RET_OK;
}
int ArithmeticInt8OpenCLKernel::Prepare() {
@ -191,7 +223,7 @@ int ArithmeticInt8OpenCLKernel::Prepare() {
activation_max_ = 6.f;
}
std::string program_name = "Arithmetic";
const std::string program_name = "Arithmetic";
std::string source = arithmetic_source;
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
@ -207,7 +239,10 @@ int ArithmeticInt8OpenCLKernel::Prepare() {
if (type() != PrimitiveType_BiasAdd) {
InitWeights();
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << kernel_name_ << " Init Done!";
return RET_OK;
}
@ -218,10 +253,22 @@ int ArithmeticInt8OpenCLKernel::Run() {
auto input_1_ptr = weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : weight_ptrs_[1];
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -33,7 +33,7 @@ class ArithmeticInt8OpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
int InitWeights() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
private:

View File

@ -67,15 +67,31 @@ void LayerNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
local->push_back(z);
}
void LayerNormOpenCLKernel::SetConstArgs() {
int LayerNormOpenCLKernel::SetConstArgs() {
int arg_cn = 6;
GpuTensorInfo img_info(in_tensors_.at(0));
in_shape_.s[0] = img_info.N, in_shape_.s[1] = img_info.H, in_shape_.s[2] = img_info.W, in_shape_.s[3] = img_info.C;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, epsilon_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, normalized_axis_);
ocl_runtime_->SetKernelArg(kernel_mean_var_, 3, in_shape_);
ocl_runtime_->SetKernelArg(kernel_mean_var_, 4, normalized_shape_size_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, epsilon_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, normalized_axis_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_mean_var_, 3, in_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_mean_var_, 4, normalized_shape_size_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void AlignMeanVarGlobalLocal(const std::vector<int> &global, const std::vector<int> &local, cl::NDRange *global_range,
@ -106,9 +122,23 @@ int LayerNormOpenCLKernel::Initweight() {
size_t weight_size = img_info.Image2DSize;
// allocated memory for weight and init value
gamma_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
if (gamma_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
beta_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
allocator->MapBuffer(gamma_, CL_MAP_WRITE, nullptr, true);
allocator->MapBuffer(beta_, CL_MAP_WRITE, nullptr, true);
if (beta_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
if (allocator->MapBuffer(gamma_, CL_MAP_WRITE, nullptr, true) == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
if (allocator->MapBuffer(beta_, CL_MAP_WRITE, nullptr, true) == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
memset(gamma_, 0x01, weight_size);
memset(beta_, 0x00, weight_size);
@ -143,8 +173,14 @@ int LayerNormOpenCLKernel::Initweight() {
memcpy(beta_, in_tensors_.at(2)->data_c(), weight_size);
}
}
allocator->UnmapBuffer(gamma_);
allocator->UnmapBuffer(beta_);
if (allocator->UnmapBuffer(gamma_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
if (allocator->UnmapBuffer(beta_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
return RET_OK;
}
@ -164,11 +200,19 @@ int LayerNormOpenCLKernel::Prepare() {
size_t size_dtype = use_fp16_enable_ ? sizeof(float16_t) : sizeof(float);
mean_size *= size_dtype;
mean_ = allocator->Malloc(mean_size, lite::opencl::MemType::BUF);
if (mean_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
var_ = allocator->Malloc(mean_size, lite::opencl::MemType::BUF);
std::string kernel_name = "LayerNormalization_NHWC4";
if (var_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
const std::string kernel_name = "LayerNormalization_NHWC4";
std::string kernel_name_mean_var = "ComputeMeanVar";
std::string source = layer_norm_source;
std::string program_name = "LayerNormalization";
const std::string program_name = "LayerNormalization";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -182,7 +226,10 @@ int LayerNormOpenCLKernel::Prepare() {
kernel_name_mean_var += "Axis" + std::to_string(normalized_axis_) + "NHWC4";
ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var, build_options_ext);
MS_LOG(DEBUG) << kernel_name << " Init Done!";
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
return RET_OK;
@ -191,21 +238,48 @@ int LayerNormOpenCLKernel::Prepare() {
int LayerNormOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
int arg1_cn = 0;
ocl_runtime_->SetKernelArg(kernel_mean_var_, arg1_cn++, in_tensors_.at(0)->data_c()); // input tensor
ocl_runtime_->SetKernelArg(kernel_mean_var_, arg1_cn++, mean_, lite::opencl::MemType::BUF); // mean_
ocl_runtime_->SetKernelArg(kernel_mean_var_, arg1_cn++, var_, lite::opencl::MemType::BUF); // var_ return RET_OK;
if (ocl_runtime_->SetKernelArg(kernel_mean_var_, arg1_cn++, in_tensors_.at(0)->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // input tensor
if (ocl_runtime_->SetKernelArg(kernel_mean_var_, arg1_cn++, mean_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_mean_var_, arg1_cn++, var_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
ocl_runtime_->RunKernel(kernel_mean_var_, global_mean_var_, local_mean_var_, nullptr, &event_);
int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()); // input tensor
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c()); // out tensor
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, mean_, lite::opencl::MemType::BUF); // mean_
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, var_, lite::opencl::MemType::BUF); // var_
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, gamma_, lite::opencl::MemType::BUF); // gamma_
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, beta_, lite::opencl::MemType::BUF); // beta_
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // input tensor
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // out tensor
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, mean_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // mean_
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, var_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // var_
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, gamma_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // gamma_
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, beta_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
} // beta_
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK;
}
} // namespace mindspore::kernel
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LayerNormFusion, OpenCLKernelCreator<LayerNormOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LayerNormFusion, OpenCLKernelCreator<LayerNormOpenCLKernel>)

View File

@ -31,7 +31,7 @@ class LayerNormOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
private:

View File

@ -84,7 +84,7 @@ int MatMulOpenCLKernel::Prepare() {
std::map<int, std::string> dims2str = {{2, "_2d"}, {3, "_4d"}, {4, "_4d"}};
kernel_name += dims2str[dims];
std::string source = matmul_source;
std::string program_name = "MatMul";
const std::string program_name = "MatMul";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -95,13 +95,16 @@ int MatMulOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
void MatMulOpenCLKernel::PadWeight(std::vector<int> weight_shape_4d, int ci, int co) {
int MatMulOpenCLKernel::PadWeight(std::vector<int> weight_shape_4d, int ci, int co) {
auto allocator = ocl_runtime_->GetAllocator();
int a = weight_shape_4d[0];
int b = weight_shape_4d[1];
@ -109,7 +112,15 @@ void MatMulOpenCLKernel::PadWeight(std::vector<int> weight_shape_4d, int ci, int
int co4 = UP_DIV(co, C4NUM);
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
padWeight_ = allocator->Malloc(a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size, lite::opencl::MemType::BUF);
if (padWeight_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
if (padWeight_ == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size);
@ -157,6 +168,7 @@ void MatMulOpenCLKernel::PadWeight(std::vector<int> weight_shape_4d, int ci, int
}
}
}
return RET_OK;
}
int MatMulOpenCLKernel::InitWeights() {
@ -185,7 +197,10 @@ int MatMulOpenCLKernel::InitWeights() {
PadWeight(weight_shape_4d, ci, CO_);
allocator->UnmapBuffer(padWeight_);
if (allocator->UnmapBuffer(padWeight_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
FreeStoredData(stored_weight_);
return InitBias();
}
@ -204,7 +219,15 @@ int MatMulOpenCLKernel::InitBias() {
}
lite::opencl::ImageSize img_size{im_dst_x, im_dst_y, img_dtype};
bias_ = allocator->Malloc(img_size);
if (bias_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
if (bias_ == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
memset(bias_, 0x00, co4 * C4NUM * dtype_size);
if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
@ -220,7 +243,10 @@ int MatMulOpenCLKernel::InitBias() {
memcpy(bias_, src_data, CO_ * dtype_size);
}
}
allocator->UnmapBuffer(bias_);
if (allocator->UnmapBuffer(bias_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
FreeStoredData(stored_bias_);
return RET_OK;
}
@ -235,29 +261,54 @@ void MatMulOpenCLKernel::SetGlobalLocal() {
AlignGlobalLocal(global_size_, local_size_);
}
void MatMulOpenCLKernel::SetConstArgs() {
int MatMulOpenCLKernel::SetConstArgs() {
int arg_count = 2;
cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
if (act_weight_) {
arg_count++;
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
int MatMulOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_count = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
if (act_weight_) {
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (act_weight_) {
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK;
}

View File

@ -32,7 +32,7 @@ class MatMulOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
int InitWeights() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Tune() override { return lite::RET_OK; }
int InitBias();
@ -54,7 +54,7 @@ class MatMulOpenCLKernel : public OpenCLKernel {
std::vector<int> outShape{std::vector<int>(MAX_DIMS, 1)};
private:
void PadWeight(std::vector<int> weight_shape_4d, int ci, int co);
int PadWeight(std::vector<int> weight_shape_4d, int ci, int co);
};
} // namespace mindspore::kernel

View File

@ -48,7 +48,7 @@ int OneHotOpenCLKernel::Prepare() {
kernel_name += "Axis" + std::to_string(axis_);
}
std::string source = one_hot_source;
std::string program_name = "OneHot";
const std::string program_name = "OneHot";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -65,7 +65,10 @@ int OneHotOpenCLKernel::Prepare() {
return ret;
}
InitWeights();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
@ -87,18 +90,40 @@ int OneHotOpenCLKernel::InitWeights() {
return RET_OK;
}
void OneHotOpenCLKernel::SetConstArgs() {
int OneHotOpenCLKernel::SetConstArgs() {
cl_int2 cl_in_image2d_shape = {static_cast<cl_int>(in_shape_.width), static_cast<cl_int>(in_shape_.height)};
cl_int4 cl_out_shape = {static_cast<cl_int>(out_shape_.N), static_cast<cl_int>(out_shape_.H),
static_cast<cl_int>(out_shape_.W), static_cast<cl_int>(out_shape_.Slice)};
int arg_idx = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_in_image2d_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_out_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, depth_);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, on_value_);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, off_value_);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<int>(out_shape_.C));
ocl_runtime_->SetKernelArg(kernel_, arg_idx, static_cast<int>(param_->support_neg_index_));
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_in_image2d_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_out_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, depth_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, on_value_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, off_value_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<int>(out_shape_.C)) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, static_cast<int>(param_->support_neg_index_)) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void OneHotOpenCLKernel::SetGlobalLocal() {
local_size_ = {};
@ -108,9 +133,18 @@ void OneHotOpenCLKernel::SetGlobalLocal() {
int OneHotOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -33,7 +33,7 @@ class OneHotOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int InitWeights() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
private:

View File

@ -81,11 +81,14 @@ int PadOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
return RET_OK;
}
void PadOpenCLKernel::SetConstArgs() {
int PadOpenCLKernel::SetConstArgs() {
auto input = GpuTensorInfo(in_tensors_.front());
auto output = GpuTensorInfo(out_tensors_.front());
cl_int4 input_shape = {static_cast<cl_int>(input.N), static_cast<cl_int>(input.H), static_cast<cl_int>(input.W),
@ -105,20 +108,45 @@ void PadOpenCLKernel::SetConstArgs() {
Broadcast2GpuShape(pad_before.s, pad_before_ori.data(), ndim, 0);
int arg_cn = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before);
ocl_runtime_->SetKernelArg(kernel_, arg_cn, param_->constant_value_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn, param_->constant_value_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
local_size_ = {8, 4, 1};
global_size_ = {output.N * output.H, output.W, output.Slice};
AlignGlobalLocal(global_size_, local_size_);
return RET_OK;
}
int PadOpenCLKernel::Run() {
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -35,7 +35,7 @@ class PadOpenCLKernel : public OpenCLKernel {
int CheckSpecs() override;
int Prepare() override;
void SetConstArgs() override;
int SetConstArgs() override;
int Run() override;

View File

@ -73,7 +73,7 @@ int PoolingOpenCLKernel::Prepare() {
kernel_name += "_NHWC4";
kernel_name += "_IMG";
std::string source = pooling2d_source;
std::string program_name = "Pooling2d";
const std::string program_name = "Pooling2d";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -84,7 +84,10 @@ int PoolingOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
@ -100,7 +103,7 @@ void PoolingOpenCLKernel::SetGlobalLocal() {
AlignGlobalLocal(global_size_, local_size_);
}
void PoolingOpenCLKernel::SetConstArgs() {
int PoolingOpenCLKernel::SetConstArgs() {
int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
cl_int4 input_shape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], slices};
cl_int4 output_shape = {out_tensors_[0]->shape()[0], out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2],
@ -109,19 +112,44 @@ void PoolingOpenCLKernel::SetConstArgs() {
cl_int2 kernel_size = {parameter_->window_h_, parameter_->window_w_};
cl_int2 padding = {parameter_->pad_u_, parameter_->pad_l_};
int arg_idx = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, stride);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, stride) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
int PoolingOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -32,7 +32,7 @@ class PoolingOpenCLKernel : public OpenCLKernel {
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
private:

View File

@ -63,15 +63,21 @@ void PowerGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l
local->push_back(z);
}
void PowerOpenCLKernel::SetConstArgs() {
int PowerOpenCLKernel::SetConstArgs() {
float unalign_w = static_cast<float>(out_shape_.s[3]);
out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
int arg_cn = 2;
if (!broadcast_) {
arg_cn++;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (use_fp16_enable_) {
auto x = static_cast<float16_t>(power_);
@ -80,11 +86,18 @@ void PowerOpenCLKernel::SetConstArgs() {
auto w = static_cast<float16_t>(unalign_w);
cl_half4 parameter = {*(reinterpret_cast<uint16_t *>(&x)), *(reinterpret_cast<uint16_t *>(&y)),
*(reinterpret_cast<uint16_t *>(&z)), *(reinterpret_cast<uint16_t *>(&w))};
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, parameter);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, parameter) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
cl_float4 parameter = {power_, shift_, scale_, unalign_w};
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, parameter);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, parameter) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
return RET_OK;
}
void PowerOpenCLKernel::SetGlobalLocal() {
@ -111,7 +124,7 @@ int PowerOpenCLKernel::Prepare() {
auto param = reinterpret_cast<PowerParameter *>(this->op_parameter_);
std::string kernel_name = "power";
std::string source = power_source;
std::string program_name = "power";
const std::string program_name = "power";
if (broadcast_) {
power_ = param->power_;
kernel_name += "_broadcast";
@ -130,7 +143,10 @@ int PowerOpenCLKernel::Prepare() {
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
return RET_OK;
}
@ -138,13 +154,28 @@ int PowerOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
int arg_cn = 0;
if (broadcast_) {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(1)->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(1)->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
return RET_OK;
}

View File

@ -30,7 +30,7 @@ class PowerOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;

View File

@ -46,7 +46,14 @@ int PReluOpenCLKernel::InitWeights() {
auto sizeof_FLT = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
size_t weight_size = UP_ROUND(C_, C4NUM) * sizeof_FLT;
weight_vector_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
allocator->MapBuffer(weight_vector_, CL_MAP_WRITE, nullptr, true);
if (weight_vector_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
if (allocator->MapBuffer(weight_vector_, CL_MAP_WRITE, nullptr, true) == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
memset(weight_vector_, 0x00, weight_size);
if (weight_tensor->data_type() == kNumberTypeFloat16) {
if (enable_fp16_) {
@ -69,7 +76,10 @@ int PReluOpenCLKernel::InitWeights() {
memcpy(weight_vector_, weight_tensor->data_c(), C_ * sizeof_FLT);
}
}
allocator->UnmapBuffer(weight_vector_);
if (allocator->UnmapBuffer(weight_vector_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
}
return RET_OK;
}
@ -95,11 +105,18 @@ int PReluOpenCLKernel::CheckSpecs() {
return RET_OK;
}
void PReluOpenCLKernel::SetConstArgs() {
int PReluOpenCLKernel::SetConstArgs() {
int arg_idx = 3;
out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, 2);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, 2) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void PReluOpenCLKernel::SetGlobalLocal() {
@ -126,8 +143,8 @@ int PReluOpenCLKernel::Prepare() {
weight_is_scalar = param->channelShared;
enable_fp16_ = ocl_runtime_->GetFp16Enable();
std::string source = prelu_source;
std::string program_name = "PRelu";
std::string kernel_name = "PRelu_" + std::string(weight_is_scalar ? "scalar" : "vector");
const std::string program_name = "PRelu";
const std::string kernel_name = "PRelu_" + std::string(weight_is_scalar ? "scalar" : "vector");
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -141,7 +158,10 @@ int PReluOpenCLKernel::Prepare() {
InitWeights();
MS_LOG(DEBUG) << program_name << " init Done!";
MS_LOG(DEBUG) << "kernel_name=: " << kernel_name << " init Done!";
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
return RET_OK;
}
@ -149,12 +169,24 @@ int PReluOpenCLKernel::Prepare() {
int PReluOpenCLKernel::Run() {
MS_LOG(DEBUG) << op_parameter_->name_ << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (weight_is_scalar) {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight_scalar_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight_scalar_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight_vector_, lite::opencl::MemType::BUF);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight_vector_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ret != mindspore::lite::RET_OK) {

View File

@ -31,7 +31,7 @@ class PReluOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;
int InitWeights() override;

View File

@ -17,6 +17,7 @@
#include <set>
#include <string>
#include <map>
#include <algorithm>
#include "include/errorcode.h"
#include "src/kernel_registry.h"
#include "src/runtime/kernel/opencl/kernel/reduce.h"
@ -179,7 +180,7 @@ int ReduceOpenCLKernel::Prepare() {
}
kernel_name += GetReduceTypeStr(reduce_param->mode_);
std::string source = reduce_source;
std::string program_name = "Reduce";
const std::string program_name = "Reduce";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -190,22 +191,32 @@ int ReduceOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
void ReduceOpenCLKernel::SetConstArgs() {
int ReduceOpenCLKernel::SetConstArgs() {
int h = inShape.H;
int w = inShape.W;
int c = inShape.C;
int c4 = UP_DIV(c, C4NUM);
cl_int4 size = {h, w, c4, c};
int arg_idx = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size);
if (wc_reduce_ || c_reduce_) {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask());
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (wc_reduce_ || c_reduce_) {
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
return RET_OK;
}
void ReduceOpenCLKernel::SetGlobalLocal() {
int h = inShape.H;
@ -235,9 +246,18 @@ int ReduceOpenCLKernel::Tune() {
int ReduceOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -32,7 +32,7 @@ class ReduceOpenCLKernel : public OpenCLKernel {
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Tune() override;

View File

@ -53,15 +53,22 @@ int ReshapeOpenCLKernel::CheckSpecs() {
return RET_OK;
}
void ReshapeOpenCLKernel::SetConstArgs() {
int ReshapeOpenCLKernel::SetConstArgs() {
auto in = GpuTensorInfo(in_tensors_.front());
auto out = GpuTensorInfo(out_tensors_.front());
cl_int4 src_size = {cl_int(in.C), cl_int(in.W), cl_int(in.H), cl_int(in.N)};
cl_int4 dst_size = {cl_int(out.width), cl_int(out.height), cl_int(out.C), cl_int(out.C * out.W)};
int arg_idx = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, dst_size);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, src_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, dst_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void ReshapeOpenCLKernel::SetGlobalLocal() {
@ -72,9 +79,9 @@ void ReshapeOpenCLKernel::SetGlobalLocal() {
}
int ReshapeOpenCLKernel::Prepare() {
std::string kernel_name = "reshape_NHWC4";
const std::string kernel_name = "reshape_NHWC4";
std::string source = reshape_source;
std::string program_name = "reshape";
const std::string program_name = "reshape";
auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
@ -87,16 +94,28 @@ int ReshapeOpenCLKernel::Prepare() {
}
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
int ReshapeOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}
@ -104,7 +123,10 @@ int ReshapeOpenCLKernel::PreProcess() {
if (type() == PrimitiveType_Reshape && !InferShapeDone()) {
auto shape_tensor = in_tensors_[1];
if (!shape_tensor->IsConst()) {
ocl_runtime_->SyncCommandQueue();
if (!ocl_runtime_->SyncCommandQueue()) {
MS_LOG(ERROR) << "SyncCommandQueue failed.";
return RET_ERROR;
}
shape_tensor->MutableData();
}
}

View File

@ -30,7 +30,7 @@ class ReshapeOpenCLKernel : public OpenCLKernel {
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int PreProcess() override;
};

View File

@ -64,7 +64,7 @@ int ResizeOpenCLKernel::Prepare() {
}
kernel_name += "_NHWC4";
std::string source = resize_source;
std::string program_name = "Resize";
const std::string program_name = "Resize";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -75,7 +75,10 @@ int ResizeOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
@ -87,7 +90,7 @@ float ResizeOpenCLKernel::getResizeScaleFactor(int input_size, int output_size)
: static_cast<float>(input_size) / static_cast<float>(output_size);
}
void ResizeOpenCLKernel::SetConstArgs() {
int ResizeOpenCLKernel::SetConstArgs() {
auto in_shape = in_tensors_[0]->shape();
auto out_shape = out_tensors_[0]->shape();
int n = out_shape[0];
@ -101,9 +104,19 @@ void ResizeOpenCLKernel::SetConstArgs() {
cl_int4 out_size = {n, h, w, c4};
cl_float2 scale = {scale_h, scale_w};
int arg_idx = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_size);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_size);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void ResizeOpenCLKernel::SetGlobalLocal() {
@ -116,9 +129,18 @@ void ResizeOpenCLKernel::SetGlobalLocal() {
int ResizeOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}
@ -126,7 +148,10 @@ int ResizeOpenCLKernel::PreProcess() {
if (type() == PrimitiveType_Resize && !InferShapeDone() && in_tensors_.size() == INPUT_TENSOR_SIZE_2) {
auto shape_tensor = in_tensors_[1];
if (!shape_tensor->IsConst()) {
ocl_runtime_->SyncCommandQueue();
if (!ocl_runtime_->SyncCommandQueue()) {
MS_LOG(ERROR) << "SyncCommandQueue failed.";
return RET_ERROR;
}
shape_tensor->MutableData();
}
}

View File

@ -31,7 +31,7 @@ class ResizeOpenCLKernel : public OpenCLKernel {
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int PreProcess() override;

View File

@ -98,14 +98,30 @@ int ScaleOpenCLKernel::InitWeights() {
img_size.height = 1;
img_size.width = UP_DIV(scale_tensor->shape()[0], C4NUM);
scale_ptr_ = allocator->Malloc(img_size, scale_tensor->data_c());
if (scale_ptr_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
offset_ptr_ = allocator->Malloc(img_size, offset_tensor->data_c());
if (offset_ptr_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
return RET_OK;
}
if (in_tensor->format() == scale_tensor->format()) {
if (in_tensor->data_type() == scale_tensor->data_type()) {
scale_ptr_ = allocator->Malloc(img_size, scale_tensor->data_c());
if (scale_ptr_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
offset_ptr_ = allocator->Malloc(img_size, offset_tensor->data_c());
if (offset_ptr_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
} else {
MS_LOG(ERROR) << "Unsupported data type transpose from " << scale_tensor->data_type() << "to "
<< in_tensor->data_type();
@ -121,7 +137,15 @@ int ScaleOpenCLKernel::InitWeights() {
PackNHWCToNHWC4(scale_tensor->data_c(), scale.data(), src_is_fp16, fp16_enable, image2d_info);
PackNHWCToNHWC4(offset_tensor->data_c(), offset.data(), src_is_fp16, fp16_enable, image2d_info);
scale_ptr_ = allocator->Malloc(img_size, scale.data());
if (scale_ptr_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
offset_ptr_ = allocator->Malloc(img_size, offset.data());
if (offset_ptr_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
} else {
MS_LOG(ERROR) << "Unsupported data type transpose from " << scale_tensor->data_type() << "to "
<< in_tensor->data_type();
@ -175,7 +199,7 @@ int ScaleOpenCLKernel::Prepare() {
} else {
kernel_name += "_BUF";
}
std::string program_name = "Scale";
const std::string program_name = "Scale";
std::string source = GetActDefines() + scale_source;
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
@ -193,44 +217,86 @@ int ScaleOpenCLKernel::Prepare() {
return RET_OK;
}
int ScaleOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
int ScaleOpenCLKernel::SetKernelArg(int *idx) {
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
return RET_ERROR;
}
if (weight_vector_flag_) {
void *scale = scale_ptr_ == nullptr ? in_tensors_[1]->data_c() : scale_ptr_;
void *offset = offset_ptr_ == nullptr ? in_tensors_[2]->data_c() : offset_ptr_;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale) != CL_SUCCESS) {
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset) != CL_SUCCESS) {
return RET_ERROR;
}
} else {
if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
float scale = static_cast<float *>(in_tensors_[1]->data_c())[0];
float offset = static_cast<float *>(in_tensors_[2]->data_c())[0];
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale) != CL_SUCCESS) {
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset) != CL_SUCCESS) {
return RET_ERROR;
}
} else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) {
float16_t scale = static_cast<float16_t *>(in_tensors_[1]->data_c())[0];
float16_t offset = static_cast<float16_t *>(in_tensors_[2]->data_c())[0];
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float>(scale));
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float>(offset));
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float>(scale)) != CL_SUCCESS) {
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float>(offset)) != CL_SUCCESS) {
return RET_ERROR;
}
} else {
MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[1]->data_type();
return RET_ERROR;
}
}
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
return RET_ERROR;
}
cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape) != CL_SUCCESS) {
return RET_ERROR;
}
*idx = arg_idx;
return RET_OK;
}
int ScaleOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
int arg_idx = 0;
if (SetKernelArg(&arg_idx) != RET_OK) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (weight_vector_flag_ && broadcast_flag_) {
if (broadcast_H_flag_) {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[1]->shape()[0]);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[1]->shape()[0]) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM));
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM)) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
}
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->activation_type_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->activation_type_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -34,7 +34,7 @@ class ScaleOpenCLKernel : public OpenCLKernel {
private:
void Image2dGetWorkGroupSize();
int SetKernelArg(int *idx);
bool weight_vector_flag_{true};
bool broadcast_flag_{false};
bool broadcast_H_flag_{false};

View File

@ -75,7 +75,7 @@ int SoftmaxOpenCLKernel::Prepare() {
kernel_name += "Axis" + std::to_string(axis_);
}
kernel_name += "_NHWC4";
std::string program_name = "Softmax";
const std::string program_name = "Softmax";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -93,7 +93,10 @@ int SoftmaxOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return lite::RET_OK;
@ -131,24 +134,40 @@ int SoftmaxOpenCLKernel::Tune() {
return OpenCLKernel::Tune();
}
void SoftmaxOpenCLKernel::SetConstArgs() {
int SoftmaxOpenCLKernel::SetConstArgs() {
int arg_idx = 2;
int channel = out_shape_.C;
int c4 = out_shape_.Slice;
auto mask_ = GetMaskForLastChannel(channel);
cl_float4 mask = {mask_[0], mask_[1], mask_[2], mask_[3]};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, mask);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, mask) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
cl_int4 input_shape = {static_cast<int>(out_shape_.N), static_cast<int>(out_shape_.H), static_cast<int>(out_shape_.W),
c4};
ocl_runtime_->SetKernelArg(kernel_, arg_idx, input_shape);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, input_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
int SoftmaxOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return lite::RET_OK;
}

View File

@ -30,7 +30,7 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Tune() override;

View File

@ -61,7 +61,7 @@ int SpaceToBatchNDOpenCLKernel::CheckSpecs() {
return RET_OK;
}
void SpaceToBatchNDOpenCLKernel::SetConstArgs() {
int SpaceToBatchNDOpenCLKernel::SetConstArgs() {
auto param = reinterpret_cast<SpaceToBatchParameter *>(this->op_parameter_);
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
@ -71,10 +71,23 @@ void SpaceToBatchNDOpenCLKernel::SetConstArgs() {
cl_int4 paddings = {param->paddings_[0], param->paddings_[1], param->paddings_[2], param->paddings_[3]};
int arg_cnt = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, block_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, paddings);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, block_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, paddings) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() {
@ -87,9 +100,9 @@ void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() {
}
int SpaceToBatchNDOpenCLKernel::Prepare() {
std::string kernel_name = "space_to_batch_nd_NHWC4";
const std::string kernel_name = "space_to_batch_nd_NHWC4";
std::string source = space_to_batch_nd_source;
std::string program_name = "space_to_batch_nd";
const std::string program_name = "space_to_batch_nd";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -101,7 +114,10 @@ int SpaceToBatchNDOpenCLKernel::Prepare() {
return ret;
}
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
@ -109,9 +125,18 @@ int SpaceToBatchNDOpenCLKernel::Prepare() {
int SpaceToBatchNDOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -32,7 +32,7 @@ class SpaceToBatchNDOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
private:

View File

@ -51,7 +51,7 @@ int SpaceToDepthOpenCLKernel::Prepare() {
kernel_name += "Align";
}
std::string source = space_to_depth_source;
std::string program_name = "SpaceToDepth";
const std::string program_name = "SpaceToDepth";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -63,28 +63,47 @@ int SpaceToDepthOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
void SpaceToDepthOpenCLKernel::SetConstArgs() {
int SpaceToDepthOpenCLKernel::SetConstArgs() {
cl_int4 cl_in_shape = {static_cast<cl_int>(in_shape_.N), static_cast<cl_int>(in_shape_.H),
static_cast<cl_int>(in_shape_.W), static_cast<cl_int>(in_shape_.Slice)};
cl_int4 cl_out_shape = {static_cast<cl_int>(out_shape_.N), static_cast<cl_int>(out_shape_.H),
static_cast<cl_int>(out_shape_.W), static_cast<cl_int>(out_shape_.Slice)};
auto param = reinterpret_cast<SpaceToDepthParameter *>(op_parameter_);
int arg_idx = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_in_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_out_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->block_size_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_in_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_out_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->block_size_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (type() == PrimitiveType_DepthToSpace) {
int co_size = out_shape_.C;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, co_size);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, co_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
int ci_size = in_shape_.C;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, ci_size);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, ci_size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
return RET_OK;
}
void SpaceToDepthOpenCLKernel::SetGlobalLocal() {
local_size_ = {};
@ -95,9 +114,18 @@ void SpaceToDepthOpenCLKernel::SetGlobalLocal() {
int SpaceToDepthOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -32,7 +32,7 @@ class SpaceToDepthOpenCLKernel : public OpenCLKernel {
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
private:

View File

@ -37,7 +37,10 @@ int SparseToDenseOpenCLKernel::InitOutputToDefault() {
cl_float4 fill_value = {};
fill_value.s[0] = fill_value.s[1] = fill_value.s[2] = fill_value.s[3] = default_;
auto src_data = out_tensors_[0]->data_c();
allocator_->GetImageSize(src_data, &img_size);
if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
MS_LOG(ERROR) << "GetImageSize failed.";
return RET_ERROR;
}
auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
@ -62,7 +65,14 @@ int SparseToDenseOpenCLKernel::InitWeights() {
auto sizeof_FLT = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
size_t weight_size = UP_ROUND(size, C4NUM) * sizeof_FLT;
weight_vector_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
allocator->MapBuffer(weight_vector_, CL_MAP_WRITE, nullptr, true);
if (weight_vector_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
if (allocator->MapBuffer(weight_vector_, CL_MAP_WRITE, nullptr, true) == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
memset(weight_vector_, 0x00, weight_size);
if (weight_tensor->data_type() == kNumberTypeFloat16) {
if (enable_fp16_) {
@ -85,7 +95,10 @@ int SparseToDenseOpenCLKernel::InitWeights() {
memcpy(weight_vector_, weight_tensor->data_c(), size * sizeof_FLT);
}
}
allocator->UnmapBuffer(weight_vector_);
if (allocator->UnmapBuffer(weight_vector_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
}
return RET_OK;
}
@ -115,7 +128,7 @@ int SparseToDenseOpenCLKernel::CheckSpecs() {
return RET_OK;
}
void SparseToDenseOpenCLKernel::SetConstArgs() {
int SparseToDenseOpenCLKernel::SetConstArgs() {
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
GpuTensorInfo img_info(out_tensors_[0]);
size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
@ -124,11 +137,27 @@ void SparseToDenseOpenCLKernel::SetConstArgs() {
auto out_shape_temp = out_tensors_[0]->shape();
cl_int4 out_shape = {out_n_, out_h_, out_w_, UP_DIV(out_c_, C4NUM)};
int arg_cn = 3;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, default_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, inshapeindex1_dim);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, default_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, inshapeindex1_dim) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void SparseToDenseOpenCLKernel::SetGlobalLocal() {
@ -144,9 +173,9 @@ int SparseToDenseOpenCLKernel::Prepare() {
input_dim_ = in_tensors_[0]->shape().size();
inshapeindex1_dim = in_tensors_[0]->shape()[1];
weight_scalar_ = in_tensors_[2]->IsScalar();
std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "Scalar" : "Vector");
const std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "Scalar" : "Vector");
std::string source = sparse_to_dense_source;
std::string program_name = "SparseToDense";
const std::string program_name = "SparseToDense";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -174,7 +203,10 @@ int SparseToDenseOpenCLKernel::Prepare() {
InitWeights();
InferShapeTo4D();
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
@ -212,14 +244,30 @@ int SparseToDenseOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
InitOutputToDefault();
int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
if (!weight_scalar_) {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_vector_, lite::opencl::MemType::BUF);
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF) !=
CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (!weight_scalar_) {
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_vector_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK;
}

View File

@ -31,7 +31,7 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int Run() override;
int InitWeights() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int CheckSpecs() override;

View File

@ -41,7 +41,10 @@ int SplitOpenCLKernel::RunAxis0() {
for (int i = 0; i < out_tensors_.size(); i++) {
auto dst_data = out_tensors_[i]->data_c();
ImageSize img_size;
allocator_->GetImageSize(dst_data, &img_size);
if (allocator_->GetImageSize(dst_data, &img_size) != RET_OK) {
MS_LOG(ERROR) << "GetImageSize failed.";
return RET_ERROR;
}
auto dst_area = cl::array<cl::size_type, 3U>{0, 0, 0};
auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data));
@ -93,23 +96,32 @@ int SplitOpenCLKernel::CheckSpecs() {
return RET_OK;
}
void SplitOpenCLKernel::AlignSplitSizes(SplitParameter *param, const std::vector<int> &in_shape) {
int SplitOpenCLKernel::AlignSplitSizes(SplitParameter *param, const std::vector<int> &in_shape) {
auto allocator = ocl_runtime_->GetAllocator();
int shape_dim = in_shape.at(param->split_dim_);
if (num_split_ == 1) {
size_t num_split = UP_DIV(shape_dim, param->split_sizes_[0]);
split_sizes_ = reinterpret_cast<int *>(allocator->Malloc(num_split * sizeof(int), lite::opencl::MemType::BUF));
if (split_sizes_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
for (int i = 0; i < num_split - 1; ++i) {
split_sizes_[i] = (i + 1) * param->split_sizes_[0];
}
} else {
int sum = 0;
split_sizes_ = reinterpret_cast<int *>(allocator->Malloc(num_split_ * sizeof(int), lite::opencl::MemType::BUF));
if (split_sizes_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
for (int i = 0; i < num_split_ - 1; ++i) {
sum += param->split_sizes_[i];
split_sizes_[i] = sum;
}
}
return RET_OK;
}
int SplitOpenCLKernel::Prepare() {
@ -129,7 +141,10 @@ int SplitOpenCLKernel::Prepare() {
}
}
}
AlignSplitSizes(param, in_shape);
if (AlignSplitSizes(param, in_shape) != RET_OK) {
MS_LOG(ERROR) << "AlignSplitSizes failed.";
return RET_ERROR;
}
std::string kernel_name = "split_out";
kernel_name += std::to_string(num_split_);
kernel_name += "_axis" + std::to_string(split_dim_);
@ -138,7 +153,7 @@ int SplitOpenCLKernel::Prepare() {
}
MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
std::string source = split_source;
std::string program_name = "split";
const std::string program_name = "split";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -151,12 +166,15 @@ int SplitOpenCLKernel::Prepare() {
return ret;
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
return RET_OK;
}
void SplitOpenCLKernel::SetConstArgs() {
int SplitOpenCLKernel::SetConstArgs() {
int arg_cn = out_tensors_.size() + 2;
cl_int4 shape = {};
for (int i = 0; i < in_tensors_[0]->shape().size(); ++i) {
@ -166,7 +184,10 @@ void SplitOpenCLKernel::SetConstArgs() {
if (Align_) {
in_shape_.s[3] = UP_DIV(in_shape_.s[3], C4NUM);
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
for (int i = 0; i < out_tensors_.size(); ++i) {
cl_int4 temp = {};
@ -177,13 +198,21 @@ void SplitOpenCLKernel::SetConstArgs() {
if (Align_) {
out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
GpuTensorInfo img_info(in_tensors_.at(0));
size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
stride_w = img_info.RowPitch() / dtype;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
return;
if (!Align_) {
GpuTensorInfo img_info(in_tensors_.at(0));
size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
stride_w = img_info.RowPitch() / dtype;
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
return RET_OK;
}
void SplitOpenCLKernel::SetGlobalLocal() {
@ -205,15 +234,31 @@ int SplitOpenCLKernel::Run() {
}
int arg_cn = 0;
if (Align_) {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c(), lite::opencl::MemType::BUF);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c(), lite::opencl::MemType::BUF) !=
CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
for (int i = 0; i < out_tensors_.size(); ++i) {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(i)->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(i)->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, split_sizes_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, split_sizes_, lite::opencl::MemType::BUF);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK;
}

View File

@ -31,12 +31,12 @@ class SplitOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;
private:
void AlignSplitSizes(SplitParameter *param, const std::vector<int> &in_shape);
int AlignSplitSizes(SplitParameter *param, const std::vector<int> &in_shape);
int RunAxis0();
private:

View File

@ -36,7 +36,10 @@ int StackOpenCLKernel::RunAxis0() {
cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data));
for (int i = 0; i < in_tensors_.size(); i++) {
auto src_data = in_tensors_[i]->data_c();
allocator_->GetImageSize(src_data, &img_size);
if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
MS_LOG(ERROR) << "GetImageSize failed.";
return RET_ERROR;
}
auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
cl::Image2D *input_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
@ -95,7 +98,7 @@ int StackOpenCLKernel::CheckSpecs() {
return RET_OK;
}
void StackOpenCLKernel::SetConstArgs() {
int StackOpenCLKernel::SetConstArgs() {
int arg_cn = in_tensors_.size() + 1;
cl_int4 inshape_tmp = {}, outshape_tmp = {};
for (int i = 0; i < in_tensors_[0]->shape().size(); ++i) {
@ -108,8 +111,14 @@ void StackOpenCLKernel::SetConstArgs() {
Broadcast2GpuShape(out_shape_.s, outshape_tmp.s, out_tensors_[0]->shape().size(), 1);
in_shape_.s[3] = UP_DIV(in_shape_.s[3], C4NUM);
out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (buffer_button_) {
GpuTensorInfo img_info_out(out_tensors_[0]);
GpuTensorInfo img_info_in(in_tensors_[0]);
@ -117,8 +126,12 @@ void StackOpenCLKernel::SetConstArgs() {
stride_w_out = img_info_out.RowPitch() / dtype;
stride_w_in = img_info_in.RowPitch() / dtype;
cl_int2 stride_w = {stride_w_out, stride_w_in};
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
return RET_OK;
}
void StackOpenCLKernel::SetGlobalLocal() {
@ -162,7 +175,7 @@ int StackOpenCLKernel::Prepare() {
MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
std::string source = stack_source;
std::string program_name = "stack";
const std::string program_name = "stack";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -174,7 +187,10 @@ int StackOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
return RET_OK;
@ -188,16 +204,33 @@ int StackOpenCLKernel::Run() {
int arg_cn = 0;
if (buffer_button_) {
for (int i = 0; i < in_tensors_.size(); ++i) {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c(), lite::opencl::MemType::BUF);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c(), lite::opencl::MemType::BUF) !=
CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF) !=
CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
} else {
for (int i = 0; i < in_tensors_.size(); ++i) {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c());
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
}
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Stack, OpenCLKernelCreator<StackOpenCLKernel>);

View File

@ -29,7 +29,7 @@ class StackOpenCLKernel : public OpenCLKernel {
~StackOpenCLKernel() override{};
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;

View File

@ -27,9 +27,9 @@ using mindspore::lite::opencl::ImageSize;
namespace mindspore::kernel {
int StrassenOpenCLKernel::Prepare() {
std::string kernel_name = "MatMul_Strassen_NHWC4_2d";
const std::string kernel_name = "MatMul_Strassen_NHWC4_2d";
std::string source = strassen_source;
std::string program_name = "MatMul";
const std::string program_name = "MatMul";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -50,13 +50,16 @@ int StrassenOpenCLKernel::Prepare() {
if (ret != RET_OK) {
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
void StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) {
int StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) {
auto allocator = ocl_runtime_->GetAllocator();
size_t img_dtype = enable_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
ImageSize img_size{static_cast<size_t>(UP_DIV(NumA, C4NUM)), static_cast<size_t>(NumA), img_dtype};
@ -64,15 +67,52 @@ void StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) {
size_t memB = NumB * NumB * dtype_size;
for (int depth = 0; depth < MAXDEPTH; depth++) {
B_temp[depth] = allocator->Malloc(memB, lite::opencl::MemType::BUF);
if (B_temp[depth] == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
A_temp[depth] = allocator->Malloc(img_size);
if (A_temp[depth] == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
M1[depth] = allocator->Malloc(img_size);
if (M1[depth] == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
M2[depth] = allocator->Malloc(img_size);
if (M2[depth] == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
M3[depth] = allocator->Malloc(img_size);
if (M3[depth] == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
M4[depth] = allocator->Malloc(img_size);
if (M4[depth] == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
M5[depth] = allocator->Malloc(img_size);
if (M5[depth] == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
M6[depth] = allocator->Malloc(img_size);
if (M6[depth] == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
M7[depth] = allocator->Malloc(img_size);
if (M7[depth] == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
}
return RET_OK;
}
int StrassenOpenCLKernel::InitWeights() {
@ -82,14 +122,25 @@ int StrassenOpenCLKernel::InitWeights() {
int NumB = in_tensors_[1]->shape()[0];
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
padWeight_ = allocator->Malloc(NumA * NumB * dtype_size, lite::opencl::MemType::BUF);
if (padWeight_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
if (padWeight_ == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
memset(padWeight_, 0x00, NumA * NumB * dtype_size);
auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c());
bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;
AllocatorMemoryForStrassen(NumA / 2, NumB / 2);
if (AllocatorMemoryForStrassen(NumA / 2, NumB / 2) != RET_OK) {
MS_LOG(ERROR) << "AllocatorMemoryForStrassen failed.";
return RET_ERROR;
}
size_t size = NumA * NumB * dtype_size;
if (isModelFp16) {
if (enable_fp16_) {
@ -108,7 +159,10 @@ int StrassenOpenCLKernel::InitWeights() {
memcpy(padWeightFp32, originWeightFp32, size);
}
}
allocator->UnmapBuffer(padWeight_);
if (allocator->UnmapBuffer(padWeight_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
return RET_OK;
}
@ -120,7 +174,7 @@ void AlignStrassenGlobalLocal(const std::vector<size_t> &global, const std::vect
}
// 0 : global_size_, 1: global_size_add_sub
void StrassenOpenCLKernel::StrassenSetGlobalLocal(size_t strassen_size, int type_flag) {
int StrassenOpenCLKernel::StrassenSetGlobalLocal(size_t strassen_size, int type_flag) {
size_t strassen_size_C4 = UP_DIV(strassen_size, C4NUM);
local_size_add_sub = {16, 1, 16};
if (type_flag == 0) {
@ -130,6 +184,7 @@ void StrassenOpenCLKernel::StrassenSetGlobalLocal(size_t strassen_size, int type
global_size_add_sub = {strassen_size_C4, 1, strassen_size};
AlignStrassenGlobalLocal(global_size_add_sub, local_size_add_sub, &global_add_sub_, &local_add_sub_);
}
return RET_OK;
}
void StrassenOpenCLKernel::SetGlobalLocal() {
@ -142,111 +197,188 @@ void StrassenOpenCLKernel::SetGlobalLocal() {
StrassenSetGlobalLocal(strassen_size, 2); // set global_size_weights
}
void StrassenOpenCLKernel::StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size,
bool is_matmul_kernel) {
int StrassenOpenCLKernel::StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size,
bool is_matmul_kernel) {
cl_int4 shape;
if (is_matmul_kernel) {
shape = {1, 1, strassen_size, strassen_size};
} else {
shape = {strassen_size, 1, 1, UP_DIV(strassen_size, C4NUM)};
}
ocl_runtime_->SetKernelArg(*kernel, index, shape);
if (ocl_runtime_->SetKernelArg(*kernel, index, shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void StrassenOpenCLKernel::SetConstArgs() {
int arg_count = 2;
cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
cl_int4 shape_offset = {0, 0, 0, 0};
int StrassenOpenCLKernel::SetConstArgs() {
int strassen_size = inShape[3] / 2;
out_shape.s[2] = in_shape.s[2] = in_shape.s[2] / 2;
out_shape.s[3] = in_shape.s[3] = in_shape.s[3] / 2;
StrassenSetConstArgs(&kernel_IMG_add_sub_2, 3, strassen_size, false);
StrassenSetConstArgs(&kernel_BUF_add_sub_2, 2, strassen_size, false);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, shape_offset);
return RET_OK;
}
void StrassenOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size,
cl_int2 offset, lite::opencl::MemType mem_type) {
int StrassenOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size,
cl_int2 offset, lite::opencl::MemType mem_type) {
if (input == nullptr || output == nullptr) {
MS_LOG(ERROR) << "StrassenDataFilled input or output can not nullptr";
return;
return RET_ERROR;
}
if (mem_type == lite::opencl::MemType::IMG) {
ocl_runtime_->SetKernelArg(*kernel, 0, input);
ocl_runtime_->SetKernelArg(*kernel, 1, output);
if (ocl_runtime_->SetKernelArg(*kernel, 0, input) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 1, output) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF);
if (ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
StrassenSetConstArgs(kernel, 2, size, false);
ocl_runtime_->SetKernelArg(*kernel, 3, offset);
ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(*kernel, 3, offset) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}
void StrassenOpenCLKernel::StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset,
int flag, lite::opencl::MemType mem_type) {
int StrassenOpenCLKernel::StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset,
int flag, lite::opencl::MemType mem_type) {
if (input == nullptr || output == nullptr) {
MS_LOG(ERROR) << "StrassenAddSub input or output can not nullptr";
return;
return RET_ERROR;
}
if (mem_type == lite::opencl::MemType::IMG) {
ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::IMG);
ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::IMG);
if (ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::IMG) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::IMG) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
} else {
ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF);
if (ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
StrassenSetConstArgs(kernel, 2, size, false);
ocl_runtime_->SetKernelArg(*kernel, 3, offset);
ocl_runtime_->SetKernelArg(*kernel, 4, flag);
ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(*kernel, 3, offset) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 4, flag) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}
void StrassenOpenCLKernel::StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3,
void *input4, void *input5, void *input6, void *input7, void *output,
const int size) {
int StrassenOpenCLKernel::StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4,
void *input5, void *input6, void *input7, void *output, const int size) {
if (input1 == nullptr || input2 == nullptr || input3 == nullptr || input4 == nullptr || input5 == nullptr ||
input6 == nullptr || input7 == nullptr || output == nullptr) {
MS_LOG(ERROR) << "StrassenBackResult input or output can not nullptr";
return;
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 0, input1) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 1, input2) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 2, input3) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 3, input4) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 4, input5) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 5, input6) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 6, input7) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(*kernel, 7, output) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
ocl_runtime_->SetKernelArg(*kernel, 0, input1);
ocl_runtime_->SetKernelArg(*kernel, 1, input2);
ocl_runtime_->SetKernelArg(*kernel, 2, input3);
ocl_runtime_->SetKernelArg(*kernel, 3, input4);
ocl_runtime_->SetKernelArg(*kernel, 4, input5);
ocl_runtime_->SetKernelArg(*kernel, 5, input6);
ocl_runtime_->SetKernelArg(*kernel, 6, input7);
ocl_runtime_->SetKernelArg(*kernel, 7, output);
StrassenSetConstArgs(kernel, 8, size, false);
ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_);
if (ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}
void StrassenOpenCLKernel::StrassenRunMmatmul(void *input, void *weight, void *output, const int size) {
int StrassenOpenCLKernel::StrassenRunMmatmul(void *input, void *weight, void *output, const int size) {
if (input == nullptr || weight == nullptr || output == nullptr) {
MS_LOG(ERROR) << "StrassenRunMmatmul input ,weight or output can not nullptr";
return;
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 0, input) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, output) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 2, weight, lite::opencl::MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
ocl_runtime_->SetKernelArg(kernel_, 0, input);
ocl_runtime_->SetKernelArg(kernel_, 1, output);
ocl_runtime_->SetKernelArg(kernel_, 2, weight, lite::opencl::MemType::BUF);
StrassenSetConstArgs(&kernel_, 3, size, true);
StrassenSetConstArgs(&kernel_, 4, size, true);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}
void StrassenOpenCLKernel::DoStrassen(void *data, void *weight, void *result, const int size, const int depth,
const int threshold) {
int StrassenOpenCLKernel::DoStrassen(void *data, void *weight, void *result, const int size, const int depth,
const int threshold) {
const int size_2 = size / 2;
int C4 = UP_DIV(size_2, C4NUM);
if (size <= threshold) {
// run matmul;
StrassenSetGlobalLocal(size, 0);
StrassenRunMmatmul(data, weight, result, size);
return;
return RET_OK;
}
// flag = 0 : add otherwise flag = 1 : sub
// M1 = A11 * ( B12- B22)
@ -307,6 +439,7 @@ void StrassenOpenCLKernel::DoStrassen(void *data, void *weight, void *result, co
StrassenSetGlobalLocal(size_2, 1);
StrassenBackResult(&kernel_back_result, M1[depth + 1], M2[depth + 1], M3[depth + 1], M4[depth + 1], M5[depth + 1],
M6[depth + 1], M7[depth + 1], result, size_2);
return RET_OK;
}
int StrassenOpenCLKernel::Run() {

View File

@ -33,22 +33,22 @@ class StrassenOpenCLKernel : public MatMulOpenCLKernel {
int Run() override;
int Prepare() override;
int InitWeights() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
// strassen
private:
void AllocatorMemoryForStrassen(int NumA, int NumB);
void DoStrassen(void *data, void *weight, void *result, const int size, const int depth, const int threshold);
void StrassenSetGlobalLocal(size_t strassen_size, int type_flag);
void StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel);
void StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, cl_int2 offset,
lite::opencl::MemType mem_type);
void StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, int flag,
lite::opencl::MemType mem_type);
void StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5,
void *input6, void *input7, void *output, const int size);
void StrassenRunMmatmul(void *input, void *weight, void *output, const int size);
int AllocatorMemoryForStrassen(int NumA, int NumB);
int DoStrassen(void *data, void *weight, void *result, const int size, const int depth, const int threshold);
int StrassenSetGlobalLocal(size_t strassen_size, int type_flag);
int StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel);
int StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, cl_int2 offset,
lite::opencl::MemType mem_type);
int StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, int flag,
lite::opencl::MemType mem_type);
int StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5,
void *input6, void *input7, void *output, const int size);
int StrassenRunMmatmul(void *input, void *weight, void *output, const int size);
cl::Kernel kernel_IMG_add_sub_2;
cl::Kernel MatMul_StrassenBUFFilled;
cl::Kernel MatMul_StrassenIMGFilled;

View File

@ -85,7 +85,7 @@ int StridedSliceOpenCLKernel::CheckSpecs() {
}
int StridedSliceOpenCLKernel::Prepare() {
std::string program_name = "strided_slice";
const std::string program_name = "strided_slice";
if (!ocl_runtime_->LoadSource(program_name, strided_slice_source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -96,7 +96,10 @@ int StridedSliceOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
return RET_OK;
}
@ -187,14 +190,33 @@ int StridedSliceOpenCLKernel::InitConstArgs() {
return RET_OK;
}
void StridedSliceOpenCLKernel::SetConstArgs() {
int StridedSliceOpenCLKernel::SetConstArgs() {
int arg_cn = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn, size_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn, size_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void StridedSliceOpenCLKernel::SetGlobalLocal() {
@ -214,9 +236,18 @@ void StridedSliceOpenCLKernel::SetGlobalLocal() {
int StridedSliceOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -31,7 +31,7 @@ class StridedSliceOpenCLKernel : public OpenCLKernel {
int CheckSpecs() override;
int Prepare() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;

View File

@ -42,11 +42,18 @@ int ToFormatOpenCLKernel::CheckSpecs() {
return RET_OK;
}
void ToFormatOpenCLKernel::SetConstArgs() {
int ToFormatOpenCLKernel::SetConstArgs() {
cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_};
cl_int4 gsize{(cl_int)(N_ * H_), (cl_int)W_, (cl_int)UP_DIV(C_, C4NUM), 1};
ocl_runtime_->SetKernelArg(kernel_, 2, gsize);
ocl_runtime_->SetKernelArg(kernel_, 3, shape);
if (ocl_runtime_->SetKernelArg(kernel_, 2, gsize) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 3, shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void ToFormatOpenCLKernel::SetGlobalLocal() {
@ -70,7 +77,7 @@ int ToFormatOpenCLKernel::Prepare() {
kernel_name += dtype_str[in_tensor->data_type()] + "_" + dtype_str[out_tensor->data_type()];
this->set_name(kernel_name);
std::string program_name = "to_format";
const std::string program_name = "to_format";
std::string source = to_format_source;
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
@ -89,7 +96,10 @@ int ToFormatOpenCLKernel::Prepare() {
C_ = output.C;
SetGlobalLocal();
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
@ -98,9 +108,18 @@ int ToFormatOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
auto src_mem_type = (out_mem_type_ == MemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG;
auto dst_mem_type = out_mem_type_;
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c(), src_mem_type);
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c(), dst_mem_type);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c(), src_mem_type) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c(), dst_mem_type) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -35,7 +35,7 @@ class ToFormatOpenCLKernel : public OpenCLKernel {
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int InferShape() override;

View File

@ -101,7 +101,7 @@ int TransposeOpenCLKernel::Prepare() {
kernel_name += "_NHWC4";
std::string source = transpose_source;
std::string program_name = "transpose";
const std::string program_name = "transpose";
if (!ocl_runtime_->LoadSource(program_name, source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -113,32 +113,45 @@ int TransposeOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
SetConstArgs();
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
void TransposeOpenCLKernel::SetConstArgs() {
int TransposeOpenCLKernel::SetConstArgs() {
size_t n = tensor_size_.N;
size_t h = tensor_size_.H;
size_t w = tensor_size_.W;
size_t c = tensor_size_.C;
int arg_idx = 2;
cl_int4 shape = {static_cast<int>(n), static_cast<int>(h), static_cast<int>(w), static_cast<int>(c)};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (type_ == TransposeType::GENERAL) {
int de_perm[4]; // output to input perm
for (int i = 0; i < 4; i++) {
de_perm[perm_4d_[i]] = i;
}
cl_int4 de_perm_cl = {de_perm[0], de_perm[1], de_perm[2], de_perm[3]};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, de_perm_cl);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, de_perm_cl) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
GpuTensorInfo in_shape = GpuTensorInfo(in_tensors_[0]);
cl_int4 in_shape_int4 = {static_cast<cl_int>(in_shape.N), static_cast<cl_int>(in_shape.H),
static_cast<cl_int>(in_shape.W), static_cast<cl_int>(in_shape.C)};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_shape_int4);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_shape_int4) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
}
return RET_OK;
}
void TransposeOpenCLKernel::SetGlobalLocal() {
@ -161,9 +174,18 @@ void TransposeOpenCLKernel::SetGlobalLocal() {
int TransposeOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -33,7 +33,7 @@ class TransposeOpenCLKernel : public OpenCLKernel {
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
private:

View File

@ -78,7 +78,7 @@ std::vector<float> GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, si
} // namespace
int WinogradOpenCLKernel::BuildKernel() {
std::string program_name = "winograd";
const std::string program_name = "winograd";
if (!ocl_runtime_->LoadSource(program_name, GetActDefines() + winograd_source)) {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
@ -103,7 +103,7 @@ int WinogradOpenCLKernel::BuildKernel() {
return RET_OK;
}
void WinogradOpenCLKernel::InitFilter() {
int WinogradOpenCLKernel::InitFilter() {
auto allocator = ocl_runtime_->GetAllocator();
// allocate opencl memory: buffer or image2d
@ -115,9 +115,17 @@ void WinogradOpenCLKernel::InitFilter() {
size_t dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
size = width * height * CO_TILE * sizeof_FLT_;
packed_filter_ = allocator->Malloc({width, height, dtype});
if (packed_filter_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
} else {
size = UP_DIV(CO_SLICES_, Ogroup) * 6 * 6 * CI_SLICES_ * Ogroup * CI_TILE * CO_TILE * sizeof_FLT_;
packed_filter_ = allocator->Malloc(size, MemType::BUF);
if (packed_filter_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
}
// rearrange filter
@ -128,6 +136,10 @@ void WinogradOpenCLKernel::InitFilter() {
void *src_data = winograd_filter.data();
#else
auto winograd_filter = std::make_unique<float[]>(CO_ * 6 * 6 * CI_);
if (winograd_filter == nullptr) {
MS_LOG(ERROR) << "new winograd_filter failed.";
return RET_ERROR;
}
WinogradWeightTransform(reinterpret_cast<const float *>(src_filter_data),
reinterpret_cast<float *>(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false);
@ -147,53 +159,121 @@ void WinogradOpenCLKernel::InitFilter() {
if (filter_type_ == MemType::IMG) {
ocl_runtime_->WriteImage(packed_filter_, tmp.data());
} else {
allocator->MapBuffer(packed_filter_, CL_MAP_WRITE, nullptr, true);
if (allocator->MapBuffer(packed_filter_, CL_MAP_WRITE, nullptr, true) == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
return RET_ERROR;
}
memcpy(packed_filter_, tmp.data(), size);
allocator->UnmapBuffer(packed_filter_);
if (allocator->UnmapBuffer(packed_filter_) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
return RET_ERROR;
}
}
FreeStoredData(stored_filter_);
return RET_OK;
}
void WinogradOpenCLKernel::AllocateMemory() {
int WinogradOpenCLKernel::AllocateMemory() {
auto allocator = ocl_runtime_->GetAllocator();
size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
size_t width = TILE_HW_;
size_t height = CI_SLICES_ * 36;
winograd_mem0_ = allocator->Malloc({width, height, img_dtype});
if (winograd_mem0_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
width = TILE_HW_;
height = CO_SLICES_ * 36;
winograd_mem1_ = allocator->Malloc({width, height, img_dtype});
if (winograd_mem1_ == nullptr) {
MS_LOG(ERROR) << "Malloc failed.";
return RET_ERROR;
}
return RET_OK;
}
void WinogradOpenCLKernel::SetConstArgs() {
int WinogradOpenCLKernel::SetConstArgs() {
AllocateMemory();
int arg_cn = 1;
cl_int4 input_shape = {batch_size_, OH_, OW_, CI_SLICES_}; // maybe pad=0, so use OH/OW
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_);
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, input_shape);
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, TILE_HW_);
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, param_->pad_u_);
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn, param_->pad_l_);
if (ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, input_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, TILE_HW_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, param_->pad_u_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn, param_->pad_l_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem0_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem1_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_filter_, filter_type_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, TILE_HW_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, CI_SLICES_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn, CO_SLICES_);
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem0_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem1_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_filter_, filter_type_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, TILE_HW_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, CI_SLICES_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn, CO_SLICES_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
arg_cn = 2;
cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};
ocl_runtime_->SetKernelArg(kernel_36to4x4_, 0, winograd_mem1_);
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape);
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, TILE_HW_);
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, param_->act_type_);
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, alpha_);
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, 0, winograd_mem1_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, MemType::BUF) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, TILE_HW_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, param_->act_type_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, alpha_) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
void WinogradOpenCLKernel::SetGlobalLocal() {
@ -205,15 +285,30 @@ void WinogradOpenCLKernel::SetGlobalLocal() {
int WinogradOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " winograd Running!";
MS_LOG(DEBUG) << "winograd kernel0 Running!";
ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_, nullptr, &event_);
if (ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_, nullptr, &event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << "winograd kernel1 Running!";
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &kernel2_event_);
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &kernel2_event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
MS_LOG(DEBUG) << "winograd kernel2 Running!";
ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_, nullptr, &kernel3_event_);
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c()) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
if (ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_, nullptr, &kernel3_event_) != RET_OK) {
MS_LOG(ERROR) << "RunKernel failed.";
return RET_ERROR;
}
return RET_OK;
}

View File

@ -32,7 +32,7 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
~WinogradOpenCLKernel() override = default;
void SetConstArgs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;
@ -42,8 +42,8 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
private:
int BuildKernel() override;
void InitFilter() override;
void AllocateMemory();
int InitFilter() override;
int AllocateMemory();
cl::Kernel kernel_4x4to36_;
cl::Kernel kernel_36to4x4_;

View File

@ -24,7 +24,7 @@ using mindspore::lite::RET_OK;
using mindspore::lite::opencl::ImageSize;
namespace mindspore::kernel {
int OpenCLKernel::AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) {
void OpenCLKernel::AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) {
std::vector<size_t> internal_global_ws = global;
for (size_t i = 0; i < local.size(); ++i) {
internal_global_ws.at(i) = UP_ROUND(global.at(i), local.at(i));
@ -50,16 +50,12 @@ int OpenCLKernel::AlignGlobalLocal(const std::vector<size_t> &global, const std:
if (!local.empty()) {
local_range_ = cl::NDRange(local.at(0), local.at(1));
}
} else if (global.size() == 3) {
} else if (global.size() >= 3) {
global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1), internal_global_ws.at(2));
if (!local.empty()) {
local_range_ = cl::NDRange(local.at(0), local.at(1), local.at(2));
}
} else {
MS_LOG(ERROR) << "Not supported NDRange!";
return RET_ERROR;
}
return RET_OK;
}
int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) {
@ -112,11 +108,17 @@ void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
auto runtime = runtime_wrapper.GetInstance();
auto allocator = runtime->GetAllocator();
runtime->SyncCommandQueue();
if (!runtime->SyncCommandQueue()) {
MS_LOG(ERROR) << "SyncCommandQueue failed.";
}
if (mem_type == lite::opencl::MemType::BUF) {
allocator->MapBuffer(tensor->data_c(), CL_MAP_READ, nullptr, true);
if (allocator->MapBuffer(tensor->data_c(), CL_MAP_READ, nullptr, true) == nullptr) {
MS_LOG(ERROR) << "Map Buffer failed.";
}
memcpy(data.data(), tensor->data_c(), img_info.OriginSize);
allocator->UnmapBuffer(tensor->data_c());
if (allocator->UnmapBuffer(tensor->data_c()) != RET_OK) {
MS_LOG(ERROR) << "UnmapBuffer failed.";
}
} else {
runtime->ReadImage(tensor->data_c(), data.data());
}

View File

@ -185,7 +185,7 @@ class OpenCLKernel : public InnerKernel {
ocl_runtime_ = ocl_runtime_wrap_.GetInstance();
}
~OpenCLKernel() override = default;
int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local);
void AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local);
int Prepare() override { return RET_OK; }
int PreProcess() override;
@ -194,7 +194,7 @@ class OpenCLKernel : public InnerKernel {
virtual int CheckSpecs();
virtual int InitWeights() { return RET_OK; }
virtual void SetConstArgs() {}
virtual int SetConstArgs() { return RET_OK; }
virtual void SetGlobalLocal() {}
virtual int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { return RET_ERROR; }
virtual int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) {

View File

@ -420,6 +420,7 @@ int OpenCLSubGraph::Execute() {
return ret;
}
if (!ocl_runtime_->SyncCommandQueue()) {
MS_LOG(ERROR) << "SyncCommandQueue failed.";
return RET_ERROR;
}
return RET_OK;
@ -449,6 +450,7 @@ int OpenCLSubGraph::Execute(const KernelCallBack &before, const KernelCallBack &
return ret;
}
if (!ocl_runtime_->SyncCommandQueue()) {
MS_LOG(ERROR) << "SyncCommandQueue failed.";
return RET_ERROR;
}
return RET_OK;