forked from mindspore-Ecosystem/mindspore
fix security check
This commit is contained in:
parent
0c707cd888
commit
fe438fae9c
|
@ -108,12 +108,15 @@ void *OpenCLAllocator::CreateImage2D(size_t size, const ImageSize &img_size, voi
|
|||
}
|
||||
if (*image == nullptr) {
|
||||
delete *buffer;
|
||||
*buffer = nullptr;
|
||||
MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << mindspore::kernel::CLErrorCode(ret) << ")";
|
||||
return nullptr;
|
||||
}
|
||||
if (ret != CL_SUCCESS) {
|
||||
delete *buffer;
|
||||
delete *image;
|
||||
*buffer = nullptr;
|
||||
*image = nullptr;
|
||||
MS_LOG(ERROR) << "Create OpenCL Image2D (ERROR CODE: " << mindspore::kernel::CLErrorCode(ret) << ")";
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -125,6 +128,8 @@ void *OpenCLAllocator::CreateImage2D(size_t size, const ImageSize &img_size, voi
|
|||
if (host_ptr == nullptr) {
|
||||
delete *buffer;
|
||||
delete *image;
|
||||
*buffer = nullptr;
|
||||
*image = nullptr;
|
||||
MS_LOG(ERROR) << "Map image failed, can not found image :" << *image << ", host_ptr=" << host_ptr;
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
@ -210,6 +210,7 @@ int OpenCLRuntime::InitQueue(std::vector<cl::Platform> *platforms) {
|
|||
#endif
|
||||
if (context_ == nullptr || ret != CL_SUCCESS) {
|
||||
delete device_;
|
||||
device_ = nullptr;
|
||||
MS_LOG(ERROR) << "Context create failed: " << CLErrorCode(ret);
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
@ -218,6 +219,8 @@ int OpenCLRuntime::InitQueue(std::vector<cl::Platform> *platforms) {
|
|||
if (default_command_queue_ == nullptr || ret != CL_SUCCESS) {
|
||||
delete device_;
|
||||
delete context_;
|
||||
device_ = nullptr;
|
||||
context_ = nullptr;
|
||||
MS_LOG(ERROR) << "Command Queue create failed: " << CLErrorCode(ret);
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
@ -227,6 +230,9 @@ int OpenCLRuntime::InitQueue(std::vector<cl::Platform> *platforms) {
|
|||
delete device_;
|
||||
delete context_;
|
||||
delete default_command_queue_;
|
||||
device_ = nullptr;
|
||||
context_ = nullptr;
|
||||
default_command_queue_ = nullptr;
|
||||
MS_LOG(ERROR) << "Profiling command Queue create failed: " << CLErrorCode(ret);
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
@ -291,6 +297,10 @@ int OpenCLRuntime::Init() {
|
|||
delete context_;
|
||||
delete default_command_queue_;
|
||||
delete profiling_command_queue_;
|
||||
device_ = nullptr;
|
||||
context_ = nullptr;
|
||||
default_command_queue_ = nullptr;
|
||||
profiling_command_queue_ = nullptr;
|
||||
MS_LOG(ERROR) << "Command OpenCL allocator failed!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
@ -305,7 +315,9 @@ int OpenCLRuntime::Uninit() {
|
|||
if (init_state_ != InitSuccess) {
|
||||
return RET_OK;
|
||||
}
|
||||
StoreCache();
|
||||
if (StoreCache() != RET_OK) {
|
||||
MS_LOG(ERROR) << "StoreCache failed!";
|
||||
}
|
||||
program_map_.clear();
|
||||
delete default_command_queue_;
|
||||
delete profiling_command_queue_;
|
||||
|
@ -574,12 +586,15 @@ void *OpenCLRuntime::MapBuffer(const cl::Buffer &buffer, int flags, size_t size,
|
|||
|
||||
int OpenCLRuntime::MapBuffer(void *host_ptr, int flags, size_t size, cl::CommandQueue *command_queue, bool sync) const {
|
||||
if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
|
||||
return RET_OK;
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (command_queue == nullptr) {
|
||||
command_queue = default_command_queue_;
|
||||
}
|
||||
return clEnqueueSVMMap(command_queue->get(), sync, flags, host_ptr, size, 0, nullptr, nullptr);
|
||||
if (clEnqueueSVMMap(command_queue->get(), sync, flags, host_ptr, size, 0, nullptr, nullptr) != CL_SUCCESS) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void *OpenCLRuntime::MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> ®ion,
|
||||
|
@ -720,17 +735,17 @@ void OpenCLRuntime::LoadCache() {
|
|||
MS_LOG(INFO) << "Init opencl cache success";
|
||||
}
|
||||
|
||||
void OpenCLRuntime::StoreCache() {
|
||||
int OpenCLRuntime::StoreCache() {
|
||||
if (!enable_cache_) {
|
||||
return;
|
||||
return RET_OK;
|
||||
}
|
||||
if (!flush_cache_) {
|
||||
return;
|
||||
return RET_OK;
|
||||
}
|
||||
auto fbb = std::make_unique<flatbuffers::FlatBufferBuilder>();
|
||||
if (fbb == nullptr) {
|
||||
MS_LOG(ERROR) << "new opencl FlatBufferBuilder fail";
|
||||
return;
|
||||
return RET_ERROR;
|
||||
}
|
||||
std::vector<flatbuffers::Offset<schema::ProgramBinary>> program_binarys;
|
||||
for (const auto &kv : program_map_) {
|
||||
|
@ -753,8 +768,12 @@ void OpenCLRuntime::StoreCache() {
|
|||
auto gpu_cache = schema::CreateGpuCache(*fbb, name, version, data);
|
||||
fbb->Finish(gpu_cache);
|
||||
uint8_t *buf = fbb->GetBufferPointer();
|
||||
WriteToBin(cache_path_, reinterpret_cast<void *>(buf), fbb->GetSize());
|
||||
if (WriteToBin(cache_path_, reinterpret_cast<void *>(buf), fbb->GetSize()) != RET_OK) {
|
||||
MS_LOG(ERROR) << "WriteToBin failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
MS_LOG(INFO) << "store opencl cache ok, size=" << fbb->GetSize();
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
cl::Buffer *OpenCLRuntime::CreateSharedMemoryBuffer(size_t size, void *host_ptr) {
|
||||
|
|
|
@ -203,7 +203,7 @@ class OpenCLRuntime {
|
|||
// for cache
|
||||
private:
|
||||
void LoadCache();
|
||||
void StoreCache();
|
||||
int StoreCache();
|
||||
#ifdef MS_OPENCL_BINARY_CACHE
|
||||
bool enable_cache_{true};
|
||||
#else
|
||||
|
|
|
@ -65,37 +65,53 @@ int ActivationOpenCLKernel::CheckSpecs() {
|
|||
int ActivationOpenCLKernel::Prepare() {
|
||||
outShape = GpuTensorInfo(out_tensors_[0]);
|
||||
std::string source = activation_source;
|
||||
std::string program_name = "Activation";
|
||||
const std::string program_name = "Activation";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
std::string kernel_name = GetActTypeString(type_);
|
||||
const std::string kernel_name = GetActTypeString(type_);
|
||||
auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
|
||||
auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
MS_LOG(DEBUG) << kernel_name << " init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ActivationOpenCLKernel::SetConstArgs() {
|
||||
int ActivationOpenCLKernel::SetConstArgs() {
|
||||
int arg_idx = 2;
|
||||
cl_int2 image_size = {static_cast<int>(outShape.width), static_cast<int>(outShape.height)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, image_size);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, image_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (type_ == ActivationType_LEAKY_RELU) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (type_ == ActivationType_SIGMOID) {
|
||||
int c4 = outShape.Slice;
|
||||
int last_c4 = outShape.C % 4 == 0 ? 4 : outShape.C % 4;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, c4);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, last_c4);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, c4) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, last_c4) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ActivationOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -107,8 +123,14 @@ void ActivationOpenCLKernel::SetGlobalLocal() {
|
|||
int ActivationOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
int arg_idx = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail.";
|
||||
|
|
|
@ -35,7 +35,7 @@ class ActivationOpenCLKernel : public OpenCLKernel {
|
|||
int Run() override;
|
||||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <cstring>
|
||||
#include <string>
|
||||
#include <functional>
|
||||
#include <algorithm>
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/argminmax.h"
|
||||
|
@ -58,19 +59,41 @@ int ArgMinMaxOpenCLKernel::CheckSpecs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void ArgMinMaxOpenCLKernel::SetConstArgs() {
|
||||
int ArgMinMaxOpenCLKernel::SetConstArgs() {
|
||||
auto param = reinterpret_cast<ArgMinMaxParameter *>(op_parameter_);
|
||||
cl_int4 in_shape{static_cast<int>(im_in_.N), static_cast<int>(im_in_.H), static_cast<int>(im_in_.W),
|
||||
static_cast<int>(im_in_.C)};
|
||||
cl_int4 flags = {param->out_value_, param->get_max_, param->axis_, param->topk_};
|
||||
int arg_cnt = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, buff_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, ids_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, cus_size_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, strides_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, flags);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, buff_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, ids_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, cus_size_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, strides_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, flags) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ArgMinMaxOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -134,14 +157,22 @@ int ArgMinMaxOpenCLKernel::InitWeights() {
|
|||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
int dtype_size = ocl_runtime_->GetFp16Enable() ? sizeof(int16_t) : sizeof(float);
|
||||
buff_ = allocator->Malloc(in_tensors_[0]->ElementsNum() * dtype_size, lite::opencl::MemType::BUF);
|
||||
if (buff_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ids_ = allocator->Malloc(in_tensors_[0]->ElementsNum() * sizeof(int32_t), lite::opencl::MemType::BUF);
|
||||
if (ids_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ArgMinMaxOpenCLKernel::Prepare() {
|
||||
std::string kernel_name = "argminmax";
|
||||
const std::string kernel_name = "argminmax";
|
||||
std::string source = argminmax_source;
|
||||
std::string program_name = "argminmax";
|
||||
const std::string program_name = "argminmax";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -162,16 +193,28 @@ int ArgMinMaxOpenCLKernel::Prepare() {
|
|||
|
||||
InitWeights();
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ArgMinMaxOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ class ArgMinMaxOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int InitWeights() override;
|
||||
int Tune() override { return lite::RET_OK; }
|
||||
|
|
|
@ -98,6 +98,10 @@ int ArithmeticOpenCLKernel::InitWeights() {
|
|||
size_t dtype = fp16_enable ? CL_HALF_FLOAT : CL_FLOAT;
|
||||
ImageSize img_size{in_shape.width, in_shape.height, dtype};
|
||||
auto weight_ptr_ = allocator->Malloc(img_size, weight.data());
|
||||
if (weight_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
weight_ptrs_.push_back(weight_ptr_);
|
||||
} else {
|
||||
weight_ptrs_.push_back(nullptr);
|
||||
|
@ -106,7 +110,7 @@ int ArithmeticOpenCLKernel::InitWeights() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void ArithmeticOpenCLKernel::SetConstArgs() {
|
||||
int ArithmeticOpenCLKernel::SetConstArgs() {
|
||||
int arg_idx = 3;
|
||||
if (!element_flag_) {
|
||||
cl_int4 in0_shape = {static_cast<int>(in0_shape_.N), static_cast<int>(in0_shape_.H), static_cast<int>(in0_shape_.W),
|
||||
|
@ -121,16 +125,38 @@ void ArithmeticOpenCLKernel::SetConstArgs() {
|
|||
} else if (in0_shape_.C != 1 && in1_shape_.C == 1) {
|
||||
broadcastC_flag = 2; // BroadCast C4 in input1
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in0_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in1_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in0_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in1_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
cl_int2 output_shape{static_cast<int>(global_range_[0]), static_cast<int>(global_range_[1])};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ArithmeticOpenCLKernel::Prepare() {
|
||||
|
@ -179,7 +205,7 @@ int ArithmeticOpenCLKernel::Prepare() {
|
|||
activation_max_ = 6.f;
|
||||
}
|
||||
|
||||
std::string program_name = "Arithmetic";
|
||||
const std::string program_name = "Arithmetic";
|
||||
std::string source = arithmetic_source;
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
|
@ -196,7 +222,10 @@ int ArithmeticOpenCLKernel::Prepare() {
|
|||
if (type() != PrimitiveType_BiasAdd) {
|
||||
InitWeights();
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name_ << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -206,10 +235,22 @@ int ArithmeticOpenCLKernel::Run() {
|
|||
auto input_0_ptr = weight_ptrs_[0] == nullptr ? in_tensors_[0]->data_c() : weight_ptrs_[0];
|
||||
auto input_1_ptr = weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : weight_ptrs_[1];
|
||||
int arg_idx = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
int InitWeights() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -86,7 +86,7 @@ int ArithmeticSelfOpenCLKernel::Prepare() {
|
|||
kernel_name += std::string(schema::EnumNamePrimitiveType(type())) + "_NHWC4";
|
||||
}
|
||||
MS_LOG(DEBUG) << "execute kernel name : " << kernel_name;
|
||||
std::string program_name = "ArithmeticSelf";
|
||||
const std::string program_name = "ArithmeticSelf";
|
||||
if (!ocl_runtime_->LoadSource(program_name, arithmeticself_source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -98,15 +98,27 @@ int ArithmeticSelfOpenCLKernel::Prepare() {
|
|||
return ret;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ArithmeticSelfOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -47,7 +47,13 @@ class ArithmeticSelfOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override { ocl_runtime_->SetKernelArg(kernel_, 2, output_shape_); }
|
||||
int SetConstArgs() override {
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 2, output_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
int Run() override;
|
||||
|
|
|
@ -55,7 +55,7 @@ int BatchToSpaceNDOpenCLKernel::CheckSpecs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void BatchToSpaceNDOpenCLKernel::SetConstArgs() {
|
||||
int BatchToSpaceNDOpenCLKernel::SetConstArgs() {
|
||||
auto param = reinterpret_cast<BatchToSpaceParameter *>(this->op_parameter_);
|
||||
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
||||
size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
|
||||
|
@ -66,10 +66,23 @@ void BatchToSpaceNDOpenCLKernel::SetConstArgs() {
|
|||
cl_int4 paddings = {param->crops_[0], param->crops_[1], param->crops_[2], param->crops_[3]};
|
||||
|
||||
int arg_cnt = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, block_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, paddings);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, block_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, paddings) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void BatchToSpaceNDOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -82,9 +95,9 @@ void BatchToSpaceNDOpenCLKernel::SetGlobalLocal() {
|
|||
}
|
||||
|
||||
int BatchToSpaceNDOpenCLKernel::Prepare() {
|
||||
std::string kernel_name = "batch_to_space_nd_NHWC4";
|
||||
const std::string kernel_name = "batch_to_space_nd_NHWC4";
|
||||
std::string source = batch_to_space_nd_source;
|
||||
std::string program_name = "batch_to_space_nd";
|
||||
const std::string program_name = "batch_to_space_nd";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -96,16 +109,28 @@ int BatchToSpaceNDOpenCLKernel::Prepare() {
|
|||
return ret;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BatchToSpaceNDOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@ class BatchToSpaceNDOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Tune() override { return lite::RET_OK; }
|
||||
|
||||
|
|
|
@ -59,15 +59,25 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
|
|||
local->push_back(z);
|
||||
}
|
||||
|
||||
void BatchNormOpenCLKernel::SetConstArgs() {
|
||||
int BatchNormOpenCLKernel::SetConstArgs() {
|
||||
int arg_cn = 6;
|
||||
auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_);
|
||||
auto input0_shape = in_tensors_.at(0)->shape();
|
||||
cl_int4 input_shape_ = {input0_shape.at(0), input0_shape.at(1), input0_shape.at(2),
|
||||
UP_DIV(input0_shape.at(3), C4NUM)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input0_shape.at(3));
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input0_shape.at(3)) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void BatchNormOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -83,6 +93,41 @@ void BatchNormOpenCLKernel::SetGlobalLocal() {
|
|||
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
|
||||
}
|
||||
|
||||
int BatchNormOpenCLKernel::UnmapBuffer() {
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
if (allocator->UnmapBuffer(scale_) != RET_OK) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->UnmapBuffer(offset_) != RET_OK) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->UnmapBuffer(mean_) != RET_OK) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->UnmapBuffer(variance_) != RET_OK) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BatchNormOpenCLKernel::MapBuffer() {
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
if (allocator->MapBuffer(scale_, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->MapBuffer(offset_, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->MapBuffer(mean_, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->MapBuffer(variance_, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BatchNormOpenCLKernel::Initweight() {
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
GpuTensorInfo img_info(in_tensors_.at(1));
|
||||
|
@ -90,15 +135,30 @@ int BatchNormOpenCLKernel::Initweight() {
|
|||
size_t weight_size = img_info.OriginSize;
|
||||
// allocated memory for weight and init value
|
||||
scale_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||
if (scale_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
offset_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||
if (offset_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
mean_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||
if (mean_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
variance_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||
if (variance_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
allocator->MapBuffer(scale_, CL_MAP_WRITE, nullptr, true);
|
||||
allocator->MapBuffer(offset_, CL_MAP_WRITE, nullptr, true);
|
||||
allocator->MapBuffer(mean_, CL_MAP_WRITE, nullptr, true);
|
||||
allocator->MapBuffer(variance_, CL_MAP_WRITE, nullptr, true);
|
||||
|
||||
if (MapBuffer() != RET_OK) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(scale_, 1, weight_size);
|
||||
memset(offset_, 0x00, weight_size);
|
||||
memset(mean_, 0x00, weight_size);
|
||||
|
@ -153,18 +213,18 @@ int BatchNormOpenCLKernel::Initweight() {
|
|||
memcpy(variance_, in_tensors_.at(4)->data_c(), weight_size);
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(scale_);
|
||||
allocator->UnmapBuffer(offset_);
|
||||
allocator->UnmapBuffer(mean_);
|
||||
allocator->UnmapBuffer(variance_);
|
||||
if (UnmapBuffer() != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BatchNormOpenCLKernel::Prepare() {
|
||||
use_fp16_enable_ = ocl_runtime_->GetFp16Enable();
|
||||
std::string kernel_name = "Batch_normalization_NHWC4";
|
||||
const std::string kernel_name = "Batch_normalization_NHWC4";
|
||||
std::string source = batchnorm_source;
|
||||
std::string program_name = "Batch_normalization";
|
||||
const std::string program_name = "Batch_normalization";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -181,7 +241,10 @@ int BatchNormOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Initweight failed ";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
|
||||
return RET_OK;
|
||||
|
@ -190,13 +253,34 @@ int BatchNormOpenCLKernel::Prepare() {
|
|||
int BatchNormOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
int arg_cn = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()); // input tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, scale_, lite::opencl::MemType::BUF); // scale
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, offset_, lite::opencl::MemType::BUF); // offset
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, mean_, lite::opencl::MemType::BUF); // mean
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, variance_, lite::opencl::MemType::BUF); // variance
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c()); // out tensor
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // input tensor
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, scale_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // scale
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, offset_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // offset
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, mean_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // mean
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, variance_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // variance
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // out tensor
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,11 +32,13 @@ class BatchNormOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
private:
|
||||
int Initweight();
|
||||
int UnmapBuffer();
|
||||
int MapBuffer();
|
||||
|
||||
private:
|
||||
bool use_fp16_enable_{false};
|
||||
|
|
|
@ -52,9 +52,13 @@ int CastOpenCLKernel::CheckSpecs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void CastOpenCLKernel::SetConstArgs() {
|
||||
int CastOpenCLKernel::SetConstArgs() {
|
||||
cl_int2 shape = {static_cast<int>(shape_.width), static_cast<int>(shape_.height)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, 2, shape);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 2, shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void CastOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -68,8 +72,8 @@ int CastOpenCLKernel::Prepare() {
|
|||
{kNumberTypeFloat32, "fp32"},
|
||||
{kNumberTypeFloat16, "fp16"},
|
||||
};
|
||||
std::string program_name = "Cast";
|
||||
std::string kernel_name =
|
||||
const std::string program_name = "Cast";
|
||||
const std::string kernel_name =
|
||||
"Cast_" + dtype_names[in_tensors_.front()->data_type()] + "_to_" + dtype_names[out_tensors_.front()->data_type()];
|
||||
if (!ocl_runtime_->LoadSource(program_name, cast_source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
|
@ -80,16 +84,28 @@ int CastOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int CastOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ class CastOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
int Run() override;
|
||||
|
|
|
@ -38,7 +38,10 @@ int ConcatOpenCLKernel::RunAxis0() {
|
|||
auto *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data));
|
||||
for (int i = 0; i < in_tensors_.size(); i++) {
|
||||
auto src_data = weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data_c() : weight_ptrs_.at(i);
|
||||
allocator_->GetImageSize(src_data, &img_size);
|
||||
if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
|
||||
MS_LOG(ERROR) << "GetImageSize failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
|
||||
auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
|
||||
auto *input_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
|
||||
|
@ -107,7 +110,7 @@ int ConcatOpenCLKernel::CheckSpecs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void ConcatOpenCLKernel::SetConstArgs() {
|
||||
int ConcatOpenCLKernel::SetConstArgs() {
|
||||
GpuTensorInfo img_info(out_tensors_[0]);
|
||||
size_t dtype = ocl_runtime_->GetFp16Enable() ? sizeof(cl_half) : sizeof(cl_float);
|
||||
stride_w = img_info.RowPitch() / dtype;
|
||||
|
@ -124,9 +127,15 @@ void ConcatOpenCLKernel::SetConstArgs() {
|
|||
temp.s[j] = in_tensor->shape()[j];
|
||||
}
|
||||
Broadcast2GpuShape(in_shape_.s, temp.s, in_tensor->shape().size(), 1);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
|
||||
} else {
|
||||
for (auto &in_tensor : in_tensors_) {
|
||||
cl_int4 temp = {};
|
||||
|
@ -135,11 +144,18 @@ void ConcatOpenCLKernel::SetConstArgs() {
|
|||
}
|
||||
Broadcast2GpuShape(in_shape_.s, temp.s, in_tensor->shape().size(), 1);
|
||||
in_shape_.s[3] = UP_DIV(in_shape_.s[3], C4NUM);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ConcatOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -190,6 +206,10 @@ int ConcatOpenCLKernel::ConvertWeightToTensor() {
|
|||
}
|
||||
ImageSize img_size{in_shape.width, in_shape.height, dtype};
|
||||
auto weight_ptr_ = allocator->Malloc(img_size, weight.data());
|
||||
if (weight_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
weight_ptrs_.push_back(weight_ptr_);
|
||||
} else {
|
||||
weight_ptrs_.push_back(nullptr);
|
||||
|
@ -222,7 +242,7 @@ int ConcatOpenCLKernel::Prepare() {
|
|||
kernel_name += "_NHWC4";
|
||||
MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
|
||||
std::string source = concat_source;
|
||||
std::string program_name = "Concat";
|
||||
const std::string program_name = "Concat";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -234,7 +254,10 @@ int ConcatOpenCLKernel::Prepare() {
|
|||
return ret;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -247,14 +270,27 @@ int ConcatOpenCLKernel::Run() {
|
|||
int arg_cn = 0;
|
||||
for (int i = 0; i < in_tensors_.size(); ++i) {
|
||||
auto input_ptr = weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data_c() : weight_ptrs_.at(i);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_ptr);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_ptr) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (axis_ == 3 && !Align_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF) !=
|
||||
CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ class ConcatOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Run() override;
|
||||
|
||||
|
|
|
@ -108,7 +108,10 @@ int Conv2DOpenCLKernel::Prepare() {
|
|||
return ret;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -142,7 +145,7 @@ void Conv2DOpenCLKernel::InitAttrs() {
|
|||
|
||||
int Conv2DOpenCLKernel::BuildKernel() {
|
||||
SetBlockSize();
|
||||
std::string program_name = "conv2d";
|
||||
const std::string program_name = "conv2d";
|
||||
std::stringstream kernel_name;
|
||||
kernel_name << "Conv2D_H" << block_size_.H << "W" << block_size_.W << "C" << block_size_.C;
|
||||
if (filter_type_ == MemType::IMG) {
|
||||
|
@ -245,9 +248,11 @@ void Conv2DOpenCLKernel::SetMaliFp16BlockSize(int task_size_per_cu, bool w_kerne
|
|||
}
|
||||
|
||||
int Conv2DOpenCLKernel::InitWeights() {
|
||||
InitFilter();
|
||||
if (InitFilter() != RET_OK) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (has_bias_) {
|
||||
InitBias();
|
||||
return InitBias();
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -300,7 +305,7 @@ void ConvertFilter(void *src, void *dst, TypeId src_dtype, TypeId dst_dtype, Fil
|
|||
}
|
||||
}
|
||||
|
||||
void Conv2DOpenCLKernel::InitFilter() {
|
||||
int Conv2DOpenCLKernel::InitFilter() {
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
|
||||
// allocate opencl memory: buffer or image2d
|
||||
|
@ -312,9 +317,17 @@ void Conv2DOpenCLKernel::InitFilter() {
|
|||
size_t dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
|
||||
size = width * height * CO_TILE * sizeof_FLT_;
|
||||
packed_filter_ = allocator->Malloc({width, height, dtype});
|
||||
if (packed_filter_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
size = UP_DIV(CO_SLICES_, Ogroup) * KH_ * KW_ * CI_SLICES_ * Ogroup * CI_TILE * CO_TILE * sizeof_FLT_;
|
||||
packed_filter_ = allocator->Malloc(size, lite::opencl::MemType::BUF);
|
||||
if (packed_filter_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
// rearrange filter
|
||||
|
@ -333,15 +346,22 @@ void Conv2DOpenCLKernel::InitFilter() {
|
|||
if (filter_type_ == MemType::IMG) {
|
||||
ocl_runtime_->WriteImage(packed_filter_, tmp.data());
|
||||
} else {
|
||||
allocator->MapBuffer(packed_filter_, CL_MAP_WRITE, nullptr, true);
|
||||
if (allocator->MapBuffer(packed_filter_, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memcpy(packed_filter_, tmp.data(), size);
|
||||
allocator->UnmapBuffer(packed_filter_);
|
||||
if (allocator->UnmapBuffer(packed_filter_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
FreeStoredData(stored_filter_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void Conv2DOpenCLKernel::InitBias() {
|
||||
int Conv2DOpenCLKernel::InitBias() {
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
|
||||
// align bias from C to C4
|
||||
|
@ -349,8 +369,15 @@ void Conv2DOpenCLKernel::InitBias() {
|
|||
void *src_data = stored_bias_ == nullptr ? bias_tensor->data_c() : stored_bias_;
|
||||
size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_;
|
||||
packed_bias_ = allocator->Malloc(packed_bias_size, lite::opencl::MemType::BUF);
|
||||
if (packed_bias_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true);
|
||||
if (allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(packed_bias_, 0x00, packed_bias_size);
|
||||
if (bias_tensor->data_type() == kNumberTypeFloat16) {
|
||||
if (use_fp16_) {
|
||||
|
@ -375,11 +402,15 @@ void Conv2DOpenCLKernel::InitBias() {
|
|||
memcpy(packed_bias_, src_data, CO_ * sizeof_FLT_);
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(packed_bias_);
|
||||
if (allocator->UnmapBuffer(packed_bias_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
FreeStoredData(stored_bias_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void Conv2DOpenCLKernel::SetConstArgs() {
|
||||
int Conv2DOpenCLKernel::SetConstArgs() {
|
||||
cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_};
|
||||
cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};
|
||||
cl_int4 kernel_stride = {KH_, KW_, param_->stride_h_, param_->stride_w_};
|
||||
|
@ -387,15 +418,43 @@ void Conv2DOpenCLKernel::SetConstArgs() {
|
|||
cl_int2 dilation = {param_->dilation_h_, param_->dilation_w_};
|
||||
|
||||
int arg_cn = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_filter_, filter_type_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param_->act_type_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn, alpha_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_filter_, filter_type_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param_->act_type_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn, alpha_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void Conv2DOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -429,9 +488,18 @@ void Conv2DOpenCLKernel::SetGlobalLocal() {
|
|||
|
||||
int Conv2DOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
|
|||
int CheckSpecs() override;
|
||||
int Prepare() override;
|
||||
int InitWeights() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Run() override;
|
||||
|
||||
|
@ -78,8 +78,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
|
|||
protected:
|
||||
void InitAttrs();
|
||||
virtual int BuildKernel();
|
||||
virtual void InitFilter();
|
||||
void InitBias();
|
||||
virtual int InitFilter();
|
||||
int InitBias();
|
||||
bool use_fp16_{false};
|
||||
size_t sizeof_FLT_{4};
|
||||
ConvParameter *param_{nullptr};
|
||||
|
|
|
@ -55,10 +55,10 @@ int Conv2dTransposeOpenCLKernel::CheckSpecs() {
|
|||
}
|
||||
|
||||
int Conv2dTransposeOpenCLKernel::Prepare() {
|
||||
std::string kernel_name = "conv2d_transpose";
|
||||
const std::string kernel_name = "conv2d_transpose";
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
std::string source = GetActDefines() + conv2d_transpose_source;
|
||||
std::string program_name = "conv2d_transpose";
|
||||
const std::string program_name = "conv2d_transpose";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -74,7 +74,10 @@ int Conv2dTransposeOpenCLKernel::Prepare() {
|
|||
return ret;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -94,7 +97,7 @@ void Conv2dTransposeOpenCLKernel::SetGlobalLocal() {
|
|||
AlignGlobalLocal(global_size_, local_size_);
|
||||
}
|
||||
|
||||
void Conv2dTransposeOpenCLKernel::SetConstArgs() {
|
||||
int Conv2dTransposeOpenCLKernel::SetConstArgs() {
|
||||
int arg_cnt = 2;
|
||||
auto *param = reinterpret_cast<ConvParameter *>(op_parameter_);
|
||||
int ci = in_tensors_[0]->shape()[3];
|
||||
|
@ -115,14 +118,39 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() {
|
|||
cl_int2 padding = {pad_h, pad_w};
|
||||
cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), n};
|
||||
cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), n};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt, static_cast<cl_int>(param->act_type_));
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt, static_cast<cl_int>(param->act_type_)) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int Conv2dTransposeOpenCLKernel::InitWeights() {
|
||||
|
@ -147,7 +175,15 @@ int Conv2dTransposeOpenCLKernel::InitFilter() {
|
|||
// IHWO to OHWI4(I)4(O)(converter format is IHWO)
|
||||
// init padWeight_(buffer mem)
|
||||
padWeight_ = allocator->Malloc(div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size, lite::opencl::MemType::BUF);
|
||||
if (padWeight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
||||
if (padWeight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(padWeight_, 0x00, div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size);
|
||||
auto origin_weight = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_;
|
||||
auto weight_dtype = in_tensors_.at(kWeightIndex)->data_type();
|
||||
|
@ -188,7 +224,10 @@ int Conv2dTransposeOpenCLKernel::InitFilter() {
|
|||
}
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(padWeight_);
|
||||
if (allocator->UnmapBuffer(padWeight_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
FreeStoredData(stored_weight_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -208,7 +247,15 @@ int Conv2dTransposeOpenCLKernel::InitBias() {
|
|||
}
|
||||
ImageSize img_size{im_dst_x, im_dst_y, img_dtype};
|
||||
bias_ = allocator->Malloc(img_size);
|
||||
if (bias_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
|
||||
if (bias_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_, 0x00, div_co * C4NUM * data_size);
|
||||
if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
|
||||
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
|
||||
|
@ -225,7 +272,10 @@ int Conv2dTransposeOpenCLKernel::InitBias() {
|
|||
memcpy(bias_, src_data, co * data_size);
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(bias_);
|
||||
if (allocator->UnmapBuffer(bias_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
FreeStoredData(stored_bias_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -233,9 +283,18 @@ int Conv2dTransposeOpenCLKernel::InitBias() {
|
|||
int Conv2dTransposeOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
int arg_cnt = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel {
|
|||
int InitWeights() override;
|
||||
int InitFilter();
|
||||
int InitBias();
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int StoreConstData() override;
|
||||
|
||||
|
|
|
@ -73,7 +73,7 @@ int DepthwiseConv2dOpenCLKernel::Prepare() {
|
|||
} else {
|
||||
block_size_.C = block_size_.H = block_size_.W = 1;
|
||||
}
|
||||
std::string program_name = "DepthwiseConv2d";
|
||||
const std::string program_name = "DepthwiseConv2d";
|
||||
std::string source = depthwise_conv2d_source;
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
|
@ -94,7 +94,10 @@ int DepthwiseConv2dOpenCLKernel::Prepare() {
|
|||
return ret;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done! mem type=" << static_cast<int>(out_mem_type_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -153,10 +156,12 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
|
|||
size_t img_dtype = ocl_runtime_->GetFp16Enable() ? CL_HALF_FLOAT : CL_FLOAT;
|
||||
ImageSize img_size{(size_t)plane_out / C4NUM, (size_t)out_info.N * CO4, img_dtype};
|
||||
packed_weight_ = allocator->Malloc(img_size, temp_filter.data());
|
||||
|
||||
} else {
|
||||
packed_weight_ = allocator->Malloc(pack_weight_size, temp_filter.data());
|
||||
}
|
||||
if (packed_weight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
FreeStoredData(stored_weight_);
|
||||
|
@ -199,13 +204,15 @@ int DepthwiseConv2dOpenCLKernel::InitBias() {
|
|||
}
|
||||
bias_data_ = allocator->Malloc(bias_size, temp_bias.data());
|
||||
if (bias_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
FreeStoredData(stored_bias_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void DepthwiseConv2dOpenCLKernel::SetConstArgs() {
|
||||
int DepthwiseConv2dOpenCLKernel::SetConstArgs() {
|
||||
auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
|
||||
auto in_info = GpuTensorInfo(in_tensors_[0]);
|
||||
auto out_info = GpuTensorInfo(out_tensors_[0]);
|
||||
|
@ -222,16 +229,47 @@ void DepthwiseConv2dOpenCLKernel::SetConstArgs() {
|
|||
cl_int4 dst_size = {(cl_int)out_info.W, (cl_int)out_info.H, (cl_int)CO4, (cl_int)out_info.N};
|
||||
|
||||
int arg_cnt = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, packed_weight_, filter_type_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dilation);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, packed_weight_, filter_type_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dilation) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -286,9 +324,18 @@ int DepthwiseConv2dOpenCLKernel::StoreConstData() {
|
|||
|
||||
int DepthwiseConv2dOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -41,7 +41,7 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {
|
|||
int CheckSpecs() override;
|
||||
int InitWeights() override;
|
||||
int InitBias();
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int StoreConstData() override;
|
||||
|
||||
|
|
|
@ -35,7 +35,10 @@ int FillOpenCLKernel::RunFill() {
|
|||
cl_int4 fill_value = {};
|
||||
fill_value.s[0] = fill_value.s[1] = fill_value.s[2] = fill_value.s[3] = default_;
|
||||
auto src_data = out_tensors_[0]->data_c();
|
||||
allocator_->GetImageSize(src_data, &img_size);
|
||||
if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
|
||||
MS_LOG(ERROR) << "GetImageSize failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
|
||||
auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
|
||||
cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
|
||||
|
@ -59,7 +62,7 @@ int FillOpenCLKernel::RunShape() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void FillOpenCLKernel::SetConstArgs() {}
|
||||
int FillOpenCLKernel::SetConstArgs() { return RET_OK; }
|
||||
|
||||
void FillOpenCLKernel::SetGlobalLocal() {}
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ class FillOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
int Run() override;
|
||||
|
|
|
@ -98,7 +98,7 @@ int FullConnectionOpenCLKernel::Prepare() {
|
|||
kernel_name = "FullConnectionWeightVar";
|
||||
}
|
||||
std::string source = fullconnection_source;
|
||||
std::string program_name = "FullConnection";
|
||||
const std::string program_name = "FullConnection";
|
||||
if (!ocl_runtime_->LoadSource(program_name, GetActDefines() + source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -113,7 +113,10 @@ int FullConnectionOpenCLKernel::Prepare() {
|
|||
if (ret != RET_OK) {
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
|
@ -137,7 +140,15 @@ int FullConnectionOpenCLKernel::InitFilter() {
|
|||
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
||||
padWeight_ = allocator->Malloc(nhw_remainder * intensor_shape.Slice * co4 * C4NUM * C4NUM * dtype_size,
|
||||
lite::opencl::MemType::BUF);
|
||||
if (padWeight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
||||
if (padWeight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
|
||||
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
|
||||
memset(padWeight_, 0x00, nhw_remainder * intensor_shape.Slice * co4 * C4NUM * C4NUM * dtype_size);
|
||||
|
@ -183,7 +194,10 @@ int FullConnectionOpenCLKernel::InitFilter() {
|
|||
}
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(padWeight_);
|
||||
if (allocator->UnmapBuffer(padWeight_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
FreeStoredData(stored_weight_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -202,7 +216,15 @@ int FullConnectionOpenCLKernel::InitBias() {
|
|||
}
|
||||
ImageSize img_size{im_dst_x, im_dst_y, img_dtype};
|
||||
bias_ = allocator->Malloc(img_size);
|
||||
if (bias_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
|
||||
if (bias_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_, 0x00, co4 * C4NUM * dtype_size);
|
||||
if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
|
||||
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
|
||||
|
@ -218,7 +240,10 @@ int FullConnectionOpenCLKernel::InitBias() {
|
|||
memcpy(bias_, src_data, CO_ * dtype_size);
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(bias_);
|
||||
if (allocator->UnmapBuffer(bias_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
FreeStoredData(stored_bias_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -231,22 +256,44 @@ void FullConnectionOpenCLKernel::SetGlobalLocal() {
|
|||
AlignGlobalLocal(global_size_, local_size_);
|
||||
}
|
||||
|
||||
void FullConnectionOpenCLKernel::SetConstArgs() {
|
||||
int FullConnectionOpenCLKernel::SetConstArgs() {
|
||||
if (!weight_var_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, 2, padWeight_, lite::opencl::MemType::BUF);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 2, padWeight_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
int arg_count = 3;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, N_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, N_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto intensor_shape = GpuTensorInfo(in_tensors_[0]);
|
||||
int CI4 = CI_remainder_ * intensor_shape.Slice;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, CI4);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, UP_DIV(CO_, C4NUM));
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, CI4) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, UP_DIV(CO_, C4NUM)) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto in_shape_info = GpuTensorInfo(in_tensors_[0]);
|
||||
cl_int2 in_img_shape = {static_cast<int>(in_shape_info.height), static_cast<int>(in_shape_info.width)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_img_shape);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_img_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto *param = reinterpret_cast<MatMulParameter *>(op_parameter_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast<cl_int>(param->act_type_));
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast<cl_int>(param->act_type_)) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int FullConnectionOpenCLKernel::StoreConstData() {
|
||||
|
@ -270,12 +317,24 @@ int FullConnectionOpenCLKernel::StoreConstData() {
|
|||
int FullConnectionOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
int arg_count = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
|
||||
if (weight_var_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (weight_var_) {
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ class FullConnectionOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
int InitWeights() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Tune() override { return lite::RET_OK; }
|
||||
int StoreConstData() override;
|
||||
|
|
|
@ -164,8 +164,8 @@ bool IsEltwiseAndOperatorSupported(LiteKernel *node) {
|
|||
|
||||
int FusionEltwiseOpenCLKernel::Prepare() {
|
||||
std::string source = Codegen();
|
||||
std::string program_name = "FusionEltwise\n" + source;
|
||||
std::string kernel_name = "FusionEltwise";
|
||||
const std::string program_name = "FusionEltwise\n" + source;
|
||||
const std::string kernel_name = "FusionEltwise";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -183,7 +183,10 @@ int FusionEltwiseOpenCLKernel::Prepare() {
|
|||
}
|
||||
InitWeights();
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -217,7 +220,14 @@ int FusionEltwiseOpenCLKernel::InitWeights() {
|
|||
size_t num = tensor_info.ElementsNum;
|
||||
size_t size = tensor_info.Image2DSize;
|
||||
void *buffer = allocator->Malloc(size, lite::opencl::MemType::BUF);
|
||||
allocator->MapBuffer(buffer, CL_MAP_WRITE, nullptr, true);
|
||||
if (buffer == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->MapBuffer(buffer, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(buffer, 0x00, size);
|
||||
if (tensor->data_type() == kNumberTypeFloat16) {
|
||||
if (use_fp16) {
|
||||
|
@ -232,7 +242,10 @@ int FusionEltwiseOpenCLKernel::InitWeights() {
|
|||
CopyNumber<float32_t, float32_t>(buffer, tensor->data_c(), num);
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(buffer);
|
||||
if (allocator->UnmapBuffer(buffer) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
buffer_weights_.push_back(buffer);
|
||||
}
|
||||
}
|
||||
|
@ -247,7 +260,7 @@ void FusionEltwiseOpenCLKernel::SetGlobalLocal() {
|
|||
AlignGlobalLocal(global_size_, local_size_);
|
||||
}
|
||||
|
||||
void FusionEltwiseOpenCLKernel::SetConstArgs() {
|
||||
int FusionEltwiseOpenCLKernel::SetConstArgs() {
|
||||
auto output = GpuTensorInfo(out_tensors_.front());
|
||||
cl_int4 output_shape = {static_cast<cl_int>(output.N), static_cast<cl_int>(output.H), static_cast<cl_int>(output.W),
|
||||
static_cast<cl_int>(output.C)};
|
||||
|
@ -260,18 +273,32 @@ void FusionEltwiseOpenCLKernel::SetConstArgs() {
|
|||
if (IsScalar(in_tensor->shape())) {
|
||||
if (ocl_runtime_->GetFp16Enable()) {
|
||||
auto value = static_cast<float16_t>(scalar_weights_[scalar_idx++]);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx, *(reinterpret_cast<cl_half *>(&value)));
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, *(reinterpret_cast<cl_half *>(&value))) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx, scalar_weights_[scalar_idx++]);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, scalar_weights_[scalar_idx++]) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx, buffer_weights_[buffer_idx++], lite::opencl::MemType::BUF);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, buffer_weights_[buffer_idx++], lite::opencl::MemType::BUF) !=
|
||||
CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
arg_idx++; // for act input
|
||||
}
|
||||
arg_idx++; // for output
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx, output_shape);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, output_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int FusionEltwiseOpenCLKernel::Run() {
|
||||
|
@ -279,12 +306,21 @@ int FusionEltwiseOpenCLKernel::Run() {
|
|||
int arg_idx = 0;
|
||||
for (auto *in_tensor : in_tensors_) {
|
||||
if (!in_tensor->IsConst()) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx, in_tensor->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, in_tensor->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
arg_idx++;
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx, out_tensors_.front()->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, out_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -162,7 +162,7 @@ class FusionEltwiseOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
int InitWeights() override;
|
||||
void SetGlobalLocal() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
int Run() override;
|
||||
|
||||
void ClearParameter() { op_parameter_ = nullptr; }
|
||||
|
|
|
@ -81,7 +81,7 @@ int GatherOpenCLKernel::CheckSpecs() {
|
|||
}
|
||||
}
|
||||
|
||||
void GatherOpenCLKernel::SetConstArgs() {
|
||||
int GatherOpenCLKernel::SetConstArgs() {
|
||||
auto input = GpuTensorInfo(in_tensors_.front());
|
||||
auto output = GpuTensorInfo(out_tensors_.front());
|
||||
int indices_num = in_tensors_.at(1)->ElementsNum();
|
||||
|
@ -90,10 +90,23 @@ void GatherOpenCLKernel::SetConstArgs() {
|
|||
cl_int4 dst_size = {static_cast<cl_int>(output.W), static_cast<cl_int>(output.H), static_cast<cl_int>(output.Slice),
|
||||
static_cast<cl_int>(output.N)};
|
||||
int arg_cnt = 3;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, indices_num);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt, axis_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, indices_num) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt, axis_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void GatherOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -104,11 +117,11 @@ void GatherOpenCLKernel::SetGlobalLocal() {
|
|||
}
|
||||
|
||||
int GatherOpenCLKernel::Prepare() {
|
||||
std::string kernel_name = "gather";
|
||||
const std::string kernel_name = "gather";
|
||||
if (in_tensors_.at(0)->shape().size() == 1 && axis_ == 0) {
|
||||
axis_ = 3;
|
||||
}
|
||||
std::string program_name = "gather";
|
||||
const std::string program_name = "gather";
|
||||
if (!ocl_runtime_->LoadSource(program_name, gather_source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -127,7 +140,10 @@ int GatherOpenCLKernel::Prepare() {
|
|||
}
|
||||
}
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -135,11 +151,21 @@ int GatherOpenCLKernel::Prepare() {
|
|||
int GatherOpenCLKernel::ConvertTensorToweight() {
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
auto indices_tensor = in_tensors_.at(1);
|
||||
allocator->MapBuffer(indices_tensor->data_c(), CL_MAP_WRITE, nullptr, true);
|
||||
if (allocator->MapBuffer(indices_tensor->data_c(), CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto indices_num = indices_tensor->ElementsNum();
|
||||
indices_data_ =
|
||||
reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num, lite::opencl::MemType::BUF));
|
||||
allocator->MapBuffer(indices_data_, CL_MAP_WRITE, nullptr, true);
|
||||
if (indices_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->MapBuffer(indices_data_, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (indices_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Memory allocation failed";
|
||||
return RET_ERROR;
|
||||
|
@ -155,8 +181,14 @@ int GatherOpenCLKernel::ConvertTensorToweight() {
|
|||
<< " But Your type is :" << data_type;
|
||||
return RET_ERROR;
|
||||
}
|
||||
allocator->UnmapBuffer(indices_data_);
|
||||
allocator->UnmapBuffer(indices_tensor->data_c());
|
||||
if (allocator->UnmapBuffer(indices_data_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->UnmapBuffer(indices_tensor->data_c()) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -197,7 +229,10 @@ int GatherOpenCLKernel::PreProcess() {
|
|||
if (!InferShapeDone()) {
|
||||
auto indices_tensor = in_tensors_[1];
|
||||
if (!indices_tensor->IsConst()) {
|
||||
ocl_runtime_->SyncCommandQueue();
|
||||
if (!ocl_runtime_->SyncCommandQueue()) {
|
||||
MS_LOG(ERROR) << "SyncCommandQueue failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
indices_tensor->MutableData();
|
||||
}
|
||||
}
|
||||
|
@ -209,10 +244,22 @@ int GatherOpenCLKernel::Run() {
|
|||
if (intensor1_is_tensor) {
|
||||
ConvertTensorToweight();
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_.front()->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ class GatherOpenCLKernel : public OpenCLKernel {
|
|||
int PreProcess() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Tune() override { return lite::RET_OK; }
|
||||
int ConvertTensorToweight();
|
||||
|
|
|
@ -98,6 +98,10 @@ int ArithmeticInt8OpenCLKernel::InitWeights() {
|
|||
size_t dtype = fp16_enable ? CL_HALF_FLOAT : CL_FLOAT;
|
||||
ImageSize img_size{in_shape.width, in_shape.height, dtype};
|
||||
auto weight_ptr_ = allocator->Malloc(img_size, weight.data());
|
||||
if (weight_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
weight_ptrs_.push_back(weight_ptr_);
|
||||
} else {
|
||||
weight_ptrs_.push_back(nullptr);
|
||||
|
@ -106,7 +110,7 @@ int ArithmeticInt8OpenCLKernel::InitWeights() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void ArithmeticInt8OpenCLKernel::SetConstArgs() {
|
||||
int ArithmeticInt8OpenCLKernel::SetConstArgs() {
|
||||
int arg_idx = 3;
|
||||
if (!element_flag_) {
|
||||
cl_int4 in0_shape = {static_cast<int>(in0_shape_.N), static_cast<int>(in0_shape_.H), static_cast<int>(in0_shape_.W),
|
||||
|
@ -121,16 +125,37 @@ void ArithmeticInt8OpenCLKernel::SetConstArgs() {
|
|||
} else if (in0_shape_.C != 1 && in1_shape_.C == 1) {
|
||||
broadcastC_flag = 2; // BroadCast C4 in input1
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in0_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in1_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in0_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in1_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
cl_int2 output_shape{static_cast<int>(global_range_[0]), static_cast<int>(global_range_[1])};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
|
||||
|
||||
// set quantization parameter.
|
||||
auto input0_quant_param = in_tensors_[0]->quant_params().front();
|
||||
|
@ -141,8 +166,15 @@ void ArithmeticInt8OpenCLKernel::SetConstArgs() {
|
|||
cl_char4 zero_point = {static_cast<int8_t>(input0_quant_param.zeroPoint),
|
||||
static_cast<int8_t>(input1_quant_param.zeroPoint),
|
||||
static_cast<int8_t>(output_quant_param.zeroPoint), 0};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale); // scale
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, zero_point); // zero_point
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // scale
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, zero_point) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // zero_point
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ArithmeticInt8OpenCLKernel::Prepare() {
|
||||
|
@ -191,7 +223,7 @@ int ArithmeticInt8OpenCLKernel::Prepare() {
|
|||
activation_max_ = 6.f;
|
||||
}
|
||||
|
||||
std::string program_name = "Arithmetic";
|
||||
const std::string program_name = "Arithmetic";
|
||||
std::string source = arithmetic_source;
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
|
@ -207,7 +239,10 @@ int ArithmeticInt8OpenCLKernel::Prepare() {
|
|||
if (type() != PrimitiveType_BiasAdd) {
|
||||
InitWeights();
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name_ << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -218,10 +253,22 @@ int ArithmeticInt8OpenCLKernel::Run() {
|
|||
auto input_1_ptr = weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : weight_ptrs_[1];
|
||||
int arg_idx = 0;
|
||||
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ class ArithmeticInt8OpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
int InitWeights() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -67,15 +67,31 @@ void LayerNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
|
|||
local->push_back(z);
|
||||
}
|
||||
|
||||
void LayerNormOpenCLKernel::SetConstArgs() {
|
||||
int LayerNormOpenCLKernel::SetConstArgs() {
|
||||
int arg_cn = 6;
|
||||
GpuTensorInfo img_info(in_tensors_.at(0));
|
||||
in_shape_.s[0] = img_info.N, in_shape_.s[1] = img_info.H, in_shape_.s[2] = img_info.W, in_shape_.s[3] = img_info.C;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, epsilon_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, normalized_axis_);
|
||||
ocl_runtime_->SetKernelArg(kernel_mean_var_, 3, in_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_mean_var_, 4, normalized_shape_size_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, epsilon_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, normalized_axis_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_mean_var_, 3, in_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_mean_var_, 4, normalized_shape_size_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void AlignMeanVarGlobalLocal(const std::vector<int> &global, const std::vector<int> &local, cl::NDRange *global_range,
|
||||
|
@ -106,9 +122,23 @@ int LayerNormOpenCLKernel::Initweight() {
|
|||
size_t weight_size = img_info.Image2DSize;
|
||||
// allocated memory for weight and init value
|
||||
gamma_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||
if (gamma_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
beta_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||
allocator->MapBuffer(gamma_, CL_MAP_WRITE, nullptr, true);
|
||||
allocator->MapBuffer(beta_, CL_MAP_WRITE, nullptr, true);
|
||||
if (beta_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->MapBuffer(gamma_, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->MapBuffer(beta_, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(gamma_, 0x01, weight_size);
|
||||
memset(beta_, 0x00, weight_size);
|
||||
|
||||
|
@ -143,8 +173,14 @@ int LayerNormOpenCLKernel::Initweight() {
|
|||
memcpy(beta_, in_tensors_.at(2)->data_c(), weight_size);
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(gamma_);
|
||||
allocator->UnmapBuffer(beta_);
|
||||
if (allocator->UnmapBuffer(gamma_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->UnmapBuffer(beta_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -164,11 +200,19 @@ int LayerNormOpenCLKernel::Prepare() {
|
|||
size_t size_dtype = use_fp16_enable_ ? sizeof(float16_t) : sizeof(float);
|
||||
mean_size *= size_dtype;
|
||||
mean_ = allocator->Malloc(mean_size, lite::opencl::MemType::BUF);
|
||||
if (mean_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
var_ = allocator->Malloc(mean_size, lite::opencl::MemType::BUF);
|
||||
std::string kernel_name = "LayerNormalization_NHWC4";
|
||||
if (var_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
const std::string kernel_name = "LayerNormalization_NHWC4";
|
||||
std::string kernel_name_mean_var = "ComputeMeanVar";
|
||||
std::string source = layer_norm_source;
|
||||
std::string program_name = "LayerNormalization";
|
||||
const std::string program_name = "LayerNormalization";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -182,7 +226,10 @@ int LayerNormOpenCLKernel::Prepare() {
|
|||
kernel_name_mean_var += "Axis" + std::to_string(normalized_axis_) + "NHWC4";
|
||||
ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var, build_options_ext);
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
|
||||
return RET_OK;
|
||||
|
@ -191,21 +238,48 @@ int LayerNormOpenCLKernel::Prepare() {
|
|||
int LayerNormOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
int arg1_cn = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_mean_var_, arg1_cn++, in_tensors_.at(0)->data_c()); // input tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_mean_var_, arg1_cn++, mean_, lite::opencl::MemType::BUF); // mean_
|
||||
ocl_runtime_->SetKernelArg(kernel_mean_var_, arg1_cn++, var_, lite::opencl::MemType::BUF); // var_ return RET_OK;
|
||||
if (ocl_runtime_->SetKernelArg(kernel_mean_var_, arg1_cn++, in_tensors_.at(0)->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // input tensor
|
||||
if (ocl_runtime_->SetKernelArg(kernel_mean_var_, arg1_cn++, mean_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_mean_var_, arg1_cn++, var_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->RunKernel(kernel_mean_var_, global_mean_var_, local_mean_var_, nullptr, &event_);
|
||||
|
||||
int arg_cn = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()); // input tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c()); // out tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, mean_, lite::opencl::MemType::BUF); // mean_
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, var_, lite::opencl::MemType::BUF); // var_
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, gamma_, lite::opencl::MemType::BUF); // gamma_
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, beta_, lite::opencl::MemType::BUF); // beta_
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // input tensor
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // out tensor
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, mean_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // mean_
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, var_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // var_
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, gamma_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // gamma_
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, beta_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
} // beta_
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
return RET_OK;
|
||||
}
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LayerNormFusion, OpenCLKernelCreator<LayerNormOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LayerNormFusion, OpenCLKernelCreator<LayerNormOpenCLKernel>)
|
||||
|
|
|
@ -31,7 +31,7 @@ class LayerNormOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -84,7 +84,7 @@ int MatMulOpenCLKernel::Prepare() {
|
|||
std::map<int, std::string> dims2str = {{2, "_2d"}, {3, "_4d"}, {4, "_4d"}};
|
||||
kernel_name += dims2str[dims];
|
||||
std::string source = matmul_source;
|
||||
std::string program_name = "MatMul";
|
||||
const std::string program_name = "MatMul";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -95,13 +95,16 @@ int MatMulOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void MatMulOpenCLKernel::PadWeight(std::vector<int> weight_shape_4d, int ci, int co) {
|
||||
int MatMulOpenCLKernel::PadWeight(std::vector<int> weight_shape_4d, int ci, int co) {
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
int a = weight_shape_4d[0];
|
||||
int b = weight_shape_4d[1];
|
||||
|
@ -109,7 +112,15 @@ void MatMulOpenCLKernel::PadWeight(std::vector<int> weight_shape_4d, int ci, int
|
|||
int co4 = UP_DIV(co, C4NUM);
|
||||
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
||||
padWeight_ = allocator->Malloc(a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size, lite::opencl::MemType::BUF);
|
||||
if (padWeight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
||||
if (padWeight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
|
||||
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
|
||||
memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size);
|
||||
|
@ -157,6 +168,7 @@ void MatMulOpenCLKernel::PadWeight(std::vector<int> weight_shape_4d, int ci, int
|
|||
}
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int MatMulOpenCLKernel::InitWeights() {
|
||||
|
@ -185,7 +197,10 @@ int MatMulOpenCLKernel::InitWeights() {
|
|||
|
||||
PadWeight(weight_shape_4d, ci, CO_);
|
||||
|
||||
allocator->UnmapBuffer(padWeight_);
|
||||
if (allocator->UnmapBuffer(padWeight_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
FreeStoredData(stored_weight_);
|
||||
return InitBias();
|
||||
}
|
||||
|
@ -204,7 +219,15 @@ int MatMulOpenCLKernel::InitBias() {
|
|||
}
|
||||
lite::opencl::ImageSize img_size{im_dst_x, im_dst_y, img_dtype};
|
||||
bias_ = allocator->Malloc(img_size);
|
||||
if (bias_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
|
||||
if (bias_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_, 0x00, co4 * C4NUM * dtype_size);
|
||||
if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
|
||||
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
|
||||
|
@ -220,7 +243,10 @@ int MatMulOpenCLKernel::InitBias() {
|
|||
memcpy(bias_, src_data, CO_ * dtype_size);
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(bias_);
|
||||
if (allocator->UnmapBuffer(bias_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
FreeStoredData(stored_bias_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -235,29 +261,54 @@ void MatMulOpenCLKernel::SetGlobalLocal() {
|
|||
AlignGlobalLocal(global_size_, local_size_);
|
||||
}
|
||||
|
||||
void MatMulOpenCLKernel::SetConstArgs() {
|
||||
int MatMulOpenCLKernel::SetConstArgs() {
|
||||
int arg_count = 2;
|
||||
cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
|
||||
cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
|
||||
if (act_weight_) {
|
||||
arg_count++;
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int MatMulOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
int arg_count = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
|
||||
if (act_weight_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (act_weight_) {
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ class MatMulOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
int InitWeights() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Tune() override { return lite::RET_OK; }
|
||||
int InitBias();
|
||||
|
@ -54,7 +54,7 @@ class MatMulOpenCLKernel : public OpenCLKernel {
|
|||
std::vector<int> outShape{std::vector<int>(MAX_DIMS, 1)};
|
||||
|
||||
private:
|
||||
void PadWeight(std::vector<int> weight_shape_4d, int ci, int co);
|
||||
int PadWeight(std::vector<int> weight_shape_4d, int ci, int co);
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ int OneHotOpenCLKernel::Prepare() {
|
|||
kernel_name += "Axis" + std::to_string(axis_);
|
||||
}
|
||||
std::string source = one_hot_source;
|
||||
std::string program_name = "OneHot";
|
||||
const std::string program_name = "OneHot";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -65,7 +65,10 @@ int OneHotOpenCLKernel::Prepare() {
|
|||
return ret;
|
||||
}
|
||||
InitWeights();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
|
@ -87,18 +90,40 @@ int OneHotOpenCLKernel::InitWeights() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void OneHotOpenCLKernel::SetConstArgs() {
|
||||
int OneHotOpenCLKernel::SetConstArgs() {
|
||||
cl_int2 cl_in_image2d_shape = {static_cast<cl_int>(in_shape_.width), static_cast<cl_int>(in_shape_.height)};
|
||||
cl_int4 cl_out_shape = {static_cast<cl_int>(out_shape_.N), static_cast<cl_int>(out_shape_.H),
|
||||
static_cast<cl_int>(out_shape_.W), static_cast<cl_int>(out_shape_.Slice)};
|
||||
int arg_idx = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_in_image2d_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_out_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, depth_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, on_value_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, off_value_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<int>(out_shape_.C));
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx, static_cast<int>(param_->support_neg_index_));
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_in_image2d_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_out_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, depth_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, on_value_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, off_value_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<int>(out_shape_.C)) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, static_cast<int>(param_->support_neg_index_)) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
void OneHotOpenCLKernel::SetGlobalLocal() {
|
||||
local_size_ = {};
|
||||
|
@ -108,9 +133,18 @@ void OneHotOpenCLKernel::SetGlobalLocal() {
|
|||
|
||||
int OneHotOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ class OneHotOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
int InitWeights() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -81,11 +81,14 @@ int PadOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void PadOpenCLKernel::SetConstArgs() {
|
||||
int PadOpenCLKernel::SetConstArgs() {
|
||||
auto input = GpuTensorInfo(in_tensors_.front());
|
||||
auto output = GpuTensorInfo(out_tensors_.front());
|
||||
cl_int4 input_shape = {static_cast<cl_int>(input.N), static_cast<cl_int>(input.H), static_cast<cl_int>(input.W),
|
||||
|
@ -105,20 +108,45 @@ void PadOpenCLKernel::SetConstArgs() {
|
|||
Broadcast2GpuShape(pad_before.s, pad_before_ori.data(), ndim, 0);
|
||||
|
||||
int arg_cn = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn, param_->constant_value_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn, param_->constant_value_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
local_size_ = {8, 4, 1};
|
||||
global_size_ = {output.N * output.H, output.W, output.Slice};
|
||||
AlignGlobalLocal(global_size_, local_size_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int PadOpenCLKernel::Run() {
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ class PadOpenCLKernel : public OpenCLKernel {
|
|||
int CheckSpecs() override;
|
||||
|
||||
int Prepare() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
|
||||
int Run() override;
|
||||
|
||||
|
|
|
@ -73,7 +73,7 @@ int PoolingOpenCLKernel::Prepare() {
|
|||
kernel_name += "_NHWC4";
|
||||
kernel_name += "_IMG";
|
||||
std::string source = pooling2d_source;
|
||||
std::string program_name = "Pooling2d";
|
||||
const std::string program_name = "Pooling2d";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -84,7 +84,10 @@ int PoolingOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
|
||||
|
@ -100,7 +103,7 @@ void PoolingOpenCLKernel::SetGlobalLocal() {
|
|||
AlignGlobalLocal(global_size_, local_size_);
|
||||
}
|
||||
|
||||
void PoolingOpenCLKernel::SetConstArgs() {
|
||||
int PoolingOpenCLKernel::SetConstArgs() {
|
||||
int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
|
||||
cl_int4 input_shape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], slices};
|
||||
cl_int4 output_shape = {out_tensors_[0]->shape()[0], out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2],
|
||||
|
@ -109,19 +112,44 @@ void PoolingOpenCLKernel::SetConstArgs() {
|
|||
cl_int2 kernel_size = {parameter_->window_h_, parameter_->window_w_};
|
||||
cl_int2 padding = {parameter_->pad_u_, parameter_->pad_l_};
|
||||
int arg_idx = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, stride);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, stride) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int PoolingOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
int arg_idx = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ class PoolingOpenCLKernel : public OpenCLKernel {
|
|||
int Run() override;
|
||||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -63,15 +63,21 @@ void PowerGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l
|
|||
local->push_back(z);
|
||||
}
|
||||
|
||||
void PowerOpenCLKernel::SetConstArgs() {
|
||||
int PowerOpenCLKernel::SetConstArgs() {
|
||||
float unalign_w = static_cast<float>(out_shape_.s[3]);
|
||||
out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
|
||||
int arg_cn = 2;
|
||||
if (!broadcast_) {
|
||||
arg_cn++;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (use_fp16_enable_) {
|
||||
auto x = static_cast<float16_t>(power_);
|
||||
|
@ -80,11 +86,18 @@ void PowerOpenCLKernel::SetConstArgs() {
|
|||
auto w = static_cast<float16_t>(unalign_w);
|
||||
cl_half4 parameter = {*(reinterpret_cast<uint16_t *>(&x)), *(reinterpret_cast<uint16_t *>(&y)),
|
||||
*(reinterpret_cast<uint16_t *>(&z)), *(reinterpret_cast<uint16_t *>(&w))};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, parameter);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, parameter) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
cl_float4 parameter = {power_, shift_, scale_, unalign_w};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, parameter);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, parameter) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void PowerOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -111,7 +124,7 @@ int PowerOpenCLKernel::Prepare() {
|
|||
auto param = reinterpret_cast<PowerParameter *>(this->op_parameter_);
|
||||
std::string kernel_name = "power";
|
||||
std::string source = power_source;
|
||||
std::string program_name = "power";
|
||||
const std::string program_name = "power";
|
||||
if (broadcast_) {
|
||||
power_ = param->power_;
|
||||
kernel_name += "_broadcast";
|
||||
|
@ -130,7 +143,10 @@ int PowerOpenCLKernel::Prepare() {
|
|||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -138,13 +154,28 @@ int PowerOpenCLKernel::Run() {
|
|||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
int arg_cn = 0;
|
||||
if (broadcast_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(1)->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(1)->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ class PowerOpenCLKernel : public OpenCLKernel {
|
|||
|
||||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Run() override;
|
||||
|
||||
|
|
|
@ -46,7 +46,14 @@ int PReluOpenCLKernel::InitWeights() {
|
|||
auto sizeof_FLT = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
|
||||
size_t weight_size = UP_ROUND(C_, C4NUM) * sizeof_FLT;
|
||||
weight_vector_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||
allocator->MapBuffer(weight_vector_, CL_MAP_WRITE, nullptr, true);
|
||||
if (weight_vector_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->MapBuffer(weight_vector_, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(weight_vector_, 0x00, weight_size);
|
||||
if (weight_tensor->data_type() == kNumberTypeFloat16) {
|
||||
if (enable_fp16_) {
|
||||
|
@ -69,7 +76,10 @@ int PReluOpenCLKernel::InitWeights() {
|
|||
memcpy(weight_vector_, weight_tensor->data_c(), C_ * sizeof_FLT);
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(weight_vector_);
|
||||
if (allocator->UnmapBuffer(weight_vector_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -95,11 +105,18 @@ int PReluOpenCLKernel::CheckSpecs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void PReluOpenCLKernel::SetConstArgs() {
|
||||
int PReluOpenCLKernel::SetConstArgs() {
|
||||
int arg_idx = 3;
|
||||
out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, 2);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, 2) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void PReluOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -126,8 +143,8 @@ int PReluOpenCLKernel::Prepare() {
|
|||
weight_is_scalar = param->channelShared;
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
std::string source = prelu_source;
|
||||
std::string program_name = "PRelu";
|
||||
std::string kernel_name = "PRelu_" + std::string(weight_is_scalar ? "scalar" : "vector");
|
||||
const std::string program_name = "PRelu";
|
||||
const std::string kernel_name = "PRelu_" + std::string(weight_is_scalar ? "scalar" : "vector");
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -141,7 +158,10 @@ int PReluOpenCLKernel::Prepare() {
|
|||
InitWeights();
|
||||
MS_LOG(DEBUG) << program_name << " init Done!";
|
||||
MS_LOG(DEBUG) << "kernel_name=: " << kernel_name << " init Done!";
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -149,12 +169,24 @@ int PReluOpenCLKernel::Prepare() {
|
|||
int PReluOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << op_parameter_->name_ << " Running!";
|
||||
int arg_idx = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (weight_is_scalar) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight_scalar_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight_scalar_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight_vector_, lite::opencl::MemType::BUF);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight_vector_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ret != mindspore::lite::RET_OK) {
|
||||
|
|
|
@ -31,7 +31,7 @@ class PReluOpenCLKernel : public OpenCLKernel {
|
|||
|
||||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Run() override;
|
||||
int InitWeights() override;
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <set>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
#include "include/errorcode.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/reduce.h"
|
||||
|
@ -179,7 +180,7 @@ int ReduceOpenCLKernel::Prepare() {
|
|||
}
|
||||
kernel_name += GetReduceTypeStr(reduce_param->mode_);
|
||||
std::string source = reduce_source;
|
||||
std::string program_name = "Reduce";
|
||||
const std::string program_name = "Reduce";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -190,22 +191,32 @@ int ReduceOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
void ReduceOpenCLKernel::SetConstArgs() {
|
||||
int ReduceOpenCLKernel::SetConstArgs() {
|
||||
int h = inShape.H;
|
||||
int w = inShape.W;
|
||||
int c = inShape.C;
|
||||
int c4 = UP_DIV(c, C4NUM);
|
||||
cl_int4 size = {h, w, c4, c};
|
||||
int arg_idx = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size);
|
||||
if (wc_reduce_ || c_reduce_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (wc_reduce_ || c_reduce_) {
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
void ReduceOpenCLKernel::SetGlobalLocal() {
|
||||
int h = inShape.H;
|
||||
|
@ -235,9 +246,18 @@ int ReduceOpenCLKernel::Tune() {
|
|||
int ReduceOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
int arg_idx = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ class ReduceOpenCLKernel : public OpenCLKernel {
|
|||
int Run() override;
|
||||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Tune() override;
|
||||
|
||||
|
|
|
@ -53,15 +53,22 @@ int ReshapeOpenCLKernel::CheckSpecs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void ReshapeOpenCLKernel::SetConstArgs() {
|
||||
int ReshapeOpenCLKernel::SetConstArgs() {
|
||||
auto in = GpuTensorInfo(in_tensors_.front());
|
||||
auto out = GpuTensorInfo(out_tensors_.front());
|
||||
cl_int4 src_size = {cl_int(in.C), cl_int(in.W), cl_int(in.H), cl_int(in.N)};
|
||||
cl_int4 dst_size = {cl_int(out.width), cl_int(out.height), cl_int(out.C), cl_int(out.C * out.W)};
|
||||
|
||||
int arg_idx = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, src_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, dst_size);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, src_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, dst_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ReshapeOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -72,9 +79,9 @@ void ReshapeOpenCLKernel::SetGlobalLocal() {
|
|||
}
|
||||
|
||||
int ReshapeOpenCLKernel::Prepare() {
|
||||
std::string kernel_name = "reshape_NHWC4";
|
||||
const std::string kernel_name = "reshape_NHWC4";
|
||||
std::string source = reshape_source;
|
||||
std::string program_name = "reshape";
|
||||
const std::string program_name = "reshape";
|
||||
auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
|
@ -87,16 +94,28 @@ int ReshapeOpenCLKernel::Prepare() {
|
|||
}
|
||||
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ReshapeOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -104,7 +123,10 @@ int ReshapeOpenCLKernel::PreProcess() {
|
|||
if (type() == PrimitiveType_Reshape && !InferShapeDone()) {
|
||||
auto shape_tensor = in_tensors_[1];
|
||||
if (!shape_tensor->IsConst()) {
|
||||
ocl_runtime_->SyncCommandQueue();
|
||||
if (!ocl_runtime_->SyncCommandQueue()) {
|
||||
MS_LOG(ERROR) << "SyncCommandQueue failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
shape_tensor->MutableData();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,7 +30,7 @@ class ReshapeOpenCLKernel : public OpenCLKernel {
|
|||
int Run() override;
|
||||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int PreProcess() override;
|
||||
};
|
||||
|
|
|
@ -64,7 +64,7 @@ int ResizeOpenCLKernel::Prepare() {
|
|||
}
|
||||
kernel_name += "_NHWC4";
|
||||
std::string source = resize_source;
|
||||
std::string program_name = "Resize";
|
||||
const std::string program_name = "Resize";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -75,7 +75,10 @@ int ResizeOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
|
@ -87,7 +90,7 @@ float ResizeOpenCLKernel::getResizeScaleFactor(int input_size, int output_size)
|
|||
: static_cast<float>(input_size) / static_cast<float>(output_size);
|
||||
}
|
||||
|
||||
void ResizeOpenCLKernel::SetConstArgs() {
|
||||
int ResizeOpenCLKernel::SetConstArgs() {
|
||||
auto in_shape = in_tensors_[0]->shape();
|
||||
auto out_shape = out_tensors_[0]->shape();
|
||||
int n = out_shape[0];
|
||||
|
@ -101,9 +104,19 @@ void ResizeOpenCLKernel::SetConstArgs() {
|
|||
cl_int4 out_size = {n, h, w, c4};
|
||||
cl_float2 scale = {scale_h, scale_w};
|
||||
int arg_idx = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ResizeOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -116,9 +129,18 @@ void ResizeOpenCLKernel::SetGlobalLocal() {
|
|||
int ResizeOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
int arg_idx = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -126,7 +148,10 @@ int ResizeOpenCLKernel::PreProcess() {
|
|||
if (type() == PrimitiveType_Resize && !InferShapeDone() && in_tensors_.size() == INPUT_TENSOR_SIZE_2) {
|
||||
auto shape_tensor = in_tensors_[1];
|
||||
if (!shape_tensor->IsConst()) {
|
||||
ocl_runtime_->SyncCommandQueue();
|
||||
if (!ocl_runtime_->SyncCommandQueue()) {
|
||||
MS_LOG(ERROR) << "SyncCommandQueue failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
shape_tensor->MutableData();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,7 +31,7 @@ class ResizeOpenCLKernel : public OpenCLKernel {
|
|||
int Run() override;
|
||||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int PreProcess() override;
|
||||
|
||||
|
|
|
@ -98,14 +98,30 @@ int ScaleOpenCLKernel::InitWeights() {
|
|||
img_size.height = 1;
|
||||
img_size.width = UP_DIV(scale_tensor->shape()[0], C4NUM);
|
||||
scale_ptr_ = allocator->Malloc(img_size, scale_tensor->data_c());
|
||||
if (scale_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
offset_ptr_ = allocator->Malloc(img_size, offset_tensor->data_c());
|
||||
if (offset_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
if (in_tensor->format() == scale_tensor->format()) {
|
||||
if (in_tensor->data_type() == scale_tensor->data_type()) {
|
||||
scale_ptr_ = allocator->Malloc(img_size, scale_tensor->data_c());
|
||||
if (scale_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
offset_ptr_ = allocator->Malloc(img_size, offset_tensor->data_c());
|
||||
if (offset_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Unsupported data type transpose from " << scale_tensor->data_type() << "to "
|
||||
<< in_tensor->data_type();
|
||||
|
@ -121,7 +137,15 @@ int ScaleOpenCLKernel::InitWeights() {
|
|||
PackNHWCToNHWC4(scale_tensor->data_c(), scale.data(), src_is_fp16, fp16_enable, image2d_info);
|
||||
PackNHWCToNHWC4(offset_tensor->data_c(), offset.data(), src_is_fp16, fp16_enable, image2d_info);
|
||||
scale_ptr_ = allocator->Malloc(img_size, scale.data());
|
||||
if (scale_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
offset_ptr_ = allocator->Malloc(img_size, offset.data());
|
||||
if (offset_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Unsupported data type transpose from " << scale_tensor->data_type() << "to "
|
||||
<< in_tensor->data_type();
|
||||
|
@ -175,7 +199,7 @@ int ScaleOpenCLKernel::Prepare() {
|
|||
} else {
|
||||
kernel_name += "_BUF";
|
||||
}
|
||||
std::string program_name = "Scale";
|
||||
const std::string program_name = "Scale";
|
||||
std::string source = GetActDefines() + scale_source;
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
|
@ -193,44 +217,86 @@ int ScaleOpenCLKernel::Prepare() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int ScaleOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
|
||||
int ScaleOpenCLKernel::SetKernelArg(int *idx) {
|
||||
int arg_idx = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (weight_vector_flag_) {
|
||||
void *scale = scale_ptr_ == nullptr ? in_tensors_[1]->data_c() : scale_ptr_;
|
||||
void *offset = offset_ptr_ == nullptr ? in_tensors_[2]->data_c() : offset_ptr_;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale) != CL_SUCCESS) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset) != CL_SUCCESS) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
|
||||
float scale = static_cast<float *>(in_tensors_[1]->data_c())[0];
|
||||
float offset = static_cast<float *>(in_tensors_[2]->data_c())[0];
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale) != CL_SUCCESS) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset) != CL_SUCCESS) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) {
|
||||
float16_t scale = static_cast<float16_t *>(in_tensors_[1]->data_c())[0];
|
||||
float16_t offset = static_cast<float16_t *>(in_tensors_[2]->data_c())[0];
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float>(scale));
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float>(offset));
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float>(scale)) != CL_SUCCESS) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float>(offset)) != CL_SUCCESS) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[1]->data_type();
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape) != CL_SUCCESS) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
*idx = arg_idx;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ScaleOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
|
||||
int arg_idx = 0;
|
||||
|
||||
if (SetKernelArg(&arg_idx) != RET_OK) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
if (weight_vector_flag_ && broadcast_flag_) {
|
||||
if (broadcast_H_flag_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[1]->shape()[0]);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[1]->shape()[0]) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM));
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM)) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->activation_type_);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->activation_type_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ class ScaleOpenCLKernel : public OpenCLKernel {
|
|||
|
||||
private:
|
||||
void Image2dGetWorkGroupSize();
|
||||
|
||||
int SetKernelArg(int *idx);
|
||||
bool weight_vector_flag_{true};
|
||||
bool broadcast_flag_{false};
|
||||
bool broadcast_H_flag_{false};
|
||||
|
|
|
@ -75,7 +75,7 @@ int SoftmaxOpenCLKernel::Prepare() {
|
|||
kernel_name += "Axis" + std::to_string(axis_);
|
||||
}
|
||||
kernel_name += "_NHWC4";
|
||||
std::string program_name = "Softmax";
|
||||
const std::string program_name = "Softmax";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -93,7 +93,10 @@ int SoftmaxOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return lite::RET_OK;
|
||||
|
@ -131,24 +134,40 @@ int SoftmaxOpenCLKernel::Tune() {
|
|||
return OpenCLKernel::Tune();
|
||||
}
|
||||
|
||||
void SoftmaxOpenCLKernel::SetConstArgs() {
|
||||
int SoftmaxOpenCLKernel::SetConstArgs() {
|
||||
int arg_idx = 2;
|
||||
int channel = out_shape_.C;
|
||||
int c4 = out_shape_.Slice;
|
||||
auto mask_ = GetMaskForLastChannel(channel);
|
||||
cl_float4 mask = {mask_[0], mask_[1], mask_[2], mask_[3]};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, mask);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, mask) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
cl_int4 input_shape = {static_cast<int>(out_shape_.N), static_cast<int>(out_shape_.H), static_cast<int>(out_shape_.W),
|
||||
c4};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx, input_shape);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx, input_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int SoftmaxOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
int arg_idx = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return lite::RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
|
|||
int Run() override;
|
||||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Tune() override;
|
||||
|
||||
|
|
|
@ -61,7 +61,7 @@ int SpaceToBatchNDOpenCLKernel::CheckSpecs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void SpaceToBatchNDOpenCLKernel::SetConstArgs() {
|
||||
int SpaceToBatchNDOpenCLKernel::SetConstArgs() {
|
||||
auto param = reinterpret_cast<SpaceToBatchParameter *>(this->op_parameter_);
|
||||
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
||||
size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
|
||||
|
@ -71,10 +71,23 @@ void SpaceToBatchNDOpenCLKernel::SetConstArgs() {
|
|||
cl_int4 paddings = {param->paddings_[0], param->paddings_[1], param->paddings_[2], param->paddings_[3]};
|
||||
|
||||
int arg_cnt = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, block_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, paddings);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, block_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, paddings) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -87,9 +100,9 @@ void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() {
|
|||
}
|
||||
|
||||
int SpaceToBatchNDOpenCLKernel::Prepare() {
|
||||
std::string kernel_name = "space_to_batch_nd_NHWC4";
|
||||
const std::string kernel_name = "space_to_batch_nd_NHWC4";
|
||||
std::string source = space_to_batch_nd_source;
|
||||
std::string program_name = "space_to_batch_nd";
|
||||
const std::string program_name = "space_to_batch_nd";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -101,7 +114,10 @@ int SpaceToBatchNDOpenCLKernel::Prepare() {
|
|||
return ret;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -109,9 +125,18 @@ int SpaceToBatchNDOpenCLKernel::Prepare() {
|
|||
int SpaceToBatchNDOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@ class SpaceToBatchNDOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -51,7 +51,7 @@ int SpaceToDepthOpenCLKernel::Prepare() {
|
|||
kernel_name += "Align";
|
||||
}
|
||||
std::string source = space_to_depth_source;
|
||||
std::string program_name = "SpaceToDepth";
|
||||
const std::string program_name = "SpaceToDepth";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -63,28 +63,47 @@ int SpaceToDepthOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
void SpaceToDepthOpenCLKernel::SetConstArgs() {
|
||||
int SpaceToDepthOpenCLKernel::SetConstArgs() {
|
||||
cl_int4 cl_in_shape = {static_cast<cl_int>(in_shape_.N), static_cast<cl_int>(in_shape_.H),
|
||||
static_cast<cl_int>(in_shape_.W), static_cast<cl_int>(in_shape_.Slice)};
|
||||
cl_int4 cl_out_shape = {static_cast<cl_int>(out_shape_.N), static_cast<cl_int>(out_shape_.H),
|
||||
static_cast<cl_int>(out_shape_.W), static_cast<cl_int>(out_shape_.Slice)};
|
||||
auto param = reinterpret_cast<SpaceToDepthParameter *>(op_parameter_);
|
||||
int arg_idx = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_in_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_out_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->block_size_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_in_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, cl_out_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->block_size_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (type() == PrimitiveType_DepthToSpace) {
|
||||
int co_size = out_shape_.C;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, co_size);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, co_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
int ci_size = in_shape_.C;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, ci_size);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, ci_size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
void SpaceToDepthOpenCLKernel::SetGlobalLocal() {
|
||||
local_size_ = {};
|
||||
|
@ -95,9 +114,18 @@ void SpaceToDepthOpenCLKernel::SetGlobalLocal() {
|
|||
int SpaceToDepthOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
int arg_idx = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ class SpaceToDepthOpenCLKernel : public OpenCLKernel {
|
|||
int Run() override;
|
||||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -37,7 +37,10 @@ int SparseToDenseOpenCLKernel::InitOutputToDefault() {
|
|||
cl_float4 fill_value = {};
|
||||
fill_value.s[0] = fill_value.s[1] = fill_value.s[2] = fill_value.s[3] = default_;
|
||||
auto src_data = out_tensors_[0]->data_c();
|
||||
allocator_->GetImageSize(src_data, &img_size);
|
||||
if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
|
||||
MS_LOG(ERROR) << "GetImageSize failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
|
||||
auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
|
||||
cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
|
||||
|
@ -62,7 +65,14 @@ int SparseToDenseOpenCLKernel::InitWeights() {
|
|||
auto sizeof_FLT = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
|
||||
size_t weight_size = UP_ROUND(size, C4NUM) * sizeof_FLT;
|
||||
weight_vector_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||
allocator->MapBuffer(weight_vector_, CL_MAP_WRITE, nullptr, true);
|
||||
if (weight_vector_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (allocator->MapBuffer(weight_vector_, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(weight_vector_, 0x00, weight_size);
|
||||
if (weight_tensor->data_type() == kNumberTypeFloat16) {
|
||||
if (enable_fp16_) {
|
||||
|
@ -85,7 +95,10 @@ int SparseToDenseOpenCLKernel::InitWeights() {
|
|||
memcpy(weight_vector_, weight_tensor->data_c(), size * sizeof_FLT);
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(weight_vector_);
|
||||
if (allocator->UnmapBuffer(weight_vector_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -115,7 +128,7 @@ int SparseToDenseOpenCLKernel::CheckSpecs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void SparseToDenseOpenCLKernel::SetConstArgs() {
|
||||
int SparseToDenseOpenCLKernel::SetConstArgs() {
|
||||
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
|
||||
GpuTensorInfo img_info(out_tensors_[0]);
|
||||
size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
|
||||
|
@ -124,11 +137,27 @@ void SparseToDenseOpenCLKernel::SetConstArgs() {
|
|||
auto out_shape_temp = out_tensors_[0]->shape();
|
||||
cl_int4 out_shape = {out_n_, out_h_, out_w_, UP_DIV(out_c_, C4NUM)};
|
||||
int arg_cn = 3;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, default_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, inshapeindex1_dim);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, default_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, inshapeindex1_dim) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void SparseToDenseOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -144,9 +173,9 @@ int SparseToDenseOpenCLKernel::Prepare() {
|
|||
input_dim_ = in_tensors_[0]->shape().size();
|
||||
inshapeindex1_dim = in_tensors_[0]->shape()[1];
|
||||
weight_scalar_ = in_tensors_[2]->IsScalar();
|
||||
std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "Scalar" : "Vector");
|
||||
const std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "Scalar" : "Vector");
|
||||
std::string source = sparse_to_dense_source;
|
||||
std::string program_name = "SparseToDense";
|
||||
const std::string program_name = "SparseToDense";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -174,7 +203,10 @@ int SparseToDenseOpenCLKernel::Prepare() {
|
|||
InitWeights();
|
||||
InferShapeTo4D();
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -212,14 +244,30 @@ int SparseToDenseOpenCLKernel::Run() {
|
|||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
InitOutputToDefault();
|
||||
int arg_cn = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
if (!weight_scalar_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_vector_, lite::opencl::MemType::BUF);
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF) !=
|
||||
CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (!weight_scalar_) {
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_vector_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
int Run() override;
|
||||
int InitWeights() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int CheckSpecs() override;
|
||||
|
||||
|
|
|
@ -41,7 +41,10 @@ int SplitOpenCLKernel::RunAxis0() {
|
|||
for (int i = 0; i < out_tensors_.size(); i++) {
|
||||
auto dst_data = out_tensors_[i]->data_c();
|
||||
ImageSize img_size;
|
||||
allocator_->GetImageSize(dst_data, &img_size);
|
||||
if (allocator_->GetImageSize(dst_data, &img_size) != RET_OK) {
|
||||
MS_LOG(ERROR) << "GetImageSize failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto dst_area = cl::array<cl::size_type, 3U>{0, 0, 0};
|
||||
auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
|
||||
cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data));
|
||||
|
@ -93,23 +96,32 @@ int SplitOpenCLKernel::CheckSpecs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void SplitOpenCLKernel::AlignSplitSizes(SplitParameter *param, const std::vector<int> &in_shape) {
|
||||
int SplitOpenCLKernel::AlignSplitSizes(SplitParameter *param, const std::vector<int> &in_shape) {
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
int shape_dim = in_shape.at(param->split_dim_);
|
||||
if (num_split_ == 1) {
|
||||
size_t num_split = UP_DIV(shape_dim, param->split_sizes_[0]);
|
||||
split_sizes_ = reinterpret_cast<int *>(allocator->Malloc(num_split * sizeof(int), lite::opencl::MemType::BUF));
|
||||
if (split_sizes_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
for (int i = 0; i < num_split - 1; ++i) {
|
||||
split_sizes_[i] = (i + 1) * param->split_sizes_[0];
|
||||
}
|
||||
} else {
|
||||
int sum = 0;
|
||||
split_sizes_ = reinterpret_cast<int *>(allocator->Malloc(num_split_ * sizeof(int), lite::opencl::MemType::BUF));
|
||||
if (split_sizes_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
for (int i = 0; i < num_split_ - 1; ++i) {
|
||||
sum += param->split_sizes_[i];
|
||||
split_sizes_[i] = sum;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int SplitOpenCLKernel::Prepare() {
|
||||
|
@ -129,7 +141,10 @@ int SplitOpenCLKernel::Prepare() {
|
|||
}
|
||||
}
|
||||
}
|
||||
AlignSplitSizes(param, in_shape);
|
||||
if (AlignSplitSizes(param, in_shape) != RET_OK) {
|
||||
MS_LOG(ERROR) << "AlignSplitSizes failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
std::string kernel_name = "split_out";
|
||||
kernel_name += std::to_string(num_split_);
|
||||
kernel_name += "_axis" + std::to_string(split_dim_);
|
||||
|
@ -138,7 +153,7 @@ int SplitOpenCLKernel::Prepare() {
|
|||
}
|
||||
MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
|
||||
std::string source = split_source;
|
||||
std::string program_name = "split";
|
||||
const std::string program_name = "split";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -151,12 +166,15 @@ int SplitOpenCLKernel::Prepare() {
|
|||
return ret;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void SplitOpenCLKernel::SetConstArgs() {
|
||||
int SplitOpenCLKernel::SetConstArgs() {
|
||||
int arg_cn = out_tensors_.size() + 2;
|
||||
cl_int4 shape = {};
|
||||
for (int i = 0; i < in_tensors_[0]->shape().size(); ++i) {
|
||||
|
@ -166,7 +184,10 @@ void SplitOpenCLKernel::SetConstArgs() {
|
|||
if (Align_) {
|
||||
in_shape_.s[3] = UP_DIV(in_shape_.s[3], C4NUM);
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
for (int i = 0; i < out_tensors_.size(); ++i) {
|
||||
cl_int4 temp = {};
|
||||
|
@ -177,13 +198,21 @@ void SplitOpenCLKernel::SetConstArgs() {
|
|||
if (Align_) {
|
||||
out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
GpuTensorInfo img_info(in_tensors_.at(0));
|
||||
size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
|
||||
stride_w = img_info.RowPitch() / dtype;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
|
||||
return;
|
||||
if (!Align_) {
|
||||
GpuTensorInfo img_info(in_tensors_.at(0));
|
||||
size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
|
||||
stride_w = img_info.RowPitch() / dtype;
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void SplitOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -205,15 +234,31 @@ int SplitOpenCLKernel::Run() {
|
|||
}
|
||||
int arg_cn = 0;
|
||||
if (Align_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c(), lite::opencl::MemType::BUF);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c(), lite::opencl::MemType::BUF) !=
|
||||
CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < out_tensors_.size(); ++i) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(i)->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(i)->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, split_sizes_, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, split_sizes_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -31,12 +31,12 @@ class SplitOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Run() override;
|
||||
|
||||
private:
|
||||
void AlignSplitSizes(SplitParameter *param, const std::vector<int> &in_shape);
|
||||
int AlignSplitSizes(SplitParameter *param, const std::vector<int> &in_shape);
|
||||
int RunAxis0();
|
||||
|
||||
private:
|
||||
|
|
|
@ -36,7 +36,10 @@ int StackOpenCLKernel::RunAxis0() {
|
|||
cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data));
|
||||
for (int i = 0; i < in_tensors_.size(); i++) {
|
||||
auto src_data = in_tensors_[i]->data_c();
|
||||
allocator_->GetImageSize(src_data, &img_size);
|
||||
if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
|
||||
MS_LOG(ERROR) << "GetImageSize failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
|
||||
auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
|
||||
cl::Image2D *input_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
|
||||
|
@ -95,7 +98,7 @@ int StackOpenCLKernel::CheckSpecs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void StackOpenCLKernel::SetConstArgs() {
|
||||
int StackOpenCLKernel::SetConstArgs() {
|
||||
int arg_cn = in_tensors_.size() + 1;
|
||||
cl_int4 inshape_tmp = {}, outshape_tmp = {};
|
||||
for (int i = 0; i < in_tensors_[0]->shape().size(); ++i) {
|
||||
|
@ -108,8 +111,14 @@ void StackOpenCLKernel::SetConstArgs() {
|
|||
Broadcast2GpuShape(out_shape_.s, outshape_tmp.s, out_tensors_[0]->shape().size(), 1);
|
||||
in_shape_.s[3] = UP_DIV(in_shape_.s[3], C4NUM);
|
||||
out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (buffer_button_) {
|
||||
GpuTensorInfo img_info_out(out_tensors_[0]);
|
||||
GpuTensorInfo img_info_in(in_tensors_[0]);
|
||||
|
@ -117,8 +126,12 @@ void StackOpenCLKernel::SetConstArgs() {
|
|||
stride_w_out = img_info_out.RowPitch() / dtype;
|
||||
stride_w_in = img_info_in.RowPitch() / dtype;
|
||||
cl_int2 stride_w = {stride_w_out, stride_w_in};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void StackOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -162,7 +175,7 @@ int StackOpenCLKernel::Prepare() {
|
|||
|
||||
MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
|
||||
std::string source = stack_source;
|
||||
std::string program_name = "stack";
|
||||
const std::string program_name = "stack";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -174,7 +187,10 @@ int StackOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
|
||||
return RET_OK;
|
||||
|
@ -188,16 +204,33 @@ int StackOpenCLKernel::Run() {
|
|||
int arg_cn = 0;
|
||||
if (buffer_button_) {
|
||||
for (int i = 0; i < in_tensors_.size(); ++i) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c(), lite::opencl::MemType::BUF);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c(), lite::opencl::MemType::BUF) !=
|
||||
CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF) !=
|
||||
CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
} else {
|
||||
for (int i = 0; i < in_tensors_.size(); ++i) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c());
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
|
||||
}
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Stack, OpenCLKernelCreator<StackOpenCLKernel>);
|
||||
|
|
|
@ -29,7 +29,7 @@ class StackOpenCLKernel : public OpenCLKernel {
|
|||
~StackOpenCLKernel() override{};
|
||||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
int Run() override;
|
||||
|
|
|
@ -27,9 +27,9 @@ using mindspore::lite::opencl::ImageSize;
|
|||
|
||||
namespace mindspore::kernel {
|
||||
int StrassenOpenCLKernel::Prepare() {
|
||||
std::string kernel_name = "MatMul_Strassen_NHWC4_2d";
|
||||
const std::string kernel_name = "MatMul_Strassen_NHWC4_2d";
|
||||
std::string source = strassen_source;
|
||||
std::string program_name = "MatMul";
|
||||
const std::string program_name = "MatMul";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -50,13 +50,16 @@ int StrassenOpenCLKernel::Prepare() {
|
|||
if (ret != RET_OK) {
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) {
|
||||
int StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) {
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
size_t img_dtype = enable_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
|
||||
ImageSize img_size{static_cast<size_t>(UP_DIV(NumA, C4NUM)), static_cast<size_t>(NumA), img_dtype};
|
||||
|
@ -64,15 +67,52 @@ void StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) {
|
|||
size_t memB = NumB * NumB * dtype_size;
|
||||
for (int depth = 0; depth < MAXDEPTH; depth++) {
|
||||
B_temp[depth] = allocator->Malloc(memB, lite::opencl::MemType::BUF);
|
||||
if (B_temp[depth] == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
A_temp[depth] = allocator->Malloc(img_size);
|
||||
if (A_temp[depth] == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
M1[depth] = allocator->Malloc(img_size);
|
||||
if (M1[depth] == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
M2[depth] = allocator->Malloc(img_size);
|
||||
if (M2[depth] == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
M3[depth] = allocator->Malloc(img_size);
|
||||
if (M3[depth] == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
M4[depth] = allocator->Malloc(img_size);
|
||||
if (M4[depth] == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
M5[depth] = allocator->Malloc(img_size);
|
||||
if (M5[depth] == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
M6[depth] = allocator->Malloc(img_size);
|
||||
if (M6[depth] == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
M7[depth] = allocator->Malloc(img_size);
|
||||
if (M7[depth] == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int StrassenOpenCLKernel::InitWeights() {
|
||||
|
@ -82,14 +122,25 @@ int StrassenOpenCLKernel::InitWeights() {
|
|||
int NumB = in_tensors_[1]->shape()[0];
|
||||
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
||||
padWeight_ = allocator->Malloc(NumA * NumB * dtype_size, lite::opencl::MemType::BUF);
|
||||
if (padWeight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
||||
if (padWeight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
|
||||
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
|
||||
memset(padWeight_, 0x00, NumA * NumB * dtype_size);
|
||||
auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
|
||||
auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c());
|
||||
bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;
|
||||
AllocatorMemoryForStrassen(NumA / 2, NumB / 2);
|
||||
if (AllocatorMemoryForStrassen(NumA / 2, NumB / 2) != RET_OK) {
|
||||
MS_LOG(ERROR) << "AllocatorMemoryForStrassen failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
size_t size = NumA * NumB * dtype_size;
|
||||
if (isModelFp16) {
|
||||
if (enable_fp16_) {
|
||||
|
@ -108,7 +159,10 @@ int StrassenOpenCLKernel::InitWeights() {
|
|||
memcpy(padWeightFp32, originWeightFp32, size);
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(padWeight_);
|
||||
if (allocator->UnmapBuffer(padWeight_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -120,7 +174,7 @@ void AlignStrassenGlobalLocal(const std::vector<size_t> &global, const std::vect
|
|||
}
|
||||
|
||||
// 0 : global_size_, 1: global_size_add_sub
|
||||
void StrassenOpenCLKernel::StrassenSetGlobalLocal(size_t strassen_size, int type_flag) {
|
||||
int StrassenOpenCLKernel::StrassenSetGlobalLocal(size_t strassen_size, int type_flag) {
|
||||
size_t strassen_size_C4 = UP_DIV(strassen_size, C4NUM);
|
||||
local_size_add_sub = {16, 1, 16};
|
||||
if (type_flag == 0) {
|
||||
|
@ -130,6 +184,7 @@ void StrassenOpenCLKernel::StrassenSetGlobalLocal(size_t strassen_size, int type
|
|||
global_size_add_sub = {strassen_size_C4, 1, strassen_size};
|
||||
AlignStrassenGlobalLocal(global_size_add_sub, local_size_add_sub, &global_add_sub_, &local_add_sub_);
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void StrassenOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -142,111 +197,188 @@ void StrassenOpenCLKernel::SetGlobalLocal() {
|
|||
StrassenSetGlobalLocal(strassen_size, 2); // set global_size_weights
|
||||
}
|
||||
|
||||
void StrassenOpenCLKernel::StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size,
|
||||
bool is_matmul_kernel) {
|
||||
int StrassenOpenCLKernel::StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size,
|
||||
bool is_matmul_kernel) {
|
||||
cl_int4 shape;
|
||||
if (is_matmul_kernel) {
|
||||
shape = {1, 1, strassen_size, strassen_size};
|
||||
} else {
|
||||
shape = {strassen_size, 1, 1, UP_DIV(strassen_size, C4NUM)};
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(*kernel, index, shape);
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, index, shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void StrassenOpenCLKernel::SetConstArgs() {
|
||||
int arg_count = 2;
|
||||
cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
|
||||
cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
|
||||
cl_int4 shape_offset = {0, 0, 0, 0};
|
||||
int StrassenOpenCLKernel::SetConstArgs() {
|
||||
int strassen_size = inShape[3] / 2;
|
||||
out_shape.s[2] = in_shape.s[2] = in_shape.s[2] / 2;
|
||||
out_shape.s[3] = in_shape.s[3] = in_shape.s[3] / 2;
|
||||
StrassenSetConstArgs(&kernel_IMG_add_sub_2, 3, strassen_size, false);
|
||||
StrassenSetConstArgs(&kernel_BUF_add_sub_2, 2, strassen_size, false);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, shape_offset);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void StrassenOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size,
|
||||
cl_int2 offset, lite::opencl::MemType mem_type) {
|
||||
int StrassenOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size,
|
||||
cl_int2 offset, lite::opencl::MemType mem_type) {
|
||||
if (input == nullptr || output == nullptr) {
|
||||
MS_LOG(ERROR) << "StrassenDataFilled input or output can not nullptr";
|
||||
return;
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (mem_type == lite::opencl::MemType::IMG) {
|
||||
ocl_runtime_->SetKernelArg(*kernel, 0, input);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 1, output);
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 0, input) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 1, output) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF);
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
StrassenSetConstArgs(kernel, 2, size, false);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 3, offset);
|
||||
ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 3, offset) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void StrassenOpenCLKernel::StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset,
|
||||
int flag, lite::opencl::MemType mem_type) {
|
||||
int StrassenOpenCLKernel::StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset,
|
||||
int flag, lite::opencl::MemType mem_type) {
|
||||
if (input == nullptr || output == nullptr) {
|
||||
MS_LOG(ERROR) << "StrassenAddSub input or output can not nullptr";
|
||||
return;
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (mem_type == lite::opencl::MemType::IMG) {
|
||||
ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::IMG);
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::IMG) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::IMG) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF);
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
StrassenSetConstArgs(kernel, 2, size, false);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 3, offset);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 4, flag);
|
||||
ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 3, offset) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 4, flag) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void StrassenOpenCLKernel::StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3,
|
||||
void *input4, void *input5, void *input6, void *input7, void *output,
|
||||
const int size) {
|
||||
int StrassenOpenCLKernel::StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4,
|
||||
void *input5, void *input6, void *input7, void *output, const int size) {
|
||||
if (input1 == nullptr || input2 == nullptr || input3 == nullptr || input4 == nullptr || input5 == nullptr ||
|
||||
input6 == nullptr || input7 == nullptr || output == nullptr) {
|
||||
MS_LOG(ERROR) << "StrassenBackResult input or output can not nullptr";
|
||||
return;
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 0, input1) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 1, input2) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 2, input3) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 3, input4) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 4, input5) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 5, input6) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 6, input7) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(*kernel, 7, output) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(*kernel, 0, input1);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 1, input2);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 2, input3);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 3, input4);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 4, input5);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 5, input6);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 6, input7);
|
||||
ocl_runtime_->SetKernelArg(*kernel, 7, output);
|
||||
StrassenSetConstArgs(kernel, 8, size, false);
|
||||
ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_);
|
||||
if (ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void StrassenOpenCLKernel::StrassenRunMmatmul(void *input, void *weight, void *output, const int size) {
|
||||
int StrassenOpenCLKernel::StrassenRunMmatmul(void *input, void *weight, void *output, const int size) {
|
||||
if (input == nullptr || weight == nullptr || output == nullptr) {
|
||||
MS_LOG(ERROR) << "StrassenRunMmatmul input ,weight or output can not nullptr";
|
||||
return;
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, input) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, output) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 2, weight, lite::opencl::MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, input);
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, output);
|
||||
ocl_runtime_->SetKernelArg(kernel_, 2, weight, lite::opencl::MemType::BUF);
|
||||
StrassenSetConstArgs(&kernel_, 3, size, true);
|
||||
StrassenSetConstArgs(&kernel_, 4, size, true);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void StrassenOpenCLKernel::DoStrassen(void *data, void *weight, void *result, const int size, const int depth,
|
||||
const int threshold) {
|
||||
int StrassenOpenCLKernel::DoStrassen(void *data, void *weight, void *result, const int size, const int depth,
|
||||
const int threshold) {
|
||||
const int size_2 = size / 2;
|
||||
int C4 = UP_DIV(size_2, C4NUM);
|
||||
if (size <= threshold) {
|
||||
// run matmul;
|
||||
StrassenSetGlobalLocal(size, 0);
|
||||
StrassenRunMmatmul(data, weight, result, size);
|
||||
return;
|
||||
return RET_OK;
|
||||
}
|
||||
// flag = 0 : add otherwise flag = 1 : sub
|
||||
// M1 = A11 * ( B12- B22)
|
||||
|
@ -307,6 +439,7 @@ void StrassenOpenCLKernel::DoStrassen(void *data, void *weight, void *result, co
|
|||
StrassenSetGlobalLocal(size_2, 1);
|
||||
StrassenBackResult(&kernel_back_result, M1[depth + 1], M2[depth + 1], M3[depth + 1], M4[depth + 1], M5[depth + 1],
|
||||
M6[depth + 1], M7[depth + 1], result, size_2);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int StrassenOpenCLKernel::Run() {
|
||||
|
|
|
@ -33,22 +33,22 @@ class StrassenOpenCLKernel : public MatMulOpenCLKernel {
|
|||
int Run() override;
|
||||
int Prepare() override;
|
||||
int InitWeights() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
// strassen
|
||||
private:
|
||||
void AllocatorMemoryForStrassen(int NumA, int NumB);
|
||||
void DoStrassen(void *data, void *weight, void *result, const int size, const int depth, const int threshold);
|
||||
void StrassenSetGlobalLocal(size_t strassen_size, int type_flag);
|
||||
void StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel);
|
||||
void StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, cl_int2 offset,
|
||||
lite::opencl::MemType mem_type);
|
||||
void StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, int flag,
|
||||
lite::opencl::MemType mem_type);
|
||||
void StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5,
|
||||
void *input6, void *input7, void *output, const int size);
|
||||
void StrassenRunMmatmul(void *input, void *weight, void *output, const int size);
|
||||
int AllocatorMemoryForStrassen(int NumA, int NumB);
|
||||
int DoStrassen(void *data, void *weight, void *result, const int size, const int depth, const int threshold);
|
||||
int StrassenSetGlobalLocal(size_t strassen_size, int type_flag);
|
||||
int StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel);
|
||||
int StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, cl_int2 offset,
|
||||
lite::opencl::MemType mem_type);
|
||||
int StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, int flag,
|
||||
lite::opencl::MemType mem_type);
|
||||
int StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5,
|
||||
void *input6, void *input7, void *output, const int size);
|
||||
int StrassenRunMmatmul(void *input, void *weight, void *output, const int size);
|
||||
cl::Kernel kernel_IMG_add_sub_2;
|
||||
cl::Kernel MatMul_StrassenBUFFilled;
|
||||
cl::Kernel MatMul_StrassenIMGFilled;
|
||||
|
|
|
@ -85,7 +85,7 @@ int StridedSliceOpenCLKernel::CheckSpecs() {
|
|||
}
|
||||
|
||||
int StridedSliceOpenCLKernel::Prepare() {
|
||||
std::string program_name = "strided_slice";
|
||||
const std::string program_name = "strided_slice";
|
||||
if (!ocl_runtime_->LoadSource(program_name, strided_slice_source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -96,7 +96,10 @@ int StridedSliceOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -187,14 +190,33 @@ int StridedSliceOpenCLKernel::InitConstArgs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void StridedSliceOpenCLKernel::SetConstArgs() {
|
||||
int StridedSliceOpenCLKernel::SetConstArgs() {
|
||||
int arg_cn = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn, size_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn, size_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void StridedSliceOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -214,9 +236,18 @@ void StridedSliceOpenCLKernel::SetGlobalLocal() {
|
|||
|
||||
int StridedSliceOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ class StridedSliceOpenCLKernel : public OpenCLKernel {
|
|||
int CheckSpecs() override;
|
||||
|
||||
int Prepare() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
int Run() override;
|
||||
|
|
|
@ -42,11 +42,18 @@ int ToFormatOpenCLKernel::CheckSpecs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void ToFormatOpenCLKernel::SetConstArgs() {
|
||||
int ToFormatOpenCLKernel::SetConstArgs() {
|
||||
cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_};
|
||||
cl_int4 gsize{(cl_int)(N_ * H_), (cl_int)W_, (cl_int)UP_DIV(C_, C4NUM), 1};
|
||||
ocl_runtime_->SetKernelArg(kernel_, 2, gsize);
|
||||
ocl_runtime_->SetKernelArg(kernel_, 3, shape);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 2, gsize) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 3, shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ToFormatOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -70,7 +77,7 @@ int ToFormatOpenCLKernel::Prepare() {
|
|||
kernel_name += dtype_str[in_tensor->data_type()] + "_" + dtype_str[out_tensor->data_type()];
|
||||
this->set_name(kernel_name);
|
||||
|
||||
std::string program_name = "to_format";
|
||||
const std::string program_name = "to_format";
|
||||
std::string source = to_format_source;
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
|
@ -89,7 +96,10 @@ int ToFormatOpenCLKernel::Prepare() {
|
|||
C_ = output.C;
|
||||
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -98,9 +108,18 @@ int ToFormatOpenCLKernel::Run() {
|
|||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
auto src_mem_type = (out_mem_type_ == MemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG;
|
||||
auto dst_mem_type = out_mem_type_;
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c(), src_mem_type);
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c(), dst_mem_type);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c(), src_mem_type) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c(), dst_mem_type) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ class ToFormatOpenCLKernel : public OpenCLKernel {
|
|||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int InferShape() override;
|
||||
|
||||
|
|
|
@ -101,7 +101,7 @@ int TransposeOpenCLKernel::Prepare() {
|
|||
kernel_name += "_NHWC4";
|
||||
|
||||
std::string source = transpose_source;
|
||||
std::string program_name = "transpose";
|
||||
const std::string program_name = "transpose";
|
||||
if (!ocl_runtime_->LoadSource(program_name, source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -113,32 +113,45 @@ int TransposeOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
SetConstArgs();
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void TransposeOpenCLKernel::SetConstArgs() {
|
||||
int TransposeOpenCLKernel::SetConstArgs() {
|
||||
size_t n = tensor_size_.N;
|
||||
size_t h = tensor_size_.H;
|
||||
size_t w = tensor_size_.W;
|
||||
size_t c = tensor_size_.C;
|
||||
int arg_idx = 2;
|
||||
cl_int4 shape = {static_cast<int>(n), static_cast<int>(h), static_cast<int>(w), static_cast<int>(c)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (type_ == TransposeType::GENERAL) {
|
||||
int de_perm[4]; // output to input perm
|
||||
for (int i = 0; i < 4; i++) {
|
||||
de_perm[perm_4d_[i]] = i;
|
||||
}
|
||||
cl_int4 de_perm_cl = {de_perm[0], de_perm[1], de_perm[2], de_perm[3]};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, de_perm_cl);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, de_perm_cl) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
GpuTensorInfo in_shape = GpuTensorInfo(in_tensors_[0]);
|
||||
cl_int4 in_shape_int4 = {static_cast<cl_int>(in_shape.N), static_cast<cl_int>(in_shape.H),
|
||||
static_cast<cl_int>(in_shape.W), static_cast<cl_int>(in_shape.C)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_shape_int4);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_shape_int4) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void TransposeOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -161,9 +174,18 @@ void TransposeOpenCLKernel::SetGlobalLocal() {
|
|||
int TransposeOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
int arg_idx = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ class TransposeOpenCLKernel : public OpenCLKernel {
|
|||
int Run() override;
|
||||
int Prepare() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -78,7 +78,7 @@ std::vector<float> GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, si
|
|||
} // namespace
|
||||
|
||||
int WinogradOpenCLKernel::BuildKernel() {
|
||||
std::string program_name = "winograd";
|
||||
const std::string program_name = "winograd";
|
||||
if (!ocl_runtime_->LoadSource(program_name, GetActDefines() + winograd_source)) {
|
||||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -103,7 +103,7 @@ int WinogradOpenCLKernel::BuildKernel() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void WinogradOpenCLKernel::InitFilter() {
|
||||
int WinogradOpenCLKernel::InitFilter() {
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
|
||||
// allocate opencl memory: buffer or image2d
|
||||
|
@ -115,9 +115,17 @@ void WinogradOpenCLKernel::InitFilter() {
|
|||
size_t dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
|
||||
size = width * height * CO_TILE * sizeof_FLT_;
|
||||
packed_filter_ = allocator->Malloc({width, height, dtype});
|
||||
if (packed_filter_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
size = UP_DIV(CO_SLICES_, Ogroup) * 6 * 6 * CI_SLICES_ * Ogroup * CI_TILE * CO_TILE * sizeof_FLT_;
|
||||
packed_filter_ = allocator->Malloc(size, MemType::BUF);
|
||||
if (packed_filter_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
// rearrange filter
|
||||
|
@ -128,6 +136,10 @@ void WinogradOpenCLKernel::InitFilter() {
|
|||
void *src_data = winograd_filter.data();
|
||||
#else
|
||||
auto winograd_filter = std::make_unique<float[]>(CO_ * 6 * 6 * CI_);
|
||||
if (winograd_filter == nullptr) {
|
||||
MS_LOG(ERROR) << "new winograd_filter failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
WinogradWeightTransform(reinterpret_cast<const float *>(src_filter_data),
|
||||
reinterpret_cast<float *>(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false);
|
||||
|
||||
|
@ -147,53 +159,121 @@ void WinogradOpenCLKernel::InitFilter() {
|
|||
if (filter_type_ == MemType::IMG) {
|
||||
ocl_runtime_->WriteImage(packed_filter_, tmp.data());
|
||||
} else {
|
||||
allocator->MapBuffer(packed_filter_, CL_MAP_WRITE, nullptr, true);
|
||||
if (allocator->MapBuffer(packed_filter_, CL_MAP_WRITE, nullptr, true) == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memcpy(packed_filter_, tmp.data(), size);
|
||||
allocator->UnmapBuffer(packed_filter_);
|
||||
if (allocator->UnmapBuffer(packed_filter_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
FreeStoredData(stored_filter_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void WinogradOpenCLKernel::AllocateMemory() {
|
||||
int WinogradOpenCLKernel::AllocateMemory() {
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
|
||||
|
||||
size_t width = TILE_HW_;
|
||||
size_t height = CI_SLICES_ * 36;
|
||||
winograd_mem0_ = allocator->Malloc({width, height, img_dtype});
|
||||
if (winograd_mem0_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
width = TILE_HW_;
|
||||
height = CO_SLICES_ * 36;
|
||||
winograd_mem1_ = allocator->Malloc({width, height, img_dtype});
|
||||
if (winograd_mem1_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void WinogradOpenCLKernel::SetConstArgs() {
|
||||
int WinogradOpenCLKernel::SetConstArgs() {
|
||||
AllocateMemory();
|
||||
|
||||
int arg_cn = 1;
|
||||
cl_int4 input_shape = {batch_size_, OH_, OW_, CI_SLICES_}; // maybe pad=0, so use OH/OW
|
||||
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_);
|
||||
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, input_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, TILE_HW_);
|
||||
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, param_->pad_u_);
|
||||
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn, param_->pad_l_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, input_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, TILE_HW_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, param_->pad_u_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn, param_->pad_l_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
arg_cn = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem0_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem1_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_filter_, filter_type_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, TILE_HW_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, CI_SLICES_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn, CO_SLICES_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem0_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem1_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_filter_, filter_type_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, TILE_HW_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, CI_SLICES_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_cn, CO_SLICES_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
arg_cn = 2;
|
||||
cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, 0, winograd_mem1_);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, TILE_HW_);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, param_->act_type_);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, alpha_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, 0, winograd_mem1_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, MemType::BUF) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, TILE_HW_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, param_->act_type_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, alpha_) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void WinogradOpenCLKernel::SetGlobalLocal() {
|
||||
|
@ -205,15 +285,30 @@ void WinogradOpenCLKernel::SetGlobalLocal() {
|
|||
int WinogradOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " winograd Running!";
|
||||
MS_LOG(DEBUG) << "winograd kernel0 Running!";
|
||||
ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_, nullptr, &event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_, nullptr, &event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
MS_LOG(DEBUG) << "winograd kernel1 Running!";
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &kernel2_event_);
|
||||
if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &kernel2_event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
MS_LOG(DEBUG) << "winograd kernel2 Running!";
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_, nullptr, &kernel3_event_);
|
||||
if (ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c()) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_, nullptr, &kernel3_event_) != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
|
|||
|
||||
~WinogradOpenCLKernel() override = default;
|
||||
|
||||
void SetConstArgs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Run() override;
|
||||
|
||||
|
@ -42,8 +42,8 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
|
|||
|
||||
private:
|
||||
int BuildKernel() override;
|
||||
void InitFilter() override;
|
||||
void AllocateMemory();
|
||||
int InitFilter() override;
|
||||
int AllocateMemory();
|
||||
|
||||
cl::Kernel kernel_4x4to36_;
|
||||
cl::Kernel kernel_36to4x4_;
|
||||
|
|
|
@ -24,7 +24,7 @@ using mindspore::lite::RET_OK;
|
|||
using mindspore::lite::opencl::ImageSize;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int OpenCLKernel::AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) {
|
||||
void OpenCLKernel::AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) {
|
||||
std::vector<size_t> internal_global_ws = global;
|
||||
for (size_t i = 0; i < local.size(); ++i) {
|
||||
internal_global_ws.at(i) = UP_ROUND(global.at(i), local.at(i));
|
||||
|
@ -50,16 +50,12 @@ int OpenCLKernel::AlignGlobalLocal(const std::vector<size_t> &global, const std:
|
|||
if (!local.empty()) {
|
||||
local_range_ = cl::NDRange(local.at(0), local.at(1));
|
||||
}
|
||||
} else if (global.size() == 3) {
|
||||
} else if (global.size() >= 3) {
|
||||
global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1), internal_global_ws.at(2));
|
||||
if (!local.empty()) {
|
||||
local_range_ = cl::NDRange(local.at(0), local.at(1), local.at(2));
|
||||
}
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Not supported NDRange!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) {
|
||||
|
@ -112,11 +108,17 @@ void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
|
|||
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
|
||||
auto runtime = runtime_wrapper.GetInstance();
|
||||
auto allocator = runtime->GetAllocator();
|
||||
runtime->SyncCommandQueue();
|
||||
if (!runtime->SyncCommandQueue()) {
|
||||
MS_LOG(ERROR) << "SyncCommandQueue failed.";
|
||||
}
|
||||
if (mem_type == lite::opencl::MemType::BUF) {
|
||||
allocator->MapBuffer(tensor->data_c(), CL_MAP_READ, nullptr, true);
|
||||
if (allocator->MapBuffer(tensor->data_c(), CL_MAP_READ, nullptr, true) == nullptr) {
|
||||
MS_LOG(ERROR) << "Map Buffer failed.";
|
||||
}
|
||||
memcpy(data.data(), tensor->data_c(), img_info.OriginSize);
|
||||
allocator->UnmapBuffer(tensor->data_c());
|
||||
if (allocator->UnmapBuffer(tensor->data_c()) != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnmapBuffer failed.";
|
||||
}
|
||||
} else {
|
||||
runtime->ReadImage(tensor->data_c(), data.data());
|
||||
}
|
||||
|
|
|
@ -185,7 +185,7 @@ class OpenCLKernel : public InnerKernel {
|
|||
ocl_runtime_ = ocl_runtime_wrap_.GetInstance();
|
||||
}
|
||||
~OpenCLKernel() override = default;
|
||||
int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local);
|
||||
void AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local);
|
||||
|
||||
int Prepare() override { return RET_OK; }
|
||||
int PreProcess() override;
|
||||
|
@ -194,7 +194,7 @@ class OpenCLKernel : public InnerKernel {
|
|||
|
||||
virtual int CheckSpecs();
|
||||
virtual int InitWeights() { return RET_OK; }
|
||||
virtual void SetConstArgs() {}
|
||||
virtual int SetConstArgs() { return RET_OK; }
|
||||
virtual void SetGlobalLocal() {}
|
||||
virtual int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { return RET_ERROR; }
|
||||
virtual int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) {
|
||||
|
|
|
@ -420,6 +420,7 @@ int OpenCLSubGraph::Execute() {
|
|||
return ret;
|
||||
}
|
||||
if (!ocl_runtime_->SyncCommandQueue()) {
|
||||
MS_LOG(ERROR) << "SyncCommandQueue failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
|
@ -449,6 +450,7 @@ int OpenCLSubGraph::Execute(const KernelCallBack &before, const KernelCallBack &
|
|||
return ret;
|
||||
}
|
||||
if (!ocl_runtime_->SyncCommandQueue()) {
|
||||
MS_LOG(ERROR) << "SyncCommandQueue failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
|
|
Loading…
Reference in New Issue