forked from mindspore-Ecosystem/mindspore
!6380 fixed opencl program stuck for some device
Merge pull request !6380 from liuchao/master
This commit is contained in:
commit
2db8560a14
|
@ -55,8 +55,7 @@ int ActivationOpenClKernel::Init() {
|
|||
c = in_tensors_[0]->shape()[3];
|
||||
}
|
||||
nhwc_shape_ = {n, h, w, c};
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
fp_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
||||
if (in_size_ != 2 && in_size_ != 4) {
|
||||
MS_LOG(ERROR) << "Activate fun only support dim=4 or 2, but your dim=" << in_size_;
|
||||
|
@ -75,9 +74,9 @@ int ActivationOpenClKernel::Init() {
|
|||
|
||||
std::string source = activation_source;
|
||||
std::set<std::string> build_options;
|
||||
ocl_runtime->LoadSource(Program_Kernel[type_][0], source);
|
||||
ocl_runtime_->LoadSource(Program_Kernel[type_][0], source);
|
||||
std::string kernel_name = Program_Kernel[type_][1];
|
||||
ocl_runtime->BuildKernel(kernel_, Program_Kernel[type_][0], kernel_name, build_options);
|
||||
ocl_runtime_->BuildKernel(kernel_, Program_Kernel[type_][0], kernel_name, build_options);
|
||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||
out_ori_format_ = out_tensors_[0]->GetFormat();
|
||||
in_tensors_[0]->SetFormat(op_format_);
|
||||
|
@ -89,17 +88,16 @@ int ActivationOpenClKernel::Init() {
|
|||
int ActivationOpenClKernel::Run() {
|
||||
MS_LOG(DEBUG) << op_parameter_->name_ << " begin running!";
|
||||
cl_int4 img2d_shape = GetImg2dShape();
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
int arg_idx = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, img2d_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, img2d_shape);
|
||||
if (type_ == ActivationType_LEAKY_RELU) {
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, alpha_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_);
|
||||
}
|
||||
std::vector<size_t> local = {};
|
||||
std::vector<size_t> global = {static_cast<size_t>(img2d_shape.s[1]), static_cast<size_t>(img2d_shape.s[2])};
|
||||
auto ret = ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
auto ret = ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Run kernel:" << op_parameter_->name_ << " fail.";
|
||||
return RET_ERROR;
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
|
||||
#include <vector>
|
||||
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "nnacl/fp32/activation.h"
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ namespace mindspore::kernel {
|
|||
|
||||
ArithmeticOpenCLKernel::~ArithmeticOpenCLKernel() {
|
||||
if (weight_ptr_ != nullptr) {
|
||||
auto allocator = runtime_->GetAllocator();
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
allocator->Free(weight_ptr_);
|
||||
weight_ptr_ = nullptr;
|
||||
}
|
||||
|
@ -106,7 +106,7 @@ int ArithmeticOpenCLKernel::InitBuffer() {
|
|||
const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
|
||||
if (!arithmetic_parameter->broadcasting_) {
|
||||
if (in_tensors_[1]->category() == lite::Tensor::Category::CONST && in_tensors_[1]->data_c() != nullptr) {
|
||||
auto allocator = runtime_->GetAllocator();
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
std::vector<size_t> img_size;
|
||||
GetImageSize(0, &img_size);
|
||||
int pack_weight_size = in_tensors_[1]->ElementsC4Num();
|
||||
|
@ -194,7 +194,6 @@ int ArithmeticOpenCLKernel::InitBuffer() {
|
|||
}
|
||||
|
||||
int ArithmeticOpenCLKernel::Init() {
|
||||
runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::string kernel_name;
|
||||
|
||||
const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
|
||||
|
@ -265,7 +264,7 @@ int ArithmeticOpenCLKernel::Init() {
|
|||
|
||||
lite::STATUS error_code = RET_OK;
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = runtime_->GetKernelFromBinary(kernel_name);
|
||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
if (out_mem_type_ == OpenCLMemType::IMG) {
|
||||
kernel_name += "_IMG";
|
||||
|
@ -275,8 +274,8 @@ int ArithmeticOpenCLKernel::Init() {
|
|||
std::string program_name = "Arithmetic";
|
||||
std::set<std::string> build_options;
|
||||
std::string source = arithmetic_source;
|
||||
runtime_->LoadSource(program_name, source);
|
||||
error_code = runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
if (error_code != RET_OK) {
|
||||
return error_code;
|
||||
|
@ -302,10 +301,10 @@ int ArithmeticOpenCLKernel::Run() {
|
|||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
|
||||
int arg_idx = 0;
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
if (element_flag_) {
|
||||
void *weight = weight_ptr_ == nullptr ? in_tensors_[1]->data_c() : weight_ptr_;
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, weight);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight);
|
||||
} else {
|
||||
float weight = 0.f;
|
||||
if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
|
||||
|
@ -316,9 +315,9 @@ int ArithmeticOpenCLKernel::Run() {
|
|||
MS_LOG(ERROR) << "Unsupport data type " << in_tensors_[1]->data_type();
|
||||
return RET_ERROR;
|
||||
}
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, weight);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight);
|
||||
}
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
|
||||
int H = 0;
|
||||
int W = 0;
|
||||
|
@ -336,8 +335,8 @@ int ArithmeticOpenCLKernel::Run() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
cl_int2 output_shape{W, H};
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||
runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
|
||||
#include <vector>
|
||||
#include "src/runtime/kernel/arm/fp32/arithmetic.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
@ -42,7 +41,6 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {
|
|||
int InitBuffer();
|
||||
|
||||
cl::Kernel kernel_;
|
||||
lite::opencl::OpenCLRuntime *runtime_;
|
||||
bool element_flag_{true};
|
||||
void *weight_ptr_{nullptr};
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
#include <algorithm>
|
||||
#include <set>
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/arithmetic_self.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#include "src/runtime/kernel/opencl/cl/arithmeticself.cl.inc"
|
||||
|
@ -51,8 +50,7 @@ int ArithmeticSelfOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *im
|
|||
im_dst_x = out_tensors_[0]->Width();
|
||||
}
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
if (enable_fp16_) {
|
||||
img_dtype = CL_HALF_FLOAT;
|
||||
}
|
||||
|
@ -136,9 +134,8 @@ int ArithmeticSelfOpenCLKernel::Init() {
|
|||
std::set<std::string> build_options;
|
||||
std::string source = arithmeticself_source;
|
||||
std::string program_name = "ArithmeticSelf";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -162,7 +159,6 @@ void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<s
|
|||
int ArithmeticSelfOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto output_shape = out_tensors_[0]->shape();
|
||||
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
|
||||
|
||||
|
@ -170,17 +166,17 @@ int ArithmeticSelfOpenCLKernel::Run() {
|
|||
uint32_t OW = output_shape[2];
|
||||
uint32_t OC = UP_DIV(output_shape[3], C4NUM);
|
||||
|
||||
const std::vector<size_t> &max_global = ocl_runtime->GetWorkItemSize();
|
||||
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
|
||||
std::vector<size_t> local = {1, 1, 1}; // init local
|
||||
std::vector<size_t> global = {OH, OW, OC};
|
||||
ArithmeticSelfGetWorkGroup(global, &local, max_global[0]);
|
||||
|
||||
int arg_cn = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -21,7 +21,6 @@
|
|||
#include <string>
|
||||
#include "ir/anf.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "nnacl/arithmetic_self_parameter.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
#include <set>
|
||||
#include <string>
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/batchnorm.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc"
|
||||
|
@ -40,8 +39,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
|
|||
im_dst_x = out_tensors_[0]->Width();
|
||||
}
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
if (enable_fp16_) {
|
||||
img_dtype = CL_HALF_FLOAT;
|
||||
}
|
||||
|
@ -72,9 +70,8 @@ int BatchNormOpenCLKernel::Init() {
|
|||
std::set<std::string> build_options;
|
||||
std::string source = batchnorm_source;
|
||||
std::string program_name = "Batch_normalization";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -98,7 +95,6 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
|
|||
int BatchNormOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_);
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto input0_shape = in_tensors_[0]->shape();
|
||||
auto output_shape = out_tensors_[0]->shape();
|
||||
cl_int4 input_shape_ = {input0_shape[0], input0_shape[1], input0_shape[2], UP_DIV(input0_shape[3], C4NUM)};
|
||||
|
@ -107,20 +103,20 @@ int BatchNormOpenCLKernel::Run() {
|
|||
uint32_t OW = output_shape[2];
|
||||
uint32_t OC = UP_DIV(output_shape[3], C4NUM);
|
||||
|
||||
const std::vector<size_t> &max_global = ocl_runtime->GetWorkItemSize();
|
||||
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
|
||||
std::vector<size_t> local = {1, 1, 1}; // init local
|
||||
std::vector<size_t> global = {OH, OW, OC};
|
||||
BatchNormGetWorkGroup(global, &local, max_global[0]);
|
||||
int arg_cn = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); // scale
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c()); // offest
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->epsilon_);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); // scale
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c()); // offest
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
#include <vector>
|
||||
#include "ir/anf.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "nnacl/fp32/batchnorm.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/opencl/kernel/biasadd.h"
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
@ -23,7 +24,6 @@
|
|||
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/biasadd.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/cl/biasadd.cl.inc"
|
||||
|
||||
|
@ -38,7 +38,7 @@ namespace mindspore::kernel {
|
|||
void BiasAddOpenCLKernel::InitBuffer() {
|
||||
int C = in_tensors_[1]->shape()[0];
|
||||
int div_ci = UP_DIV(C, C4NUM);
|
||||
auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
if (enable_fp16_) {
|
||||
img_dtype = CL_HALF_FLOAT;
|
||||
|
@ -57,8 +57,7 @@ int BiasAddOpenCLKernel::Init() {
|
|||
for (int i = 0; i < in_size_; ++i) {
|
||||
input_shape_.s[i + 4 - in_size_] = in_tensors_[0]->shape()[i];
|
||||
}
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
fp_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
||||
if (in_size_ != 4 && in_size_ != 2) {
|
||||
MS_LOG(ERROR) << "BiasAdd only support dim=4 or 2, but your dim=" << in_size_;
|
||||
|
@ -75,8 +74,8 @@ int BiasAddOpenCLKernel::Init() {
|
|||
std::string source = biasadd_source;
|
||||
std::string program_name = "BiasAdd";
|
||||
std::string kernel_name = "BiasAdd";
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
|
||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||
out_ori_format_ = out_tensors_[0]->GetFormat();
|
||||
|
@ -89,18 +88,17 @@ int BiasAddOpenCLKernel::Init() {
|
|||
int BiasAddOpenCLKernel::Run() {
|
||||
cl_int4 global_size = GetGlobalshape();
|
||||
MS_LOG(DEBUG) << op_parameter_->name_ << " Running!";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
int arg_idx = 0;
|
||||
std::map<schema::Format, int> data_type{
|
||||
{schema::Format::Format_NC4, 1}, {schema::Format::Format_NHWC4, 2}, {schema::Format::Format_NC4HW4, 3}};
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, input_shape_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, BiasAdd_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, data_type[op_format_]);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, BiasAdd_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, data_type[op_format_]);
|
||||
std::vector<size_t> local = {1, 1};
|
||||
std::vector<size_t> global = {static_cast<size_t>(global_size.s[1]), static_cast<size_t>(global_size.s[2])};
|
||||
auto ret = ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
auto ret = ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
|
||||
return RET_ERROR;
|
||||
|
|
|
@ -23,7 +23,6 @@
|
|||
#include "src/tensor.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
|
|
|
@ -13,13 +13,13 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/opencl/kernel/concat.h"
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/concat.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#include "src/runtime/kernel/opencl/cl/concat.cl.inc"
|
||||
|
||||
|
@ -40,8 +40,7 @@ int ConcatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
|
|||
im_dst_x = out_tensors_[0]->Width();
|
||||
}
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
if (enable_fp16_) {
|
||||
img_dtype = CL_HALF_FLOAT;
|
||||
}
|
||||
|
@ -52,8 +51,7 @@ int ConcatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
|
|||
}
|
||||
|
||||
int ConcatOpenCLKernel::RunAxis0() {
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto allocator_ = ocl_runtime->GetAllocator();
|
||||
auto allocator_ = ocl_runtime_->GetAllocator();
|
||||
std::vector<size_t> img_size;
|
||||
auto dst_data = out_tensors_[0]->data_c();
|
||||
auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
|
||||
|
@ -64,7 +62,7 @@ int ConcatOpenCLKernel::RunAxis0() {
|
|||
auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
|
||||
auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1};
|
||||
cl::Image2D *input_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
|
||||
ocl_runtime->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin, region);
|
||||
ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin, region);
|
||||
dst_origin[1] += region[1];
|
||||
}
|
||||
return RET_OK;
|
||||
|
@ -112,9 +110,8 @@ int ConcatOpenCLKernel::Init() {
|
|||
std::set<std::string> build_options;
|
||||
std::string source = concat_source;
|
||||
std::string program_name = "Concat";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -155,7 +152,6 @@ int ConcatOpenCLKernel::Run() {
|
|||
return RunAxis0();
|
||||
}
|
||||
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto input1_shape = in_tensors_[0]->shape();
|
||||
auto input2_shape = in_tensors_[1]->shape();
|
||||
auto output_shape = out_tensors_[0]->shape();
|
||||
|
@ -168,7 +164,7 @@ int ConcatOpenCLKernel::Run() {
|
|||
uint32_t OW = output_shape[2];
|
||||
uint32_t OC = UP_DIV(output_shape[3], C4NUM);
|
||||
|
||||
const std::vector<size_t> &max_global = ocl_runtime->GetWorkItemSize();
|
||||
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
|
||||
std::vector<size_t> local = {1, 1, 1}; // init local
|
||||
std::vector<size_t> global = {OH, OW, OC};
|
||||
ConcatGetWorkGroup(global, &local, max_global[0]);
|
||||
|
@ -176,48 +172,48 @@ int ConcatOpenCLKernel::Run() {
|
|||
|
||||
int arg_cn = 0;
|
||||
if (in_tensors_.size() == 2) {
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape1_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape2_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
|
||||
} else if (in_tensors_.size() == 3) {
|
||||
auto input3_shape = in_tensors_[2]->shape();
|
||||
cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)};
|
||||
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape1_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape2_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape3_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
|
||||
} else if (in_tensors_.size() == 4) {
|
||||
auto input3_shape = in_tensors_[2]->shape();
|
||||
auto input4_shape = in_tensors_[3]->shape();
|
||||
cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)};
|
||||
cl_int4 input_shape4_ = {input4_shape[0], input4_shape[1], input4_shape[2], UP_DIV(input4_shape[3], C4NUM)};
|
||||
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape4_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape1_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape2_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape3_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape4_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
|
||||
} else {
|
||||
MS_LOG(ERROR) << " input sizes must 2 or 3 or 4";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
#include <vector>
|
||||
#include "ir/anf.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/arm/base/concat_base.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
|
|
@ -14,12 +14,11 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/opencl/kernel/conv2d_transpose.h"
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include "nnacl/fp32/common_func.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/conv2d_transpose.h"
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
#include "src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl.inc"
|
||||
#endif
|
||||
|
@ -41,16 +40,15 @@ int Conv2dTransposeOpenCLKernel::Init() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
std::string kernel_name = "conv2d_transpose2x2_" + std::string(EnumNameFormat(op_format_));
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
std::string source = conv2d_transpose2x2_source;
|
||||
std::set<std::string> build_options;
|
||||
std::string program_name = "conv2d_transpose2x2";
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
PadWeight();
|
||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||
|
@ -71,7 +69,7 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
|
|||
int kw = param->kernel_w_;
|
||||
int div_ci = UP_DIV(ci, C4NUM);
|
||||
int div_co = UP_DIV(co, C4NUM);
|
||||
auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
auto data_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float);
|
||||
|
||||
// IHWO to OHWI4(I)4(O)(converter format is IHWO)
|
||||
|
@ -188,7 +186,6 @@ int Conv2dTransposeOpenCLKernel::Run() {
|
|||
int ow = out_tensors_[0]->shape()[2];
|
||||
int h = in_tensors_[0]->shape()[1];
|
||||
int w = in_tensors_[0]->shape()[2];
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
// local size should less than MAX_GROUP_SIZE
|
||||
std::vector<size_t> local = {16, 1, 16};
|
||||
std::vector<size_t> global = {UP_ROUND((size_t)UP_ROUND(oh / 2, 2), local[0]),
|
||||
|
@ -200,16 +197,16 @@ int Conv2dTransposeOpenCLKernel::Run() {
|
|||
cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), 1};
|
||||
cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1};
|
||||
int arg_cnt = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, kernel_size);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, stride);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padding);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, src_size);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, dst_size);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@
|
|||
|
||||
#include "src/lite_kernel.h"
|
||||
#include "nnacl/conv_parameter.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
|
|
@ -39,12 +39,11 @@ constexpr size_t CO_TILE = C4NUM;
|
|||
|
||||
int ConvolutionOpenCLKernel::Init() {
|
||||
static int init_count = 0;
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto allocator = ocl_runtime->GetAllocator();
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
|
||||
std::set<std::string> build_options;
|
||||
init_count++;
|
||||
use_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
use_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
|
||||
if (op_format_ != Format_NHWC4 && op_format_ != Format_NC4HW4) {
|
||||
MS_LOG(ERROR) << "op_format_ " << op_format_ << " not support!";
|
||||
|
@ -76,21 +75,21 @@ int ConvolutionOpenCLKernel::Init() {
|
|||
MS_LOG(DEBUG) << "use winograd";
|
||||
std::string program_name;
|
||||
program_name = "Winograd4x4To36" + std::to_string(init_count);
|
||||
ocl_runtime->LoadSource(program_name, CodeGenWinograd4x4To36());
|
||||
ocl_runtime->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options);
|
||||
ocl_runtime_->LoadSource(program_name, CodeGenWinograd4x4To36());
|
||||
ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options);
|
||||
|
||||
program_name = "WinogradConvolution" + std::to_string(init_count);
|
||||
ocl_runtime->LoadSource(program_name, CodeGenWinogradConvolution());
|
||||
ocl_runtime->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options);
|
||||
ocl_runtime_->LoadSource(program_name, CodeGenWinogradConvolution());
|
||||
ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options);
|
||||
|
||||
program_name = "Winograd36To4x4" + std::to_string(init_count);
|
||||
ocl_runtime->LoadSource(program_name, CodeGenWinograd36To4x4());
|
||||
ocl_runtime->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options);
|
||||
ocl_runtime_->LoadSource(program_name, CodeGenWinograd36To4x4());
|
||||
ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options);
|
||||
} else {
|
||||
std::string program_name = "convolution" + std::to_string(init_count);
|
||||
std::string source = op_format_ == Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4();
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_conv_, program_name, "Convolution", build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_conv_, program_name, "Convolution", build_options);
|
||||
}
|
||||
|
||||
// allocate winograd memory
|
||||
|
@ -167,7 +166,7 @@ int ConvolutionOpenCLKernel::GenerateWinogradWeight() {
|
|||
}
|
||||
|
||||
int ConvolutionOpenCLKernel::InitWeight() {
|
||||
auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
|
||||
// allocate memory
|
||||
size_t packed_weight_size;
|
||||
|
@ -205,8 +204,7 @@ int ConvolutionOpenCLKernel::InitWeight() {
|
|||
}
|
||||
|
||||
int ConvolutionOpenCLKernel::InitBias() {
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto allocator = ocl_runtime->GetAllocator();
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
|
||||
// align bias from C to C4
|
||||
auto bias_tensor = in_tensors_[2];
|
||||
|
@ -272,57 +270,56 @@ int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_s
|
|||
|
||||
int ConvolutionOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
||||
int arg_cn = 0;
|
||||
if (use_winograd_) {
|
||||
arg_cn = 0;
|
||||
cl_int4 _4x4to36_in_shape = {1, IH_, IW_, CI_SLICES_};
|
||||
cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY_, CI_SLICES_};
|
||||
ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
|
||||
ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_in_shape);
|
||||
ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_out_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_in_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_out_shape);
|
||||
|
||||
arg_cn = 0;
|
||||
cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_};
|
||||
cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_};
|
||||
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
|
||||
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
|
||||
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape);
|
||||
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, conv_out_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, conv_out_shape);
|
||||
|
||||
arg_cn = 0;
|
||||
cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_};
|
||||
cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_};
|
||||
ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
|
||||
ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
|
||||
ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape);
|
||||
} else {
|
||||
arg_cn = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
|
||||
if (op_format_ == Format_NC4HW4) {
|
||||
cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_};
|
||||
cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_};
|
||||
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, input_shape);
|
||||
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, output_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, input_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, output_shape);
|
||||
}
|
||||
}
|
||||
|
||||
if (use_winograd_) {
|
||||
ocl_runtime->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr);
|
||||
ocl_runtime->RunKernel(kernel_conv_, {size_t(TILES_XY_ / 2), 36, size_t(CO_SLICES_ / 2)}, {8, 6, 2}, nullptr);
|
||||
ocl_runtime->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_conv_, {size_t(TILES_XY_ / 2), 36, size_t(CO_SLICES_ / 2)}, {8, 6, 2}, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr);
|
||||
} else {
|
||||
std::vector<size_t> global, local;
|
||||
SetGlobalLocalConv(&global, &local);
|
||||
ocl_runtime->RunKernel(kernel_conv_, global, local, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_conv_, global, local, nullptr);
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
|
@ -819,10 +816,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
|
|||
}
|
||||
|
||||
int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local) {
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
constexpr size_t work_group_size[] = {4, 4, 1};
|
||||
auto max_work_item_sizes = ocl_runtime->GetWorkItemSize();
|
||||
size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime->Device())());
|
||||
auto max_work_item_sizes = ocl_runtime_->GetWorkItemSize();
|
||||
size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime_->Device())());
|
||||
const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);
|
||||
|
||||
size_t global_h = UP_DIV(OH_, work_group_size[0]) * work_group_size[0];
|
||||
|
|
|
@ -22,7 +22,6 @@
|
|||
#include "src/tensor.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "nnacl/conv_parameter.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
|
|
@ -21,7 +21,6 @@
|
|||
#include <map>
|
||||
#include <utility>
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#include "nnacl/fp32/common_func.h"
|
||||
#include "nnacl/op_base.h"
|
||||
|
@ -42,7 +41,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
|
|||
namespace mindspore::kernel {
|
||||
|
||||
int DepthwiseConv2dOpenCLKernel::Init() {
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::string kernel_name = "DepthwiseConv2d";
|
||||
auto in_format = op_format_;
|
||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||
|
@ -69,13 +67,13 @@ int DepthwiseConv2dOpenCLKernel::Init() {
|
|||
kernel_name += "_1x1";
|
||||
}
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
std::string program_name = "DepthwiseConv2d";
|
||||
std::set<std::string> build_options;
|
||||
std::string source = depthwise_conv2d_source;
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
this->InitBuffer();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done! mem type=" << static_cast<int>(out_mem_type_);
|
||||
|
@ -84,9 +82,8 @@ int DepthwiseConv2dOpenCLKernel::Init() {
|
|||
|
||||
int DepthwiseConv2dOpenCLKernel::InitBuffer() {
|
||||
auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto allocator = ocl_runtime->GetAllocator();
|
||||
bool is_fp16 = ocl_runtime->GetFp16Enable();
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
bool is_fp16 = ocl_runtime_->GetFp16Enable();
|
||||
|
||||
// weight: o, h, w, i; o == group, i == 1
|
||||
void *origin_weight = in_tensors_.at(kWeightIndex)->data_c();
|
||||
|
@ -162,7 +159,7 @@ int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *i
|
|||
im_dst_x = out_tensors_[0]->Width();
|
||||
}
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
if (lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable()) {
|
||||
if (ocl_runtime_->GetFp16Enable()) {
|
||||
img_dtype = CL_HALF_FLOAT;
|
||||
}
|
||||
img_size->clear();
|
||||
|
@ -189,7 +186,6 @@ int DepthwiseConv2dOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size
|
|||
int DepthwiseConv2dOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
||||
size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
|
||||
std::vector<size_t> global = {(size_t)out_tensors_[0]->Width(), (size_t)out_tensors_[0]->Height(), CO4};
|
||||
|
@ -207,19 +203,19 @@ int DepthwiseConv2dOpenCLKernel::Run() {
|
|||
(cl_int)out_tensors_[0]->Batch()};
|
||||
|
||||
int arg_cnt = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, packed_weight_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, kernel_size);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, stride);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padding);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, dilation);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, src_size);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, dst_size);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, packed_weight_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dilation);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
#include <vector>
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "nnacl/conv_parameter.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
#include <set>
|
||||
#include <utility>
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/gather.h"
|
||||
#include "src/runtime/kernel/opencl/cl/gather.cl.inc"
|
||||
|
||||
|
@ -49,9 +48,8 @@ int GatherOpenCLKernel::Init() {
|
|||
std::set<std::string> build_options;
|
||||
std::string source = gather_source;
|
||||
std::string program_name = "gather";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
// init indices_data_
|
||||
auto indices_tensor = in_tensors_.at(1);
|
||||
int indices_num = indices_tensor->ElementsNum();
|
||||
|
@ -104,8 +102,7 @@ int GatherOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
|
|||
im_dst_x = out_tensors_[0]->Width();
|
||||
}
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
if (enable_fp16_) {
|
||||
img_dtype = CL_HALF_FLOAT;
|
||||
}
|
||||
|
@ -117,7 +114,6 @@ int GatherOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
|
|||
int GatherOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
auto param = reinterpret_cast<GatherParameter *>(this->op_parameter_);
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
||||
if (InitBuffer() != RET_OK) {
|
||||
return RET_ERROR;
|
||||
|
@ -134,14 +130,14 @@ int GatherOpenCLKernel::Run() {
|
|||
std::vector<size_t> local = {1, 1, 1};
|
||||
std::vector<size_t> global = {(size_t)out_tensors_[0]->Width(), (size_t)out_tensors_[0]->Height(), CO4};
|
||||
int arg_cn = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, indices_data_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, src_size);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, dst_size);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, indices_num);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, indices_data_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, src_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dst_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, indices_num);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
#include <vector>
|
||||
#include "ir/anf.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "nnacl/gather_parameter.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
#include <map>
|
||||
#include "nnacl/fp32/common_func.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/matmul.h"
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
#include "src/runtime/kernel/opencl/cl/matmul.cl.inc"
|
||||
|
@ -35,7 +34,6 @@ namespace mindspore::kernel {
|
|||
int MatMulOpenCLKernel::Init() {
|
||||
std::string kernel_name = "MatMul";
|
||||
kernel_name += "_" + std::string(EnumNameFormat(op_format_));
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto param = reinterpret_cast<MatMulParameter *>(op_parameter_);
|
||||
transposeA = param->a_transpose_;
|
||||
if (transposeA) {
|
||||
|
@ -43,7 +41,7 @@ int MatMulOpenCLKernel::Init() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
transposeB = param->b_transpose_;
|
||||
enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
if (in_tensors_[0]->shape().size() != out_tensors_[0]->shape().size() ||
|
||||
(in_tensors_[0]->shape().size() != 2 && in_tensors_[0]->shape().size() != 4)) {
|
||||
MS_LOG(ERROR) << "matmul only support input shape size=2 or 4.";
|
||||
|
@ -57,13 +55,13 @@ int MatMulOpenCLKernel::Init() {
|
|||
std::map<int, std::string> dims2str = {{2, "_2d"}, {4, "_4d"}};
|
||||
kernel_name += dims2str[dims];
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
std::set<std::string> build_options;
|
||||
std::string source = matmul_source;
|
||||
std::string program_name = "MatMul";
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
|
||||
PadWeight();
|
||||
|
@ -79,7 +77,7 @@ int MatMulOpenCLKernel::ReSize() { return RET_OK; }
|
|||
|
||||
void MatMulOpenCLKernel::PadWeight() {
|
||||
// ABMCI @ ABCICO = ABMCO
|
||||
auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
int ci = inShape[3];
|
||||
int ci4 = UP_DIV(ci, C4NUM);
|
||||
int co = outShape[3];
|
||||
|
@ -201,7 +199,6 @@ int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
|
|||
|
||||
int MatMulOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
// local size should less than MAX_GROUP_SIZE
|
||||
std::vector<size_t> local = {32, 4, 1};
|
||||
std::vector<size_t> global = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM),
|
||||
|
@ -210,14 +207,14 @@ int MatMulOpenCLKernel::Run() {
|
|||
int arg_count = 0;
|
||||
cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
|
||||
cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_count++, in_shape);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_count++, out_shape);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@
|
|||
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "nnacl/matmul_parameter.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
|
|
|
@ -20,8 +20,6 @@
|
|||
#include "include/errorcode.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#include "src/runtime/opencl/opencl_wrapper.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/image_format.h"
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
#include "src/runtime/kernel/opencl/cl/avg_pool2d.cl.inc"
|
||||
|
@ -59,10 +57,9 @@ int PoolingOpenCLKernel::Init() {
|
|||
MS_LOG(ERROR) << "Init `Pooling2d` kernel failed!";
|
||||
return RET_INVALID_OP_NAME;
|
||||
}
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
kernel_name += "_" + std::string(EnumNameFormat(op_format_));
|
||||
if (out_mem_type_ == OpenCLMemType::BUF) {
|
||||
|
@ -72,8 +69,8 @@ int PoolingOpenCLKernel::Init() {
|
|||
kernel_name += "_IMG";
|
||||
}
|
||||
std::set<std::string> build_options;
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||
out_ori_format_ = out_tensors_[0]->GetFormat();
|
||||
|
@ -124,7 +121,6 @@ int PoolingOpenCLKernel::ReSize() { return RET_OK; }
|
|||
|
||||
int PoolingOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
||||
int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
|
||||
cl_int4 input_shape = {in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], in_tensors_[0]->shape()[3], slices};
|
||||
|
@ -135,21 +131,21 @@ int PoolingOpenCLKernel::Run() {
|
|||
cl_int2 padding = {parameter_->pad_u_, parameter_->pad_l_};
|
||||
|
||||
int arg_idx = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, input_shape);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, stride);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, kernel_size);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, padding);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, stride);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding);
|
||||
|
||||
std::vector<size_t> local_size;
|
||||
std::vector<size_t> global_size = InitGlobalSize();
|
||||
int max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime->Device())());
|
||||
int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
|
||||
local_size = GetCommonLocalSize(global_size, max_work_group_size);
|
||||
global_size = GetCommonGlobalSize(local_size, global_size);
|
||||
|
||||
ocl_runtime->RunKernel(kernel_, global_size, local_size, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_, global_size, local_size, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@
|
|||
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "nnacl/fp32/pooling.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
|
|
|
@ -24,7 +24,6 @@
|
|||
#include "include/errorcode.h"
|
||||
#include "nnacl/fp32/common_func.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/prelu.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/cl/prelu.cl.inc"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
|
@ -36,7 +35,7 @@ using mindspore::schema::PrimitiveType_PReLU;
|
|||
namespace mindspore::kernel {
|
||||
|
||||
void PReluOpenCLKernel::InitBuffer() {
|
||||
auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
int elem_num = in_tensors_[0]->shape().size() == 2 ? in_tensors_[0]->shape()[1] : in_tensors_[0]->shape()[3];
|
||||
int elem_num_c4 = UP_DIV(elem_num, C4NUM);
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
|
@ -91,12 +90,11 @@ int PReluOpenCLKernel::Init() {
|
|||
std::string source = prelu_source;
|
||||
std::string program_name = "PRelu";
|
||||
std::string kernel_name = "PRelu";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
fp_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
||||
InitBuffer();
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||
in_tensors_[0]->SetFormat(op_format_);
|
||||
out_ori_format_ = out_tensors_[0]->GetFormat();
|
||||
|
@ -107,18 +105,17 @@ int PReluOpenCLKernel::Init() {
|
|||
|
||||
int PReluOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << op_parameter_->name_ << " Running!";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::map<schema::Format, int> data_type{{schema::Format::Format_NHWC4, 1}, {schema::Format::Format_NC4HW4, 2}};
|
||||
int arg_idx = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, input_shape_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, PReluWeight_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, data_type[op_format_]);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, reinterpret_cast<int>(in_tensors_[1]->shape()[0]));
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, PReluWeight_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, data_type[op_format_]);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, reinterpret_cast<int>(in_tensors_[1]->shape()[0]));
|
||||
std::vector<size_t> local = {1, 1};
|
||||
std::vector<size_t> global = {static_cast<size_t>(global_shape_.s[1]), static_cast<size_t>(global_shape_.s[2])};
|
||||
auto ret = ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
auto ret = ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
|
||||
return RET_ERROR;
|
||||
|
|
|
@ -22,7 +22,6 @@
|
|||
#include "src/tensor.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
#include <map>
|
||||
#include "include/errorcode.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/reduce.h"
|
||||
#include "src/runtime/kernel/opencl/cl/reduce.cl.inc"
|
||||
|
||||
|
@ -59,8 +58,7 @@ int ReduceOpenCLKernel::Init() {
|
|||
}
|
||||
std::string kernel_name = reduce_type2str.at(reduce_param->mode_);
|
||||
kernel_name += "_" + std::string(EnumNameFormat(op_format_));
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
|
||||
if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) {
|
||||
MS_LOG(ERROR) << "Reduce input channel " << in_tensors_[0]->shape().back() << " should equal output channel"
|
||||
|
@ -68,12 +66,12 @@ int ReduceOpenCLKernel::Init() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
std::set<std::string> build_options;
|
||||
std::string source = reduce_source;
|
||||
ocl_runtime->LoadSource(kernel_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, kernel_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(kernel_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, kernel_name, kernel_name, build_options);
|
||||
#endif
|
||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||
out_ori_format_ = out_tensors_[0]->GetFormat();
|
||||
|
@ -130,15 +128,14 @@ int ReduceOpenCLKernel::Run() {
|
|||
int w = shapex[2];
|
||||
int c = shapex[3];
|
||||
int c4 = UP_DIV(c, C4NUM);
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::vector<size_t> local = {};
|
||||
std::vector<size_t> global = {static_cast<size_t>(c4)};
|
||||
cl_int4 size = {h, w, c4, 1};
|
||||
int arg_idx = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, size);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
#include <vector>
|
||||
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "nnacl/reduce_parameter.h"
|
||||
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
#include <string>
|
||||
#include "include/errorcode.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/reshape.h"
|
||||
#include "src/runtime/kernel/opencl/cl/reshape.cl.inc"
|
||||
|
||||
|
@ -34,8 +33,7 @@ namespace mindspore::kernel {
|
|||
int ReshapeOpenCLKernel::Init() {
|
||||
std::string kernel_name = "reshape";
|
||||
kernel_name += "_" + std::string(EnumNameFormat(op_format_));
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
if (out_tensors_[0]->shape().size() != 2 && out_tensors_[0]->shape().size() != 4) {
|
||||
MS_LOG(ERROR) << "Reshape output size should in 2,4";
|
||||
return RET_ERROR;
|
||||
|
@ -46,13 +44,13 @@ int ReshapeOpenCLKernel::Init() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
std::set<std::string> build_options;
|
||||
std::string source = reshape_source;
|
||||
std::string program_name = "reshape";
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||
out_ori_format_ = out_tensors_[0]->GetFormat();
|
||||
|
@ -112,17 +110,16 @@ int ReshapeOpenCLKernel::Run() {
|
|||
oh = out_tensors_[0]->shape()[1];
|
||||
ow = out_tensors_[0]->shape()[2];
|
||||
}
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::vector<size_t> local = {};
|
||||
std::vector<size_t> global = {(size_t)oh, (size_t)ow, (size_t)c4};
|
||||
cl_int4 size = {h, w, c4, 1};
|
||||
cl_int4 size_out = {oh, ow, c4, 1};
|
||||
int arg_idx = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, size);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, size_out);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size_out);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
#include <vector>
|
||||
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
|
|
@ -245,7 +245,6 @@ int ScaleOpenCLKernel::InitBuffer() {
|
|||
}
|
||||
|
||||
int ScaleOpenCLKernel::Init() {
|
||||
ocl_runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::string kernel_name;
|
||||
|
||||
const ScaleParameter *scale_param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
|
||||
#include <vector>
|
||||
#include "nnacl/scale.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
@ -42,7 +41,6 @@ class ScaleOpenCLKernel : public OpenCLKernel {
|
|||
int InitBuffer();
|
||||
|
||||
cl::Kernel kernel_;
|
||||
lite::opencl::OpenCLRuntime *ocl_runtime_;
|
||||
bool element_flag_{true};
|
||||
void *scale_ptr_{nullptr};
|
||||
void *offset_ptr_{nullptr};
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
#include <algorithm>
|
||||
#include <set>
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/slice.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#include "src/runtime/kernel/opencl/cl/slice.cl.inc"
|
||||
|
@ -40,8 +39,7 @@ int SliceOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
|
|||
im_dst_x = out_tensors_[0]->Width();
|
||||
}
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
if (enable_fp16_) {
|
||||
img_dtype = CL_HALF_FLOAT;
|
||||
}
|
||||
|
@ -71,9 +69,8 @@ int SliceOpenCLKernel::Init() {
|
|||
std::set<std::string> build_options;
|
||||
std::string source = slice_source;
|
||||
std::string program_name = "slice";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -96,7 +93,6 @@ void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l
|
|||
int SliceOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_);
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto input_shape = in_tensors_[0]->shape();
|
||||
cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)};
|
||||
cl_int4 size_ = {param->size_[0], param->size_[1], param->size_[2], UP_DIV(param->size_[3], C4NUM)};
|
||||
|
@ -105,18 +101,18 @@ int SliceOpenCLKernel::Run() {
|
|||
uint32_t OH = param->size_[1];
|
||||
uint32_t OW = param->size_[2];
|
||||
|
||||
const std::vector<size_t> &max_global = ocl_runtime->GetWorkItemSize();
|
||||
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
|
||||
std::vector<size_t> local = {1, 1, 1}; // init local
|
||||
std::vector<size_t> global = {1, OH, OW};
|
||||
SlcieGetWorkGroup(global, &local, max_global[0]);
|
||||
int arg_cn = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, size_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, begin_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, sharedNoUpdiv);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, size_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, sharedNoUpdiv);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
#include <vector>
|
||||
#include "ir/anf.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "nnacl/fp32/slice.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
#include <set>
|
||||
#include "include/errorcode.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
#include "src/runtime/kernel/opencl/cl/softmax.cl.inc"
|
||||
|
@ -51,7 +50,7 @@ int SoftmaxOpenCLKernel::InitGlobalSize() {
|
|||
int SoftmaxOpenCLKernel::SetWorkGroupSize() {
|
||||
// set work group size
|
||||
InitGlobalSize();
|
||||
int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())());
|
||||
int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
|
||||
local_size_ = GetCommonLocalSize(global_size_, max_work_group_size);
|
||||
global_size_ = GetCommonGlobalSize(local_size_, global_size_);
|
||||
return lite::RET_OK;
|
||||
|
@ -101,8 +100,7 @@ int SoftmaxOpenCLKernel::Init() {
|
|||
std::string program_name = "SoftMax";
|
||||
|
||||
std::string source = softmax_source;
|
||||
runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
enable_fp16_ = runtime_->GetFp16Enable();
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
// framework not set this param yet! just use default.
|
||||
if (in_tensors_[0]->shape().size() == 4) {
|
||||
// support 4d tensor
|
||||
|
@ -133,8 +131,8 @@ int SoftmaxOpenCLKernel::Init() {
|
|||
program_name += "_IMG";
|
||||
}
|
||||
std::set<std::string> build_options;
|
||||
runtime_->LoadSource(program_name, source);
|
||||
runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||
out_ori_format_ = out_tensors_[0]->GetFormat();
|
||||
|
@ -158,32 +156,32 @@ int SoftmaxOpenCLKernel::Run() {
|
|||
auto mask_ = GetMaskForLastChannel(channel_size);
|
||||
cl_float4 mask = {mask_[0], mask_[1], mask_[2], mask_[3]};
|
||||
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
if (is_image_out_) {
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
} else {
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
}
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, mask);
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, slices);
|
||||
runtime_->SetKernelArg(kernel_, arg_idx, slices_x32);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, mask);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, slices);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx, slices_x32);
|
||||
SetWorkGroupSize1x1();
|
||||
} else {
|
||||
int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
|
||||
cl_int4 input_shape = {in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], in_tensors_[0]->shape()[3], slices};
|
||||
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
if (is_image_out_) {
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
} else {
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
}
|
||||
runtime_->SetKernelArg(kernel_, arg_idx, input_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx, input_shape);
|
||||
SetWorkGroupSize();
|
||||
}
|
||||
|
||||
// run opengl kernel
|
||||
runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
|
||||
return lite::RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@
|
|||
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "nnacl/fp32/softmax.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
|
@ -46,7 +45,6 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
|
|||
private:
|
||||
cl::Kernel kernel_;
|
||||
SoftmaxParameter *parameter_;
|
||||
lite::opencl::OpenCLRuntime *runtime_;
|
||||
|
||||
bool onexone_flag_{false};
|
||||
std::vector<size_t> local_size_;
|
||||
|
|
|
@ -21,7 +21,6 @@
|
|||
#include <utility>
|
||||
#include "include/errorcode.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/cl/to_format.cl.inc"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
|
@ -33,7 +32,6 @@ using mindspore::schema::PrimitiveType_ToFormat;
|
|||
namespace mindspore::kernel {
|
||||
|
||||
int ToFormatOpenCLKernel::Init() {
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_);
|
||||
out_mem_type_ = parameter->out_mem_type;
|
||||
std::string program_name = "to_format";
|
||||
|
@ -53,12 +51,12 @@ int ToFormatOpenCLKernel::Init() {
|
|||
|
||||
this->set_name(kernel_name);
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
std::set<std::string> build_options;
|
||||
std::string source = to_format_source;
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
InitNHWCShape();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
|
@ -147,7 +145,7 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
|
|||
return RET_ERROR;
|
||||
}
|
||||
img_size->clear();
|
||||
auto enable_fp16_ = lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable();
|
||||
auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
if (enable_fp16_) {
|
||||
img_dtype = CL_HALF_FLOAT;
|
||||
|
@ -158,7 +156,6 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
|
|||
}
|
||||
int ToFormatOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::vector<size_t> local = {};
|
||||
std::vector<size_t> global;
|
||||
GetGlobalSize(0, &global);
|
||||
|
@ -167,11 +164,11 @@ int ToFormatOpenCLKernel::Run() {
|
|||
cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};
|
||||
auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG;
|
||||
auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF;
|
||||
ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type);
|
||||
ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), dst_mem_type);
|
||||
ocl_runtime->SetKernelArg(kernel_, 2, gsize);
|
||||
ocl_runtime->SetKernelArg(kernel_, 3, shape);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type);
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), dst_mem_type);
|
||||
ocl_runtime_->SetKernelArg(kernel_, 2, gsize);
|
||||
ocl_runtime_->SetKernelArg(kernel_, 3, shape);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
#include <vector>
|
||||
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
#include <string>
|
||||
#include "include/errorcode.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/transpose.h"
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
#include "src/runtime/kernel/opencl/cl/transpose.cl.inc"
|
||||
|
@ -34,8 +33,7 @@ namespace mindspore::kernel {
|
|||
|
||||
int TransposeOpenCLKernel::Init() {
|
||||
std::string kernel_name = "transpose";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
auto param = reinterpret_cast<TransposeParameter *>(op_parameter_);
|
||||
if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 &&
|
||||
param->perm_[3] == 2) {
|
||||
|
@ -52,13 +50,13 @@ int TransposeOpenCLKernel::Init() {
|
|||
kernel_name += "_IMG";
|
||||
}
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
std::set<std::string> build_options;
|
||||
std::string source = transpose_source;
|
||||
std::string program_name = "transpose";
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
if ((in_tensors_[0]->shape()[1] * in_tensors_[0]->shape()[2]) % 4 != 0) {
|
||||
MS_LOG(ERROR) << "input H * W % 4 != 0 not support!";
|
||||
|
@ -114,24 +112,23 @@ int TransposeOpenCLKernel::Run() {
|
|||
int c = shapex[3];
|
||||
int c4 = UP_DIV(c, 4);
|
||||
int hw4 = UP_DIV(h * w, 4);
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::vector<size_t> local = {16, 16};
|
||||
std::vector<size_t> global = {UP_ROUND(hw4, local[0]), UP_ROUND(c4, local[1])};
|
||||
|
||||
cl_int2 HW = {h * w, hw4};
|
||||
cl_int2 C = {c, c4};
|
||||
int arg_idx = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
if (out_mem_type_ == OpenCLMemType::BUF) {
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
} else {
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
}
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, HW);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, C);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, w);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, h);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, HW);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, C);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, w);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, h);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@
|
|||
|
||||
#include "src/lite_kernel.h"
|
||||
#include "nnacl/transpose.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
|
@ -36,7 +37,16 @@ class OpenCLKernel : public LiteKernel {
|
|||
public:
|
||||
explicit OpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs)
|
||||
: LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {}
|
||||
: LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {
|
||||
ocl_runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
}
|
||||
|
||||
~OpenCLKernel() {
|
||||
if (ocl_runtime_ != nullptr) {
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
ocl_runtime_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
virtual int Init() { return RET_ERROR; }
|
||||
virtual int Prepare() { return RET_ERROR; }
|
||||
|
@ -59,6 +69,7 @@ class OpenCLKernel : public LiteKernel {
|
|||
schema::Format in_ori_format_{schema::Format::Format_NHWC};
|
||||
schema::Format out_ori_format_{schema::Format::Format_NHWC4};
|
||||
schema::Format op_format_{schema::Format::Format_NHWC4};
|
||||
lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr};
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -99,7 +99,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
|
|||
|
||||
out_tensors->emplace_back(new_tensor);
|
||||
KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat};
|
||||
if (mem_type == OpenCLMemType::IMG && lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable()) {
|
||||
if (mem_type == OpenCLMemType::IMG && ocl_runtime_->GetFp16Enable()) {
|
||||
desc.data_type = kNumberTypeFloat16;
|
||||
new_tensor->set_data_type(kNumberTypeFloat16);
|
||||
}
|
||||
|
@ -160,7 +160,8 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
|
|||
}
|
||||
|
||||
int SubGraphOpenCLKernel::Init() {
|
||||
allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
|
||||
ocl_runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
allocator_ = ocl_runtime_->GetAllocator();
|
||||
MS_LOG(DEBUG) << "input num=" << in_tensors_.size() << ", output num=" << out_tensors_.size();
|
||||
for (const auto tensor : in_tensors_) {
|
||||
tensor->set_allocator(allocator_);
|
||||
|
@ -195,8 +196,7 @@ int SubGraphOpenCLKernel::Init() {
|
|||
}
|
||||
|
||||
int SubGraphOpenCLKernel::UpdateTensorDataType() {
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
bool is_fp16 = ocl_runtime->GetFp16Enable();
|
||||
bool is_fp16 = ocl_runtime_->GetFp16Enable();
|
||||
if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) {
|
||||
std::set<lite::Tensor *> out_set;
|
||||
out_set.insert(in_tensors_.begin(), in_tensors_.end());
|
||||
|
@ -292,16 +292,25 @@ int SubGraphOpenCLKernel::UnInit() {
|
|||
delete tensor;
|
||||
}
|
||||
}
|
||||
in_convert_tensors_.clear();
|
||||
for (const auto &tensor : out_convert_tensors_) {
|
||||
if (tensor != nullptr) {
|
||||
delete tensor;
|
||||
}
|
||||
}
|
||||
for (const auto &op : in_convert_ops_) {
|
||||
out_convert_tensors_.clear();
|
||||
for (const auto &op : nodes_) {
|
||||
if (op != nullptr) {
|
||||
delete op;
|
||||
}
|
||||
}
|
||||
nodes_.clear();
|
||||
in_convert_ops_.clear();
|
||||
out_convert_ops_.clear();
|
||||
if (ocl_runtime_ != nullptr) {
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
ocl_runtime_ = nullptr;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -310,14 +319,13 @@ int SubGraphOpenCLKernel::InferShape() { return RET_OK; }
|
|||
int SubGraphOpenCLKernel::ReSize() { return RET_OK; }
|
||||
|
||||
int SubGraphOpenCLKernel::Run() {
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
for (auto &tensor : in_tensors_) {
|
||||
allocator_->UnmapBuffer(tensor->data_c());
|
||||
}
|
||||
|
||||
lite::opencl::OpenCLExecutor executor;
|
||||
executor.Run(in_tensors_, out_tensors_, nodes_, allocator_);
|
||||
ocl_runtime->SyncCommandQueue();
|
||||
ocl_runtime_->SyncCommandQueue();
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -64,6 +64,7 @@ class SubGraphOpenCLKernel : public SubGraphKernel {
|
|||
std::vector<OpenCLToFormatParameter *> out_parameters_;
|
||||
std::vector<LiteKernel *> in_convert_ops_;
|
||||
std::vector<LiteKernel *> out_convert_ops_;
|
||||
lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr};
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -23,8 +23,6 @@
|
|||
|
||||
namespace mindspore::lite::opencl {
|
||||
|
||||
OpenCLAllocator::OpenCLAllocator() {}
|
||||
|
||||
OpenCLAllocator::OpenCLAllocator(OpenCLRuntime *ocl_runtime) : ocl_runtime_(ocl_runtime) {}
|
||||
|
||||
OpenCLAllocator::~OpenCLAllocator() { Clear(); }
|
||||
|
@ -49,9 +47,6 @@ void OpenCLAllocator::UnLock() {
|
|||
void *OpenCLAllocator::Malloc(size_t size) { return Malloc(size, std::vector<size_t>{}); }
|
||||
|
||||
void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size) {
|
||||
if (ocl_runtime_ == nullptr) {
|
||||
ocl_runtime_ = opencl::OpenCLRuntime::GetInstance();
|
||||
}
|
||||
auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
|
||||
|
||||
size_t img_pitch = 0;
|
||||
|
@ -144,9 +139,6 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v
|
|||
MS_LOG(ERROR) << "MallocData out of max_size, size: " << size;
|
||||
return nullptr;
|
||||
}
|
||||
if (ocl_runtime_ == nullptr) {
|
||||
ocl_runtime_ = opencl::OpenCLRuntime::GetInstance();
|
||||
}
|
||||
Lock();
|
||||
auto iter = free_list_.lower_bound(size);
|
||||
while (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) {
|
||||
|
@ -258,9 +250,6 @@ void *OpenCLAllocator::GetBuffer(void *buffer) {
|
|||
|
||||
void OpenCLAllocator::Clear() {
|
||||
Lock();
|
||||
if (ocl_runtime_ == nullptr) {
|
||||
ocl_runtime_ = opencl::OpenCLRuntime::GetInstance();
|
||||
}
|
||||
auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
|
||||
for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) {
|
||||
if (svm_capabilities) {
|
||||
|
@ -306,9 +295,6 @@ void OpenCLAllocator::Clear() {
|
|||
}
|
||||
|
||||
void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue, bool sync) {
|
||||
if (ocl_runtime_ == nullptr) {
|
||||
ocl_runtime_ = opencl::OpenCLRuntime::GetInstance();
|
||||
}
|
||||
auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
|
||||
if (svm_capabilities) {
|
||||
if (!(svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) {
|
||||
|
@ -362,9 +348,6 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue,
|
|||
}
|
||||
|
||||
int OpenCLAllocator::UnmapBuffer(void *host_ptr, void *command_queue) {
|
||||
if (ocl_runtime_ == nullptr) {
|
||||
ocl_runtime_ = opencl::OpenCLRuntime::GetInstance();
|
||||
}
|
||||
auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
|
||||
if (svm_capabilities) {
|
||||
if (!(svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) {
|
||||
|
|
|
@ -45,7 +45,6 @@ enum class MemType : char { SVM, BUF, IMG };
|
|||
|
||||
class OpenCLAllocator : public Allocator {
|
||||
public:
|
||||
OpenCLAllocator();
|
||||
explicit OpenCLAllocator(OpenCLRuntime *ocl_runtime);
|
||||
~OpenCLAllocator() override;
|
||||
void SetContext(const AllocatorContext &ctx) override;
|
||||
|
|
|
@ -27,7 +27,11 @@
|
|||
namespace mindspore::lite::opencl {
|
||||
class OpenCLExecutor : Executor {
|
||||
public:
|
||||
OpenCLExecutor() : Executor() { allocator_ = OpenCLRuntime::GetInstance()->GetAllocator(); }
|
||||
OpenCLExecutor() : Executor() {
|
||||
auto ocl_runtime = OpenCLRuntime::GetInstance();
|
||||
allocator_ = ocl_runtime->GetAllocator();
|
||||
OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
|
||||
int Prepare(const std::vector<kernel::LiteKernel *> &kernels);
|
||||
|
||||
|
|
|
@ -244,7 +244,7 @@ kernel::LiteKernel *Scheduler::ScheduleNode(const std::vector<Tensor *> &in_tens
|
|||
TypeId data_type = GetFirstFp32Fp16OrInt8Type(in_tensors);
|
||||
kernel::KernelKey desc{kernel::KERNEL_ARCH::kCPU, data_type, static_cast<schema::PrimitiveType>(primitive->Type())};
|
||||
#if SUPPORT_GPU
|
||||
if (context_->device_type_ == DT_GPU && lite::opencl::OpenCLRuntime::GetInstance()->IsInitOK()) {
|
||||
if (context_->device_type_ == DT_GPU) {
|
||||
desc.arch = kernel::KERNEL_ARCH::kGPU;
|
||||
auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, primitive, context_, desc);
|
||||
if (kernel != nullptr) {
|
||||
|
|
|
@ -157,7 +157,6 @@ TEST_F(TestActivationOpenCL, ReluFp_dim4) {
|
|||
ret = sub_graph->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init sub_graph error.";
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -167,7 +166,6 @@ TEST_F(TestActivationOpenCL, ReluFp_dim4) {
|
|||
MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
|
||||
ret = sub_graph->Run();
|
||||
if (ret != RET_OK) {
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -182,7 +180,6 @@ TEST_F(TestActivationOpenCL, ReluFp_dim4) {
|
|||
printf_tensor<float>("ReluFp32--output data--", outputs[0]);
|
||||
CompareRes<float>(output_tensor, out_file);
|
||||
}
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -271,7 +268,6 @@ TEST_F(TestActivationOpenCL, Relu6Fp_dim4) {
|
|||
ret = sub_graph->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init sub_graph error.";
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -281,7 +277,6 @@ TEST_F(TestActivationOpenCL, Relu6Fp_dim4) {
|
|||
MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
|
||||
ret = sub_graph->Run();
|
||||
if (ret != RET_OK) {
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -297,7 +292,6 @@ TEST_F(TestActivationOpenCL, Relu6Fp_dim4) {
|
|||
printf_tensor<float>("Relu6:FP32--output data---", outputs[0]);
|
||||
CompareRes<float>(output_tensor, out_file);
|
||||
}
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -386,7 +380,6 @@ TEST_F(TestActivationOpenCL, SigmoidFp_dim4) {
|
|||
ret = sub_graph->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init sub_graph error.";
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -396,7 +389,6 @@ TEST_F(TestActivationOpenCL, SigmoidFp_dim4) {
|
|||
MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
|
||||
ret = sub_graph->Run();
|
||||
if (ret != RET_OK) {
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -412,7 +404,6 @@ TEST_F(TestActivationOpenCL, SigmoidFp_dim4) {
|
|||
printf_tensor<float>("Sigmoid:FP32--output data---", outputs[0]);
|
||||
CompareRes<float>(output_tensor, out_file);
|
||||
}
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -502,7 +493,6 @@ TEST_F(TestActivationOpenCL, LeakyReluFp_dim4) {
|
|||
ret = sub_graph->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init sub_graph error.";
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -512,7 +502,6 @@ TEST_F(TestActivationOpenCL, LeakyReluFp_dim4) {
|
|||
MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
|
||||
ret = sub_graph->Run();
|
||||
if (ret != RET_OK) {
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -527,7 +516,6 @@ TEST_F(TestActivationOpenCL, LeakyReluFp_dim4) {
|
|||
printf_tensor<float>("Leaky Relu:FP32--output data---", outputs[0]);
|
||||
CompareRes<float>(output_tensor, out_file);
|
||||
}
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -616,7 +604,6 @@ TEST_F(TestActivationOpenCLTanh, TanhFp_dim4) {
|
|||
ret = sub_graph->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init sub_graph error.";
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -626,7 +613,6 @@ TEST_F(TestActivationOpenCLTanh, TanhFp_dim4) {
|
|||
MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
|
||||
ret = sub_graph->Run();
|
||||
if (ret != RET_OK) {
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
@ -642,7 +628,6 @@ TEST_F(TestActivationOpenCLTanh, TanhFp_dim4) {
|
|||
printf_tensor<float>("Tanh:FP32--output data---", outputs[0]);
|
||||
CompareRes<float>(output_tensor, out_file);
|
||||
}
|
||||
delete kernel;
|
||||
delete param;
|
||||
delete input_tensor;
|
||||
delete output_tensor;
|
||||
|
|
|
@ -127,7 +127,6 @@ TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfOpenCLFp16) {
|
|||
delete tensor;
|
||||
}
|
||||
delete param;
|
||||
delete arithmeticself_kernel;
|
||||
delete sub_graph;
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -203,7 +203,6 @@ static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &sh
|
|||
delete[] data_c_ocl;
|
||||
|
||||
delete kernel;
|
||||
delete arith_kernel;
|
||||
delete param;
|
||||
for (auto tensor : inputs) {
|
||||
delete tensor;
|
||||
|
|
|
@ -147,7 +147,6 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
|
|||
delete tensor;
|
||||
}
|
||||
delete param;
|
||||
delete batchnorm_kernel;
|
||||
delete sub_graph;
|
||||
}
|
||||
TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
|
||||
|
|
|
@ -174,7 +174,6 @@ TEST_F(TestBiasAddOpenCL, BiasAddFp32_dim4) {
|
|||
delete weight_tensor;
|
||||
delete sub_graph;
|
||||
delete param;
|
||||
delete biasadd_kernel;
|
||||
return;
|
||||
}
|
||||
MS_LOG(INFO) << "Sub graph begin running!";
|
||||
|
@ -186,7 +185,6 @@ TEST_F(TestBiasAddOpenCL, BiasAddFp32_dim4) {
|
|||
delete weight_tensor;
|
||||
delete sub_graph;
|
||||
delete param;
|
||||
delete biasadd_kernel;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -202,7 +200,6 @@ TEST_F(TestBiasAddOpenCL, BiasAddFp32_dim4) {
|
|||
delete output_tensor;
|
||||
delete sub_graph;
|
||||
delete param;
|
||||
delete biasadd_kernel;
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -164,7 +164,6 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
|
|||
delete tensor;
|
||||
}
|
||||
delete param;
|
||||
delete concat_kernel;
|
||||
delete sub_graph;
|
||||
}
|
||||
|
||||
|
@ -284,7 +283,6 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
|
|||
delete tensor;
|
||||
}
|
||||
delete param;
|
||||
delete concat_kernel;
|
||||
delete sub_graph;
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -78,7 +78,6 @@ void test_main_gather(void *input_data, void *correct_data, const std::vector<in
|
|||
std::cout << "==================output data================" << std::endl;
|
||||
auto *output_data = reinterpret_cast<T *>(outputs[0]->data_c());
|
||||
CommonTest::CompareOutputData<T>(output_data, static_cast<T*>(correct_data), outputs[0]->ElementsNum(), 0.0001);
|
||||
delete pkernel;
|
||||
delete sub_graph;
|
||||
}
|
||||
TEST_F(TestGatherOpenCL, Axis1Fp32) {
|
||||
|
|
|
@ -167,7 +167,6 @@ TEST_F(TestPReluOpenCL, PReluFp32_dim4) {
|
|||
delete output_tensor;
|
||||
delete weight_tensor;
|
||||
delete param;
|
||||
delete prelu_kernel;
|
||||
delete sub_graph;
|
||||
return;
|
||||
}
|
||||
|
@ -179,7 +178,6 @@ TEST_F(TestPReluOpenCL, PReluFp32_dim4) {
|
|||
delete output_tensor;
|
||||
delete weight_tensor;
|
||||
delete param;
|
||||
delete prelu_kernel;
|
||||
delete sub_graph;
|
||||
return;
|
||||
}
|
||||
|
@ -195,7 +193,6 @@ TEST_F(TestPReluOpenCL, PReluFp32_dim4) {
|
|||
delete output_tensor;
|
||||
delete weight_tensor;
|
||||
delete param;
|
||||
delete prelu_kernel;
|
||||
delete sub_graph;
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
|
|
|
@ -223,7 +223,6 @@ static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &sh
|
|||
delete[] data_out_ocl;
|
||||
|
||||
delete kernel;
|
||||
delete scale_kernel;
|
||||
delete param;
|
||||
for (auto tensor : inputs) {
|
||||
delete tensor;
|
||||
|
|
|
@ -143,7 +143,6 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) {
|
|||
for (auto tensor : outputs) {
|
||||
delete tensor;
|
||||
}
|
||||
delete slice_kernel;
|
||||
delete sub_graph;
|
||||
}
|
||||
TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
|
||||
|
@ -251,7 +250,6 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
|
|||
for (auto tensor : outputs) {
|
||||
delete tensor;
|
||||
}
|
||||
delete slice_kernel;
|
||||
delete sub_graph;
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
Loading…
Reference in New Issue