!9215 fix opencl depthwise 1x1
From: @ddwsky Reviewed-by: @zhanghaibo5,@HilbertDavid Signed-off-by: @HilbertDavid
This commit is contained in:
commit
339f34a08c
|
@ -182,37 +182,55 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_b221(__write_only image2d_t dst_data, __
|
|||
WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[3]);
|
||||
}
|
||||
}
|
||||
__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__write_only image2d_t dst_data, __read_only image2d_t src_data,
|
||||
__global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride,
|
||||
int2 padding, int2 dilation, int4 src_size, int4 dst_size,
|
||||
float relu_clip_min, float relu_clip_max) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2) * 2;
|
||||
__kernel void DepthwiseConv2d_IMG_NHWC4_1x1_b221(__write_only image2d_t dst_data, __read_only image2d_t src_data,
|
||||
__global FLT4 *filter, __global FLT4 *bias, int2 kernel_size,
|
||||
int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size,
|
||||
float relu_clip_min, float relu_clip_max) {
|
||||
int X = get_global_id(1) * 2;
|
||||
int Y = get_global_id(2) * 2;
|
||||
int Z = get_global_id(0);
|
||||
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
|
||||
FLT4 r[2] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f)};
|
||||
FLT4 r[4] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f),
|
||||
(FLT4)(0.0f, 0.0f, 0.0f, 0.0f)};
|
||||
int x_offset = X * stride.x + padding.x;
|
||||
int y_offset = Y * stride.y + padding.y;
|
||||
int fx_c = Z;
|
||||
int f_len = kernel_size.x * kernel_size.y;
|
||||
int fx_c = Z * f_len;
|
||||
bool last_x = (get_global_id(1) == (dst_size.x + 1) / 2) && ((dst_size.x & 0x1) == 1);
|
||||
bool last_y = (get_global_id(2) == (dst_size.y + 1) / 2) && ((dst_size.y & 0x1) == 1);
|
||||
int y_c = y_offset;
|
||||
bool outside_y = y_c < 0 || y_c >= src_size.y;
|
||||
int y_c_a1 = y_c + stride.y;
|
||||
int x_c = x_offset;
|
||||
bool outside_x = x_c < 0 || x_c >= src_size.x;
|
||||
if (!outside_x && !outside_y) {
|
||||
FLT4 flt_p0 = filter[fx_c];
|
||||
FLT4 flt_p1 = filter[fx_c + 1];
|
||||
FLT4 src_p0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c));
|
||||
FLT4 src_p1 = READ_IMAGE(src_data, smp_zero, (int2)(Z + 1 + x_c * src_size.z, y_c));
|
||||
r[0] += TO_FLT4(src_p0 * flt_p0);
|
||||
r[1] += TO_FLT4(src_p1 * flt_p1);
|
||||
}
|
||||
int x_c_a1 = x_c + stride.x;
|
||||
int x_sign = x_c < 0 ? -1 : 1;
|
||||
int x_a1_sign = x_c_a1 < 0 ? -1 : 1;
|
||||
FLT4 flt_p0 = filter[fx_c];
|
||||
FLT4 src_p00_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_sign + x_c * src_size.z, y_c));
|
||||
r[0] += TO_FLT4(src_p00_c0 * flt_p0);
|
||||
FLT4 src_p01_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c));
|
||||
r[1] += TO_FLT4(src_p01_c0 * flt_p0);
|
||||
FLT4 src_p10_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c_a1));
|
||||
r[2] += TO_FLT4(src_p10_c0 * flt_p0);
|
||||
FLT4 src_p11_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c_a1));
|
||||
r[3] += TO_FLT4(src_p11_c0 * flt_p0);
|
||||
|
||||
r[0] += bias[Z];
|
||||
r[1] += bias[Z];
|
||||
r[2] += bias[Z];
|
||||
r[3] += bias[Z];
|
||||
r[0] = clamp(r[0], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
|
||||
r[1] += bias[Z + 1];
|
||||
r[1] = clamp(r[1], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
|
||||
r[2] = clamp(r[2], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
|
||||
r[3] = clamp(r[3], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
|
||||
WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), r[0]);
|
||||
if ((dst_size.z & 0x1) == 0) {
|
||||
WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z + 1, Y), r[1]);
|
||||
if (!last_x) {
|
||||
WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y), r[1]);
|
||||
}
|
||||
if (!last_y) {
|
||||
WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y + 1), r[2]);
|
||||
}
|
||||
if (!last_y && !last_x) {
|
||||
WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[3]);
|
||||
}
|
||||
}
|
||||
__kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *dst_data, __global FLT4 *src_data, __global FLT4 *filter,
|
||||
|
|
|
@ -35,6 +35,35 @@ using mindspore::schema::PrimitiveType_BiasAdd;
|
|||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int BiasAddOpenCLKernel::CheckSpecs() {
|
||||
if (in_tensors_.size() == 0) {
|
||||
MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << in_tensors_.size();
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (in_tensors_[0]->shape()[0] > 1) {
|
||||
MS_LOG(ERROR) << "Input data size unsupported multi-batch.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void BiasAddOpenCLKernel::SetConstArgs() {
|
||||
int arg_idx = 2;
|
||||
std::map<schema::Format, int> data_type{
|
||||
{schema::Format::Format_NC4, 1}, {schema::Format::Format_NHWC4, 2}, {schema::Format::Format_NC4HW4, 3}};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, BiasAdd_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, data_type[schema::Format::Format_NHWC4]);
|
||||
}
|
||||
|
||||
void BiasAddOpenCLKernel::SetGlobalLocal() {
|
||||
cl_int4 global_size = input_shape_;
|
||||
global_size.s[2] = UP_DIV(global_size.s[3], C4NUM) * global_size.s[2];
|
||||
std::vector<size_t> local = {1, 1};
|
||||
std::vector<size_t> global = {static_cast<size_t>(global_size.s[1]), static_cast<size_t>(global_size.s[2])};
|
||||
OpenCLKernel::AlignGlobalLocal(global, local);
|
||||
}
|
||||
|
||||
int BiasAddOpenCLKernel::InitWeights() {
|
||||
int C = in_tensors_[1]->shape()[0];
|
||||
int div_ci = UP_DIV(C, C4NUM);
|
||||
|
@ -52,7 +81,7 @@ int BiasAddOpenCLKernel::InitWeights() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int BiasAddOpenCLKernel::Init() {
|
||||
int BiasAddOpenCLKernel::Prepare() {
|
||||
in_size_ = in_tensors_[0]->shape().size();
|
||||
out_size_ = out_tensors_[0]->shape().size();
|
||||
for (int i = 0; i < in_size_; ++i) {
|
||||
|
@ -77,67 +106,27 @@ int BiasAddOpenCLKernel::Init() {
|
|||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
|
||||
|
||||
auto ret = InitWeights();
|
||||
if (ret != RET_OK) {
|
||||
return ret;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
MS_LOG(DEBUG) << program_name << " Init Done!";
|
||||
return mindspore::lite::RET_OK;
|
||||
}
|
||||
|
||||
int BiasAddOpenCLKernel::Run() {
|
||||
cl_int4 global_size = GetGlobalshape();
|
||||
MS_LOG(DEBUG) << op_parameter_->name_ << " Running!";
|
||||
int arg_idx = 0;
|
||||
std::map<schema::Format, int> data_type{
|
||||
{schema::Format::Format_NC4, 1}, {schema::Format::Format_NHWC4, 2}, {schema::Format::Format_NC4HW4, 3}};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, BiasAdd_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, data_type[schema::Format::Format_NHWC4]);
|
||||
std::vector<size_t> local = {1, 1};
|
||||
std::vector<size_t> global = {static_cast<size_t>(global_size.s[1]), static_cast<size_t>(global_size.s[2])};
|
||||
auto ret = ocl_runtime_->RunKernel(kernel_, global, local);
|
||||
if (ret != mindspore::lite::RET_OK) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
|
||||
auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
|
||||
return mindspore::lite::RET_ERROR;
|
||||
return RET_ERROR;
|
||||
}
|
||||
return mindspore::lite::RET_OK;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
cl_int4 BiasAddOpenCLKernel::GetGlobalshape() {
|
||||
cl_int4 global_shape = input_shape_;
|
||||
global_shape.s[2] = UP_DIV(global_shape.s[3], C4NUM) * global_shape.s[2];
|
||||
return global_shape;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *OpenCLBiasAddKernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
|
||||
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
|
||||
const lite::PrimitiveC *primitive) {
|
||||
if (inputs.size() == 0) {
|
||||
MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size();
|
||||
free(opParameter);
|
||||
return nullptr;
|
||||
}
|
||||
if (inputs[0]->shape()[0] > 1) {
|
||||
MS_LOG(ERROR) << "Input data size unsupported multi-batch.";
|
||||
free(opParameter);
|
||||
return nullptr;
|
||||
}
|
||||
auto *kernel = new (std::nothrow) BiasAddOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << "Kernel " << opParameter->name_ << "is nullptr.";
|
||||
free(opParameter);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto ret = kernel->Init();
|
||||
if (ret != mindspore::lite::RET_OK) {
|
||||
MS_LOG(ERROR) << "Init BiasAdd kernel failed!";
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BiasAdd, OpenCLBiasAddKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, OpenCLBiasAddKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BiasAdd, OpenCLKernelCreator<BiasAddOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, OpenCLKernelCreator<BiasAddOpenCLKernel>)
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -33,13 +33,15 @@ class BiasAddOpenCLKernel : public OpenCLKernel {
|
|||
: OpenCLKernel(parameter, inputs, outputs) {}
|
||||
~BiasAddOpenCLKernel() override = default;
|
||||
|
||||
int Init() override;
|
||||
int Run() override;
|
||||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int InitWeights() override;
|
||||
int Run() override;
|
||||
|
||||
private:
|
||||
cl_int4 GetGlobalshape();
|
||||
|
||||
void *BiasAdd_{nullptr};
|
||||
int in_size_{};
|
||||
int out_size_{};
|
||||
|
|
|
@ -131,8 +131,8 @@ int PowerOpenCLKernel::Run() {
|
|||
cl_float4 parameter = {power_, shift_, scale_, 1};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, parameter);
|
||||
}
|
||||
|
||||
ocl_runtime_->RunKernel(kernel_, global, local);
|
||||
AlignGlobalLocal(global, local);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -132,7 +132,8 @@ int PReluOpenCLKernel::Run() {
|
|||
|
||||
std::vector<size_t> local = {4, 4, 1};
|
||||
std::vector<size_t> global = {static_cast<size_t>(H_), static_cast<size_t>(W_), static_cast<size_t>(CO_SLICES_)};
|
||||
auto ret = ocl_runtime_->RunKernel(kernel_, global, local);
|
||||
AlignGlobalLocal(global, local);
|
||||
auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
|
||||
if (ret != mindspore::lite::RET_OK) {
|
||||
MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
|
||||
return mindspore::lite::RET_ERROR;
|
||||
|
|
|
@ -60,6 +60,7 @@ void ScaleOpenCLKernel::Image2dGetWorkGroupSize() {
|
|||
local_size_ = {16, 16};
|
||||
auto image2d_info = GpuTensorInfo(out_tensors_[0]);
|
||||
global_size_ = {image2d_info.width, image2d_info.height};
|
||||
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
|
||||
}
|
||||
|
||||
int ScaleOpenCLKernel::InitWeights() {
|
||||
|
@ -245,7 +246,7 @@ int ScaleOpenCLKernel::Run() {
|
|||
}
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->activation_type_);
|
||||
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -195,7 +195,6 @@ class OpenCLKernel : public LiteKernel {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int Init() override { return RET_ERROR; } // !!!To be deleted
|
||||
int Prepare() override { return RET_OK; }
|
||||
int PreProcess() override { return RET_ERROR; }
|
||||
int ReSize() override { return RET_ERROR; }
|
||||
|
@ -235,7 +234,7 @@ class OpenCLKernel : public LiteKernel {
|
|||
tuning_params.push_back(default_tuning_param);
|
||||
std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize();
|
||||
size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_);
|
||||
size_t MIN_WORKGROUP_SIZE = 8;
|
||||
const size_t MIN_WORKGROUP_SIZE = 8;
|
||||
std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]);
|
||||
std::set<size_t> candidate_y = {1};
|
||||
std::set<size_t> candidate_z = {1};
|
||||
|
|
|
@ -229,16 +229,11 @@ int OpenCLSubGraph::Init() {
|
|||
return ret;
|
||||
}
|
||||
auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_);
|
||||
ocl_runtime_->SetProfiling(true);
|
||||
ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
|
||||
return ret;
|
||||
}
|
||||
ocl_runtime_->SetProfiling(false);
|
||||
#ifdef Debug
|
||||
ocl_runtime_->SetProfiling(true);
|
||||
#endif
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -262,50 +257,6 @@ void OpenCLSubGraph::UpdateTensorDataType() {
|
|||
}
|
||||
}
|
||||
|
||||
int OpenCLSubGraph::MallocTensorWithReuse() {
|
||||
int ret;
|
||||
kernel::LiteKernelUtil::InitTensorRefCount(nodes_);
|
||||
for (auto *kernel : nodes_) {
|
||||
MS_ASSERT(kernel);
|
||||
auto *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel);
|
||||
auto outputs = kernel->out_tensors();
|
||||
for (auto i = 0; i < outputs.size(); ++i) {
|
||||
auto *output = outputs.at(i);
|
||||
MS_ASSERT(output);
|
||||
if (op_kernel->GetMemType() == MemType::IMG) {
|
||||
std::vector<size_t> img_size;
|
||||
ret = op_kernel->GetImageSize(i, &img_size);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(WARNING) << "GetImageSize failed";
|
||||
}
|
||||
auto data_ptr = allocator_->Malloc(output->Size(), img_size);
|
||||
output->set_data(data_ptr);
|
||||
} else {
|
||||
ret = output->MallocData(allocator_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(WARNING) << "MallocData failed";
|
||||
}
|
||||
}
|
||||
output->set_allocator(allocator_);
|
||||
}
|
||||
for (auto input_kernel : kernel->in_kernels()) {
|
||||
MS_ASSERT(input_kernel);
|
||||
ret = input_kernel->DecOutTensorRefCount();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto kernel : out_kernels_) {
|
||||
MS_ASSERT(kernel);
|
||||
ret = kernel->DecOutTensorRefCount();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void OpenCLSubGraph::GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
|
||||
const std::vector<kernel::LiteKernel *> &in_kernels,
|
||||
std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from) {
|
||||
|
@ -379,8 +330,6 @@ void OpenCLSubGraph::UnInit() {
|
|||
delete this->executor_;
|
||||
}
|
||||
|
||||
int OpenCLSubGraph::InferShape() { return RET_OK; }
|
||||
|
||||
int OpenCLSubGraph::ReSize() { return RET_OK; }
|
||||
|
||||
int OpenCLSubGraph::Run() {
|
||||
|
@ -395,7 +344,6 @@ int OpenCLSubGraph::Run() {
|
|||
MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null";
|
||||
return RET_ERROR;
|
||||
}
|
||||
allocator_->UnmapBuffer(tensor->data_c());
|
||||
ret = allocator_->UnmapBuffer(tensor->data_c());
|
||||
if (ret != RET_OK) {
|
||||
return ret;
|
||||
|
|
|
@ -43,7 +43,6 @@ class OpenCLSubGraph : public SubGraphKernel {
|
|||
int PostProcess() override { return mindspore::lite::RET_OK; }
|
||||
int Prepare() override;
|
||||
int Init() override;
|
||||
int InferShape();
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Run(const KernelCallBack &before, const KernelCallBack &after) override { return this->Run(); };
|
||||
|
@ -51,7 +50,6 @@ class OpenCLSubGraph : public SubGraphKernel {
|
|||
private:
|
||||
void UnInit();
|
||||
void UpdateTensorDataType();
|
||||
int MallocTensorWithReuse();
|
||||
void ReplaceOutTensorAndKernelToNull(const std::vector<lite::Tensor *> &in_tensors,
|
||||
const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
|
||||
lite::opencl::MemType mem_type);
|
||||
|
@ -66,6 +64,9 @@ class OpenCLSubGraph : public SubGraphKernel {
|
|||
void GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
|
||||
const std::vector<kernel::LiteKernel *> &in_kernels,
|
||||
std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from);
|
||||
void Fusion();
|
||||
|
||||
private:
|
||||
lite::opencl::OpenCLAllocator *allocator_{nullptr};
|
||||
std::vector<lite::Tensor *> in_convert_tensors_;
|
||||
std::vector<lite::Tensor *> out_convert_tensors_;
|
||||
|
@ -78,9 +79,6 @@ class OpenCLSubGraph : public SubGraphKernel {
|
|||
std::set<LiteKernel *> nodes_set_;
|
||||
lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
|
||||
lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr};
|
||||
|
||||
private:
|
||||
void Fusion();
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -30,7 +30,9 @@ int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &ou
|
|||
int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs,
|
||||
std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator,
|
||||
const KernelCallBack &before, const KernelCallBack &after, bool is_tune) {
|
||||
int ret;
|
||||
int ret{RET_OK};
|
||||
ocl_runtime.GetInstance()->SetProfiling(is_tune);
|
||||
|
||||
kernel::LiteKernelUtil::InitTensorRefCount(kernels);
|
||||
for (auto *kernel : kernels) {
|
||||
MS_ASSERT(kernel);
|
||||
|
@ -77,11 +79,6 @@ int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor
|
|||
MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
|
||||
return ret;
|
||||
}
|
||||
#ifdef Debug
|
||||
MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str()
|
||||
<< ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms";
|
||||
|
||||
#endif
|
||||
}
|
||||
if (after != nullptr) {
|
||||
if (!after(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) {
|
||||
|
@ -95,7 +92,16 @@ int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor
|
|||
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
|
||||
}
|
||||
}
|
||||
#ifdef Debug
|
||||
MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str()
|
||||
<< ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms";
|
||||
#endif
|
||||
}
|
||||
return RET_OK;
|
||||
#ifdef Debug
|
||||
ocl_runtime.GetInstance()->SetProfiling(true);
|
||||
#else
|
||||
ocl_runtime.GetInstance()->SetProfiling(false);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
} // namespace mindspore::lite::opencl
|
||||
|
|
|
@ -406,76 +406,6 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
// Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
|
||||
int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global,
|
||||
const std::vector<size_t> &local, cl::CommandQueue *command_queue) {
|
||||
if (command_queue == nullptr) {
|
||||
command_queue = default_command_queue_;
|
||||
}
|
||||
MS_ASSERT(local.size() == 0 || local.size() == global.size());
|
||||
std::vector<size_t> internal_global_ws = global;
|
||||
for (size_t i = 0; i < local.size(); ++i) {
|
||||
internal_global_ws[i] = ROUND_UP(global[i], local[i]);
|
||||
}
|
||||
|
||||
MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size();
|
||||
for (size_t i = 0; i < global.size(); i++) {
|
||||
MS_LOG(DEBUG) << "global[" << i << "] = " << global[i];
|
||||
}
|
||||
for (size_t i = 0; i < local.size(); i++) {
|
||||
MS_LOG(DEBUG) << "local[" << i << "] = " << local[i];
|
||||
}
|
||||
|
||||
cl::NDRange global_range = cl::NullRange;
|
||||
cl::NDRange local_range = cl::NullRange;
|
||||
if (global.size() == 1) {
|
||||
global_range = cl::NDRange(internal_global_ws[0]);
|
||||
if (!local.empty()) {
|
||||
local_range = cl::NDRange(local[0]);
|
||||
}
|
||||
} else if (global.size() == 2) {
|
||||
global_range = cl::NDRange(internal_global_ws[0], internal_global_ws[1]);
|
||||
if (!local.empty()) {
|
||||
local_range = cl::NDRange(local[0], local[1]);
|
||||
}
|
||||
} else if (global.size() == 3) {
|
||||
global_range = cl::NDRange(internal_global_ws[0], internal_global_ws[1], internal_global_ws[2]);
|
||||
if (!local.empty()) {
|
||||
local_range = cl::NDRange(local[0], local[1], local[2]);
|
||||
}
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Not supported NDRange!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
cl::Event event;
|
||||
cl_int ret = CL_SUCCESS;
|
||||
ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global_range, local_range, nullptr, &event);
|
||||
if (ret != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret);
|
||||
return RET_ERROR;
|
||||
}
|
||||
static int cnt = 0;
|
||||
const int flush_period = 10;
|
||||
if (cnt % flush_period == 0) {
|
||||
auto flush_ret = command_queue->flush();
|
||||
if (flush_ret != CL_SUCCESS) {
|
||||
MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret);
|
||||
}
|
||||
}
|
||||
cnt++;
|
||||
MS_LOG(DEBUG) << "RunKernel success!";
|
||||
#ifdef Debug
|
||||
event.wait();
|
||||
cl_ulong time_start;
|
||||
cl_ulong time_end;
|
||||
event.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
|
||||
event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
|
||||
double nanoSeconds = time_end - time_start;
|
||||
MS_LOG(INFO) << "OpenCl Execution time is: " << nanoSeconds / 1000000.0 << "ms";
|
||||
#endif
|
||||
return RET_OK;
|
||||
}
|
||||
// Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
|
||||
int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
|
||||
cl::CommandQueue *command_queue, cl::Event *event) {
|
||||
|
|
|
@ -115,8 +115,6 @@ class OpenCLRuntime {
|
|||
bool LoadSource(const std::string &program_name, const std::string &source);
|
||||
int BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name,
|
||||
const std::set<std::string> &build_options = {});
|
||||
int RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local,
|
||||
cl::CommandQueue *command_queue = nullptr); // !!!To be deleted
|
||||
int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
|
||||
cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr);
|
||||
bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
|
||||
|
|
Loading…
Reference in New Issue