From 53d0e9c22a67a3ba3cbd3e0da575e8e32941b23f Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Wed, 28 Oct 2020 10:52:11 +0800 Subject: [PATCH] optimize arithmetic and pooling --- .../runtime/kernel/opencl/cl/avg_pool2d.cl | 68 +----------------- .../runtime/kernel/opencl/cl/max_pool2d.cl | 71 ++----------------- .../kernel/opencl/kernel/arithmetic.cc | 71 ++++++++++--------- .../runtime/kernel/opencl/kernel/arithmetic.h | 1 + .../runtime/kernel/opencl/kernel/pooling2d.cc | 17 ++--- .../runtime/kernel/opencl/kernel/pooling2d.h | 4 +- 6 files changed, 56 insertions(+), 176 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl index 0df90fe08d8..30bde691805 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl @@ -2,46 +2,14 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #endif #define divide_no_check(a, b) (a / b) -__kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape, - const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { - // axis to dst tensor coordinate - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - - // boundary check - if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { - return; - } - - FLT4 r = (FLT4)(0.0f); - FLT window_size = 0.0f; - int xs = X * stride.x - padding.x; - int ys = Y * stride.y - padding.y; - - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = xs + kx; - bool outside_x = x_c < 0 || x_c >= input_shape.x; - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = ys + ky; - bool outside = outside_x || y_c < 0 || y_c >= input_shape.y; - r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (FLT4)(0.0f); - window_size += !outside ? 1.0f : 0.0f; - } - } - FLT4 result = TO_FLT4(r / window_size); - output[(output_shape.y * X + Y) * output_shape.w + Z] = result; -} - __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { // axis to dst tensor coordinate - int X = get_global_id(0); + int X = get_global_id(2); int Y = get_global_id(1); - int Z = get_global_id(2); + int Z = get_global_id(0); // boundary check if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { @@ -66,35 +34,3 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i FLT4 result = TO_FLT4(divide_no_check(r, window_size)); WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result); } - -__kernel void AvgPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, - const int4 input_shape, const int4 output_shape, const int2 stride, - const int2 kernel_size, const int2 padding) { - // axis to dst tensor coordinate - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - - // boundary check - if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { - return; - } - - FLT4 r = (FLT4)(0.0f); - FLT window_size = 0.0f; - int xs = X * stride.x - padding.x; - int ys = Y * stride.y - padding.y; - - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = ys + ky; - bool outside_y = y_c < 0 || y_c >= input_shape.y; - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = xs + kx; - bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; - r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c, Z * input_shape.x + x_c)) : (FLT4)(0.0f); - window_size += !outside ? 1.0f : 0.0f; - } - } - FLT4 result = TO_FLT4(divide_no_check(r, window_size)); - WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), result); -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl index 596f6f4367a..e5f7b54fa23 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl @@ -1,48 +1,14 @@ #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable #endif -__kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape, - const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { - // axis to dst tensor coordinate - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - - // boundary check - if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { - return; - } - - FLT4 maximum = (FLT4)(-10000.0f); - int xs = X * stride.x - padding.x; - int ys = Y * stride.y - padding.y; - - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = xs + kx; - if (x_c < 0 || x_c >= input_shape.x) { - continue; - } - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = ys + ky; - if (y_c < 0 || y_c >= input_shape.y) { - continue; - } - FLT4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z]; - maximum = max(src, maximum); - } - } - output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum; -} - __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; - __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { // axis to dst tensor coordinate - int X = get_global_id(0); + int X = get_global_id(2); int Y = get_global_id(1); - int Z = get_global_id(2); + int Z = get_global_id(0); // boundary check if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { @@ -69,9 +35,9 @@ __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o const int4 input_shape, const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { // axis to dst tensor coordinate - int X = get_global_id(0); + int X = get_global_id(2); int Y = get_global_id(1); - int Z = get_global_id(2); + int Z = get_global_id(0); // boundary check if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { @@ -93,32 +59,3 @@ __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o } WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f))); } - -__kernel void MaxPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, - const int4 input_shape, const int4 output_shape, const int2 stride, - const int2 kernel_size, const int2 padding) { - // axis to dst tensor coordinate - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - - // boundary check - if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { - return; - } - - FLT4 maximum = (FLT4)(-10000.0f); - int xs = X * stride.x - padding.x; - int ys = Y * stride.y - padding.y; - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = ys + ky; - if (y_c < 0 || y_c >= input_shape.y) continue; - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = xs + kx; - if (x_c < 0 || x_c >= input_shape.x) continue; - FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c, Z * input_shape.x + x_c)); - maximum = max(src, maximum); - } - } - WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), maximum); -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index a1c82b8c68c..659b6bb4d2b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -52,16 +52,23 @@ std::vector ArithmeticOpenCLKernel::InitGlobalSize() const { } void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { - local_size_ = {16, 16}; - auto out_shape = out_tensors_[0]->shape(); - if (out_shape.size() == 2) { - size_t H = out_shape[0]; - size_t W = UP_DIV(out_shape[1], C4NUM); - global_size_ = {W, H}; + if (element_flag_) { + local_size_ = {16, 16}; + auto out_shape = out_tensors_[0]->shape(); + if (out_shape.size() == 2) { + size_t H = out_shape[0]; + size_t W = UP_DIV(out_shape[1], C4NUM); + global_size_ = {W, H}; + } else { + size_t H = out_shape[0] * out_shape[1]; + size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); + global_size_ = {W, H}; + } } else { - size_t H = out_shape[0] * out_shape[1]; - size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); - global_size_ = {W, H}; + local_size_ = {}; + auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); + global_size_ = {static_cast(UP_DIV(out_shape[3], C4NUM)), static_cast(out_shape[2]), + static_cast(out_shape[1] * out_shape[0])}; } } @@ -129,6 +136,27 @@ int ArithmeticOpenCLKernel::InitBuffer() { return RET_OK; } +int ArithmeticOpenCLKernel::SetArgs() { + int arg_idx = 3; + if (!element_flag_) { + cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2], + UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape); + cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2], + UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape); + auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); + cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); + } else { + cl_int2 output_shape{static_cast(global_size_[0]), static_cast(global_size_[1])}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); + } + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); + return RET_OK; +} + int ArithmeticOpenCLKernel::Init() { std::string kernel_name; auto *arithmetic_parameter = reinterpret_cast(op_parameter_); @@ -237,6 +265,7 @@ int ArithmeticOpenCLKernel::Init() { Image2dGetWorkGroupSize(); InitBuffer(); + SetArgs(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } @@ -250,29 +279,7 @@ int ArithmeticOpenCLKernel::Run() { auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1]; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - if (!element_flag_) { - cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2], - UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)}; - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape); - cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2], - UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)}; - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape); - auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); - cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)}; - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); - ocl_runtime_->RunKernel(kernel_, - {static_cast(UP_DIV(out_shape[3], C4NUM)), static_cast(out_shape[2]), - static_cast(out_shape[1] * out_shape[0])}, - {}, nullptr); - } else { - cl_int2 output_shape{static_cast(global_size_[0]), static_cast(global_size_[1])}; - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); - ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); - } + ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h index 29142f0efbf..c41b92afe0c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h @@ -33,6 +33,7 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { int Init() override; int Run() override; int InitBuffer() override; + int SetArgs(); private: std::vector InitGlobalSize() const; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index cc071ef0866..254b4c6df78 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -83,17 +83,20 @@ int PoolingOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif + InitGlobalSize(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return mindspore::lite::RET_OK; } -std::vector PoolingOpenCLKernel::InitGlobalSize() const { +void PoolingOpenCLKernel::InitGlobalSize() { const size_t global_x = out_tensors_[0]->shape()[1]; const size_t global_y = out_tensors_[0]->shape()[2]; const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); - std::vector global = {global_x, global_y, global_z}; - return global; + global_size_ = {global_z, global_y, global_x}; + int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); + local_size_ = GetCommonLocalSize(global_size_, max_work_group_size); + global_size_ = GetCommonGlobalSize(local_size_, global_size_); } int PoolingOpenCLKernel::Run() { @@ -116,13 +119,7 @@ int PoolingOpenCLKernel::Run() { ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding); - std::vector local_size; - std::vector global_size = InitGlobalSize(); - int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); - local_size = GetCommonLocalSize(global_size, max_work_group_size); - global_size = GetCommonGlobalSize(local_size, global_size); - - ocl_runtime_->RunKernel(kernel_, global_size, local_size, nullptr); + ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); return mindspore::lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h index dbc735a4024..17b73312971 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h @@ -35,10 +35,12 @@ class PoolingOpenCLKernel : public OpenCLKernel { int Run() override; private: - std::vector InitGlobalSize() const; + void InitGlobalSize(); PoolingParameter *parameter_; cl::Kernel kernel_; bool enable_fp16_{false}; + std::vector local_size_; + std::vector global_size_; }; } // namespace mindspore::kernel