From fa3ca21910ed055ae2c32ebc014c3635708eee87 Mon Sep 17 00:00:00 2001 From: yeyunpeng2020 Date: Mon, 9 Aug 2021 14:35:31 +0800 Subject: [PATCH] support global average pooling --- .../src/runtime/kernel/opencl/cl/pooling2d.cl | 97 +++++++------------ .../runtime/kernel/opencl/kernel/pooling2d.cc | 72 +++++++++++--- .../runtime/kernel/opencl/kernel/pooling2d.h | 9 ++ 3 files changed, 106 insertions(+), 72 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/pooling2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/pooling2d.cl index 130e296409f..bbc8a9852f6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/pooling2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/pooling2d.cl @@ -1,6 +1,7 @@ #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable #endif +#define LOCAL_CACHE_THREAD 16 #define divide_no_check(a, b) (a / b) __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, @@ -34,41 +35,11 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i } } FLT4 result = TO_FLT4(divide_no_check(r, window_size)); - WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), result); -} - -__kernel void AvgPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, - const int4 input_shape, const int4 output_shape, const int2 stride, - const int2 kernel_size, const int2 padding) { - // axis to dst tensor coordinate - int X = get_global_id(2); // N*H - int Y = get_global_id(1); // W - int Z = get_global_id(0); // C4 - int N = X / output_shape.y; - X = X % output_shape.y; - // boundary check - if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { - return; - } - - FLT4 r = (FLT4)(0.0f); - FLT window_size = 0.0f; - int xs = X * stride.x - padding.x; - int ys = Y * stride.y - padding.y; - - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = ys + ky; - bool outside_y = y_c < 0 || y_c >= input_shape.z; - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = xs + kx; - bool outside = outside_y || x_c < 0 || x_c >= input_shape.y; - r += - !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c)) : (FLT4)(0.0f); - window_size += !outside ? 1.0f : 0.0f; - } - } - FLT4 result = TO_FLT4(divide_no_check(r, window_size)); +#ifdef RELU WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(result, (FLT4)(0.f))); +#else + WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), result); +#endif } __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, @@ -98,35 +69,41 @@ __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i maximum = max(src, maximum); } } +#ifdef RELU + WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(maximum, (FLT4)(0.f))); +#else WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), maximum); +#endif } -__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, - const int4 input_shape, const int4 output_shape, const int2 stride, - const int2 kernel_size, const int2 padding) { - // axis to dst tensor coordinate - int X = get_global_id(2); // N*H - int Y = get_global_id(1); // W - int Z = get_global_id(0); // C4 - int N = X / output_shape.y; - X = X % output_shape.y; - // boundary check - if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { - return; - } - - FLT4 maximum = (FLT4)(-10000.0f); - int xs = X * stride.x - padding.x; - int ys = Y * stride.y - padding.y; - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = ys + ky; - if (y_c < 0 || y_c >= input_shape.z) continue; - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = xs + kx; - if (x_c < 0 || x_c >= input_shape.y) continue; - FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c)); - maximum = max(src, maximum); +__kernel void AvgPooling2d_global_NHWC4_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, + int4 size) { + int X = get_global_id(0); // C4 + int localy = get_local_id(1); + int localz = get_local_id(2); + if (X >= size.z) return; + __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD]; + temp[localy][localz] = (float4)0.f; + for (int h = localy; h < size.x; h += LOCAL_CACHE_THREAD) { + for (int w = localz; w < size.y; w += LOCAL_CACHE_THREAD) { + temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h))); } } - WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(maximum, (FLT4)(0.f))); + barrier(CLK_LOCAL_MEM_FENCE); + if (localz == 0) { + for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { + temp[localy][0] += temp[localy][i]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + float4 result = temp[0][0]; + for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { + result += temp[i][0]; + } + result /= size.x * size.y; +#ifdef RELU + WRITE_IMAGE(dst_data, (int2)(X, 0), max(TO_FLT4(result), (FLT4)(0.f))); +#else + WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result)); +#endif } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index 01a90630d5d..9f1fd5c8763 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -53,18 +53,25 @@ int PoolingOpenCLKernel::CheckSpecs() { return RET_OK; } -int PoolingOpenCLKernel::Prepare() { +int PoolingOpenCLKernel::BuildKernel() { std::string kernel_name; if (parameter_->pool_mode_ == PoolMode_MaxPool) { kernel_name = "MaxPooling2d"; } else if (parameter_->pool_mode_ == PoolMode_AvgPool) { kernel_name = "AvgPooling2d"; } + + if (parameter_->global_ && + (parameter_->window_h_ >= LOCAL_CACHE_THREAD || parameter_->window_w_ >= LOCAL_CACHE_THREAD)) { + kernel_name += "_global"; + is_use_local_ = true; + } + auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_); switch (parameter_->act_type_) { case ActType_No: break; case ActType_Relu: - kernel_name += "_ReLU"; + build_options_ext.emplace_back("-DRELU"); break; default: MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_; @@ -78,32 +85,44 @@ int PoolingOpenCLKernel::Prepare() { MS_LOG(ERROR) << "Load source failed."; return RET_ERROR; } - auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_); auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); if (ret != RET_OK) { MS_LOG(ERROR) << "Build kernel failed."; return ret; } + return RET_OK; +} + +int PoolingOpenCLKernel::Prepare() { + input_tensor_ = GpuTensorInfo(in_tensors_[0]); + if (BuildKernel() != RET_OK) { + MS_LOG(ERROR) << "BuildKernel failed."; + return RET_ERROR; + } if (SetConstArgs() != RET_OK) { MS_LOG(ERROR) << "SeConstArgs failed."; return RET_ERROR; } SetGlobalLocal(); - MS_LOG(DEBUG) << kernel_name << " Init Done!"; - return RET_OK; } void PoolingOpenCLKernel::SetGlobalLocal() { - const size_t global_x = out_tensors_[0]->shape()[1] * out_tensors_[0]->shape()[0]; - const size_t global_y = out_tensors_[0]->shape()[2]; - const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); - global_size_ = {global_z, global_y, global_x}; - local_size_ = {}; - AlignGlobalLocal(global_size_, local_size_); + if (is_use_local_) { + local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; + global_size_ = {static_cast(input_tensor_.Slice), 1, 1}; + AlignGlobalLocal(global_size_, local_size_); + } else { + const size_t global_x = out_tensors_[0]->shape()[1] * out_tensors_[0]->shape()[0]; + const size_t global_y = out_tensors_[0]->shape()[2]; + const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); + global_size_ = {global_z, global_y, global_x}; + local_size_ = {}; + AlignGlobalLocal(global_size_, local_size_); + } } -int PoolingOpenCLKernel::SetConstArgs() { +int PoolingOpenCLKernel::SetGlobalConstArgs() { int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); cl_int4 input_shape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], slices}; cl_int4 output_shape = {out_tensors_[0]->shape()[0], out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], @@ -135,6 +154,35 @@ int PoolingOpenCLKernel::SetConstArgs() { return RET_OK; } +int PoolingOpenCLKernel::SetLocalConstArgs() { + int h = input_tensor_.H; + int w = input_tensor_.W; + int c = input_tensor_.C; + int c4 = UP_DIV(c, C4NUM); + cl_int4 size = {h, w, c4, c}; + int arg_idx = 2; + if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size) != CL_SUCCESS) { + MS_LOG(ERROR) << "SetKernelArg failed."; + return RET_ERROR; + } + return RET_OK; +} + +int PoolingOpenCLKernel::SetConstArgs() { + if (is_use_local_) { + return SetLocalConstArgs(); + } else { + return SetGlobalConstArgs(); + } +} + +int PoolingOpenCLKernel::Tune() { + if (is_use_local_) { + return RET_OK; + } + return OpenCLKernel::Tune(); +} + int PoolingOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; int arg_idx = 0; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h index 68a67f7fa56..1bc0cb86440 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h @@ -34,9 +34,18 @@ class PoolingOpenCLKernel : public OpenCLKernel { int CheckSpecs() override; int SetConstArgs() override; void SetGlobalLocal() override; + int Tune() override; + + private: + int BuildKernel(); + int SetGlobalConstArgs(); + int SetLocalConstArgs(); private: PoolingParameter *parameter_; + bool is_use_local_ = false; + static const size_t LOCAL_CACHE_THREAD{16}; + GpuTensorInfo input_tensor_; }; } // namespace mindspore::kernel