!21537 support global average pooling

Merge pull request !21537 from yeyunpeng2020/gpu
This commit is contained in:
i-robot 2021-08-09 08:28:39 +00:00 committed by Gitee
commit 91bc3c0926
3 changed files with 106 additions and 72 deletions

View File

@ -1,6 +1,7 @@
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define LOCAL_CACHE_THREAD 16
#define divide_no_check(a, b) (a / b)
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
@ -34,41 +35,11 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i
}
}
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), result);
}
__kernel void AvgPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
const int4 input_shape, const int4 output_shape, const int2 stride,
const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(0); // C4
int N = X / output_shape.y;
X = X % output_shape.y;
// boundary check
if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 r = (FLT4)(0.0f);
FLT window_size = 0.0f;
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
bool outside_y = y_c < 0 || y_c >= input_shape.z;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
bool outside = outside_y || x_c < 0 || x_c >= input_shape.y;
r +=
!outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c)) : (FLT4)(0.0f);
window_size += !outside ? 1.0f : 0.0f;
}
}
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
#ifdef RELU
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(result, (FLT4)(0.f)));
#else
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), result);
#endif
}
__kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
@ -98,35 +69,41 @@ __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i
maximum = max(src, maximum);
}
}
#ifdef RELU
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(maximum, (FLT4)(0.f)));
#else
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), maximum);
#endif
}
__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
const int4 input_shape, const int4 output_shape, const int2 stride,
const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(0); // C4
int N = X / output_shape.y;
X = X % output_shape.y;
// boundary check
if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 maximum = (FLT4)(-10000.0f);
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
if (y_c < 0 || y_c >= input_shape.z) continue;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
if (x_c < 0 || x_c >= input_shape.y) continue;
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c));
maximum = max(src, maximum);
__kernel void AvgPooling2d_global_NHWC4_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data,
int4 size) {
int X = get_global_id(0); // C4
int localy = get_local_id(1);
int localz = get_local_id(2);
if (X >= size.z) return;
__local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD];
temp[localy][localz] = (float4)0.f;
for (int h = localy; h < size.x; h += LOCAL_CACHE_THREAD) {
for (int w = localz; w < size.y; w += LOCAL_CACHE_THREAD) {
temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)));
}
}
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(maximum, (FLT4)(0.f)));
barrier(CLK_LOCAL_MEM_FENCE);
if (localz == 0) {
for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
temp[localy][0] += temp[localy][i];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
float4 result = temp[0][0];
for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
result += temp[i][0];
}
result /= size.x * size.y;
#ifdef RELU
WRITE_IMAGE(dst_data, (int2)(X, 0), max(TO_FLT4(result), (FLT4)(0.f)));
#else
WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result));
#endif
}

View File

@ -53,18 +53,25 @@ int PoolingOpenCLKernel::CheckSpecs() {
return RET_OK;
}
int PoolingOpenCLKernel::Prepare() {
int PoolingOpenCLKernel::BuildKernel() {
std::string kernel_name;
if (parameter_->pool_mode_ == PoolMode_MaxPool) {
kernel_name = "MaxPooling2d";
} else if (parameter_->pool_mode_ == PoolMode_AvgPool) {
kernel_name = "AvgPooling2d";
}
if (parameter_->global_ &&
(parameter_->window_h_ >= LOCAL_CACHE_THREAD || parameter_->window_w_ >= LOCAL_CACHE_THREAD)) {
kernel_name += "_global";
is_use_local_ = true;
}
auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
switch (parameter_->act_type_) {
case ActType_No:
break;
case ActType_Relu:
kernel_name += "_ReLU";
build_options_ext.emplace_back("-DRELU");
break;
default:
MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_;
@ -78,32 +85,44 @@ int PoolingOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "Load source failed.";
return RET_ERROR;
}
auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Build kernel failed.";
return ret;
}
return RET_OK;
}
int PoolingOpenCLKernel::Prepare() {
input_tensor_ = GpuTensorInfo(in_tensors_[0]);
if (BuildKernel() != RET_OK) {
MS_LOG(ERROR) << "BuildKernel failed.";
return RET_ERROR;
}
if (SetConstArgs() != RET_OK) {
MS_LOG(ERROR) << "SeConstArgs failed.";
return RET_ERROR;
}
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
void PoolingOpenCLKernel::SetGlobalLocal() {
const size_t global_x = out_tensors_[0]->shape()[1] * out_tensors_[0]->shape()[0];
const size_t global_y = out_tensors_[0]->shape()[2];
const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
global_size_ = {global_z, global_y, global_x};
local_size_ = {};
AlignGlobalLocal(global_size_, local_size_);
if (is_use_local_) {
local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
global_size_ = {static_cast<size_t>(input_tensor_.Slice), 1, 1};
AlignGlobalLocal(global_size_, local_size_);
} else {
const size_t global_x = out_tensors_[0]->shape()[1] * out_tensors_[0]->shape()[0];
const size_t global_y = out_tensors_[0]->shape()[2];
const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
global_size_ = {global_z, global_y, global_x};
local_size_ = {};
AlignGlobalLocal(global_size_, local_size_);
}
}
int PoolingOpenCLKernel::SetConstArgs() {
int PoolingOpenCLKernel::SetGlobalConstArgs() {
int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
cl_int4 input_shape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], slices};
cl_int4 output_shape = {out_tensors_[0]->shape()[0], out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2],
@ -135,6 +154,35 @@ int PoolingOpenCLKernel::SetConstArgs() {
return RET_OK;
}
int PoolingOpenCLKernel::SetLocalConstArgs() {
int h = input_tensor_.H;
int w = input_tensor_.W;
int c = input_tensor_.C;
int c4 = UP_DIV(c, C4NUM);
cl_int4 size = {h, w, c4, c};
int arg_idx = 2;
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size) != CL_SUCCESS) {
MS_LOG(ERROR) << "SetKernelArg failed.";
return RET_ERROR;
}
return RET_OK;
}
int PoolingOpenCLKernel::SetConstArgs() {
if (is_use_local_) {
return SetLocalConstArgs();
} else {
return SetGlobalConstArgs();
}
}
int PoolingOpenCLKernel::Tune() {
if (is_use_local_) {
return RET_OK;
}
return OpenCLKernel::Tune();
}
int PoolingOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;

View File

@ -34,9 +34,18 @@ class PoolingOpenCLKernel : public OpenCLKernel {
int CheckSpecs() override;
int SetConstArgs() override;
void SetGlobalLocal() override;
int Tune() override;
private:
int BuildKernel();
int SetGlobalConstArgs();
int SetLocalConstArgs();
private:
PoolingParameter *parameter_;
bool is_use_local_ = false;
static const size_t LOCAL_CACHE_THREAD{16};
GpuTensorInfo input_tensor_;
};
} // namespace mindspore::kernel