forked from mindspore-Ecosystem/mindspore
!21537 support global average pooling
Merge pull request !21537 from yeyunpeng2020/gpu
This commit is contained in:
commit
91bc3c0926
|
@ -1,6 +1,7 @@
|
|||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
#define LOCAL_CACHE_THREAD 16
|
||||
#define divide_no_check(a, b) (a / b)
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
|
||||
|
@ -34,41 +35,11 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i
|
|||
}
|
||||
}
|
||||
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), result);
|
||||
}
|
||||
|
||||
__kernel void AvgPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
|
||||
const int4 input_shape, const int4 output_shape, const int2 stride,
|
||||
const int2 kernel_size, const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(2); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(0); // C4
|
||||
int N = X / output_shape.y;
|
||||
X = X % output_shape.y;
|
||||
// boundary check
|
||||
if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 r = (FLT4)(0.0f);
|
||||
FLT window_size = 0.0f;
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
bool outside_y = y_c < 0 || y_c >= input_shape.z;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
bool outside = outside_y || x_c < 0 || x_c >= input_shape.y;
|
||||
r +=
|
||||
!outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c)) : (FLT4)(0.0f);
|
||||
window_size += !outside ? 1.0f : 0.0f;
|
||||
}
|
||||
}
|
||||
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
|
||||
#ifdef RELU
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(result, (FLT4)(0.f)));
|
||||
#else
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), result);
|
||||
#endif
|
||||
}
|
||||
|
||||
__kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
|
||||
|
@ -98,35 +69,41 @@ __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i
|
|||
maximum = max(src, maximum);
|
||||
}
|
||||
}
|
||||
#ifdef RELU
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(maximum, (FLT4)(0.f)));
|
||||
#else
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), maximum);
|
||||
#endif
|
||||
}
|
||||
|
||||
__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
|
||||
const int4 input_shape, const int4 output_shape, const int2 stride,
|
||||
const int2 kernel_size, const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(2); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(0); // C4
|
||||
int N = X / output_shape.y;
|
||||
X = X % output_shape.y;
|
||||
// boundary check
|
||||
if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 maximum = (FLT4)(-10000.0f);
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
if (y_c < 0 || y_c >= input_shape.z) continue;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
if (x_c < 0 || x_c >= input_shape.y) continue;
|
||||
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c));
|
||||
maximum = max(src, maximum);
|
||||
__kernel void AvgPooling2d_global_NHWC4_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data,
|
||||
int4 size) {
|
||||
int X = get_global_id(0); // C4
|
||||
int localy = get_local_id(1);
|
||||
int localz = get_local_id(2);
|
||||
if (X >= size.z) return;
|
||||
__local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD];
|
||||
temp[localy][localz] = (float4)0.f;
|
||||
for (int h = localy; h < size.x; h += LOCAL_CACHE_THREAD) {
|
||||
for (int w = localz; w < size.y; w += LOCAL_CACHE_THREAD) {
|
||||
temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)));
|
||||
}
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(maximum, (FLT4)(0.f)));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (localz == 0) {
|
||||
for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
|
||||
temp[localy][0] += temp[localy][i];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
float4 result = temp[0][0];
|
||||
for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
|
||||
result += temp[i][0];
|
||||
}
|
||||
result /= size.x * size.y;
|
||||
#ifdef RELU
|
||||
WRITE_IMAGE(dst_data, (int2)(X, 0), max(TO_FLT4(result), (FLT4)(0.f)));
|
||||
#else
|
||||
WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result));
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -53,18 +53,25 @@ int PoolingOpenCLKernel::CheckSpecs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int PoolingOpenCLKernel::Prepare() {
|
||||
int PoolingOpenCLKernel::BuildKernel() {
|
||||
std::string kernel_name;
|
||||
if (parameter_->pool_mode_ == PoolMode_MaxPool) {
|
||||
kernel_name = "MaxPooling2d";
|
||||
} else if (parameter_->pool_mode_ == PoolMode_AvgPool) {
|
||||
kernel_name = "AvgPooling2d";
|
||||
}
|
||||
|
||||
if (parameter_->global_ &&
|
||||
(parameter_->window_h_ >= LOCAL_CACHE_THREAD || parameter_->window_w_ >= LOCAL_CACHE_THREAD)) {
|
||||
kernel_name += "_global";
|
||||
is_use_local_ = true;
|
||||
}
|
||||
auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
|
||||
switch (parameter_->act_type_) {
|
||||
case ActType_No:
|
||||
break;
|
||||
case ActType_Relu:
|
||||
kernel_name += "_ReLU";
|
||||
build_options_ext.emplace_back("-DRELU");
|
||||
break;
|
||||
default:
|
||||
MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_;
|
||||
|
@ -78,32 +85,44 @@ int PoolingOpenCLKernel::Prepare() {
|
|||
MS_LOG(ERROR) << "Load source failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
|
||||
auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Build kernel failed.";
|
||||
return ret;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int PoolingOpenCLKernel::Prepare() {
|
||||
input_tensor_ = GpuTensorInfo(in_tensors_[0]);
|
||||
if (BuildKernel() != RET_OK) {
|
||||
MS_LOG(ERROR) << "BuildKernel failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (SetConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "SeConstArgs failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
SetGlobalLocal();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void PoolingOpenCLKernel::SetGlobalLocal() {
|
||||
const size_t global_x = out_tensors_[0]->shape()[1] * out_tensors_[0]->shape()[0];
|
||||
const size_t global_y = out_tensors_[0]->shape()[2];
|
||||
const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
|
||||
global_size_ = {global_z, global_y, global_x};
|
||||
local_size_ = {};
|
||||
AlignGlobalLocal(global_size_, local_size_);
|
||||
if (is_use_local_) {
|
||||
local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
|
||||
global_size_ = {static_cast<size_t>(input_tensor_.Slice), 1, 1};
|
||||
AlignGlobalLocal(global_size_, local_size_);
|
||||
} else {
|
||||
const size_t global_x = out_tensors_[0]->shape()[1] * out_tensors_[0]->shape()[0];
|
||||
const size_t global_y = out_tensors_[0]->shape()[2];
|
||||
const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
|
||||
global_size_ = {global_z, global_y, global_x};
|
||||
local_size_ = {};
|
||||
AlignGlobalLocal(global_size_, local_size_);
|
||||
}
|
||||
}
|
||||
|
||||
int PoolingOpenCLKernel::SetConstArgs() {
|
||||
int PoolingOpenCLKernel::SetGlobalConstArgs() {
|
||||
int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
|
||||
cl_int4 input_shape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], slices};
|
||||
cl_int4 output_shape = {out_tensors_[0]->shape()[0], out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2],
|
||||
|
@ -135,6 +154,35 @@ int PoolingOpenCLKernel::SetConstArgs() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int PoolingOpenCLKernel::SetLocalConstArgs() {
|
||||
int h = input_tensor_.H;
|
||||
int w = input_tensor_.W;
|
||||
int c = input_tensor_.C;
|
||||
int c4 = UP_DIV(c, C4NUM);
|
||||
cl_int4 size = {h, w, c4, c};
|
||||
int arg_idx = 2;
|
||||
if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size) != CL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "SetKernelArg failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int PoolingOpenCLKernel::SetConstArgs() {
|
||||
if (is_use_local_) {
|
||||
return SetLocalConstArgs();
|
||||
} else {
|
||||
return SetGlobalConstArgs();
|
||||
}
|
||||
}
|
||||
|
||||
int PoolingOpenCLKernel::Tune() {
|
||||
if (is_use_local_) {
|
||||
return RET_OK;
|
||||
}
|
||||
return OpenCLKernel::Tune();
|
||||
}
|
||||
|
||||
int PoolingOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
int arg_idx = 0;
|
||||
|
|
|
@ -34,9 +34,18 @@ class PoolingOpenCLKernel : public OpenCLKernel {
|
|||
int CheckSpecs() override;
|
||||
int SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Tune() override;
|
||||
|
||||
private:
|
||||
int BuildKernel();
|
||||
int SetGlobalConstArgs();
|
||||
int SetLocalConstArgs();
|
||||
|
||||
private:
|
||||
PoolingParameter *parameter_;
|
||||
bool is_use_local_ = false;
|
||||
static const size_t LOCAL_CACHE_THREAD{16};
|
||||
GpuTensorInfo input_tensor_;
|
||||
};
|
||||
|
||||
} // namespace mindspore::kernel
|
||||
|
|
Loading…
Reference in New Issue