!21537 support global average pooling

Merge pull request !21537 from yeyunpeng2020/gpu
2021-08-09 08:28:39 +00:00 · 2021-08-09 08:28:39 +00:00 · 91bc3c0926
parent 3cf9c6c516 fa3ca21910
commit 91bc3c0926
3 changed files with 106 additions and 72 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/pooling2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/pooling2d.cl
@ -1,6 +1,7 @@
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
+#define LOCAL_CACHE_THREAD 16
 #define divide_no_check(a, b) (a / b)
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
@ -34,41 +35,11 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i
    }
  }
  FLT4 result = TO_FLT4(divide_no_check(r, window_size));
-  WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), result);
-}
-
-__kernel void AvgPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
-                                          const int4 input_shape, const int4 output_shape, const int2 stride,
-                                          const int2 kernel_size, const int2 padding) {
-  // axis to dst tensor coordinate
-  int X = get_global_id(2);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(0);  // C4
-  int N = X / output_shape.y;
-  X = X % output_shape.y;
-  // boundary check
-  if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-
-  FLT4 r = (FLT4)(0.0f);
-  FLT window_size = 0.0f;
-  int xs = X * stride.x - padding.x;
-  int ys = Y * stride.y - padding.y;
-
-  for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = ys + ky;
-    bool outside_y = y_c < 0 || y_c >= input_shape.z;
-    for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = xs + kx;
-      bool outside = outside_y || x_c < 0 || x_c >= input_shape.y;
-      r +=
-        !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c)) : (FLT4)(0.0f);
-      window_size += !outside ? 1.0f : 0.0f;
-    }
-  }
-  FLT4 result = TO_FLT4(divide_no_check(r, window_size));
+#ifdef RELU
  WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(result, (FLT4)(0.f)));
+#else
+  WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), result);
+#endif
 }

 __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
@ -98,35 +69,41 @@ __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i
      maximum = max(src, maximum);
    }
  }
+#ifdef RELU
+  WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(maximum, (FLT4)(0.f)));
+#else
  WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), maximum);
+#endif
 }

-__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
-                                          const int4 input_shape, const int4 output_shape, const int2 stride,
-                                          const int2 kernel_size, const int2 padding) {
-  // axis to dst tensor coordinate
-  int X = get_global_id(2);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(0);  // C4
-  int N = X / output_shape.y;
-  X = X % output_shape.y;
-  // boundary check
-  if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-
-  FLT4 maximum = (FLT4)(-10000.0f);
-  int xs = X * stride.x - padding.x;
-  int ys = Y * stride.y - padding.y;
-  for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = ys + ky;
-    if (y_c < 0 || y_c >= input_shape.z) continue;
-    for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = xs + kx;
-      if (x_c < 0 || x_c >= input_shape.y) continue;
-      FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c));
-      maximum = max(src, maximum);
+__kernel void AvgPooling2d_global_NHWC4_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data,
+                                            int4 size) {
+  int X = get_global_id(0);  // C4
+  int localy = get_local_id(1);
+  int localz = get_local_id(2);
+  if (X >= size.z) return;
+  __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD];
+  temp[localy][localz] = (float4)0.f;
+  for (int h = localy; h < size.x; h += LOCAL_CACHE_THREAD) {
+    for (int w = localz; w < size.y; w += LOCAL_CACHE_THREAD) {
+      temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)));
    }
  }
-  WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(maximum, (FLT4)(0.f)));
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (localz == 0) {
+    for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
+      temp[localy][0] += temp[localy][i];
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  float4 result = temp[0][0];
+  for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
+    result += temp[i][0];
+  }
+  result /= size.x * size.y;
+#ifdef RELU
+  WRITE_IMAGE(dst_data, (int2)(X, 0), max(TO_FLT4(result), (FLT4)(0.f)));
+#else
+  WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result));
+#endif
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@ -53,18 +53,25 @@ int PoolingOpenCLKernel::CheckSpecs() {
  return RET_OK;
 }

-int PoolingOpenCLKernel::Prepare() {
+int PoolingOpenCLKernel::BuildKernel() {
  std::string kernel_name;
  if (parameter_->pool_mode_ == PoolMode_MaxPool) {
    kernel_name = "MaxPooling2d";
  } else if (parameter_->pool_mode_ == PoolMode_AvgPool) {
    kernel_name = "AvgPooling2d";
  }
+
+  if (parameter_->global_ &&
+      (parameter_->window_h_ >= LOCAL_CACHE_THREAD || parameter_->window_w_ >= LOCAL_CACHE_THREAD)) {
+    kernel_name += "_global";
+    is_use_local_ = true;
+  }
+  auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
  switch (parameter_->act_type_) {
    case ActType_No:
      break;
    case ActType_Relu:
-      kernel_name += "_ReLU";
+      build_options_ext.emplace_back("-DRELU");
      break;
    default:
      MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_;
@ -78,32 +85,44 @@ int PoolingOpenCLKernel::Prepare() {
    MS_LOG(ERROR) << "Load source failed.";
    return RET_ERROR;
  }
-  auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
  auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Build kernel failed.";
    return ret;
  }
+  return RET_OK;
+}
+
+int PoolingOpenCLKernel::Prepare() {
+  input_tensor_ = GpuTensorInfo(in_tensors_[0]);
+  if (BuildKernel() != RET_OK) {
+    MS_LOG(ERROR) << "BuildKernel failed.";
+    return RET_ERROR;
+  }
  if (SetConstArgs() != RET_OK) {
    MS_LOG(ERROR) << "SeConstArgs failed.";
    return RET_ERROR;
  }
  SetGlobalLocal();
-  MS_LOG(DEBUG) << kernel_name << " Init Done!";
-
  return RET_OK;
 }

 void PoolingOpenCLKernel::SetGlobalLocal() {
-  const size_t global_x = out_tensors_[0]->shape()[1] * out_tensors_[0]->shape()[0];
-  const size_t global_y = out_tensors_[0]->shape()[2];
-  const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
-  global_size_ = {global_z, global_y, global_x};
-  local_size_ = {};
-  AlignGlobalLocal(global_size_, local_size_);
+  if (is_use_local_) {
+    local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
+    global_size_ = {static_cast<size_t>(input_tensor_.Slice), 1, 1};
+    AlignGlobalLocal(global_size_, local_size_);
+  } else {
+    const size_t global_x = out_tensors_[0]->shape()[1] * out_tensors_[0]->shape()[0];
+    const size_t global_y = out_tensors_[0]->shape()[2];
+    const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
+    global_size_ = {global_z, global_y, global_x};
+    local_size_ = {};
+    AlignGlobalLocal(global_size_, local_size_);
+  }
 }

-int PoolingOpenCLKernel::SetConstArgs() {
+int PoolingOpenCLKernel::SetGlobalConstArgs() {
  int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
  cl_int4 input_shape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], slices};
  cl_int4 output_shape = {out_tensors_[0]->shape()[0], out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2],
@ -135,6 +154,35 @@ int PoolingOpenCLKernel::SetConstArgs() {
  return RET_OK;
 }

+int PoolingOpenCLKernel::SetLocalConstArgs() {
+  int h = input_tensor_.H;
+  int w = input_tensor_.W;
+  int c = input_tensor_.C;
+  int c4 = UP_DIV(c, C4NUM);
+  cl_int4 size = {h, w, c4, c};
+  int arg_idx = 2;
+  if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size) != CL_SUCCESS) {
+    MS_LOG(ERROR) << "SetKernelArg failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int PoolingOpenCLKernel::SetConstArgs() {
+  if (is_use_local_) {
+    return SetLocalConstArgs();
+  } else {
+    return SetGlobalConstArgs();
+  }
+}
+
+int PoolingOpenCLKernel::Tune() {
+  if (is_use_local_) {
+    return RET_OK;
+  }
+  return OpenCLKernel::Tune();
+}
+
 int PoolingOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running!";
  int arg_idx = 0;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
@ -34,9 +34,18 @@ class PoolingOpenCLKernel : public OpenCLKernel {
  int CheckSpecs() override;
  int SetConstArgs() override;
  void SetGlobalLocal() override;
+  int Tune() override;
+
+ private:
+  int BuildKernel();
+  int SetGlobalConstArgs();
+  int SetLocalConstArgs();

 private:
  PoolingParameter *parameter_;
+  bool is_use_local_ = false;
+  static const size_t LOCAL_CACHE_THREAD{16};
+  GpuTensorInfo input_tensor_;
 };

 }  // namespace mindspore::kernel