From 53d0e9c22a67a3ba3cbd3e0da575e8e32941b23f Mon Sep 17 00:00:00 2001
From: chenzupeng <chenzupeng@huawei.com>
Date: Wed, 28 Oct 2020 10:52:11 +0800
Subject: [PATCH] optimize arithmetic and pooling

---
 .../runtime/kernel/opencl/cl/avg_pool2d.cl    | 68 +-----------------
 .../runtime/kernel/opencl/cl/max_pool2d.cl    | 71 ++-----------------
 .../kernel/opencl/kernel/arithmetic.cc        | 71 ++++++++++---------
 .../runtime/kernel/opencl/kernel/arithmetic.h |  1 +
 .../runtime/kernel/opencl/kernel/pooling2d.cc | 17 ++---
 .../runtime/kernel/opencl/kernel/pooling2d.h  |  4 +-
 6 files changed, 56 insertions(+), 176 deletions(-)

diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
index 0df90fe08d8..30bde691805 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
@@ -2,46 +2,14 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 #define divide_no_check(a, b) (a / b)
-__kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
-                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
-  // axis to dst tensor coordinate
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-
-  // boundary check
-  if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
-    return;
-  }
-
-  FLT4 r = (FLT4)(0.0f);
-  FLT window_size = 0.0f;
-  int xs = X * stride.x - padding.x;
-  int ys = Y * stride.y - padding.y;
-
-  for (int kx = 0; kx < kernel_size.x; ++kx) {
-    int x_c = xs + kx;
-    bool outside_x = x_c < 0 || x_c >= input_shape.x;
-    for (int ky = 0; ky < kernel_size.y; ++ky) {
-      int y_c = ys + ky;
-      bool outside = outside_x || y_c < 0 || y_c >= input_shape.y;
-      r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (FLT4)(0.0f);
-      window_size += !outside ? 1.0f : 0.0f;
-    }
-  }
-  FLT4 result = TO_FLT4(r / window_size);
-  output[(output_shape.y * X + Y) * output_shape.w + Z] = result;
-}
-
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
 __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
                                      const int4 output_shape, const int2 stride, const int2 kernel_size,
                                      const int2 padding) {
   // axis to dst tensor coordinate
-  int X = get_global_id(0);
+  int X = get_global_id(2);
   int Y = get_global_id(1);
-  int Z = get_global_id(2);
+  int Z = get_global_id(0);
 
   // boundary check
   if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
@@ -66,35 +34,3 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i
   FLT4 result = TO_FLT4(divide_no_check(r, window_size));
   WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
 }
-
-__kernel void AvgPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output,
-                                      const int4 input_shape, const int4 output_shape, const int2 stride,
-                                      const int2 kernel_size, const int2 padding) {
-  // axis to dst tensor coordinate
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-
-  // boundary check
-  if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
-    return;
-  }
-
-  FLT4 r = (FLT4)(0.0f);
-  FLT window_size = 0.0f;
-  int xs = X * stride.x - padding.x;
-  int ys = Y * stride.y - padding.y;
-
-  for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = ys + ky;
-    bool outside_y = y_c < 0 || y_c >= input_shape.y;
-    for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = xs + kx;
-      bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
-      r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c, Z * input_shape.x + x_c)) : (FLT4)(0.0f);
-      window_size += !outside ? 1.0f : 0.0f;
-    }
-  }
-  FLT4 result = TO_FLT4(divide_no_check(r, window_size));
-  WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), result);
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
index 596f6f4367a..e5f7b54fa23 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
@@ -1,48 +1,14 @@
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
-__kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
-                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
-  // axis to dst tensor coordinate
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-
-  // boundary check
-  if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
-    return;
-  }
-
-  FLT4 maximum = (FLT4)(-10000.0f);
-  int xs = X * stride.x - padding.x;
-  int ys = Y * stride.y - padding.y;
-
-  for (int kx = 0; kx < kernel_size.x; ++kx) {
-    int x_c = xs + kx;
-    if (x_c < 0 || x_c >= input_shape.x) {
-      continue;
-    }
-    for (int ky = 0; ky < kernel_size.y; ++ky) {
-      int y_c = ys + ky;
-      if (y_c < 0 || y_c >= input_shape.y) {
-        continue;
-      }
-      FLT4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z];
-      maximum = max(src, maximum);
-    }
-  }
-  output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum;
-}
-
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
-
 __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
                                      const int4 output_shape, const int2 stride, const int2 kernel_size,
                                      const int2 padding) {
   // axis to dst tensor coordinate
-  int X = get_global_id(0);
+  int X = get_global_id(2);
   int Y = get_global_id(1);
-  int Z = get_global_id(2);
+  int Z = get_global_id(0);
 
   // boundary check
   if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
@@ -69,9 +35,9 @@ __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o
                                           const int4 input_shape, const int4 output_shape, const int2 stride,
                                           const int2 kernel_size, const int2 padding) {
   // axis to dst tensor coordinate
-  int X = get_global_id(0);
+  int X = get_global_id(2);
   int Y = get_global_id(1);
-  int Z = get_global_id(2);
+  int Z = get_global_id(0);
 
   // boundary check
   if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
@@ -93,32 +59,3 @@ __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o
   }
   WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f)));
 }
-
-__kernel void MaxPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output,
-                                      const int4 input_shape, const int4 output_shape, const int2 stride,
-                                      const int2 kernel_size, const int2 padding) {
-  // axis to dst tensor coordinate
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-
-  // boundary check
-  if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
-    return;
-  }
-
-  FLT4 maximum = (FLT4)(-10000.0f);
-  int xs = X * stride.x - padding.x;
-  int ys = Y * stride.y - padding.y;
-  for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = ys + ky;
-    if (y_c < 0 || y_c >= input_shape.y) continue;
-    for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = xs + kx;
-      if (x_c < 0 || x_c >= input_shape.x) continue;
-      FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c, Z * input_shape.x + x_c));
-      maximum = max(src, maximum);
-    }
-  }
-  WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), maximum);
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
index a1c82b8c68c..659b6bb4d2b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -52,16 +52,23 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
 }
 
 void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
-  local_size_ = {16, 16};
-  auto out_shape = out_tensors_[0]->shape();
-  if (out_shape.size() == 2) {
-    size_t H = out_shape[0];
-    size_t W = UP_DIV(out_shape[1], C4NUM);
-    global_size_ = {W, H};
+  if (element_flag_) {
+    local_size_ = {16, 16};
+    auto out_shape = out_tensors_[0]->shape();
+    if (out_shape.size() == 2) {
+      size_t H = out_shape[0];
+      size_t W = UP_DIV(out_shape[1], C4NUM);
+      global_size_ = {W, H};
+    } else {
+      size_t H = out_shape[0] * out_shape[1];
+      size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM);
+      global_size_ = {W, H};
+    }
   } else {
-    size_t H = out_shape[0] * out_shape[1];
-    size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM);
-    global_size_ = {W, H};
+    local_size_ = {};
+    auto out_shape = GetNHWCShape(out_tensors_[0]->shape());
+    global_size_ = {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]),
+                    static_cast<size_t>(out_shape[1] * out_shape[0])};
   }
 }
 
@@ -129,6 +136,27 @@ int ArithmeticOpenCLKernel::InitBuffer() {
   return RET_OK;
 }
 
+int ArithmeticOpenCLKernel::SetArgs() {
+  int arg_idx = 3;
+  if (!element_flag_) {
+    cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2],
+                            UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)};
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape);
+    cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2],
+                            UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)};
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape);
+    auto out_shape = GetNHWCShape(out_tensors_[0]->shape());
+    cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)};
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
+  } else {
+    cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
+  }
+  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
+  return RET_OK;
+}
+
 int ArithmeticOpenCLKernel::Init() {
   std::string kernel_name;
   auto *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
@@ -237,6 +265,7 @@ int ArithmeticOpenCLKernel::Init() {
 
   Image2dGetWorkGroupSize();
   InitBuffer();
+  SetArgs();
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
   return RET_OK;
 }
@@ -250,29 +279,7 @@ int ArithmeticOpenCLKernel::Run() {
   auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1];
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr);
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  if (!element_flag_) {
-    cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2],
-                            UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)};
-    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape);
-    cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2],
-                            UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)};
-    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape);
-    auto out_shape = GetNHWCShape(out_tensors_[0]->shape());
-    cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)};
-    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
-    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
-    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
-    ocl_runtime_->RunKernel(kernel_,
-                            {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]),
-                             static_cast<size_t>(out_shape[1] * out_shape[0])},
-                            {}, nullptr);
-  } else {
-    cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
-    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
-    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
-    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
-    ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
-  }
+  ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
index 29142f0efbf..c41b92afe0c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
@@ -33,6 +33,7 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {
   int Init() override;
   int Run() override;
   int InitBuffer() override;
+  int SetArgs();
 
  private:
   std::vector<size_t> InitGlobalSize() const;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
index cc071ef0866..254b4c6df78 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@@ -83,17 +83,20 @@ int PoolingOpenCLKernel::Init() {
   ocl_runtime_->LoadSource(program_name, source);
   ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
+  InitGlobalSize();
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
 
   return mindspore::lite::RET_OK;
 }
 
-std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const {
+void PoolingOpenCLKernel::InitGlobalSize() {
   const size_t global_x = out_tensors_[0]->shape()[1];
   const size_t global_y = out_tensors_[0]->shape()[2];
   const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
-  std::vector<size_t> global = {global_x, global_y, global_z};
-  return global;
+  global_size_ = {global_z, global_y, global_x};
+  int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
+  local_size_ = GetCommonLocalSize(global_size_, max_work_group_size);
+  global_size_ = GetCommonGlobalSize(local_size_, global_size_);
 }
 
 int PoolingOpenCLKernel::Run() {
@@ -116,13 +119,7 @@ int PoolingOpenCLKernel::Run() {
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size);
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding);
 
-  std::vector<size_t> local_size;
-  std::vector<size_t> global_size = InitGlobalSize();
-  int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
-  local_size = GetCommonLocalSize(global_size, max_work_group_size);
-  global_size = GetCommonGlobalSize(local_size, global_size);
-
-  ocl_runtime_->RunKernel(kernel_, global_size, local_size, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
   return mindspore::lite::RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
index dbc735a4024..17b73312971 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
@@ -35,10 +35,12 @@ class PoolingOpenCLKernel : public OpenCLKernel {
   int Run() override;
 
  private:
-  std::vector<size_t> InitGlobalSize() const;
+  void InitGlobalSize();
   PoolingParameter *parameter_;
   cl::Kernel kernel_;
   bool enable_fp16_{false};
+  std::vector<size_t> local_size_;
+  std::vector<size_t> global_size_;
 };
 
 }  // namespace mindspore::kernel