!17294 optimize conv and depthwise

From: @yeyunpeng2020 Reviewed-by: @ddwsky,@HilbertDavid Signed-off-by: @ddwsky
2021-05-31 09:40:55 +08:00 · 2021-05-31 09:40:55 +08:00 · 5b30e3ce51
parent 0d29b12462 a9986fe46f
commit 5b30e3ce51
4 changed files with 935 additions and 11 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
@ -308,6 +308,129 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
  }
 }

+__kernel void Conv2D_H2W2C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                            __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
+                            int2 dilation, int act_type, float alpha) {
+  const int BlockH = 2;
+  const int BlockW = 2;
+  const int BlockC = 1;
+  DEFINE_ARGS;
+
+  int oh0 = oh + 0;
+  int oh1 = oh + 1;
+  int n_oh0 = n * OH + oh0;
+  int n_oh1 = n * OH + oh1;
+  int ow0 = ow + 0;
+  int ow1 = ow + 1;
+  int co_slice0 = co_slice + 0;
+
+  FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
+
+  for (int kh = 0; kh < KH; ++kh) {
+    int ih0 = kh * dilationH + oh0 * strideH - padTop;
+    // no need to check oh1, finally write out will check (oh1 < OH)
+    int ih1 = kh * dilationH + oh1 * strideH - padTop;
+    // check ih0 and ih1
+    int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+    int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
+
+    for (int kw = 0; kw < KW; ++kw) {
+      int iw0 = kw * dilationW + ow0 * strideW - padLeft;
+      int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2;
+      int x_idx0 = iw0 * CI_SLICES;
+      int x_idx1 = iw1 * CI_SLICES;
+
+      for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+        FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
+        FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0));
+        FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
+        FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1));
+        x_idx0++;
+        x_idx1++;
+
+        out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
+        out_h0_w1_c0 += weight_ptr[0] * in_h0_w1.x;
+        out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
+        out_h1_w1_c0 += weight_ptr[0] * in_h1_w1.x;
+        out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
+        out_h0_w1_c0 += weight_ptr[1] * in_h0_w1.y;
+        out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
+        out_h1_w1_c0 += weight_ptr[1] * in_h1_w1.y;
+        out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
+        out_h0_w1_c0 += weight_ptr[2] * in_h0_w1.z;
+        out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
+        out_h1_w1_c0 += weight_ptr[2] * in_h1_w1.z;
+        out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
+        out_h0_w1_c0 += weight_ptr[3] * in_h0_w1.w;
+        out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
+        out_h1_w1_c0 += weight_ptr[3] * in_h1_w1.w;
+
+        weight_ptr += 4;
+      }
+    }
+  }
+
+  if (bias != 0) {
+    out_h0_w0_c0 += bias[co_slice0];
+    out_h0_w1_c0 += bias[co_slice0];
+    out_h1_w0_c0 += bias[co_slice0];
+    out_h1_w1_c0 += bias[co_slice0];
+  }
+
+  if (act_type == ActivationType_RELU) {
+    out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
+    out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f));
+    out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
+    out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f));
+
+  } else if (act_type == ActivationType_RELU6) {
+    out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+
+  } else if (act_type == ActivationType_TANH) {
+    FLT4 exp0, exp1;
+    DO_TANH(out_h0_w0_c0);
+    DO_TANH(out_h0_w1_c0);
+    DO_TANH(out_h1_w0_c0);
+    DO_TANH(out_h1_w1_c0);
+
+  } else if (act_type == ActivationType_LEAKY_RELU) {
+    DO_LEAKY_RELU(out_h0_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h0_w1_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w1_c0, alpha);
+
+  } else if (act_type == ActivationType_SIGMOID) {
+    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
+    out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0));
+    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
+    out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0));
+  }
+
+  if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
+    }  // end if (oh1 < OH)
+  } else {
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
+      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
+    }  // end (oh1 < OH)
+  }
+}
+
 __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
                            __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
                            int2 dilation, int act_type, float alpha) {
@ -676,3 +799,741 @@ __kernel void Conv2D_H2W2C2_Img(__read_only image2d_t input, __write_only image2
    }    // end if (co_slice1 < CO_SLICES)
  }
 }
+
+__kernel void Conv2D_H1W1C1_1x1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                                __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
+                                int2 dilation, int act_type, float alpha) {
+  const int BlockH = 1;
+  const int BlockW = 1;
+  const int BlockC = 1;
+  DEFINE_ARGS;
+
+  int oh0 = oh + 0;
+  int n_oh0 = n * OH + oh0;
+  int ow0 = ow + 0;
+  int co_slice0 = co_slice + 0;
+
+  FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
+
+  int ih0 = oh0 * strideH - padTop;
+  int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+
+  int iw0 = ow0 * strideW - padLeft;
+  int x_idx0 = iw0 * CI_SLICES;
+
+  for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+    FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
+    x_idx0++;
+
+    out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
+    out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
+    out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
+    out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
+
+    weight_ptr += 4;
+  }
+
+  if (bias != 0) {
+    out_h0_w0_c0 += bias[co_slice0];
+  }
+
+  if (act_type == ActivationType_RELU) {
+    out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
+  } else if (act_type == ActivationType_RELU6) {
+    out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+  } else if (act_type == ActivationType_TANH) {
+    FLT4 exp0, exp1;
+    DO_TANH(out_h0_w0_c0);
+  } else if (act_type == ActivationType_LEAKY_RELU) {
+    DO_LEAKY_RELU(out_h0_w0_c0, alpha);
+  } else if (act_type == ActivationType_SIGMOID) {
+    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
+  }
+
+  if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+  } else {
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
+  }
+}
+
+__kernel void Conv2D_H2W1C1_1x1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                                __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
+                                int2 dilation, int act_type, float alpha) {
+  const int BlockH = 2;
+  const int BlockW = 1;
+  const int BlockC = 1;
+  DEFINE_ARGS;
+
+  int oh0 = oh + 0;
+  int oh1 = oh + 1;
+  int n_oh0 = n * OH + oh0;
+  int n_oh1 = n * OH + oh1;
+  int ow0 = ow + 0;
+  int co_slice0 = co_slice + 0;
+
+  FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
+
+  int ih0 = oh0 * strideH - padTop;
+  // no need to check oh1, finally write out will check (oh1 < OH)
+  int ih1 = oh1 * strideH - padTop;
+  // check ih0 and ih1
+  int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+  int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
+
+  int iw0 = ow0 * strideW - padLeft;
+  int x_idx0 = iw0 * CI_SLICES;
+
+  for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+    FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
+    FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
+    x_idx0++;
+
+    out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
+    out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
+    out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
+    out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
+    out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
+    out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
+    out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
+    out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
+
+    weight_ptr += 4;
+  }
+
+  if (bias != 0) {
+    out_h0_w0_c0 += bias[co_slice0];
+    out_h1_w0_c0 += bias[co_slice0];
+  }
+
+  if (act_type == ActivationType_RELU) {
+    out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
+    out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
+  } else if (act_type == ActivationType_RELU6) {
+    out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+  } else if (act_type == ActivationType_TANH) {
+    FLT4 exp0, exp1;
+    DO_TANH(out_h0_w0_c0);
+    DO_TANH(out_h1_w0_c0);
+  } else if (act_type == ActivationType_LEAKY_RELU) {
+    DO_LEAKY_RELU(out_h0_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c0, alpha);
+  } else if (act_type == ActivationType_SIGMOID) {
+    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
+    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
+  }
+
+#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
+  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+  }  // end if (oh1 < OH)
+#else
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
+  }  // end (oh1 < OH)
+#endif
+}
+
+__kernel void Conv2D_H2W1C2_1x1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                                __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
+                                int2 dilation, int act_type, float alpha) {
+  const int BlockH = 2;
+  const int BlockW = 1;
+  const int BlockC = 2;
+  DEFINE_ARGS;
+
+  int oh0 = oh + 0;
+  int oh1 = oh + 1;
+  int n_oh0 = n * OH + oh0;
+  int n_oh1 = n * OH + oh1;
+  int ow0 = ow + 0;
+  int co_slice0 = co_slice + 0;
+  int co_slice1 = co_slice + 1;
+
+  FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
+
+  int ih0 = oh0 * strideH - padTop;
+  // no need to check oh1, finally write out will check (oh1 < OH)
+  int ih1 = oh1 * strideH - padTop;
+  // check ih0 and ih1
+  int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+  int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
+
+  int iw0 = ow0 * strideW - padLeft;
+  int x_idx0 = iw0 * CI_SLICES;
+
+  for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+    FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
+    FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
+    x_idx0++;
+
+    out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
+    out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
+    out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
+    out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
+    out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
+    out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
+    out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
+    out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
+
+    out_h0_w0_c1 += weight_ptr[4] * in_h0_w0.x;
+    out_h1_w0_c1 += weight_ptr[4] * in_h1_w0.x;
+    out_h0_w0_c1 += weight_ptr[5] * in_h0_w0.y;
+    out_h1_w0_c1 += weight_ptr[5] * in_h1_w0.y;
+    out_h0_w0_c1 += weight_ptr[6] * in_h0_w0.z;
+    out_h1_w0_c1 += weight_ptr[6] * in_h1_w0.z;
+    out_h0_w0_c1 += weight_ptr[7] * in_h0_w0.w;
+    out_h1_w0_c1 += weight_ptr[7] * in_h1_w0.w;
+
+    weight_ptr += 8;
+  }
+
+  if (bias != 0) {
+    out_h0_w0_c0 += bias[co_slice0];
+    out_h1_w0_c0 += bias[co_slice0];
+    out_h0_w0_c1 += bias[co_slice1];
+    out_h1_w0_c1 += bias[co_slice1];
+  }
+
+  if (act_type == ActivationType_RELU) {
+    out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
+    out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
+    out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f));
+    out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
+  } else if (act_type == ActivationType_RELU6) {
+    out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+  } else if (act_type == ActivationType_TANH) {
+    FLT4 exp0, exp1;
+    DO_TANH(out_h0_w0_c0);
+    DO_TANH(out_h1_w0_c0);
+    DO_TANH(out_h0_w0_c1);
+    DO_TANH(out_h1_w0_c1);
+  } else if (act_type == ActivationType_LEAKY_RELU) {
+    DO_LEAKY_RELU(out_h0_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h0_w0_c1, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c1, alpha);
+  } else if (act_type == ActivationType_SIGMOID) {
+    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
+    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
+    out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1));
+    out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
+  }
+
+#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
+  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+  }  // end if (oh1 < OH)
+  if (co_slice1 < CO_SLICES) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
+    }  // end if (oh1 < OH)
+  }    // end if (co_slice1 < CO_SLICES)
+#else
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
+  }  // end (oh1 < OH)
+  if (co_slice1 < CO_SLICES) {
+    WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
+    }  // end if (oh1 < OH)
+  }    // end if (co_slice1 < CO_SLICES)
+#endif
+}
+
+__kernel void Conv2D_H2W2C1_1x1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                                __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
+                                int2 dilation, int act_type, float alpha) {
+  const int BlockH = 2;
+  const int BlockW = 2;
+  const int BlockC = 1;
+  DEFINE_ARGS;
+
+  int oh0 = oh + 0;
+  int oh1 = oh + 1;
+  int n_oh0 = n * OH + oh0;
+  int n_oh1 = n * OH + oh1;
+  int ow0 = ow + 0;
+  int ow1 = ow + 1;
+  int co_slice0 = co_slice + 0;
+
+  FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
+
+  int ih0 = oh0 * strideH - padTop;
+  // no need to check oh1, finally write out will check (oh1 < OH)
+  int ih1 = oh1 * strideH - padTop;
+  // check ih0 and ih1
+  int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+  int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
+
+  int iw0 = ow0 * strideW - padLeft;
+  int iw1 = (ow1 < OW) ? ow1 * strideW - padLeft : -2;
+  int x_idx0 = iw0 * CI_SLICES;
+  int x_idx1 = iw1 * CI_SLICES;
+
+  for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+    FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
+    FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0));
+    FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
+    FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1));
+    x_idx0++;
+    x_idx1++;
+
+    out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
+    out_h0_w1_c0 += weight_ptr[0] * in_h0_w1.x;
+    out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
+    out_h1_w1_c0 += weight_ptr[0] * in_h1_w1.x;
+    out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
+    out_h0_w1_c0 += weight_ptr[1] * in_h0_w1.y;
+    out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
+    out_h1_w1_c0 += weight_ptr[1] * in_h1_w1.y;
+    out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
+    out_h0_w1_c0 += weight_ptr[2] * in_h0_w1.z;
+    out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
+    out_h1_w1_c0 += weight_ptr[2] * in_h1_w1.z;
+    out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
+    out_h0_w1_c0 += weight_ptr[3] * in_h0_w1.w;
+    out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
+    out_h1_w1_c0 += weight_ptr[3] * in_h1_w1.w;
+
+    weight_ptr += 4;
+  }
+
+  if (bias != 0) {
+    out_h0_w0_c0 += bias[co_slice0];
+    out_h0_w1_c0 += bias[co_slice0];
+    out_h1_w0_c0 += bias[co_slice0];
+    out_h1_w1_c0 += bias[co_slice0];
+  }
+
+  if (act_type == ActivationType_RELU) {
+    out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
+    out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f));
+    out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
+    out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f));
+  } else if (act_type == ActivationType_RELU6) {
+    out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+  } else if (act_type == ActivationType_TANH) {
+    FLT4 exp0, exp1;
+    DO_TANH(out_h0_w0_c0);
+    DO_TANH(out_h0_w1_c0);
+    DO_TANH(out_h1_w0_c0);
+    DO_TANH(out_h1_w1_c0);
+  } else if (act_type == ActivationType_LEAKY_RELU) {
+    DO_LEAKY_RELU(out_h0_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h0_w1_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w1_c0, alpha);
+  } else if (act_type == ActivationType_SIGMOID) {
+    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
+    out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0));
+    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
+    out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0));
+  }
+
+#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
+  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+  WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
+  }  // end if (oh1 < OH)
+#else
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
+  }  // end (oh1 < OH)
+#endif
+}
+
+__kernel void Conv2D_H2W2C2_1x1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                                __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
+                                int2 dilation, int act_type, float alpha) {
+  const int BlockH = 2;
+  const int BlockW = 2;
+  const int BlockC = 2;
+  DEFINE_ARGS;
+
+  int oh0 = oh + 0;
+  int oh1 = oh + 1;
+  int n_oh0 = n * OH + oh0;
+  int n_oh1 = n * OH + oh1;
+  int ow0 = ow + 0;
+  int ow1 = ow + 1;
+  int co_slice0 = co_slice + 0;
+  int co_slice1 = co_slice + 1;
+
+  FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
+
+  int ih0 = oh0 * strideH - padTop;
+  // no need to check oh1, finally write out will check (oh1 < OH)
+  int ih1 = oh1 * strideH - padTop;
+  // check ih0 and ih1
+  int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+  int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
+
+  int iw0 = ow0 * strideW - padLeft;
+  int iw1 = (ow1 < OW) ? ow1 * strideW - padLeft : -2;
+  int x_idx0 = iw0 * CI_SLICES;
+  int x_idx1 = iw1 * CI_SLICES;
+
+  for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+    FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
+    FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0));
+    FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
+    FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1));
+    x_idx0++;
+    x_idx1++;
+
+    out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
+    out_h0_w1_c0 += weight_ptr[0] * in_h0_w1.x;
+    out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
+    out_h1_w1_c0 += weight_ptr[0] * in_h1_w1.x;
+    out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
+    out_h0_w1_c0 += weight_ptr[1] * in_h0_w1.y;
+    out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
+    out_h1_w1_c0 += weight_ptr[1] * in_h1_w1.y;
+    out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
+    out_h0_w1_c0 += weight_ptr[2] * in_h0_w1.z;
+    out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
+    out_h1_w1_c0 += weight_ptr[2] * in_h1_w1.z;
+    out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
+    out_h0_w1_c0 += weight_ptr[3] * in_h0_w1.w;
+    out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
+    out_h1_w1_c0 += weight_ptr[3] * in_h1_w1.w;
+
+    out_h0_w0_c1 += weight_ptr[4] * in_h0_w0.x;
+    out_h0_w1_c1 += weight_ptr[4] * in_h0_w1.x;
+    out_h1_w0_c1 += weight_ptr[4] * in_h1_w0.x;
+    out_h1_w1_c1 += weight_ptr[4] * in_h1_w1.x;
+    out_h0_w0_c1 += weight_ptr[5] * in_h0_w0.y;
+    out_h0_w1_c1 += weight_ptr[5] * in_h0_w1.y;
+    out_h1_w0_c1 += weight_ptr[5] * in_h1_w0.y;
+    out_h1_w1_c1 += weight_ptr[5] * in_h1_w1.y;
+    out_h0_w0_c1 += weight_ptr[6] * in_h0_w0.z;
+    out_h0_w1_c1 += weight_ptr[6] * in_h0_w1.z;
+    out_h1_w0_c1 += weight_ptr[6] * in_h1_w0.z;
+    out_h1_w1_c1 += weight_ptr[6] * in_h1_w1.z;
+    out_h0_w0_c1 += weight_ptr[7] * in_h0_w0.w;
+    out_h0_w1_c1 += weight_ptr[7] * in_h0_w1.w;
+    out_h1_w0_c1 += weight_ptr[7] * in_h1_w0.w;
+    out_h1_w1_c1 += weight_ptr[7] * in_h1_w1.w;
+
+    weight_ptr += 8;
+  }
+
+  if (bias != 0) {
+    out_h0_w0_c0 += bias[co_slice0];
+    out_h0_w1_c0 += bias[co_slice0];
+    out_h1_w0_c0 += bias[co_slice0];
+    out_h1_w1_c0 += bias[co_slice0];
+    out_h0_w0_c1 += bias[co_slice1];
+    out_h0_w1_c1 += bias[co_slice1];
+    out_h1_w0_c1 += bias[co_slice1];
+    out_h1_w1_c1 += bias[co_slice1];
+  }
+
+  if (act_type == ActivationType_RELU) {
+    out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
+    out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f));
+    out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
+    out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f));
+    out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f));
+    out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f));
+    out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
+    out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f));
+  } else if (act_type == ActivationType_RELU6) {
+    out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+  } else if (act_type == ActivationType_TANH) {
+    FLT4 exp0, exp1;
+    DO_TANH(out_h0_w0_c0);
+    DO_TANH(out_h0_w1_c0);
+    DO_TANH(out_h1_w0_c0);
+    DO_TANH(out_h1_w1_c0);
+    DO_TANH(out_h0_w0_c1);
+    DO_TANH(out_h0_w1_c1);
+    DO_TANH(out_h1_w0_c1);
+    DO_TANH(out_h1_w1_c1);
+  } else if (act_type == ActivationType_LEAKY_RELU) {
+    DO_LEAKY_RELU(out_h0_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h0_w1_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w1_c0, alpha);
+    DO_LEAKY_RELU(out_h0_w0_c1, alpha);
+    DO_LEAKY_RELU(out_h0_w1_c1, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c1, alpha);
+    DO_LEAKY_RELU(out_h1_w1_c1, alpha);
+  } else if (act_type == ActivationType_SIGMOID) {
+    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
+    out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0));
+    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
+    out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0));
+    out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1));
+    out_h0_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c1));
+    out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
+    out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
+  }
+
+#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
+  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+  WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
+  }  // end if (oh1 < OH)
+  if (co_slice1 < CO_SLICES) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
+    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
+      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
+    }  // end if (oh1 < OH)
+  }    // end if (co_slice1 < CO_SLICES)
+#else
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
+  }  // end (oh1 < OH)
+  if (co_slice1 < CO_SLICES) {
+    WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
+    WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
+      WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
+    }  // end if (oh1 < OH)
+  }    // end if (co_slice1 < CO_SLICES)
+#endif
+}
+
+__kernel void Conv2D_H2W2C2_Img_1x1(__read_only image2d_t input, __write_only image2d_t output,
+                                    __read_only image2d_t weight, __global FLT4 *bias, int4 input_shape,
+                                    int4 output_shape, int4 kernel_stride, int4 pad, int2 dilation, int act_type,
+                                    float alpha) {
+  const int BlockH = 2;
+  const int BlockW = 2;
+  const int BlockC = 2;
+  DEFINE_ARGS;
+
+  int oh0 = oh + 0;
+  int oh1 = oh + 1;
+  int n_oh0 = n * OH + oh0;
+  int n_oh1 = n * OH + oh1;
+  int ow0 = ow + 0;
+  int ow1 = ow + 1;
+  int co_slice0 = co_slice + 0;
+  int co_slice1 = co_slice + 1;
+
+  FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  int filter_offset = 0;
+  int ih0 = oh0 * strideH - padTop;
+  // no need to check oh1, finally write out will check (oh1 < OH)
+  int ih1 = oh1 * strideH - padTop;
+  // check ih0 and ih1
+  int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+  int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
+
+  int iw0 = ow0 * strideW - padLeft;
+  int iw1 = (ow1 < OW) ? ow1 * strideW - padLeft : -2;
+  int x_idx0 = iw0 * CI_SLICES;
+  int x_idx1 = iw1 * CI_SLICES;
+
+  for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+    FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
+    FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0));
+    FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
+    FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1));
+    x_idx0++;
+    x_idx1++;
+
+    FLT4 filter_ci0_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 0));
+    FLT4 filter_ci1_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 1));
+    FLT4 filter_ci2_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 2));
+    FLT4 filter_ci3_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 3));
+    FLT4 filter_ci0_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 0));
+    FLT4 filter_ci1_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 1));
+    FLT4 filter_ci2_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 2));
+    FLT4 filter_ci3_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 3));
+    filter_offset += 4;
+
+    out_h0_w0_c0 += filter_ci0_co0 * in_h0_w0.x;
+    out_h0_w1_c0 += filter_ci0_co0 * in_h0_w1.x;
+    out_h1_w0_c0 += filter_ci0_co0 * in_h1_w0.x;
+    out_h1_w1_c0 += filter_ci0_co0 * in_h1_w1.x;
+    out_h0_w0_c0 += filter_ci1_co0 * in_h0_w0.y;
+    out_h0_w1_c0 += filter_ci1_co0 * in_h0_w1.y;
+    out_h1_w0_c0 += filter_ci1_co0 * in_h1_w0.y;
+    out_h1_w1_c0 += filter_ci1_co0 * in_h1_w1.y;
+    out_h0_w0_c0 += filter_ci2_co0 * in_h0_w0.z;
+    out_h0_w1_c0 += filter_ci2_co0 * in_h0_w1.z;
+    out_h1_w0_c0 += filter_ci2_co0 * in_h1_w0.z;
+    out_h1_w1_c0 += filter_ci2_co0 * in_h1_w1.z;
+    out_h0_w0_c0 += filter_ci3_co0 * in_h0_w0.w;
+    out_h0_w1_c0 += filter_ci3_co0 * in_h0_w1.w;
+    out_h1_w0_c0 += filter_ci3_co0 * in_h1_w0.w;
+    out_h1_w1_c0 += filter_ci3_co0 * in_h1_w1.w;
+
+    out_h0_w0_c1 += filter_ci0_co1 * in_h0_w0.x;
+    out_h0_w1_c1 += filter_ci0_co1 * in_h0_w1.x;
+    out_h1_w0_c1 += filter_ci0_co1 * in_h1_w0.x;
+    out_h1_w1_c1 += filter_ci0_co1 * in_h1_w1.x;
+    out_h0_w0_c1 += filter_ci1_co1 * in_h0_w0.y;
+    out_h0_w1_c1 += filter_ci1_co1 * in_h0_w1.y;
+    out_h1_w0_c1 += filter_ci1_co1 * in_h1_w0.y;
+    out_h1_w1_c1 += filter_ci1_co1 * in_h1_w1.y;
+    out_h0_w0_c1 += filter_ci2_co1 * in_h0_w0.z;
+    out_h0_w1_c1 += filter_ci2_co1 * in_h0_w1.z;
+    out_h1_w0_c1 += filter_ci2_co1 * in_h1_w0.z;
+    out_h1_w1_c1 += filter_ci2_co1 * in_h1_w1.z;
+    out_h0_w0_c1 += filter_ci3_co1 * in_h0_w0.w;
+    out_h0_w1_c1 += filter_ci3_co1 * in_h0_w1.w;
+    out_h1_w0_c1 += filter_ci3_co1 * in_h1_w0.w;
+    out_h1_w1_c1 += filter_ci3_co1 * in_h1_w1.w;
+  }
+
+  if (bias != 0) {
+    out_h0_w0_c0 += bias[co_slice0];
+    out_h0_w1_c0 += bias[co_slice0];
+    out_h1_w0_c0 += bias[co_slice0];
+    out_h1_w1_c0 += bias[co_slice0];
+    out_h0_w0_c1 += bias[co_slice1];
+    out_h0_w1_c1 += bias[co_slice1];
+    out_h1_w0_c1 += bias[co_slice1];
+    out_h1_w1_c1 += bias[co_slice1];
+  }
+
+  if (act_type == ActivationType_RELU) {
+    out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
+    out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f));
+    out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
+    out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f));
+    out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f));
+    out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f));
+    out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
+    out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f));
+  } else if (act_type == ActivationType_RELU6) {
+    out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+  } else if (act_type == ActivationType_TANH) {
+    FLT4 exp0, exp1;
+    DO_TANH(out_h0_w0_c0);
+    DO_TANH(out_h0_w1_c0);
+    DO_TANH(out_h1_w0_c0);
+    DO_TANH(out_h1_w1_c0);
+    DO_TANH(out_h0_w0_c1);
+    DO_TANH(out_h0_w1_c1);
+    DO_TANH(out_h1_w0_c1);
+    DO_TANH(out_h1_w1_c1);
+  } else if (act_type == ActivationType_LEAKY_RELU) {
+    DO_LEAKY_RELU(out_h0_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h0_w1_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w1_c0, alpha);
+    DO_LEAKY_RELU(out_h0_w0_c1, alpha);
+    DO_LEAKY_RELU(out_h0_w1_c1, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c1, alpha);
+    DO_LEAKY_RELU(out_h1_w1_c1, alpha);
+  } else if (act_type == ActivationType_SIGMOID) {
+    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
+    out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0));
+    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
+    out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0));
+    out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1));
+    out_h0_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c1));
+    out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
+    out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
+  }
+
+#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
+  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+  WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
+  }  // end if (oh1 < OH)
+  if (co_slice1 < CO_SLICES) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
+    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
+      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
+    }  // end if (oh1 < OH)
+  }    // end if (co_slice1 < CO_SLICES)
+#else
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
+  }  // end (oh1 < OH)
+  if (co_slice1 < CO_SLICES) {
+    WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
+    WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
+      WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
+    }  // end if (oh1 < OH)
+  }    // end if (co_slice1 < CO_SLICES)
+#endif
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@ -143,8 +143,15 @@ void Conv2DOpenCLKernel::BuildKernel() {
  if (filter_type_ == MemType::IMG) {
    kernel_name << "_Img";
  }
+  if (KW_ == 1 && KH_ == 1) {
+    kernel_name << "_1x1";
+  }
  ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source);
  auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
+
+  std::string exceed_max_image_width_option =
+    (OW_ * CO_SLICES_ <= ocl_runtime_->GetMaxImage2DWidth()) ? "" : " -DEXCEDD_MAX_IMAGE2D_WIDTH";
+  build_options_ext.push_back(exceed_max_image_width_option);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str(), build_options_ext);
 }

@ -155,25 +162,63 @@ void Conv2DOpenCLKernel::SetBlockSize() {
  }
  auto task_size = static_cast<float>(batch_size_ * OH_ * OW_ * CO_SLICES_);
  auto task_size_per_cu = task_size / ocl_runtime_->DeviceComputeUnits();
+  bool w_kernel_is_1 =
+    KW_ == 1 && param_->stride_w_ == 1 && param_->dilation_w_ == 1 && param_->pad_l_ == 0 && param_->pad_r_ == 0;
+  bool h_kernel_is_1 =
+    KH_ == 1 && param_->stride_h_ == 1 && param_->dilation_h_ == 1 && param_->pad_u_ == 0 && param_->pad_d_ == 0;
+  if (use_fp16_) {
+    SetMaliFp16BlockSize(task_size_per_cu, w_kernel_is_1, h_kernel_is_1);
+  } else {
+    SetMaliFp32BlockSize(task_size_per_cu, w_kernel_is_1, h_kernel_is_1);
+  }
+}
+
+void Conv2DOpenCLKernel::SetMaliFp32BlockSize(int task_size_per_cu, bool w_kernel_is_1, bool h_kernel_is_1) {
  int block_size;
  if (task_size_per_cu <= 256) {
    block_size = 1;
  } else if (task_size_per_cu <= 256 * 4) {
    block_size = 2;
-  } else if (task_size_per_cu <= (use_fp16_ ? 256 * 8 : FLT_MAX)) {
+  } else if (task_size_per_cu <= FLT_MAX) {
    block_size = 4;
  } else {
    block_size = 8;
  }

-  bool w_kernel_is_1 =
-    KW_ == 1 && param_->stride_w_ == 1 && param_->dilation_w_ == 1 && param_->pad_l_ == 0 && param_->pad_r_ == 0;
-  bool h_kernel_is_1 =
-    KH_ == 1 && param_->stride_h_ == 1 && param_->dilation_h_ == 1 && param_->pad_u_ == 0 && param_->pad_d_ == 0;
  if (!w_kernel_is_1 || !h_kernel_is_1) {
    block_size = std::min(block_size, 4);
  }

+  if (block_size == 8) {
+    block_size_ = {2, 2, 2};
+  } else if (block_size == 4) {
+    block_size_ = {2, 2, 1};
+  } else if (block_size == 2) {
+    block_size_ = {2, 1, 1};
+  } else {
+    block_size_ = {1, 1, 1};
+  }
+}
+void Conv2DOpenCLKernel::SetMaliFp16BlockSize(int task_size_per_cu, bool w_kernel_is_1, bool h_kernel_is_1) {
+  int block_size;
+  if (task_size_per_cu <= 256) {
+    block_size = 1;
+  } else if (task_size_per_cu <= 256 * 4) {
+    block_size = 2;
+  } else if (task_size_per_cu <= 256 * 8) {
+    block_size = 4;
+  } else {
+    block_size = 8;
+  }
+
+  if (!w_kernel_is_1 || !h_kernel_is_1) {
+    block_size = std::min(block_size, 4);
+  }
+
+  if (CO_SLICES_ >= 128 && OH_ >= 10 && OW_ >= 10) {
+    block_size = 8;
+  }
+
  if (block_size == 8) {
    block_size_ = {2, 2, 2};
  } else if (block_size == 4) {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
@ -105,6 +105,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel {

 private:
  void SetBlockSize();
+  void SetMaliFp16BlockSize(int task_size_per_cu, bool w_kernel_is_1, bool h_kernel_is_1);
+  void SetMaliFp32BlockSize(int task_size_per_cu, bool w_kernel_is_1, bool h_kernel_is_1);
  struct {
    int H{1};
    int W{1};
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@ -244,13 +244,29 @@ void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() {
  global_size_ = {CO4, (size_t)UP_DIV(out_info.W, block_size_.W),
                  (size_t)UP_DIV(out_info.H, block_size_.H) * out_info.N};
  // set local
-  const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
-  int z = global_size_[0];
-  int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size_[2], 8));
-  int x = std::max(1, std::min(static_cast<int>(global_size_[1]), max_group_size / (y * z)));
-  local_size_ = std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)});
+  int local_max = filter_type_ == MemType::IMG ? 64 : 128;
+  if (ocl_runtime_->DeviceComputeUnits() > 16) {
+    local_max = 256;
+  }
+  const int local_c_max = 16;
+  const int OH_threshold = 100;
+  const int OW_threshold = 100;
+  const int OC_threshold = 64;
+  size_t local_c = GetMaxDivisor(global_size_[0], local_c_max);
+  local_c = std::max<size_t>(local_c, 1);
+  size_t local_hw = local_max / local_c;
+  size_t local_h;
+  size_t local_w;
+  if (out_info.H >= OH_threshold && out_info.W >= OW_threshold && out_info.C <= OC_threshold) {  // c -> w -> h
+    local_w = std::min(global_size_[1], local_hw);
+    local_h = std::min(local_hw / local_w, global_size_[2]);
+  } else {  // c -> h -> w
+    local_h = std::min(global_size_[2], local_hw);
+    local_w = std::min(local_hw / local_h, global_size_[1]);
+  }

-  OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
+  local_size_ = {local_c, local_w, local_h};
+  AlignGlobalLocal(global_size_, local_size_);
 }

 int DepthwiseConv2dOpenCLKernel::StoreConstData() {