From 748a595c17288be65ebc9aff729693c5107b3b2b Mon Sep 17 00:00:00 2001
From: chenzupeng <chenzupeng@huawei.com>
Date: Thu, 3 Sep 2020 16:33:48 +0800
Subject: [PATCH] op support NC4HW4

---
 .../runtime/kernel/opencl/cl/avg_pool2d.cl    |  42 ++++-
 .../kernel/opencl/cl/conv2d_transpose2x2.cl   |  62 ++++++-
 .../src/runtime/kernel/opencl/cl/matmul.cl    |  32 +++-
 .../runtime/kernel/opencl/cl/max_pool2d.cl    |  34 +++-
 .../src/runtime/kernel/opencl/cl/reshape.cl   |  16 +-
 .../src/runtime/kernel/opencl/cl/softmax.cl   | 103 ++++++++++-
 .../src/runtime/kernel/opencl/cl/transpose.cl |  45 ++++-
 .../kernel/opencl/kernel/conv2d_transpose.cc  |  40 +++--
 .../runtime/kernel/opencl/kernel/matmul.cc    |  26 ++-
 .../runtime/kernel/opencl/kernel/pooling2d.cc |  35 ++--
 .../runtime/kernel/opencl/kernel/reshape.cc   |  34 ++--
 .../runtime/kernel/opencl/kernel/softmax.cc   |  20 +--
 .../runtime/kernel/opencl/kernel/to_format.cc |  10 +-
 .../runtime/kernel/opencl/kernel/transpose.cc |  35 +++-
 .../runtime/kernel/opencl/kernel/transpose.h  |   1 -
 .../kernel/opencl/avg_pooling_tests.cc        | 151 ++++++++--------
 .../kernel/opencl/max_pooling_tests.cc        | 161 +++++++++++-------
 .../runtime/kernel/opencl/reshape_tests.cc    |  74 ++++++--
 .../runtime/kernel/opencl/transpose_tests.cc  |   4 +-
 19 files changed, 678 insertions(+), 247 deletions(-)

diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
index 7728a102109..0df90fe08d8 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
@@ -1,6 +1,7 @@
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
+#define divide_no_check(a, b) (a / b)
 __kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
                                const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
   // axis to dst tensor coordinate
@@ -34,8 +35,9 @@ __kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, cons
 
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-__kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
-                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
+__kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
+                                     const int4 output_shape, const int2 stride, const int2 kernel_size,
+                                     const int2 padding) {
   // axis to dst tensor coordinate
   int X = get_global_id(0);
   int Y = get_global_id(1);
@@ -57,10 +59,42 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
     for (int kx = 0; kx < kernel_size.x; ++kx) {
       int x_c = xs + kx;
       bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
-      r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (float4)(0.0f);
+      r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f);
       window_size += !outside ? 1.0f : 0.0f;
     }
   }
-  FLT4 result = TO_FLT4(r / window_size);
+  FLT4 result = TO_FLT4(divide_no_check(r, window_size));
   WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
 }
+
+__kernel void AvgPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output,
+                                      const int4 input_shape, const int4 output_shape, const int2 stride,
+                                      const int2 kernel_size, const int2 padding) {
+  // axis to dst tensor coordinate
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+
+  // boundary check
+  if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
+    return;
+  }
+
+  FLT4 r = (FLT4)(0.0f);
+  FLT window_size = 0.0f;
+  int xs = X * stride.x - padding.x;
+  int ys = Y * stride.y - padding.y;
+
+  for (int ky = 0; ky < kernel_size.y; ++ky) {
+    int y_c = ys + ky;
+    bool outside_y = y_c < 0 || y_c >= input_shape.y;
+    for (int kx = 0; kx < kernel_size.x; ++kx) {
+      int x_c = xs + kx;
+      bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
+      r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c, Z * input_shape.x + x_c)) : (FLT4)(0.0f);
+      window_size += !outside ? 1.0f : 0.0f;
+    }
+  }
+  FLT4 result = TO_FLT4(divide_no_check(r, window_size));
+  WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), result);
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl
index eea2139ee2f..ac76b8742f6 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl
@@ -1,8 +1,8 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
-                                  __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
-                                  int4 src_size, int4 dst_size) {
+__kernel void conv2d_transpose2x2_NHWC4(__read_only image2d_t src_data, __global FLT16 *weight,
+                                        __read_only image2d_t biases, __write_only image2d_t dst_data, int2 kernel_size,
+                                        int2 stride, int2 padding, int4 src_size, int4 dst_size) {
   int h = get_global_id(0);
   int kh = h % 2;
   int src_h = h / 2;
@@ -55,3 +55,59 @@ __kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16
   WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2);
   WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3);
 }
+
+__kernel void conv2d_transpose2x2_NC4HW4(__read_only image2d_t src_data, __global FLT16 *weight,
+                                         __read_only image2d_t biases, __write_only image2d_t dst_data,
+                                         int2 kernel_size, int2 stride, int2 padding, int4 src_size, int4 dst_size) {
+  int h = get_global_id(0);
+  int kh = h % 2;
+  int src_h = h / 2;
+  src_h = src_h * 2;
+  int w = get_global_id(1);
+  int kw = w % 2;
+  int src_w = w / 2;
+  src_w = src_w * 2;
+  int co = get_global_id(2);
+  if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return;
+  FLT4 r0 = (FLT4)(0.f);
+  FLT4 r1 = (FLT4)(0.f);
+  FLT4 r2 = (FLT4)(0.f);
+  FLT4 r3 = (FLT4)(0.f);
+  int base_w = (co * 4 + kh * 2 + kw) * src_size.z;
+  for (int ci = 0; ci < src_size.z; ++ci) {
+    FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w, ci * src_size.x + src_h));
+    FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w, ci * src_size.x + src_h + 1));
+    FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(src_w + 1, ci * src_size.x + src_h));
+    FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(src_w + 1, ci * src_size.x + src_h + 1));
+    FLT16 weight_cache = weight[base_w++];
+    r0 += x0.x * weight_cache.s0123;
+    r0 += x0.y * weight_cache.s4567;
+    r0 += x0.z * weight_cache.s89ab;
+    r0 += x0.w * weight_cache.scdef;
+
+    r1 += x1.x * weight_cache.s0123;
+    r1 += x1.y * weight_cache.s4567;
+    r1 += x1.z * weight_cache.s89ab;
+    r1 += x1.w * weight_cache.scdef;
+
+    r2 += x2.x * weight_cache.s0123;
+    r2 += x2.y * weight_cache.s4567;
+    r2 += x2.z * weight_cache.s89ab;
+    r2 += x2.w * weight_cache.scdef;
+
+    r3 += x3.x * weight_cache.s0123;
+    r3 += x3.y * weight_cache.s4567;
+    r3 += x3.z * weight_cache.s89ab;
+    r3 += x3.w * weight_cache.scdef;
+  }
+  FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0));
+  r0 += bias_val;
+  r1 += bias_val;
+  r2 += bias_val;
+  r3 += bias_val;
+
+  WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw, co * dst_size.x + 2 * src_h + kh), r0);
+  WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw, co * dst_size.x + 2 * src_h + kh + 2), r1);
+  WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw + 2, co * dst_size.x + 2 * src_h + kh), r2);
+  WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw + 2, co * dst_size.x + 2 * src_h + kh + 2), r3);
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
index e67c73358e9..300584c2f9a 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
@@ -1,7 +1,7 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
-                     __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
+__kernel void MatMul_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
+                           __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
   int2 gid = (int2)(get_global_id(0), get_global_id(1));
   int2 lid = (int2)(get_local_id(0), get_local_id(1));
   FLT4 result = (FLT4)(0.0f);
@@ -27,3 +27,31 @@ __kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read
     WRITE_IMAGE(output, (int2)(gid.x, 0), result);
   }
 }
+
+__kernel void MatMul_NC4HW4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
+                            __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
+  int2 gid = (int2)(get_global_id(0), get_global_id(1));
+  int2 lid = (int2)(get_local_id(0), get_local_id(1));
+  FLT4 result = (FLT4)(0.0f);
+  bool inside = gid.x < offset_co.y;
+  for (uint i = lid.y; i < offset_ci.y && inside; i += 4) {
+    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(0, i));
+    FLT16 w = weight[gid.x + i * offset_co.y];
+    result.x += dot(v, w.s0123);
+    result.y += dot(v, w.s4567);
+    result.z += dot(v, w.s89ab);
+    result.w += dot(v, w.scdef);
+  }
+  __local FLT4 temp[64][4];
+  temp[lid.x][lid.y] = result;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (lid.y == 0 && inside) {
+    result += temp[lid.x][1];
+    result += temp[lid.x][2];
+    result += temp[lid.x][3];
+    if (has_bias != 0) {
+      result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0));
+    }
+    WRITE_IMAGE(output, (int2)(0, gid.x), result);
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
index f7b25423608..23121428fb2 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
@@ -36,8 +36,9 @@ __kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, cons
 
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 
-__kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
-                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
+__kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
+                                     const int4 output_shape, const int2 stride, const int2 kernel_size,
+                                     const int2 padding) {
   // axis to dst tensor coordinate
   int X = get_global_id(0);
   int Y = get_global_id(1);
@@ -63,3 +64,32 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d
   }
   WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum);
 }
+
+__kernel void MaxPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output,
+                                      const int4 input_shape, const int4 output_shape, const int2 stride,
+                                      const int2 kernel_size, const int2 padding) {
+  // axis to dst tensor coordinate
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+
+  // boundary check
+  if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
+    return;
+  }
+
+  FLT4 maximum = (FLT4)(-10000.0f);
+  int xs = X * stride.x - padding.x;
+  int ys = Y * stride.y - padding.y;
+  for (int ky = 0; ky < kernel_size.y; ++ky) {
+    int y_c = ys + ky;
+    if (y_c < 0 || y_c >= input_shape.y) continue;
+    for (int kx = 0; kx < kernel_size.x; ++kx) {
+      int x_c = xs + kx;
+      if (x_c < 0 || x_c >= input_shape.x) continue;
+      FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c, Z * input_shape.x + x_c));
+      maximum = max(src, maximum);
+    }
+  }
+  WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), maximum);
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
index c292a5dee86..1b35e9aca43 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
@@ -1,6 +1,6 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, int4 size_out) {
+__kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, int4 size_out) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
@@ -12,3 +12,17 @@ __kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst
   int iw = out_index % size.y;
   WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(iw * size.z + Z, ih)));
 }
+
+__kernel void reshape_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size,
+                             int4 size_out) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size_out.x || Y >= size_out.y || Z >= size_out.z) {
+    return;
+  }
+  int out_index = X * size_out.y + Y;
+  int ih = out_index / size.y;
+  int iw = out_index % size.y;
+  WRITE_IMAGE(dst_data, (int2)(Y, Z * size_out.x + X), READ_IMAGE(src_data, smp_zero, (int2)(iw, Z * size.x + ih)));
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
index 08dcc1f2fcf..96b8a035f67 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
@@ -1,9 +1,10 @@
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
+#define divide_no_check(a, b) (a / b)
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void SoftMax_BUF(__read_only image2d_t input, __global FLT4 *output, const int4 input_shape) {
+__kernel void SoftMax_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *output, const int4 input_shape) {
   int X = get_global_id(0);  // H
   int Y = get_global_id(1);  // W
   int H = input_shape.x;
@@ -24,7 +25,38 @@ __kernel void SoftMax_BUF(__read_only image2d_t input, __global FLT4 *output, co
 
   for (int d = 0; d < S; ++d) {
     FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X));
-    t = exp(t) / sum;
+    t = divide_no_check(exp(t), sum);
+    __global FLT *output_flt = (__global FLT *)output;
+    output_flt += (X * W + Y) * C + 4 * d;
+    output_flt[0] = t.x;
+    if (d * 4 + 1 < C) output_flt[1] += t.y;
+    if (d * 4 + 2 < C) output_flt[2] += t.z;
+    if (d * 4 + 3 < C) output_flt[3] += t.w;
+  }
+}
+
+__kernel void SoftMax_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *output, const int4 input_shape) {
+  int X = get_global_id(0);  // H
+  int Y = get_global_id(1);  // W
+  int H = input_shape.x;
+  int W = input_shape.y;
+  int C = input_shape.z;
+  int S = input_shape.w;
+
+  if (X >= H || Y >= W) return;
+
+  FLT sum = 0.0f;
+  for (int d = 0; d < S; ++d) {
+    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X));
+    sum += exp(t.x);
+    if (d * 4 + 1 < C) sum += exp(t.y);
+    if (d * 4 + 2 < C) sum += exp(t.z);
+    if (d * 4 + 3 < C) sum += exp(t.w);
+  }
+
+  for (int d = 0; d < S; ++d) {
+    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X));
+    t = divide_no_check(exp(t), sum);
     __global FLT *output_flt = (__global FLT *)output;
     output_flt += (X * W + Y) * C + 4 * d;
     output_flt[0] = t.x;
@@ -39,7 +71,7 @@ __kernel void SoftMax_IMG(__read_only image2d_t input, __write_only image2d_t ou
   int Y = get_global_id(1);
   if (X >= input_shape.x || Y >= input_shape.y) return;
 
-  float sum = 0.0f;
+  FLT sum = 0.0f;
   for (int d = 0; d < input_shape.w; ++d) {
     FLT4 t = READ_IMAGE(input, smp_none, (int2)(Y * input_shape.w + d, X));
     sum += exp(t.x);
@@ -50,7 +82,7 @@ __kernel void SoftMax_IMG(__read_only image2d_t input, __write_only image2d_t ou
 
   for (int d = 0; d < input_shape.w; ++d) {
     FLT4 t = READ_IMAGE(input, smp_none, (int2)(Y * input_shape.w + d, X));
-    t = exp(t) / sum;
+    t = divide_no_check(exp(t), sum);
     FLT4 result = TO_FLT4(t);
     WRITE_IMAGE(output, (int2)(Y * input_shape.w + d, X), result);
   }
@@ -86,7 +118,7 @@ __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t
     sum += dot((FLT4)(1.0f), tmp[5]);
     sum += dot((FLT4)(1.0f), tmp[6]);
     sum += dot((FLT4)(1.0f), tmp[7]);
-    tmpx1[0] = 1.0f / sum;
+    tmpx1[0] = divide_no_check(1.0f, sum);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
   sum = tmpx1[0];
@@ -104,8 +136,8 @@ __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t
   } while (slices_count < slices_x32);
 }
 
-__kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, const int slices,
-                             const int slices_x32) {
+__kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask,
+                                   const int slices, const int slices_x32) {
   int tid = get_local_id(0);
   FLT sum = 0.0f;
   for (size_t i = tid; i < slices - 1; i += 32) {
@@ -131,7 +163,7 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global FLT4 *output,
     sum += dot((FLT4)(1.0f), tmp[5]);
     sum += dot((FLT4)(1.0f), tmp[6]);
     sum += dot((FLT4)(1.0f), tmp[7]);
-    tmpx1[0] = 1.0f / sum;
+    tmpx1[0] = divide_no_check(1.0f, sum);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
   sum = tmpx1[0];
@@ -158,3 +190,58 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global FLT4 *output,
     }
   }
 }
+
+__kernel void SoftMax1x1_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask,
+                                    const int slices, const int slices_x32) {
+  int tid = get_local_id(0);
+  FLT sum = 0.0f;
+  for (size_t i = tid; i < slices - 1; i += 32) {
+    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, i));
+    sum += dot((FLT4)(1.0f), exp(src));
+  }
+  if ((slices - 1) % 32 == tid) {
+    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1));
+
+    sum += dot(TO_FLT4(mask), exp(src));
+  }
+
+  __local FLT4 tmp[8];
+  __local FLT *tmpx1 = (__local FLT *)tmp;
+  tmpx1[tid] = sum;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (tid == 0) {
+    sum = dot((FLT4)(1.0f), tmp[0]);
+    sum += dot((FLT4)(1.0f), tmp[1]);
+    sum += dot((FLT4)(1.0f), tmp[2]);
+    sum += dot((FLT4)(1.0f), tmp[3]);
+    sum += dot((FLT4)(1.0f), tmp[4]);
+    sum += dot((FLT4)(1.0f), tmp[5]);
+    sum += dot((FLT4)(1.0f), tmp[6]);
+    sum += dot((FLT4)(1.0f), tmp[7]);
+    tmpx1[0] = divide_no_check(1.0f, sum);
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  sum = tmpx1[0];
+  for (size_t i = tid; i < slices - 1; i += 32) {
+    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i));
+    result = exp(result) * sum;
+    output[i] = result;
+  }
+  if ((slices - 1) % 32 == tid) {
+    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1));
+    result = exp(result) * sum;
+    __global FLT4 *remain_ptr4 = output;
+    remain_ptr4 += slices - 1;
+    __global FLT *remain_ptr = (__global FLT *)remain_ptr4;
+    remain_ptr[0] = result.x;
+    if (mask.y > 0.f) {
+      remain_ptr[1] = result.y;
+    }
+    if (mask.z > 0.f) {
+      remain_ptr[2] = result.z;
+    }
+    if (mask.w > 0.f) {
+      remain_ptr[3] = result.w;
+    }
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
index ac11eaa1e30..7be8e94b748 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
@@ -43,7 +43,8 @@ __kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d
   WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
 }
 
-__kernel void transpose_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C, int W) {
+__kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C, int W,
+                                  int H) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= HW.y || Y >= C.y) {
@@ -83,3 +84,45 @@ __kernel void transpose_BUF(__read_only image2d_t src_data, global FLT4 *dst_dat
   if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = result[2];
   if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = result[3];
 }
+
+__kernel void transpose_NC4HW4_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C, int W,
+                                   int H) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= HW.y || Y >= C.y) {
+    return;
+  }
+  FLT4 result[4];
+  result[0] = (FLT4)(0.0f);
+  result[1] = (FLT4)(0.0f);
+  result[2] = (FLT4)(0.0f);
+  result[3] = (FLT4)(0.0f);
+  FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W, Y * H + (4 * X) / W));
+  FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W, Y * H + (4 * X + 1) / W));
+  FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W, Y * H + (4 * X + 2) / W));
+  FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W, Y * H + (4 * X + 3) / W));
+  result[0].x = x0.x;
+  result[0].y = x1.x;
+  result[0].z = x2.x;
+  result[0].w = x3.x;
+
+  result[1].x = x0.y;
+  result[1].y = x1.y;
+  result[1].z = x2.y;
+  result[1].w = x3.y;
+
+  result[2].x = x0.z;
+  result[2].y = x1.z;
+  result[2].z = x2.z;
+  result[2].w = x3.z;
+
+  result[3].x = x0.w;
+  result[3].y = x1.w;
+  result[3].z = x2.w;
+  result[3].w = x3.w;
+
+  if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = result[0];
+  if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = result[1];
+  if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = result[2];
+  if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = result[3];
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
index e79349bf5f5..27106da1fd2 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@@ -40,7 +40,7 @@ int Conv2dTransposeOpenCLKernel::Init() {
     MS_LOG(ERROR) << "only support pad =0.";
     return RET_ERROR;
   }
-  std::string kernel_name = "conv2d_transpose2x2";
+  std::string kernel_name = "conv2d_transpose2x2_" + std::string(EnumNameFormat(op_format_));
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   enable_fp16_ = ocl_runtime->GetFp16Enable();
 #ifdef PROGRAM_WITH_IL
@@ -54,9 +54,9 @@ int Conv2dTransposeOpenCLKernel::Init() {
 #endif
   PadWeight();
   in_ori_format_ = in_tensors_[0]->GetFormat();
-  in_tensors_[0]->SetFormat(schema::Format_NHWC4);
+  in_tensors_[0]->SetFormat(op_format_);
   out_ori_format_ = out_tensors_[0]->GetFormat();
-  out_tensors_[0]->SetFormat(schema::Format_NHWC4);
+  out_tensors_[0]->SetFormat(op_format_);
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
   return RET_OK;
 }
@@ -142,8 +142,20 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
 
 int Conv2dTransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
   size_t im_dst_x, im_dst_y;
-  im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-  im_dst_y = out_tensors_[0]->Height();
+  int n = out_tensors_[0]->shape()[0];
+  int h = out_tensors_[0]->shape()[1];
+  int w = out_tensors_[0]->shape()[2];
+  int c = out_tensors_[0]->shape()[3];
+  if (op_format_ == schema::Format_NHWC4) {
+    im_dst_x = w * UP_DIV(c, C4NUM);
+    im_dst_y = n * h;
+  } else if (op_format_ == schema::Format_NC4HW4) {
+    im_dst_x = w;
+    im_dst_y = n * UP_DIV(c, C4NUM) * h;
+  } else {
+    MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
+    return RET_ERROR;
+  }
   size_t img_dtype = CL_FLOAT;
   if (enable_fp16_) {
     img_dtype = CL_HALF_FLOAT;
@@ -156,23 +168,17 @@ int Conv2dTransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *i
 
 int Conv2dTransposeOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
-  std::vector<int> shapex = in_tensors_[0]->shape();
-  int n = shapex[0];
-  if (n > 1) {
-    MS_LOG(ERROR) << " n > 1 not supported!";
-    return RET_ERROR;
-  }
   ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_);
-  int ci = in_tensors_[0]->Channel();
-  int co = out_tensors_[0]->Channel();
+  int ci = in_tensors_[0]->shape()[3];
+  int co = out_tensors_[0]->shape()[3];
   int co4 = UP_DIV(co, C4NUM);
   int kh = param->kernel_h_;
   int kw = param->kernel_w_;
   int pad = param->pad_u_;
-  int oh = out_tensors_[0]->Height();
-  int ow = out_tensors_[0]->Width();
-  int h = in_tensors_[0]->Height();
-  int w = in_tensors_[0]->Width();
+  int oh = out_tensors_[0]->shape()[1];
+  int ow = out_tensors_[0]->shape()[2];
+  int h = in_tensors_[0]->shape()[1];
+  int w = in_tensors_[0]->shape()[2];
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   // local size should less than MAX_GROUP_SIZE
   std::vector<size_t> local = {16, 1, 16};
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
index a020dd857df..9431e555d5a 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@@ -34,6 +34,7 @@ namespace mindspore::kernel {
 
 int MatMulOpenCLKernel::Init() {
   std::string kernel_name = "MatMul";
+  kernel_name += "_" + std::string(EnumNameFormat(op_format_));
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   enable_fp16_ = ocl_runtime->GetFp16Enable();
 #ifdef PROGRAM_WITH_IL
@@ -46,6 +47,10 @@ int MatMulOpenCLKernel::Init() {
   ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
   int ci, co;
+  if (in_tensors_[1]->shape().size() != 2) {
+    MS_LOG(ERROR) << "matmul do not support input shape size=" << in_tensors_[1]->shape().size();
+    return RET_ERROR;
+  }
   if (in_tensors_[1]->shape().size() == 2) {
     ci = in_tensors_[1]->shape()[1];
     co = in_tensors_[1]->shape()[0];
@@ -59,13 +64,8 @@ int MatMulOpenCLKernel::Init() {
   PadWeight();
   in_ori_format_ = in_tensors_[0]->GetFormat();
   out_ori_format_ = out_tensors_[0]->GetFormat();
-  if (out_tensors_[0]->shape().size() == 2) {
-    out_tensors_[0]->SetFormat(schema::Format_NC4);
-    in_tensors_[0]->SetFormat(schema::Format_NC4);
-  } else {
-    in_tensors_[0]->SetFormat(schema::Format_NHWC4);
-    out_tensors_[0]->SetFormat(schema::Format_NHWC4);
-  }
+  in_tensors_[0]->SetFormat(op_format_);
+  out_tensors_[0]->SetFormat(op_format_);
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
   return RET_OK;
 }
@@ -142,8 +142,16 @@ void MatMulOpenCLKernel::PadWeight() {
 
 int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
   size_t im_dst_x, im_dst_y;
-  im_dst_x = sizeCO.s[1];
-  im_dst_y = 1;
+  if (op_format_ == schema::Format_NHWC4) {
+    im_dst_x = sizeCO.s[1];
+    im_dst_y = 1;
+  } else if (op_format_ == schema::Format_NC4HW4) {
+    im_dst_x = 1;
+    im_dst_y = sizeCO.s[1];
+  } else {
+    MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
+    return RET_ERROR;
+  }
   size_t img_dtype = CL_FLOAT;
   if (enable_fp16_) {
     img_dtype = CL_HALF_FLOAT;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
index 3d87827769d..27e4f214ff3 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@@ -64,6 +64,7 @@ int PoolingOpenCLKernel::Init() {
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
+  kernel_name += "_" + std::string(EnumNameFormat(op_format_));
   if (out_mem_type_ == OpenCLMemType::BUF) {
     MS_LOG(ERROR) << "buffer output not support yet.";
     return RET_ERROR;
@@ -75,27 +76,38 @@ int PoolingOpenCLKernel::Init() {
   ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
   in_ori_format_ = in_tensors_[0]->GetFormat();
-  in_tensors_[0]->SetFormat(schema::Format_NHWC4);
   out_ori_format_ = out_tensors_[0]->GetFormat();
-  out_tensors_[0]->SetFormat(schema::Format_NHWC4);
+  in_tensors_[0]->SetFormat(op_format_);
+  out_tensors_[0]->SetFormat(op_format_);
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
 
   return RET_OK;
 }
 
 std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const {
-  const size_t global_x = out_tensors_[0]->Height();
-  const size_t global_y = out_tensors_[0]->Width();
-  const size_t global_z = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+  const size_t global_x = out_tensors_[0]->shape()[1];
+  const size_t global_y = out_tensors_[0]->shape()[2];
+  const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
   std::vector<size_t> global = {global_x, global_y, global_z};
   return global;
 }
 
 int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
-  size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
   size_t im_dst_x, im_dst_y;
-  im_dst_x = out_tensors_[0]->Width() * CO4;
-  im_dst_y = out_tensors_[0]->Height();
+  int n = out_tensors_[0]->shape()[0];
+  int h = out_tensors_[0]->shape()[1];
+  int w = out_tensors_[0]->shape()[2];
+  int c = out_tensors_[0]->shape()[3];
+  if (op_format_ == schema::Format_NHWC4) {
+    im_dst_x = w * UP_DIV(c, C4NUM);
+    im_dst_y = n * h;
+  } else if (op_format_ == schema::Format_NC4HW4) {
+    im_dst_x = w;
+    im_dst_y = n * UP_DIV(c, C4NUM) * h;
+  } else {
+    MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
+    return RET_ERROR;
+  }
   size_t img_dtype = CL_FLOAT;
   if (enable_fp16_) {
     img_dtype = CL_HALF_FLOAT;
@@ -114,9 +126,10 @@ int PoolingOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
 
-  int slices = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-  cl_int4 input_shape = {in_tensors_[0]->Height(), in_tensors_[0]->Width(), in_tensors_[0]->Channel(), slices};
-  cl_int4 output_shape = {out_tensors_[0]->Height(), out_tensors_[0]->Width(), out_tensors_[0]->Channel(), slices};
+  int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
+  cl_int4 input_shape = {in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], in_tensors_[0]->shape()[3], slices};
+  cl_int4 output_shape = {out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], out_tensors_[0]->shape()[3],
+                          slices};
   cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_};
   cl_int2 kernel_size = {parameter_->window_h_, parameter_->window_w_};
   cl_int2 padding = {parameter_->pad_u_, parameter_->pad_l_};
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
index a504d6a84ef..d3d21a8199c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@@ -32,14 +32,10 @@ namespace mindspore::kernel {
 
 int ReshapeOpenCLKernel::Init() {
   std::string kernel_name = "reshape";
+  kernel_name += "_" + std::string(EnumNameFormat(op_format_));
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   enable_fp16_ = ocl_runtime->GetFp16Enable();
-  in_ori_format_ = in_tensors_[0]->GetFormat();
-  out_ori_format_ = out_tensors_[0]->GetFormat();
-  if (in_ori_format_ != schema::Format_NHWC4 && in_ori_format_ != schema::Format_NHWC) {
-    MS_LOG(ERROR) << "Reshape input format:" << in_ori_format_ << " not support yet.";
-    return RET_ERROR;
-  }
+
   if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) {
     MS_LOG(ERROR) << "Reshape input channel " << in_tensors_[0]->shape().back() << " should equal output channel"
                   << out_tensors_[0]->shape().back();
@@ -54,12 +50,10 @@ int ReshapeOpenCLKernel::Init() {
   ocl_runtime->LoadSource(program_name, source);
   ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
-  in_tensors_[0]->SetFormat(schema::Format_NHWC4);
-  out_tensors_[0]->SetFormat(schema::Format_NHWC4);
-  if (out_tensors_[0]->shape().size() == 2) {
-    out_ori_format_ = schema::Format_NC;
-    out_tensors_[0]->SetFormat(schema::Format_NC4);
-  }
+  in_ori_format_ = in_tensors_[0]->GetFormat();
+  out_ori_format_ = out_tensors_[0]->GetFormat();
+  in_tensors_[0]->SetFormat(op_format_);
+  out_tensors_[0]->SetFormat(op_format_);
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
   return RET_OK;
 }
@@ -69,17 +63,27 @@ int ReshapeOpenCLKernel::ReSize() { return RET_OK; }
 int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
   size_t im_dst_x, im_dst_y;
   std::vector<int> shapex = out_tensors_[0]->shape();
-  int h, w, c;
+  int n, h, w, c;
   if (shapex.size() == 2) {
+    n = shapex[0];
     h = w = 1;
     c = shapex[1];
   } else {
+    n = shapex[0];
     h = shapex[1];
     w = shapex[2];
     c = shapex[3];
   }
-  im_dst_x = w * UP_DIV(c, C4NUM);
-  im_dst_y = h;
+  if (op_format_ == schema::Format_NHWC4) {
+    im_dst_x = w * UP_DIV(c, C4NUM);
+    im_dst_y = n * h;
+  } else if (op_format_ == schema::Format_NC4HW4) {
+    im_dst_x = w;
+    im_dst_y = n * UP_DIV(c, C4NUM) * h;
+  } else {
+    MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
+    return RET_ERROR;
+  }
   size_t img_dtype = CL_FLOAT;
   if (enable_fp16_) {
     img_dtype = CL_HALF_FLOAT;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
index 39a869bf5a1..9c6a1774c4b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@@ -86,6 +86,7 @@ int SoftmaxOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
 int SoftmaxOpenCLKernel::Init() {
   std::string kernel_name = "SoftMax";
   std::string program_name = "SoftMax";
+
   std::string source = softmax_source;
   runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
   enable_fp16_ = runtime_->GetFp16Enable();
@@ -102,6 +103,7 @@ int SoftmaxOpenCLKernel::Init() {
     MS_LOG(ERROR) << "Init `Softmax` kernel failed: Unsupported shape size: " << in_tensors_[0]->shape().size();
     return RET_ERROR;
   }
+  kernel_name += "_" + std::string(EnumNameFormat(op_format_));
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
@@ -124,21 +126,9 @@ int SoftmaxOpenCLKernel::Init() {
 #endif
   in_ori_format_ = in_tensors_[0]->GetFormat();
   out_ori_format_ = out_tensors_[0]->GetFormat();
-  if (in_tensors_[0]->shape().size() == 2) {
-    in_tensors_[0]->SetFormat(schema::Format_NC4);
-  } else {
-    in_tensors_[0]->SetFormat(schema::Format_NHWC4);
-  }
-
-  if (is_image_out_) {
-    if (out_tensors_[0]->shape().size() == 2) {
-      out_ori_format_ = schema::Format_NC;
-      out_tensors_[0]->SetFormat(schema::Format_NC4);
-    } else {
-      out_ori_format_ = schema::Format_NHWC;
-      out_tensors_[0]->SetFormat(schema::Format_NHWC4);
-    }
-  } else {
+  in_tensors_[0]->SetFormat(op_format_);
+  out_tensors_[0]->SetFormat(op_format_);
+  if (!is_image_out_) {
     out_tensors_[0]->SetFormat(out_ori_format_);
   }
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
index 94dd6dc66e3..8d0e042cf95 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@@ -65,6 +65,14 @@ int ToFormatOpenCLKernel::Init() {
 int ToFormatOpenCLKernel::InitNHWCShape() {
   std::vector<int> shapex = out_tensors_[0]->shape();
   size_t n, h, w, c;
+  if (shapex.size() == 2) {
+    n = shapex[0];
+    h = 1;
+    w = 1;
+    c = shapex[1];
+    nhwc_shape_ = {n, h, w, c};
+    return RET_OK;
+  }
   if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC4 ||
       out_tensors_[0]->GetFormat() == schema::Format_NHWC) {
     n = shapex[0];
@@ -118,7 +126,7 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
     im_dst_x = w * UP_DIV(c, C4NUM);
     im_dst_y = h;
   } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
-    int c = nhwc_shape_[1];
+    int c = nhwc_shape_[3];
     im_dst_x = UP_DIV(c, C4NUM);
     im_dst_y = 1;
   } else {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
index 0d783303576..53cd41f07fd 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@@ -36,7 +36,9 @@ int TransposeOpenCLKernel::Init() {
   std::string kernel_name = "transpose";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   enable_fp16_ = ocl_runtime->GetFp16Enable();
-  if (!is_image_out_) {
+  out_mem_type_ = OpenCLMemType::BUF;
+  kernel_name += "_" + std::string(EnumNameFormat(op_format_));
+  if (out_mem_type_ == OpenCLMemType::BUF) {
     kernel_name += "_BUF";
   } else {
     kernel_name += "_IMG";
@@ -50,17 +52,19 @@ int TransposeOpenCLKernel::Init() {
   ocl_runtime->LoadSource(program_name, source);
   ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
-  if ((in_tensors_[0]->Height() * in_tensors_[0]->Width()) % 4 != 0) {
+  if ((in_tensors_[0]->shape()[1] * in_tensors_[0]->shape()[2]) % 4 != 0) {
     MS_LOG(ERROR) << "input H * W % 4 != 0 not support!";
     return RET_ERROR;
   }
   in_ori_format_ = in_tensors_[0]->GetFormat();
-  in_tensors_[0]->SetFormat(schema::Format_NHWC4);
-  out_ori_format_ = schema::Format_NCHW;
-  out_tensors_[0]->SetFormat(schema::Format_NCHW);
-  if (!is_image_out_) {
-    out_mem_type_ = OpenCLMemType::BUF;
+  out_ori_format_ = out_tensors_[0]->GetFormat();
+  in_tensors_[0]->SetFormat(op_format_);
+  out_tensors_[0]->SetFormat(op_format_);
+  if (out_mem_type_ == OpenCLMemType::BUF) {
+    out_ori_format_ = schema::Format_NCHW;
+    out_tensors_[0]->SetFormat(schema::Format_NCHW);
   }
+
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
   return RET_OK;
 }
@@ -69,8 +73,20 @@ int TransposeOpenCLKernel::ReSize() { return RET_OK; }
 
 int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
   size_t im_dst_x, im_dst_y;
-  im_dst_x = out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Width(), C4NUM);
-  im_dst_y = out_tensors_[0]->Channel();
+  int n = out_tensors_[0]->shape()[0];
+  int h = out_tensors_[0]->shape()[1];
+  int w = out_tensors_[0]->shape()[2];
+  int c = out_tensors_[0]->shape()[3];
+  if (op_format_ == schema::Format_NHWC4) {
+    im_dst_x = w * UP_DIV(c, C4NUM);
+    im_dst_y = n * h;
+  } else if (op_format_ == schema::Format_NC4HW4) {
+    im_dst_x = w;
+    im_dst_y = n * UP_DIV(c, C4NUM) * h;
+  } else {
+    MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
+    return RET_ERROR;
+  }
   size_t img_dtype = CL_FLOAT;
   if (enable_fp16_) {
     img_dtype = CL_HALF_FLOAT;
@@ -102,6 +118,7 @@ int TransposeOpenCLKernel::Run() {
   ocl_runtime->SetKernelArg(kernel_, arg_idx++, HW);
   ocl_runtime->SetKernelArg(kernel_, arg_idx++, C);
   ocl_runtime->SetKernelArg(kernel_, arg_idx++, w);
+  ocl_runtime->SetKernelArg(kernel_, arg_idx++, h);
   ocl_runtime->RunKernel(kernel_, global, local, nullptr);
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
index 698ef9e7355..4576bd066c6 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
@@ -38,7 +38,6 @@ class TransposeOpenCLKernel : public OpenCLKernel {
 
  private:
   cl::Kernel kernel_;
-  bool is_image_out_{false};
   bool enable_fp16_{false};
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc
index f74cd7cfdab..31fa220ca32 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc
@@ -21,6 +21,7 @@
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h"
+#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
 
 namespace mindspore {
 
@@ -51,97 +52,107 @@ void InitAvgPoolingParam(PoolingParameter *param) {
   param->pool_mode_ = PoolMode_AvgPool;
 }
 
-TEST_F(TestAvgPoolingOpenCL, AvgPoolFp32) {
-  MS_LOG(INFO) << "start TEST_F TestPoolingOpenCL";
+void RunTestCaseAvgPooling(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
-
-  MS_LOG(INFO) << "create PoolingParameter";
-  auto param = new (std::nothrow) PoolingParameter();
+  size_t dtype_size = sizeof(float);
+  if (enable_fp16) {
+    ocl_runtime->SetFp16Enable(true);
+    dtype_size = sizeof(float16_t);
+  }
+  auto allocator = ocl_runtime->GetAllocator();
+  int n = shape[0];
+  int h = shape[1];
+  int w = shape[2];
+  int c = shape[3];
+  int oh = shape[4];
+  int ow = shape[5];
+  auto param_ptr = std::make_unique<PoolingParameter>();
+  auto param = param_ptr.get();
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "param create error.";
+    return;
+  }
   InitAvgPoolingParam(param);
-
-  MS_LOG(INFO) << "create Tensors";
-  std::vector<int> shape_in = {
-    param->input_batch_,
-    param->input_h_,
-    param->input_w_,
-    param->input_channel_,
-  };
-  std::vector<int> shape_out = {
-    param->output_batch_,
-    param->output_h_,
-    param->output_w_,
-    param->output_channel_,
-  };
-  auto data_type = kNumberTypeFloat32;
-  auto tensorType = schema::NodeType_ValueNode;
-  lite::tensor::Tensor *tensor_in =
-    new (std::nothrow) lite::tensor::Tensor(data_type, shape_in, schema::Format_NHWC, tensorType);
-  lite::tensor::Tensor *tensor_out =
-    new (std::nothrow) lite::tensor::Tensor(data_type, shape_out, schema::Format_NHWC, tensorType);
-  if (tensor_in == nullptr) {
-    MS_LOG(ERROR) << "tensor_in null";
+  std::vector<int> input_shape = {n, h, w, c};
+  auto tensor_x_ptr = std::make_unique<lite::tensor::Tensor>(
+    TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape, schema::Format_NHWC);
+  auto tensor_x = tensor_x_ptr.get();
+  if (tensor_x == nullptr) {
+    MS_LOG(ERROR) << "tensor_x create error.";
     return;
   }
+  std::vector<int> out_shape = {n, oh, ow, c};
+  auto tensor_out_ptr = std::make_unique<lite::tensor::Tensor>(
+    TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, schema::Format_NHWC);
+  auto tensor_out = tensor_out_ptr.get();
   if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out null";
+    MS_LOG(ERROR) << "tensor_out create error.";
     return;
   }
-  std::vector<lite::tensor::Tensor *> inputs{tensor_in};
+  std::vector<lite::tensor::Tensor *> inputs{tensor_x};
   std::vector<lite::tensor::Tensor *> outputs{tensor_out};
-
-  MS_LOG(INFO) << "create OpenCL Kernel";
-  auto *pooling_kernel =
-    new (std::nothrow) kernel::PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (pooling_kernel == nullptr) {
-    MS_LOG(ERROR) << "pooling_kernel null";
+  auto arith_kernel_ptr =
+    std::make_unique<kernel::PoolingOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+  auto arith_kernel = arith_kernel_ptr.get();
+  if (arith_kernel == nullptr) {
+    MS_LOG(ERROR) << "arith_kernel create error.";
     return;
   }
-  pooling_kernel->Init();
-  std::vector<kernel::LiteKernel *> kernels{pooling_kernel};
+  arith_kernel->Init();
 
-  MS_LOG(INFO) << "create SubGraphOpenCLKernel";
-  auto *pGraph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
+  inputs[0]->MallocData(allocator);
+
+  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
+  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
+  auto pGraph = pGraph_ptr.get();
   if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph null";
+    MS_LOG(ERROR) << "pGraph create error.";
     return;
   }
   pGraph->Init();
-
-  MS_LOG(INFO) << "initialize data";
-  std::vector<lite::tensor::Tensor *> tensor_map = {tensor_in};
-  for (auto &tensor_file : tensor_map) {
-    auto tensor = tensor_file;
-    size_t size = tensor->Size();
-    const float data[16] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f,
-                            0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-    memcpy(tensor->Data(), data, size);
-  }
-
-  MS_LOG(INFO) << "pGraph->Run()";
+  memcpy(inputs[0]->Data(), input_data, inputs[0]->ElementsNum() * dtype_size);
   pGraph->Run();
 
-  MS_LOG(INFO) << "==================output data=================";
-  float *output_data = reinterpret_cast<float *>(tensor_out->Data());
-  printf("output:");
-  for (int i = 0; i < 4; i++) {
-    printf("%.3f ", output_data[i]);
+  if (enable_fp16) {
+    CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3), 2e-2);
+  } else {
+    CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
   }
-  printf("\n");
-  float expect[4] = {2.0f, 3.0f, 4.0f, 5.0f};
+  inputs[0]->SetData(nullptr);
+  outputs[0]->SetData(nullptr);
 
-  for (int i = 0; i < tensor_out->ElementsNum(); ++i)
-    if (std::fabs(output_data[i] - expect[i]) > 1e-5) {
-      printf("idx[%d] except=%.3f output=%.3f, ", i, expect[i], output_data[i]);
-    }
-  printf("test all close OK!\n");
-  lite::CompareOutputData(output_data, expect, 4);
-  delete tensor_in;
-  delete tensor_out;
-  delete pooling_kernel;
-  delete pGraph;
-  delete param;
+  MS_LOG(INFO) << "Test AvgPool2d passed";
   lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 
+TEST_F(TestAvgPoolingOpenCL, AvgPoolingFp32) {
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int c = 4;
+  int oh = 1;
+  int ow = 1;
+  std::vector<int> shape = {n, h, w, c, oh, ow};
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+                                   8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+  std::vector<float> output_data = {6.0f, 7.0f, 8.0f, 9.0f};
+
+  RunTestCaseAvgPooling(shape, input_data.data(), output_data.data(), false);
+}
+
+TEST_F(TestAvgPoolingOpenCL, AvgPoolingFp16) {
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int c = 4;
+  int oh = 1;
+  int ow = 1;
+  std::vector<int> shape = {n, h, w, c, oh, ow};
+  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+                                       8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+  std::vector<float16_t> output_data = {6.0f, 7.0f, 8.0f, 9.0f};
+
+  RunTestCaseAvgPooling(shape, input_data.data(), output_data.data(), true);
+}
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc
index 321b1ccd061..fe453a0ca1f 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc
@@ -13,10 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#include <iostream>
 #include <memory>
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
+#include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h"
@@ -26,98 +27,132 @@ namespace mindspore {
 
 class TestMaxPoolingOpenCL : public mindspore::CommonTest {};
 
-void InitParameter(PoolingParameter *param) {
+void InitMaxPoolingParam(PoolingParameter *param) {
+  param->input_batch_ = 1;
+  param->input_h_ = 2;
+  param->input_w_ = 2;
+  param->input_channel_ = 4;
+
+  param->output_batch_ = 1;
+  param->output_h_ = 1;
+  param->output_w_ = 1;
+  param->output_channel_ = 4;
+
   param->window_h_ = 2;
   param->window_w_ = 2;
+
   param->stride_h_ = 2;
   param->stride_w_ = 2;
+
   param->pad_u_ = 0;
   param->pad_d_ = 0;
   param->pad_l_ = 0;
   param->pad_r_ = 0;
+
   param->pool_mode_ = PoolMode_MaxPool;
 }
 
-TEST_F(TestMaxPoolingOpenCL, MaxPool_1_32_512_96) {
-  MS_LOG(INFO) << "ocl runtime";
+void RunTestCaseMaxPooling(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
+  size_t dtype_size = sizeof(float);
+  if (enable_fp16) {
+    ocl_runtime->SetFp16Enable(true);
+    dtype_size = sizeof(float16_t);
+  }
   auto allocator = ocl_runtime->GetAllocator();
-
-  MS_LOG(INFO) << "PoolingParameter";
-  auto param = new (std::nothrow) PoolingParameter;
-  InitParameter(param);
-
-  // define tensor
-  MS_LOG(INFO) << "define tensor1";
-  std::vector<int> input_shape = {1, 16, 256, 192};
-  std::vector<int> output_shape = {1, 8, 128, 192};
-  auto data_type = kNumberTypeFloat32;
-  auto tensorType = schema::NodeType_ValueNode;
-  MS_LOG(INFO) << "define tensor2";
-  auto input_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC4, tensorType);
-  auto output_tensor =
-    new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensorType);
-  if (input_tensor == nullptr) {
-    MS_LOG(ERROR) << "input_tensor null";
+  int n = shape[0];
+  int h = shape[1];
+  int w = shape[2];
+  int c = shape[3];
+  int oh = shape[4];
+  int ow = shape[5];
+  auto param_ptr = std::make_unique<PoolingParameter>();
+  auto param = param_ptr.get();
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "param create error.";
     return;
   }
-  if (output_tensor == nullptr) {
-    MS_LOG(ERROR) << "output_tensor null";
+  InitMaxPoolingParam(param);
+  std::vector<int> input_shape = {n, h, w, c};
+  auto tensor_x_ptr = std::make_unique<lite::tensor::Tensor>(
+    TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape, schema::Format_NHWC);
+  auto tensor_x = tensor_x_ptr.get();
+  if (tensor_x == nullptr) {
+    MS_LOG(ERROR) << "tensor_x create error.";
     return;
   }
-  MS_LOG(INFO) << "define input";
-  std::vector<lite::tensor::Tensor *> inputs{input_tensor};
-  std::vector<lite::tensor::Tensor *> outputs{output_tensor};
-
-  // run
-  MS_LOG(INFO) << "pooling_kernel";
-  auto *pooling_kernel =
-    new (std::nothrow) kernel::PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (pooling_kernel == nullptr) {
-    MS_LOG(ERROR) << "pooling_kernel null";
+  std::vector<int> out_shape = {n, oh, ow, c};
+  auto tensor_out_ptr = std::make_unique<lite::tensor::Tensor>(
+    TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, schema::Format_NHWC);
+  auto tensor_out = tensor_out_ptr.get();
+  if (tensor_out == nullptr) {
+    MS_LOG(ERROR) << "tensor_out create error.";
     return;
   }
-  MS_LOG(INFO) << "pooling_kernel init";
-  pooling_kernel->Init();
+  std::vector<lite::tensor::Tensor *> inputs{tensor_x};
+  std::vector<lite::tensor::Tensor *> outputs{tensor_out};
+  auto arith_kernel_ptr =
+    std::make_unique<kernel::PoolingOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+  auto arith_kernel = arith_kernel_ptr.get();
+  if (arith_kernel == nullptr) {
+    MS_LOG(ERROR) << "arith_kernel create error.";
+    return;
+  }
+  arith_kernel->Init();
 
-  std::vector<kernel::LiteKernel *> kernels{pooling_kernel};
   inputs[0]->MallocData(allocator);
-  auto *pGraph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
+
+  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
+  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
+  auto pGraph = pGraph_ptr.get();
   if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph null";
+    MS_LOG(ERROR) << "pGraph create error.";
     return;
   }
-  MS_LOG(INFO) << "pGraph init";
   pGraph->Init();
-
-  // load data
-  MS_LOG(INFO) << "load data1";
-  std::string input_file = "maxpool_in.bin";
-  std::string expect_file = "maxpool_out.bin";
-  MS_LOG(INFO) << "load data2";
-  LoadTestData(input_tensor->Data(), input_tensor->Size(), input_file);
-  auto *input_data = reinterpret_cast<float *>(input_tensor->Data());
-  printf("input[0:10]:");
-  for (int i = 0; i < 10; i++) {
-    printf("[%d]:%.3f ", i, input_data[i]);
-  }
-  printf("\n");
-
+  memcpy(inputs[0]->Data(), input_data, inputs[0]->ElementsNum() * dtype_size);
   pGraph->Run();
 
-  MS_LOG(INFO) << "compare result";
-  std::cout << "compare result" << std::endl;
-  CompareOutput(output_tensor, expect_file, static_cast<float>(1e-5));
-  for (auto tensor : inputs) {
-    delete tensor;
+  if (enable_fp16) {
+    CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3), 2e-2);
+  } else {
+    CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
   }
-  for (auto tensor : outputs) {
-    delete tensor;
-  }
-  delete pooling_kernel;
-  delete pGraph;
+  inputs[0]->SetData(nullptr);
+  outputs[0]->SetData(nullptr);
+
+  MS_LOG(INFO) << "Test MaxPool2d passed";
   lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 
+TEST_F(TestMaxPoolingOpenCL, MaxPoolingFp32) {
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int c = 4;
+  int oh = 1;
+  int ow = 1;
+  std::vector<int> shape = {n, h, w, c, oh, ow};
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+                                   8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+  std::vector<float> output_data = {12.0f, 13.0f, 14.0f, 15.0f};
+
+  RunTestCaseMaxPooling(shape, input_data.data(), output_data.data(), false);
+}
+
+TEST_F(TestMaxPoolingOpenCL, MaxPoolingFp16) {
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int c = 4;
+  int oh = 1;
+  int ow = 1;
+  std::vector<int> shape = {n, h, w, c, oh, ow};
+  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+                                       8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+  std::vector<float16_t> output_data = {12.0f, 13.0f, 14.0f, 15.0f};
+
+  RunTestCaseMaxPooling(shape, input_data.data(), output_data.data(), true);
+}
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc
index 6c21bc61c81..72efebd1a33 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc
@@ -29,7 +29,8 @@ class TestReshapeOpenCL : public mindspore::CommonTest {
   TestReshapeOpenCL() {}
 };
 
-void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) {
+void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16,
+                        bool is_output_2d) {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   size_t dtype_size = sizeof(float);
@@ -38,8 +39,13 @@ void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *o
     dtype_size = sizeof(float16_t);
   }
   auto allocator = ocl_runtime->GetAllocator();
-  int c = shape[0];
-  std::vector<int> input_shape = {1, 1, 1, c};
+  int n = shape[0];
+  int h = shape[1];
+  int w = shape[2];
+  int c = shape[3];
+  int oh = shape[4];
+  int ow = shape[5];
+  std::vector<int> input_shape = {n, h, w, c};
   auto tensor_x_ptr = std::make_unique<lite::tensor::Tensor>(
     TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape, schema::Format_NHWC);
   auto tensor_x = tensor_x_ptr.get();
@@ -47,9 +53,13 @@ void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *o
     MS_LOG(ERROR) << "tensor_x create error.";
     return;
   }
-  std::vector<int> out_shape = {1, c};
-  auto tensor_out_ptr = std::make_unique<lite::tensor::Tensor>(
-    TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, schema::Format_NC);
+  std::vector<int> out_shape = {n, oh, ow, c};
+  if (is_output_2d) {
+    std::vector<int> out_shape = {n, c};
+  }
+  auto tensor_out_ptr =
+    std::make_unique<lite::tensor::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape,
+                                           is_output_2d ? schema::Format_NC : schema::Format_NHWC);
   auto tensor_out = tensor_out_ptr.get();
   if (tensor_out == nullptr) {
     MS_LOG(ERROR) << "tensor_out create error.";
@@ -75,13 +85,13 @@ void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *o
     return;
   }
   pGraph->Init();
-  memcpy(inputs[0]->Data(), input_data, c * dtype_size);
+  memcpy(inputs[0]->Data(), input_data, inputs[0]->ElementsNum() * dtype_size);
   pGraph->Run();
 
   if (enable_fp16) {
-    CompareOutput(outputs[0]->Data(), output_data, c, static_cast<float16_t>(1e-3), 2e-2);
+    CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3), 2e-2);
   } else {
-    CompareOutput(outputs[0]->Data(), output_data, c, static_cast<float>(1e-5));
+    CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
   }
   inputs[0]->SetData(nullptr);
   outputs[0]->SetData(nullptr);
@@ -91,20 +101,58 @@ void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *o
 }
 
 TEST_F(TestReshapeOpenCL, ReshapeFp32) {
+  int n = 1;
+  int h = 1;
+  int w = 1;
   int c = 7;
-  std::vector<int> shape = {c};
+  int oh = 1;
+  int ow = 1;
+  std::vector<int> shape = {n, h, w, c, oh, ow};
   std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   std::vector<float> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
 
-  RunTestCaseReshape(shape, input_data.data(), output_data.data(), false);
+  RunTestCaseReshape(shape, input_data.data(), output_data.data(), false, true);
 }
 
 TEST_F(TestReshapeOpenCL, ReshapeFp16) {
+  int n = 1;
+  int h = 1;
+  int w = 1;
   int c = 7;
-  std::vector<int> shape = {c};
+  int oh = 1;
+  int ow = 1;
+  std::vector<int> shape = {n, h, w, c, oh, ow};
   std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   std::vector<float16_t> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
 
-  RunTestCaseReshape(shape, input_data.data(), output_data.data(), true);
+  RunTestCaseReshape(shape, input_data.data(), output_data.data(), true, true);
+}
+
+TEST_F(TestReshapeOpenCL, Reshape4DFp32) {
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int c = 3;
+  int oh = 1;
+  int ow = 4;
+  std::vector<int> shape = {n, h, w, c, oh, ow};
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  std::vector<float> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+
+  RunTestCaseReshape(shape, input_data.data(), output_data.data(), false, false);
+}
+
+TEST_F(TestReshapeOpenCL, Reshape4DFp16) {
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int c = 3;
+  int oh = 1;
+  int ow = 4;
+  std::vector<int> shape = {n, h, w, c, oh, ow};
+  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  std::vector<float16_t> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+
+  RunTestCaseReshape(shape, input_data.data(), output_data.data(), true, false);
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
index c0dbcf3b225..dcadb543125 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
@@ -94,8 +94,8 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
 }
 
 TEST_F(TestTransposeOpenCL, TransposeFp32) {
-  int h = 64;
-  int w = 1;
+  int h = 1;
+  int w = 64;
   int c = 7360;
   std::vector<int> shape = {h, w, c};
   size_t input_size;