opencl fusion, optimize adreno performance, winograd support pad=0

2021-01-04 10:45:10 +08:00 · 2021-01-04 10:45:10 +08:00 · cf2f6a6a38
parent 1d1f6841b9
commit cf2f6a6a38
16 changed files with 2198 additions and 545 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
@ -30,7 +30,7 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP
  exp1 = exp(-data);  \
  data = (exp0 - exp1) / (exp0 + exp1);

-#define DO_LEAKY_RELU(data)                      \
+#define DO_LEAKY_RELU(data, alpha)               \
  data.x = data.x > 0 ? data.x : data.x * alpha; \
  data.y = data.y > 0 ? data.y : data.y * alpha; \
  data.z = data.z > 0 ? data.z : data.z * alpha; \
@ -87,7 +87,7 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t
    FLT4 exp0, exp1;
    DO_TANH(out_h0_w0_c0);
  } else if (act_type == ActivationType_LEAKY_RELU) {
-    DO_LEAKY_RELU(out_h0_w0_c0);
+    DO_LEAKY_RELU(out_h0_w0_c0, alpha);
  } else if (act_type == ActivationType_SIGMOID) {
    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
  }
@ -166,8 +166,8 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t
    DO_TANH(out_h0_w0_c0);
    DO_TANH(out_h1_w0_c0);
  } else if (act_type == ActivationType_LEAKY_RELU) {
-    DO_LEAKY_RELU(out_h0_w0_c0);
-    DO_LEAKY_RELU(out_h1_w0_c0);
+    DO_LEAKY_RELU(out_h0_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c0, alpha);
  } else if (act_type == ActivationType_SIGMOID) {
    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
@ -273,10 +273,10 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
    DO_TANH(out_h0_w0_c1);
    DO_TANH(out_h1_w0_c1);
  } else if (act_type == ActivationType_LEAKY_RELU) {
-    DO_LEAKY_RELU(out_h0_w0_c0);
-    DO_LEAKY_RELU(out_h1_w0_c0);
-    DO_LEAKY_RELU(out_h0_w0_c1);
-    DO_LEAKY_RELU(out_h1_w0_c1);
+    DO_LEAKY_RELU(out_h0_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h0_w0_c1, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c1, alpha);
  } else if (act_type == ActivationType_SIGMOID) {
    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
@ -438,14 +438,202 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
    DO_TANH(out_h1_w0_c1);
    DO_TANH(out_h1_w1_c1);
  } else if (act_type == ActivationType_LEAKY_RELU) {
-    DO_LEAKY_RELU(out_h0_w0_c0);
-    DO_LEAKY_RELU(out_h0_w1_c0);
-    DO_LEAKY_RELU(out_h1_w0_c0);
-    DO_LEAKY_RELU(out_h1_w1_c0);
-    DO_LEAKY_RELU(out_h0_w0_c1);
-    DO_LEAKY_RELU(out_h0_w1_c1);
-    DO_LEAKY_RELU(out_h1_w0_c1);
-    DO_LEAKY_RELU(out_h1_w1_c1);
+    DO_LEAKY_RELU(out_h0_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h0_w1_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w1_c0, alpha);
+    DO_LEAKY_RELU(out_h0_w0_c1, alpha);
+    DO_LEAKY_RELU(out_h0_w1_c1, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c1, alpha);
+    DO_LEAKY_RELU(out_h1_w1_c1, alpha);
+  } else if (act_type == ActivationType_SIGMOID) {
+    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
+    out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0));
+    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
+    out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0));
+    out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1));
+    out_h0_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c1));
+    out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
+    out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
+  }
+
+  if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
+    }  // end if (oh1 < OH)
+    if (co_slice1 < CO_SLICES) {
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
+      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
+      if (oh1 < OH) {
+        WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
+        WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
+      }  // end if (oh1 < OH)
+    }    // end if (co_slice1 < CO_SLICES)
+  } else {
+    WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0);
+    WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow1), out_h0_w1_c0);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0);
+      WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow1), out_h1_w1_c0);
+    }  // end (oh1 < OH)
+    if (co_slice1 < CO_SLICES) {
+      WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow0), out_h0_w0_c1);
+      WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow1), out_h0_w1_c1);
+      if (oh1 < OH) {
+        WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow0), out_h1_w0_c1);
+        WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow1), out_h1_w1_c1);
+      }  // end if (oh1 < OH)
+    }    // end if (co_slice1 < CO_SLICES)
+  }
+}
+
+__kernel void Conv2D_H2W2C2_Img(__read_only image2d_t input, __write_only image2d_t output,
+                                __read_only image2d_t weight, __global FLT4 *bias, int4 input_shape, int4 output_shape,
+                                int4 kernel_stride, int4 pad, int2 dilation, int act_type, float alpha) {
+  const int BlockH = 2;
+  const int BlockW = 2;
+  const int BlockC = 2;
+  DEFINE_ARGS;
+
+  int oh0 = oh + 0;
+  int oh1 = oh + 1;
+  int n_oh0 = n * OH + oh0;
+  int n_oh1 = n * OH + oh1;
+  int ow0 = ow + 0;
+  int ow1 = ow + 1;
+  int co_slice0 = co_slice + 0;
+  int co_slice1 = co_slice + 1;
+
+  FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  int filter_offset = 0;
+  for (int kh = 0; kh < KH; ++kh) {
+    int ih0 = kh * dilationH + oh0 * strideH - padTop;
+    // no need to check oh1, finally write out will check (oh1 < OH)
+    int ih1 = kh * dilationH + oh1 * strideH - padTop;
+    // check ih0 and ih1
+    int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+    int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
+
+    for (int kw = 0; kw < KW; ++kw) {
+      int iw0 = kw * dilationW + ow0 * strideW - padLeft;
+      int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2;
+      int x_idx0 = iw0 * CI_SLICES;
+      int x_idx1 = iw1 * CI_SLICES;
+
+      for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+        FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
+        FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0));
+        FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
+        FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1));
+        x_idx0++;
+        x_idx1++;
+
+        FLT4 filter_ci0_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 0));
+        FLT4 filter_ci1_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 1));
+        FLT4 filter_ci2_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 2));
+        FLT4 filter_ci3_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 3));
+        FLT4 filter_ci0_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 0));
+        FLT4 filter_ci1_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 1));
+        FLT4 filter_ci2_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 2));
+        FLT4 filter_ci3_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 3));
+        filter_offset += 4;
+
+        out_h0_w0_c0 += filter_ci0_co0 * in_h0_w0.x;
+        out_h0_w1_c0 += filter_ci0_co0 * in_h0_w1.x;
+        out_h1_w0_c0 += filter_ci0_co0 * in_h1_w0.x;
+        out_h1_w1_c0 += filter_ci0_co0 * in_h1_w1.x;
+        out_h0_w0_c0 += filter_ci1_co0 * in_h0_w0.y;
+        out_h0_w1_c0 += filter_ci1_co0 * in_h0_w1.y;
+        out_h1_w0_c0 += filter_ci1_co0 * in_h1_w0.y;
+        out_h1_w1_c0 += filter_ci1_co0 * in_h1_w1.y;
+        out_h0_w0_c0 += filter_ci2_co0 * in_h0_w0.z;
+        out_h0_w1_c0 += filter_ci2_co0 * in_h0_w1.z;
+        out_h1_w0_c0 += filter_ci2_co0 * in_h1_w0.z;
+        out_h1_w1_c0 += filter_ci2_co0 * in_h1_w1.z;
+        out_h0_w0_c0 += filter_ci3_co0 * in_h0_w0.w;
+        out_h0_w1_c0 += filter_ci3_co0 * in_h0_w1.w;
+        out_h1_w0_c0 += filter_ci3_co0 * in_h1_w0.w;
+        out_h1_w1_c0 += filter_ci3_co0 * in_h1_w1.w;
+
+        out_h0_w0_c1 += filter_ci0_co1 * in_h0_w0.x;
+        out_h0_w1_c1 += filter_ci0_co1 * in_h0_w1.x;
+        out_h1_w0_c1 += filter_ci0_co1 * in_h1_w0.x;
+        out_h1_w1_c1 += filter_ci0_co1 * in_h1_w1.x;
+        out_h0_w0_c1 += filter_ci1_co1 * in_h0_w0.y;
+        out_h0_w1_c1 += filter_ci1_co1 * in_h0_w1.y;
+        out_h1_w0_c1 += filter_ci1_co1 * in_h1_w0.y;
+        out_h1_w1_c1 += filter_ci1_co1 * in_h1_w1.y;
+        out_h0_w0_c1 += filter_ci2_co1 * in_h0_w0.z;
+        out_h0_w1_c1 += filter_ci2_co1 * in_h0_w1.z;
+        out_h1_w0_c1 += filter_ci2_co1 * in_h1_w0.z;
+        out_h1_w1_c1 += filter_ci2_co1 * in_h1_w1.z;
+        out_h0_w0_c1 += filter_ci3_co1 * in_h0_w0.w;
+        out_h0_w1_c1 += filter_ci3_co1 * in_h0_w1.w;
+        out_h1_w0_c1 += filter_ci3_co1 * in_h1_w0.w;
+        out_h1_w1_c1 += filter_ci3_co1 * in_h1_w1.w;
+      }
+    }
+  }
+
+  if (bias != 0) {
+    out_h0_w0_c0 += bias[co_slice0];
+    out_h0_w1_c0 += bias[co_slice0];
+    out_h1_w0_c0 += bias[co_slice0];
+    out_h1_w1_c0 += bias[co_slice0];
+    out_h0_w0_c1 += bias[co_slice1];
+    out_h0_w1_c1 += bias[co_slice1];
+    out_h1_w0_c1 += bias[co_slice1];
+    out_h1_w1_c1 += bias[co_slice1];
+  }
+
+  if (act_type == ActivationType_RELU) {
+    out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
+    out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f));
+    out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
+    out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f));
+    out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f));
+    out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f));
+    out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
+    out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f));
+  } else if (act_type == ActivationType_RELU6) {
+    out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+  } else if (act_type == ActivationType_TANH) {
+    FLT4 exp0, exp1;
+    DO_TANH(out_h0_w0_c0);
+    DO_TANH(out_h0_w1_c0);
+    DO_TANH(out_h1_w0_c0);
+    DO_TANH(out_h1_w1_c0);
+    DO_TANH(out_h0_w0_c1);
+    DO_TANH(out_h0_w1_c1);
+    DO_TANH(out_h1_w0_c1);
+    DO_TANH(out_h1_w1_c1);
+  } else if (act_type == ActivationType_LEAKY_RELU) {
+    DO_LEAKY_RELU(out_h0_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h0_w1_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c0, alpha);
+    DO_LEAKY_RELU(out_h1_w1_c0, alpha);
+    DO_LEAKY_RELU(out_h0_w0_c1, alpha);
+    DO_LEAKY_RELU(out_h0_w1_c1, alpha);
+    DO_LEAKY_RELU(out_h1_w0_c1, alpha);
+    DO_LEAKY_RELU(out_h1_w1_c1, alpha);
  } else if (act_type == ActivationType_SIGMOID) {
    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
    out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0));
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
@ -2,6 +2,7 @@

 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

+#define CI_TILE 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))

 constant FLT Bt[36] = {
@ -13,65 +14,55 @@ constant FLT Bt[36] = {
  0.0000000000f, 0.9999998808f,  -0.0000000596f, -2.5000000000f, 0.0000000000f,  1.0000000000f,
 };

-__kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output,
-                              int4 input_shape,     // N H W CI_SLICES
-                              int4 output_shape) {  // N 36 H/4*W/4 CI_SLICES
-#define PAD 1
-  int tile_xy = get_global_id(0);
+__kernel void Winograd4x4To36(__read_only image2d_t input,    // height=N*H             width=W*CI_SLICES
+                              __write_only image2d_t output,  // height=CI_SLICES*36    width=H/4*W/4
+                              int4 input_shape,               // N H W CI_SLICES
+                              int TILE_HW, int pad) {
+  int tile_hw = get_global_id(0);
  int row = get_global_id(1);
-  int slice = get_global_id(2);
-
-  int TILE_XY = output_shape.z;
-  int SLICES = input_shape.w;
-  if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES) {
+  int ci_slice = get_global_id(2);
+  int H = input_shape.y;
+  int W = input_shape.z;
+  int CI_SLICES = input_shape.w;
+  if (tile_hw >= TILE_HW || row >= 6 || ci_slice >= CI_SLICES) {
    return;
  }
-
-  int IH = input_shape.y, IW = input_shape.z;
-  int TILE_X = UP_DIV(IW, 4);
-  int tile_x = tile_xy % TILE_X;
-  int tile_y = tile_xy / TILE_X;
+  int TILE_W = UP_DIV(W, 4);
+  int tile_w = tile_hw % TILE_W;
+  int tile_h = tile_hw / TILE_W;

  constant FLT *Bt_row = Bt + row * 6;
  FLT4 BtD_row[6] = {0};
-
-  int ih = tile_y * 4 - PAD;
-  int iw = tile_x * 4 - PAD;
+  int h = tile_h * 4 - pad;
+  int w = tile_w * 4 - pad;
  for (int y = 0; y < 6; y++) {
-    int x_idx = iw * SLICES + slice;
+    int x_idx = w * CI_SLICES + ci_slice;
    for (int x = 0; x < 6; x++) {
-      // no need to check iw: because slice is in [0, SLICES). when iw<0, x_idx<0; iw>=IW, x_idx>=IW*SLICES
-      // if (iw < 0 || iw >= IW) { continue; }
-      BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, ih));
-      x_idx += SLICES;
+      // no need to check w: because ci_slice is in [0, CI_SLICES). when w<0, x_idx<0; w>=W, x_idx>=W*CI_SLICES
+      // if (w < 0 || w >= W) { continue; }
+      BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, h));
+      x_idx += CI_SLICES;
    }
-    ih++;
+    h++;
  }

-  int y_idx = slice * 36 + row * 6;
+  int y_idx = ci_slice * 36 + row * 6;
  for (int y = 0; y < 6; y++) {
    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
    for (int x = 0; x < 6; x++) {
      acc += BtD_row[x] * Bt[y * 6 + x];
    }
-    WRITE_IMAGE(output, (int2)(tile_xy, y_idx + y), acc);  // CH W  H=36
+    WRITE_IMAGE(output, (int2)(tile_hw, y_idx + y), acc);
  }
-#undef PAD
 }

-__kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
-                                  int4 input_shape,     // N 36 H/4*W/4 CI_SLICES
-                                  int4 output_shape) {  // N 36 H/4*W/4 CO_SLICES
-#define H 36
-  int w = get_global_id(0) * 2;
+__kernel void WinogradConv2D(__read_only image2d_t input,    // height=CI_SLICES*36    width=TILE_HW
+                             __write_only image2d_t output,  // height=CO_SLICES*36    width=TILE_HW
+                             __global FLT16 *weight, int TILE_HW, int CI_SLICES, int CO_SLICES) {
+  int tile_hw = get_global_id(0) * 2;
  int h = get_global_id(1);
  int co_slice = get_global_id(2) * 2;
-
-  int CI_SLICES = input_shape.w;
-  int W = input_shape.z;
-  int CO_SLICES = output_shape.w;
-
-  if (h >= H || w >= W || co_slice >= CO_SLICES) {
+  if (h >= 36 || tile_hw >= TILE_HW || co_slice >= CO_SLICES) {
    return;
  }

@ -83,8 +74,8 @@ __kernel void WinogradConvolution(__read_only image2d_t input, __write_only imag
  int y_idx = h;
  __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;
  for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
-    FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(w + 0, y_idx));
-    FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(w + 1, y_idx));
+    FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(tile_hw + 0, y_idx));
+    FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(tile_hw + 1, y_idx));
    y_idx += 36;

    FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];
@ -111,57 +102,110 @@ __kernel void WinogradConvolution(__read_only image2d_t input, __write_only imag
    out11 += in1.w * weight1.scdef;
  }

-  WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);
-  if (w + 1 < W) {
-    WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);
+  WRITE_IMAGE(output, (int2)(tile_hw + 0, (co_slice + 0) * 36 + h), out00);
+  WRITE_IMAGE(output, (int2)(tile_hw + 1, (co_slice + 0) * 36 + h), out01);
+  WRITE_IMAGE(output, (int2)(tile_hw + 0, (co_slice + 1) * 36 + h), out10);
+  WRITE_IMAGE(output, (int2)(tile_hw + 1, (co_slice + 1) * 36 + h), out11);
+}
+
+__kernel void WinogradConv2D_Img(__read_only image2d_t input,    // height=CI_SLICES*36    width=TILE_HW
+                                 __write_only image2d_t output,  // height=CO_SLICES*36    width=TILE_HW
+                                 __read_only image2d_t weight, int TILE_HW, int CI_SLICES, int CO_SLICES) {
+  int tile_hw = get_global_id(0) * 2;
+  int h = get_global_id(1);
+  int co_slice = get_global_id(2) * 2;
+  if (h >= 36 || tile_hw >= TILE_HW || co_slice >= CO_SLICES) {
+    return;
+  }
+  int CI_ALIGN = CI_SLICES * CI_TILE;
+
+  FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  int y_idx = h;
+  for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+    FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(tile_hw + 0, y_idx));
+    FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(tile_hw + 1, y_idx));
+    y_idx += 36;
+
+    FLT4 filter_ci0_co0 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 0, co_slice + 0));
+    FLT4 filter_ci1_co0 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 1, co_slice + 0));
+    FLT4 filter_ci2_co0 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 2, co_slice + 0));
+    FLT4 filter_ci3_co0 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 3, co_slice + 0));
+    FLT4 filter_ci0_co1 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 0, co_slice + 1));
+    FLT4 filter_ci1_co1 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 1, co_slice + 1));
+    FLT4 filter_ci2_co1 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 2, co_slice + 1));
+    FLT4 filter_ci3_co1 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 3, co_slice + 1));
+
+    out00 += in0.x * filter_ci0_co0;
+    out00 += in0.y * filter_ci1_co0;
+    out00 += in0.z * filter_ci2_co0;
+    out00 += in0.w * filter_ci3_co0;
+
+    out01 += in1.x * filter_ci0_co0;
+    out01 += in1.y * filter_ci1_co0;
+    out01 += in1.z * filter_ci2_co0;
+    out01 += in1.w * filter_ci3_co0;
+
+    out10 += in0.x * filter_ci0_co1;
+    out10 += in0.y * filter_ci1_co1;
+    out10 += in0.z * filter_ci2_co1;
+    out10 += in0.w * filter_ci3_co1;
+
+    out11 += in1.x * filter_ci0_co1;
+    out11 += in1.y * filter_ci1_co1;
+    out11 += in1.z * filter_ci2_co1;
+    out11 += in1.w * filter_ci3_co1;
  }

-  if (co_slice + 1 < CO_SLICES) {
-    WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);
-    if (w + 1 < W) {
-      WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);
-    }
-  }
-#undef H
+  WRITE_IMAGE(output, (int2)(tile_hw + 0, (co_slice + 0) * 36 + h), out00);
+  WRITE_IMAGE(output, (int2)(tile_hw + 1, (co_slice + 0) * 36 + h), out01);
+  WRITE_IMAGE(output, (int2)(tile_hw + 0, (co_slice + 1) * 36 + h), out10);
+  WRITE_IMAGE(output, (int2)(tile_hw + 1, (co_slice + 1) * 36 + h), out11);
 }

+#define DO_LEAKY_RELU(data, alpha)               \
+  data.x = data.x > 0 ? data.x : data.x * alpha; \
+  data.y = data.y > 0 ? data.y : data.y * alpha; \
+  data.z = data.z > 0 ? data.z : data.z * alpha; \
+  data.w = data.w > 0 ? data.w : data.w * alpha;
+
 constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f,  1.0000000000f, 1.0000000000f,  0.0000000000f,
                       0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,
                       0.0000000000f, 0.4999999702f, 0.4999999702f,  1.9999998808f, 1.9999998808f,  0.0000000000f,
                       0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f};

-__kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias,
-                              int4 input_shape,   // N 36 H/4*W/4 CO_SLICES
+__kernel void Winograd36To4x4(__read_only image2d_t input,    // height=CO_SLICES*36    width=TILE_HW
+                              __write_only image2d_t output,  // height=N*H             width=W*CO_SLICES
+                              __global FLT4 *bias,
                              int4 output_shape,  // N H W CO_SLICES
-                              int act_type, float alpha) {
-  int tile_xy = get_global_id(0);
+                              int TILE_HW, int act_type, float alpha) {
+  int tile_hw = get_global_id(0);
  int row = get_global_id(1);
-  int slice = get_global_id(2);
-
-  int TILE_XY = input_shape.z;
-  int SLICES = input_shape.w;
-  int OH = output_shape.y;
-  int OW = output_shape.z;
-
-  if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES) {
+  int co_slice = get_global_id(2);
+  int H = output_shape.y;
+  int W = output_shape.z;
+  int CO_SLICES = output_shape.w;
+  if (tile_hw >= TILE_HW || row >= 4 || co_slice >= CO_SLICES) {
    return;
  }

  constant FLT *At_row = At + row * 6;
  FLT4 AtM_row[6] = {0};
-  for (int y = 0, idx = slice * 36; y < 6; y++) {
+  for (int y = 0, idx = co_slice * 36; y < 6; y++) {
    for (int x = 0; x < 6; x++, idx++) {
-      AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, idx));
+      AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx));
    }
  }

-  int TILE_X = UP_DIV(OW, 4);
-  int tile_x = tile_xy % TILE_X;
-  int tile_y = tile_xy / TILE_X;
-  int oh = tile_y * 4 + row;
-  int ow = tile_x * 4;
-  int x_idx = ow * SLICES + slice;
-
+  int TILE_W = UP_DIV(W, 4);
+  int tile_w = tile_hw % TILE_W;
+  int tile_h = tile_hw / TILE_W;
+  int h = tile_h * 4 + row;
+  int w = tile_w * 4;
+  int x_idx = w * CO_SLICES + co_slice;
  for (int x = 0, idx = 0; x < 4; x++) {
    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
    for (int y = 0; y < 6; y++, idx++) {
@ -169,7 +213,7 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_
    }

    if (bias != 0) {
-      acc += bias[slice];
+      acc += bias[co_slice];
    }

    if (act_type == ActivationType_RELU) {
@ -181,15 +225,11 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_
      FLT4 exp1 = exp(-acc);
      acc = (exp0 - exp1) / (exp0 + exp1);
    } else if (act_type == ActivationType_LEAKY_RELU) {
-      if (acc.x < 0) acc.x *= alpha;
-      if (acc.y < 0) acc.y *= alpha;
-      if (acc.z < 0) acc.z *= alpha;
-      if (acc.w < 0) acc.w *= alpha;
+      DO_LEAKY_RELU(acc, alpha);
    } else if (act_type == ActivationType_SIGMOID) {
      acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc));
    }
-
-    WRITE_IMAGE(output, (int2)(x_idx, oh), acc);
-    x_idx += SLICES;
+    WRITE_IMAGE(output, (int2)(x_idx, h), acc);
+    x_idx += CO_SLICES;
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@ -14,18 +14,18 @@
 * limitations under the License.
 */

+#include "src/runtime/kernel/opencl/kernel/conv2d.h"
 #include <string>
 #include <set>
 #include <algorithm>
-#include "src/common/utils.h"
-#include "src/runtime/kernel/opencl/kernel/conv2d.h"
-#include "src/runtime/kernel/opencl/kernel/fullconnection.h"
-#include "src/runtime/kernel/opencl/utils.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
 #include "schema/ops_generated.h"
+#include "src/common/utils.h"
+#include "src/runtime/kernel/opencl/utils.h"
+#include "src/runtime/kernel/opencl/kernel/fullconnection.h"
+#include "src/runtime/kernel/opencl/kernel/winograd.h"
 #include "src/runtime/kernel/opencl/cl/conv2d.cl.inc"
-#include "src/runtime/kernel/opencl/cl/winograd.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@ -41,38 +41,46 @@ using mindspore::schema::PrimitiveType_FullConnection;

 namespace mindspore::kernel {

-const size_t CI_TILE = C4NUM;
-const size_t CO_TILE = C4NUM;
-
 int Conv2DOpenCLKernel::CheckSpecs() {
-  if (in_tensors_.size() != 2 && in_tensors_.size() != 3) {
-    MS_LOG(ERROR) << "Conv2D only supports 2 or 3 input Tensor but get " << in_tensors_.size();
+  int inputs_num = in_tensors_.size();
+  if (inputs_num != 2 && inputs_num != 3) {
+    MS_LOG(ERROR) << "Conv2D only supports 2 or 3 input Tensor but get " << inputs_num;
    return RET_ERROR;
  }
-  if (out_tensors_.size() != 1) {
-    MS_LOG(ERROR) << "Conv2D only supports 1 output Tensor but get " << out_tensors_.size();
+  int outputs_num = out_tensors_.size();
+  if (outputs_num != 1) {
+    MS_LOG(ERROR) << "Conv2D only supports 1 output Tensor but get " << outputs_num;
    return RET_ERROR;
  }
-  if (in_tensors_.front()->shape().size() != 4) {
-    MS_LOG(ERROR) << "Conv2D only supports 4D input Tensor but get " << in_tensors_.front()->shape().size() << "D.";
+
+  int input_ndim = in_tensors_.at(kInputIndex)->shape().size();
+  if (input_ndim != 4) {
+    MS_LOG(ERROR) << "Conv2D only supports 4D input Tensor but get " << input_ndim << "D.";
    return RET_ERROR;
  }
-  if (in_tensors_.at(1)->shape().size() != 4) {
-    MS_LOG(ERROR) << "Conv2D only supports 4D filter Tensor but get " << in_tensors_.at(1)->shape().size() << "D.";
+  int output_ndim = out_tensors_.at(kOutputIndex)->shape().size();
+  if (output_ndim != 4) {
+    MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << output_ndim << "D.";
    return RET_ERROR;
  }
-  if (out_tensors_.front()->shape().size() != 4) {
-    MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << out_tensors_.front()->shape().size() << "D.";
+
+  auto *filter_tensor = in_tensors_.at(kWeightIndex);
+  int filter_ndim = filter_tensor->shape().size();
+  if (filter_ndim != 4) {
+    MS_LOG(ERROR) << "Conv2D only supports 4D filter Tensor but get " << filter_ndim << "D.";
    return RET_ERROR;
  }
-  if (!in_tensors_.at(1)->IsConst()) {
+  if (!filter_tensor->IsConst()) {
    MS_LOG(ERROR) << "Conv2D don't support non-constant filter yet.";
    return RET_ERROR;
  }
-  if (in_tensors_.size() == 3 && !in_tensors_.at(2)->IsConst()) {
+
+  auto *bias_tensor = in_tensors_.size() >= 3 ? in_tensors_.at(kBiasIndex) : nullptr;
+  if (bias_tensor != nullptr && !bias_tensor->IsConst()) {
    MS_LOG(ERROR) << "Conv2D don't support non-constant bias yet.";
    return RET_ERROR;
  }
+
  // for fusion: ActivationType_LEAKY_RELU ActivationType_TANH
  switch (static_cast<int>(param_->act_type_)) {
    case ActType_No:
@ -90,9 +98,17 @@ int Conv2DOpenCLKernel::CheckSpecs() {
 }

 int Conv2DOpenCLKernel::Prepare() {
+  InitAttrs();
+  BuildKernel();
+  InitWeights();
+  SetGlobalLocal();
+  SetConstArgs();
+  return RET_OK;
+}
+
+void Conv2DOpenCLKernel::InitAttrs() {
  use_fp16_ = ocl_runtime_->GetFp16Enable();
  sizeof_FLT_ = use_fp16_ ? sizeof(float16_t) : sizeof(float);
-
  auto input_shape = in_tensors_.front()->shape();
  auto output_shape = out_tensors_.front()->shape();
  batch_size_ = input_shape[0];
@ -109,170 +125,167 @@ int Conv2DOpenCLKernel::Prepare() {
    OW_ = output_shape[2];
    CO_ = output_shape[3];
  }
-  CI_SLICES_ = UP_DIV(CI_, C4NUM);
-  CO_SLICES_ = UP_DIV(CO_, C4NUM);
+  CI_SLICES_ = UP_DIV(CI_, CI_TILE);
+  CO_SLICES_ = UP_DIV(CO_, CO_TILE);
  KH_ = param_->kernel_h_;
  KW_ = param_->kernel_w_;
  has_bias_ = in_tensors_.size() == 3;
+  // note: TILE_HW_ is only used when use_winograd_=true
+  TILE_HW_ = UP_DIV(OW_, 4) * UP_DIV(OH_, 4);
+}

-  // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true
-  TILES_X_ = UP_DIV(OW_, 4);
-  TILES_Y_ = UP_DIV(OH_, 4);
-  TILES_XY_ = TILES_X_ * TILES_Y_;
-  use_winograd_ = UseWinograd4x4To6x6();
+void Conv2DOpenCLKernel::BuildKernel() {
+  SetBlockSize();
+  std::string program_name = "conv2d";
+  std::stringstream kernel_name;
+  kernel_name << "Conv2D_H" << block_size_.H << "W" << block_size_.W << "C" << block_size_.C;
+  if (filter_type_ == MemType::IMG) {
+    kernel_name << "_Img";
+  }
+  ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str());
+}

-  // build kernel
-  if (use_winograd_) {
-    MS_LOG(DEBUG) << "use winograd";
-    std::string program_name = "winograd";
-    ocl_runtime_->LoadSource(program_name, GetActDefines() + winograd_source);
-    ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36");
-    ocl_runtime_->BuildKernel(kernel_, program_name, "WinogradConvolution");
-    ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4");
+void Conv2DOpenCLKernel::SetBlockSize() {
+  if (filter_type_ == MemType::IMG) {
+    block_size_ = {2, 2, 2};
+    return;
+  }
+  auto task_size = static_cast<float>(batch_size_ * OH_ * OW_ * CO_SLICES_);
+  auto task_size_per_cu = task_size / ocl_runtime_->DeviceComputeUnits();
+  int block_size;
+  if (task_size_per_cu <= 256) {
+    block_size = 1;
+  } else if (task_size_per_cu <= 256 * 4) {
+    block_size = 2;
+  } else if (task_size_per_cu <= (use_fp16_ ? 256 * 8 : FLT_MAX)) {
+    block_size = 4;
  } else {
-    SetBlockSize();
-    std::string program_name = "conv2d";
-    std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" +
-                              std::to_string(block_size_.C);
-    ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source);
-    ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+    block_size = 8;
  }

-  // allocate winograd memory
-  if (use_winograd_) {
-    auto allocator = ocl_runtime_->GetAllocator();
-    size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
-
-    size_t size = TILES_XY_ * CI_SLICES_ * 36 * sizeof_FLT_;
-    size_t width = TILES_XY_;
-    size_t height = CI_SLICES_ * 36;
-    winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype});
-
-    size = TILES_XY_ * CO_SLICES_ * 36 * sizeof_FLT_;
-    width = TILES_XY_;
-    height = CO_SLICES_ * 36;
-    winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype});
+  bool w_kernel_is_1 =
+    KW_ == 1 && param_->stride_w_ == 1 && param_->dilation_w_ == 1 && param_->pad_l_ == 0 && param_->pad_r_ == 0;
+  bool h_kernel_is_1 =
+    KH_ == 1 && param_->stride_h_ == 1 && param_->dilation_h_ == 1 && param_->pad_u_ == 0 && param_->pad_d_ == 0;
+  if (!w_kernel_is_1 || !h_kernel_is_1) {
+    block_size = std::min(block_size, 4);
  }

-  auto ret = InitWeights();
-  if (ret != RET_OK) {
-    return ret;
+  if (block_size == 8) {
+    block_size_ = {2, 2, 2};
+  } else if (block_size == 4) {
+    block_size_ = {2, 1, 2};
+  } else if (block_size == 2) {
+    block_size_ = {2, 1, 1};
+  } else {
+    block_size_ = {1, 1, 1};
+  }
+}
+
+int Conv2DOpenCLKernel::InitWeights() {
+  InitFilter();
+  if (has_bias_) {
+    InitBias();
  }
-  SetGlobalLocal();
-  SetConstArgs();
  return RET_OK;
 }

-int Conv2DOpenCLKernel::GenerateWinogradFilter() {
-  const float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
-                      0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
-                      0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};
-  const float G[] = {1.0000000000, 0.0000000000,  0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702,
-                     1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808,
-                     1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000};
-
-  auto weight_tensor = in_tensors_.at(1);
-  auto origin_weight_fp32 = reinterpret_cast<float *>(weight_tensor->data_c());
-  MS_ASSERT(origin_weight_fp32);
-  auto origin_weight_fp16 = reinterpret_cast<float16_t *>(weight_tensor->data_c());
-  MS_ASSERT(origin_weight_fp16);
-  std::function<float(int)> access_func;
-  if (weight_tensor->data_type() == kNumberTypeFloat32) {
-    access_func = [=](int idx) { return origin_weight_fp32[idx]; };
-  } else {
-    access_func = [=](int idx) { return static_cast<float>(origin_weight_fp16[idx]); };
-  }
-
-  // OHWI -> O66I
-  std::vector<float> encoded_weight(CO_ * 6 * 6 * CI_);
-  for (int co = 0; co < CO_; ++co) {
-    for (int ci = 0; ci < CI_; ++ci) {
-      float in_vals[9];
-      for (int kh = 0; kh < 3; ++kh) {
-        for (int kw = 0; kw < 3; ++kw) {
-          const int f_index = ((co * 3 + kh) * 3 + kw) * CI_ + ci;
-          in_vals[kh * 3 + kw] = access_func(f_index);
-        }
-      }
-
-      auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3);
-      auto out_vals = MatrixMultiply(temp_vals.data(), Gt, 6, 3, 6);
-      for (int kh = 0; kh < 6; ++kh) {
-        for (int kw = 0; kw < 6; ++kw) {
-          const int f_index = ((co * 6 + kh) * 6 + kw) * CI_ + ci;
-          encoded_weight[f_index] = out_vals[kh * 6 + kw];
+void ConvertFilter(void *src, void *dst, TypeId src_dtype, TypeId dst_dtype, FilterFormat src_format,
+                   FilterFormat dst_format, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup) {
+  MS_ASSERT(src);
+  MS_ASSERT(dst);
+  MS_ASSERT(src_dtype == kNumberTypeFloat16 || src_dtype == kNumberTypeFloat32);
+  MS_ASSERT(dst_dtype == kNumberTypeFloat16 || dst_dtype == kNumberTypeFloat32);
+  MS_ASSERT(src_format == OHWI);
+  MS_ASSERT(dst_format == HWII4OO4 || dst_format == OHWIOgroupI4O4);
+  auto src_fp16 = reinterpret_cast<float16_t *>(src);
+  auto src_fp32 = reinterpret_cast<float32_t *>(src);
+  auto dst_fp16 = reinterpret_cast<float16_t *>(dst);
+  auto dst_fp32 = reinterpret_cast<float32_t *>(dst);
+  bool src_is_fp16 = src_dtype == kNumberTypeFloat16;
+  bool dst_is_fp16 = dst_dtype == kNumberTypeFloat16;
+  auto CI_SLICES = UP_DIV(CI, CI_TILE);
+  auto CO_SLICES = UP_DIV(CO, CO_TILE);
+  for (size_t co = 0, src_idx = 0; co < CO; ++co) {
+    for (size_t kh = 0; kh < KH; ++kh) {
+      for (size_t kw = 0; kw < KW; ++kw) {
+        for (size_t ci = 0; ci < CI; ++ci, ++src_idx) {
+          size_t dst_idx = 0;
+          size_t co_inner = co % CO_TILE;
+          size_t ci_slice = ci / CI_TILE;
+          size_t ci_inner = ci % CI_TILE;
+          if (dst_format == OHWIOgroupI4O4) {
+            size_t co_slice = co / (CO_TILE * OGroup);
+            size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE;
+            dst_idx =
+              (((((co_slice * KH + kh) * KW + kw) * CI_SLICES + ci_slice) * OGroup + group_idx) * CI_TILE + ci_inner) *
+                CO_TILE +
+              co_inner;
+          } else {  // if(dst_format==HWII4OO4)
+            size_t co_slice = co / CO_TILE;
+            dst_idx =
+              ((((kh * KW + kw) * CI_SLICES + ci_slice) * CI_TILE + ci_inner) * CO_SLICES + co_slice) * CO_TILE +
+              co_inner;
+          }
+          if (dst_is_fp16) {
+            dst_fp16[dst_idx] = src_is_fp16 ? src_fp16[src_idx] : static_cast<float16_t>(src_fp32[src_idx]);
+          } else {
+            dst_fp32[dst_idx] = src_is_fp16 ? static_cast<float32_t>(src_fp16[src_idx]) : src_fp32[src_idx];
+          }
        }
      }
    }
  }
-
-  if (use_fp16_) {
-    ConvertConvWeight4DTo7D<float, float16_t>(reinterpret_cast<void *>(encoded_weight.data()), packed_weight_, CO_, 6,
-                                              6, CI_, 2);
-  } else {
-    ConvertConvWeight4DTo7D<float, float>(reinterpret_cast<void *>(encoded_weight.data()), packed_weight_, CO_, 6, 6,
-                                          CI_, 2);
-  }
-
-  return RET_OK;
 }

-int Conv2DOpenCLKernel::InitFilter() {
+void Conv2DOpenCLKernel::InitFilter() {
  auto allocator = ocl_runtime_->GetAllocator();
+
  auto ret = DequantWeight();
  if (ret != RET_OK) {
-    return ret;
+    return;
  }

-  // allocate memory
-  size_t packed_weight_size;
-  if (use_winograd_) {
-    packed_weight_size = UP_DIV(CO_, 8) * 6 * 6 * CI_SLICES_ * 2 * CI_TILE * CO_TILE * sizeof_FLT_;
+  // allocate opencl memory: buffer or image2d
+  size_t size = 0;
+  int Ogroup = block_size_.C;
+  if (filter_type_ == MemType::IMG) {
+    size_t width = CO_SLICES_;
+    size_t height = KH_ * KW_ * UP_ROUND(CI_, CI_TILE);
+    size_t dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
+    size = width * height * CO_TILE * sizeof_FLT_;
+    packed_filter_ = allocator->Malloc(size, {width, height, dtype});
  } else {
-    packed_weight_size = UP_ROUND(CO_SLICES_, block_size_.C) * KH_ * KW_ * CI_SLICES_ * CI_TILE * CO_TILE * sizeof_FLT_;
+    size = UP_DIV(CO_SLICES_, Ogroup) * KH_ * KW_ * CI_SLICES_ * Ogroup * CI_TILE * CO_TILE * sizeof_FLT_;
+    packed_filter_ = allocator->Malloc(size);
  }
-  packed_weight_ = allocator->Malloc(packed_weight_size);
-  allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
-  memset(packed_weight_, 0x00, packed_weight_size);

-  // rearrange weight
-  if (use_winograd_) {
-    GenerateWinogradFilter();
+  // rearrange filter
+  auto filter_tensor = in_tensors_.at(1);
+  void *src_data = filter_tensor->data_c();
+  auto src_dtype = filter_tensor->data_type();
+  auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32;
+  std::vector<char> tmp(size, 0);
+  if (filter_type_ == MemType::IMG) {
+    ConvertFilter(src_data, tmp.data(), src_dtype, dst_dtype, OHWI, HWII4OO4, CO_, KH_, KW_, CI_);
  } else {
-    auto weight_tensor = in_tensors_.at(1);
-    if (weight_tensor->data_type() == kNumberTypeFloat16) {
-      if (use_fp16_) {
-        ConvertConvWeight4DTo7D<float16_t, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_,
-                                                      block_size_.C);
-      } else {
-        ConvertConvWeight4DTo7D<float16_t, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_,
-                                                  block_size_.C);
-      }
-    } else if (weight_tensor->data_type() == kNumberTypeFloat32) {
-      if (use_fp16_) {
-        ConvertConvWeight4DTo7D<float, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_,
-                                                  block_size_.C);
-      } else {
-        ConvertConvWeight4DTo7D<float, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_,
-                                              block_size_.C);
-      }
-    } else {  // int8 or int16
-      if (use_fp16_) {
-        ConvertConvWeight4DTo7D<float16_t, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_,
-                                                      block_size_.C);
-      } else {
-        ConvertConvWeight4DTo7D<float, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_,
-                                              block_size_.C);
-      }
-    }
+    ConvertFilter(src_data, tmp.data(), src_dtype, dst_dtype, OHWI, OHWIOgroupI4O4, CO_, KH_, KW_, CI_, Ogroup);
+  }
+
+  // unmap
+  if (filter_type_ == MemType::IMG) {
+    ocl_runtime_->WriteImage(packed_filter_, tmp.data());
+  } else {
+    allocator->MapBuffer(packed_filter_, CL_MAP_WRITE, nullptr, true);
+    memcpy(packed_filter_, tmp.data(), size);
+    allocator->UnmapBuffer(packed_filter_);
  }

-  allocator->UnmapBuffer(packed_weight_);
  FreeDequantedWeight();
-  return RET_OK;
 }

-int Conv2DOpenCLKernel::InitBias() {
+void Conv2DOpenCLKernel::InitBias() {
  auto allocator = ocl_runtime_->GetAllocator();

  // align bias from C to C4
@ -306,95 +319,64 @@ int Conv2DOpenCLKernel::InitBias() {
    }
  }
  allocator->UnmapBuffer(packed_bias_);
-  return RET_OK;
 }

-int Conv2DOpenCLKernel::InitWeights() {
-  InitFilter();
-  if (has_bias_) {
-    InitBias();
-  }
-  return RET_OK;
-}
+void Conv2DOpenCLKernel::SetConstArgs() {
+  cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_};
+  cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};
+  cl_int4 kernel_stride = {KH_, KW_, param_->stride_h_, param_->stride_w_};
+  cl_int4 pad = {param_->pad_u_, param_->pad_d_, param_->pad_l_, param_->pad_r_};
+  cl_int2 dilation = {param_->dilation_h_, param_->dilation_w_};

-void Conv2DOpenCLKernel::SetBlockSize() {
-  auto task_size = static_cast<float>(batch_size_ * OH_ * OW_ * CO_SLICES_);
-  auto task_size_per_cu = task_size / ocl_runtime_->DeviceComputeUnits();
-  int block_size;
-  if (task_size_per_cu <= 256) {
-    block_size = 1;
-  } else if (task_size_per_cu <= 256 * 4) {
-    block_size = 2;
-  } else if (task_size_per_cu <= (use_fp16_ ? 256 * 8 : FLT_MAX)) {
-    block_size = 4;
-  } else {
-    block_size = 8;
-  }
-
-  bool w_kernel_is_1 =
-    KW_ == 1 && param_->stride_w_ == 1 && param_->dilation_w_ == 1 && param_->pad_l_ == 0 && param_->pad_r_ == 0;
-  bool h_kernel_is_1 =
-    KH_ == 1 && param_->stride_h_ == 1 && param_->dilation_h_ == 1 && param_->pad_u_ == 0 && param_->pad_d_ == 0;
-  if (!w_kernel_is_1 || !h_kernel_is_1) {
-    block_size = std::min(block_size, 4);
-  }
-
-  if (block_size == 8) {
-    block_size_ = {2, 2, 2};
-  } else if (block_size == 4) {
-    block_size_ = {2, 1, 2};
-  } else if (block_size == 2) {
-    block_size_ = {2, 1, 1};
-  } else {
-    block_size_ = {1, 1, 1};
-  }
-}
-
-void AlignWinogradGlobalLocal(const std::vector<int> &global, const std::vector<int> &local, cl::NDRange *global_range,
-                              cl::NDRange *local_range) {
-  *local_range = cl::NDRange(local[0], local[1], local[2]);
-  *global_range =
-    cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2]));
+  int arg_cn = 2;
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_filter_, filter_type_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, MemType::BUF);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param_->act_type_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn, alpha_);
 }

 void Conv2DOpenCLKernel::SetGlobalLocal() {
-  if (use_winograd_) {
-    AlignWinogradGlobalLocal({TILES_XY_, 6, CI_SLICES_}, {8, 6, 4}, &global_4x4to36_, &local_4x4to36_);
-    AlignWinogradGlobalLocal({UP_DIV(TILES_XY_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 6, 2}, &global_conv_, &local_conv_);
-    AlignWinogradGlobalLocal({TILES_XY_, 4, CO_SLICES_}, {32, 4, 2}, &global_36to4x4_, &local_36to4x4_);
-  } else {
-    size_t global_h = batch_size_ * UP_DIV(OH_, block_size_.H);
-    size_t global_w = UP_DIV(OW_, block_size_.W);
-    size_t global_c = UP_DIV(CO_SLICES_, block_size_.C);
-    const int local_c_max = 16;
-    const int local_hw_max = 256;
-    const int OH_threshold = 100;
-    const int OW_threshold = 100;
-    const int OC_threshold = 64;
-    size_t local_c = GetMaxDivisor(global_c, local_c_max);
-    local_c = std::max<size_t>(local_c, 1);
-    size_t local_hw = local_hw_max / local_c;
-    size_t local_h;
-    size_t local_w;
-    if (OH_ >= OH_threshold && OW_ >= OW_threshold && CO_ <= OC_threshold) {  // c -> w -> h
-      local_w = std::min(global_w, local_hw);
-      local_h = std::min(local_hw / local_w, global_h);
-    } else {  // c -> h -> w
-      local_h = std::min(global_h, local_hw);
-      local_w = std::min(local_hw / local_h, global_w);
-    }
-    global_size_ = {global_h, global_w, global_c};
-    local_size_ = {local_h, local_w, local_c};
-    AlignGlobalLocal(global_size_, local_size_);
+  size_t global_h = batch_size_ * UP_DIV(OH_, block_size_.H);
+  size_t global_w = UP_DIV(OW_, block_size_.W);
+  size_t global_c = UP_DIV(CO_SLICES_, block_size_.C);
+  int local_max = filter_type_ == MemType::IMG ? 64 : 128;
+  const int local_c_max = 16;
+  const int OH_threshold = 100;
+  const int OW_threshold = 100;
+  const int OC_threshold = 64;
+  size_t local_c = GetMaxDivisor(global_c, local_c_max);
+  local_c = std::max<size_t>(local_c, 1);
+  size_t local_hw = local_max / local_c;
+  size_t local_h;
+  size_t local_w;
+  if (OH_ >= OH_threshold && OW_ >= OW_threshold && CO_ <= OC_threshold) {  // c -> w -> h
+    local_w = std::min(global_w, local_hw);
+    local_h = std::min(local_hw / local_w, global_h);
+  } else {  // c -> h -> w
+    local_h = std::min(global_h, local_hw);
+    local_w = std::min(local_hw / local_h, global_w);
  }
+  global_size_ = {global_h, global_w, global_c};
+  local_size_ = {local_h, local_w, local_c};
+  AlignGlobalLocal(global_size_, local_size_);
+}
+
+int Conv2DOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << this->name() << " Running!";
+  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
+  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
+  return RET_OK;
 }

 std::vector<BaseTuningParameter> Conv2DOpenCLKernel::GenerateTuningParam() {
  // don't need to tune local_c
  std::vector<BaseTuningParameter> tuning_params = {};
-  if (use_winograd_) {
-    return tuning_params;
-  }
  BaseTuningParameter default_tuning_param = BaseTuningParameter();
  default_tuning_param.local_size = local_size_;
  tuning_params.push_back(default_tuning_param);
@ -420,85 +402,6 @@ std::vector<BaseTuningParameter> Conv2DOpenCLKernel::GenerateTuningParam() {
  return tuning_params;
 }

-std::string Conv2DOpenCLKernel::Key() {
-  auto key = OpenCLKernel::Key();
-  key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" +
-         std::to_string(param_->stride_w_) + "_" + std::to_string(param_->dilation_h_) + "_" +
-         std::to_string(param_->dilation_w_);
-  return key;
-}
-
-void Conv2DOpenCLKernel::SetConstArgs() {
-  cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_};
-  cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};
-
-  int arg_cn;
-  if (use_winograd_) {
-    arg_cn = 1;
-    cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY_, CI_SLICES_};
-    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_);
-    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, input_shape);
-    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn, _4x4to36_out_shape);
-
-    arg_cn = 0;
-    cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_};
-    cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_};
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem0_);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem1_);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, conv_in_shape);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn, conv_out_shape);
-
-    arg_cn = 2;
-    cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_};
-    ocl_runtime_->SetKernelArg(kernel_36to4x4_, 0, winograd_mem1_);
-    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
-    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
-    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape);
-    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, static_cast<cl_int>(param_->act_type_));
-    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, static_cast<cl_float>(alpha_));
-  } else {
-    arg_cn = 2;
-    cl_int4 kernel_stride = {KH_, KW_, param_->stride_h_, param_->stride_w_};
-    cl_int4 pad = {param_->pad_u_, param_->pad_d_, param_->pad_l_, param_->pad_r_};
-    cl_int2 dilation = {param_->dilation_h_, param_->dilation_w_};
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, static_cast<cl_int>(param_->act_type_));
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast<cl_float>(alpha_));
-  }
-}
-
-int Conv2DOpenCLKernel::Tune() {
-  if (use_winograd_) {
-    return RET_OK;
-  }
-  return OpenCLKernel::Tune();
-}
-
-int Conv2DOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running!";
-  if (use_winograd_) {
-    ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c());
-    ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_);
-
-    ocl_runtime_->RunKernel(kernel_, global_conv_, local_conv_);
-
-    ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c());
-    ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_);
-  } else {
-    ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
-    ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
-    ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
-  }
-  return RET_OK;
-}
-
 bool UseFcReplaceConv(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
                      ConvParameter *param) {
  MS_ASSERT(param);
@ -528,6 +431,51 @@ OpParameter *CreateFcParam(const ConvParameter *conv_param) {
  return reinterpret_cast<OpParameter *>(fc_param);
 }

+bool UseWinograd4x4To6x6(const ConvParameter *param, const std::vector<lite::Tensor *> &inputs,
+                         const std::vector<lite::Tensor *> &outputs) {
+  // not use winograd on adreno gpu
+  lite::opencl::OpenCLRuntimeWrapper runtime_wrap;
+  lite::opencl::OpenCLRuntime *runtime = runtime_wrap.GetInstance();
+  if (runtime->GetGpuInfo().type == lite::opencl::GpuType::ADRENO) {
+    return false;
+  }
+
+  if (!(inputs.size() == 2 || inputs.size() == 3) || outputs.empty()) {
+    return false;
+  }
+  auto input_shape = inputs.front()->shape();
+  auto output_shape = outputs.front()->shape();
+  if (input_shape.size() != 4 || (output_shape.size() != 2 && output_shape.size() != 4)) {
+    return false;
+  }
+  int batch_size = input_shape[0];
+  int IH = input_shape[1];
+  int IW = input_shape[2];
+  int CI = input_shape[3];
+  int OH = output_shape.size() == 2 ? 1 : output_shape[1];
+  int OW = output_shape.size() == 2 ? 1 : output_shape[2];
+  int CO = output_shape.size() == 2 ? output_shape[1] : output_shape[3];
+  int CI_SLICES = UP_DIV(CI, CI_TILE);
+  int CO_SLICES = UP_DIV(CO, CO_TILE);
+  int TILE_HW_ = UP_DIV(OH, 4) * UP_DIV(OW, 4);
+
+  bool pad_is_all_0 = param->pad_u_ == 0 && param->pad_d_ == 0 && param->pad_l_ == 0 && param->pad_r_ == 0;
+  bool pad_is_all_1 = param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 && param->pad_r_ == 1;
+  bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 && param->stride_w_ == 1 &&
+                    param->dilation_h_ == 1 && param->dilation_w_ == 1 && (pad_is_all_0 || pad_is_all_1);
+
+  bool shape_valid = false;
+  if (pad_is_all_1) {
+    shape_valid = batch_size == 1 && IH == OH && IW == OW;
+  } else if (pad_is_all_0) {
+    shape_valid = batch_size == 1 && IH - 2 == OH && IW - 2 == OW;
+  }
+
+  bool channel_good = CI_SLICES >= 8 && CO_SLICES >= 8;
+  bool hw_good = TILE_HW_ >= 16;
+  return attr_valid && shape_valid && channel_good && hw_good;
+}
+
 kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                   const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                                   const lite::InnerContext *ctx, const kernel::KernelKey &desc,
@ -549,7 +497,12 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::Tenso
      MS_LOG(INFO) << "use FullConnection to replace Convolution.";
    }
  } else {
-    kernel = new (std::nothrow) Conv2DOpenCLKernel(reinterpret_cast<OpParameter *>(conv_param), inputs, outputs);
+    if (UseWinograd4x4To6x6(conv_param, inputs, outputs)) {
+      MS_LOG(DEBUG) << "use Winograd algorithm.";
+      kernel = new (std::nothrow) WinogradOpenCLKernel(reinterpret_cast<OpParameter *>(conv_param), inputs, outputs);
+    } else {
+      kernel = new (std::nothrow) Conv2DOpenCLKernel(reinterpret_cast<OpParameter *>(conv_param), inputs, outputs);
+    }
    real_param = reinterpret_cast<OpParameter *>(conv_param);
    if (kernel == nullptr) {
      MS_LOG(ERROR) << "Create Convolution kernel failed.";
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
@ -27,53 +27,57 @@

 namespace mindspore::kernel {

+using lite::opencl::MemType;
+
+constexpr size_t CI_TILE = C4NUM;
+constexpr size_t CO_TILE = C4NUM;
+
+enum FilterFormat {
+  OHWI,            // CO KH KW CI
+  HWII4OO4,        // KH KW CI/CI_TILE CI_TILE CO/CO_TILE CO_TILE
+  OHWIOgroupI4O4,  // CO/Ogroup/CO_TILE KH KW CI/CI_TILE Ogroup CI_TILE CO_TILE
+};
+
+void ConvertFilter(void *src, void *dst, TypeId src_dtype, TypeId dst_dtype, FilterFormat src_format,
+                   FilterFormat dst_format, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1);
+
 class Conv2DOpenCLKernel : public OpenCLKernel {
 public:
  Conv2DOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                     const std::vector<lite::Tensor *> &outputs)
-      : OpenCLKernel(parameter, inputs, outputs), param_(reinterpret_cast<ConvParameter *>(parameter)) {}
+      : OpenCLKernel(parameter, inputs, outputs), param_(reinterpret_cast<ConvParameter *>(parameter)) {
+    bool is_adreno = ocl_runtime_->GetGpuInfo().type == lite::opencl::GpuType::ADRENO;
+    filter_type_ = is_adreno ? MemType::IMG : MemType::BUF;
+  }
  ~Conv2DOpenCLKernel() override = default;

  int CheckSpecs() override;
-
  int Prepare() override;
-  void SetGlobalLocal() override;
  int InitWeights() override;
  void SetConstArgs() override;
-
+  void SetGlobalLocal() override;
  int Run() override;
-  int Tune() override;
+
+  std::string Key() override {
+    auto key = OpenCLKernel::Key();
+    key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" +
+           std::to_string(param_->stride_w_) + "_" + std::to_string(param_->dilation_h_) + "_" +
+           std::to_string(param_->dilation_w_);
+    return key;
+  }
  std::vector<BaseTuningParameter> GenerateTuningParam() override;
-  std::string Key() override;
+  int Tune() override { return OpenCLKernel::Tune(); }

  // for opencl fusion: Conv2D + PReLU(weight is scalar) -> param_.act_type=ActivationType_LEAKY_RELU
  float alpha_{0.0f};

- private:
-  void SetBlockSize();
-  int InitFilter();
-  int InitBias();
-  int GenerateWinogradFilter();
-
-  bool UseWinograd4x4To6x6() {
-    const bool attr_valid = param_->kernel_h_ == 3 && param_->kernel_w_ == 3 && param_->stride_h_ == 1 &&
-                            param_->stride_w_ == 1 && param_->pad_u_ == 1 && param_->pad_d_ == 1 &&
-                            param_->pad_l_ == 1 && param_->pad_r_ == 1 && param_->dilation_h_ == 1 &&
-                            param_->dilation_w_ == 1 && IH_ == OH_ && IW_ == OW_ && batch_size_ == 1;
-    const bool channel_good = CI_SLICES_ >= 8 && CO_SLICES_ >= 8;
-    const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
-    return attr_valid && channel_good && hw_good;
-  }
-
-  cl::Kernel kernel_4x4to36_;
-  cl::Kernel kernel_36to4x4_;
-  cl::NDRange global_4x4to36_, local_4x4to36_;
-  cl::NDRange global_conv_, local_conv_;
-  cl::NDRange global_36to4x4_, local_36to4x4_;
-
+ protected:
+  void InitAttrs();
+  virtual void BuildKernel();
+  virtual void InitFilter();
+  void InitBias();
  bool use_fp16_{false};
  size_t sizeof_FLT_{4};
-
  ConvParameter *param_{nullptr};
  int batch_size_{};
  int CI_{};
@ -86,23 +90,21 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
  int CO_SLICES_{};
  int KH_{};
  int KW_{};
-  void *packed_weight_{nullptr};
+  void *packed_filter_{nullptr};
  void *packed_bias_{nullptr};
+  MemType filter_type_{MemType::BUF};
  bool has_bias_{false};
+  int TILE_HW_{};

-  bool use_winograd_{false};
-  int TILES_X_{};
-  int TILES_Y_{};
-  int TILES_XY_{};
-  void *winograd_mem0_{nullptr};
-  void *winograd_mem1_{nullptr};
-
+ private:
+  void SetBlockSize();
  struct {
    int H{1};
    int W{1};
    int C{1};
  } block_size_;
 };
+
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONV2D_H_
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc
@ -14,3 +14,420 @@
 * limitations under the License.
 */
 #include "src/runtime/kernel/opencl/kernel/fusion_eltwise.h"
+#include <algorithm>
+#include "src/runtime/kernel/opencl/utils.h"
+#include "include/errorcode.h"
+#include "nnacl/fp32/activation_fp32.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+
+static std::set<EltwiseOperator> SupportedOperators = {
+  // Arithmetic Primitive
+  Operator_Mul,
+  Operator_Add,
+  Operator_Sub,
+  Operator_Div,
+  // ArithmeticSelf Primitive
+  Operator_Neg,
+  // Other Primitive
+  Operator_Scale,
+  // Activation
+  Operator_Act_NO_ACTIVATION,
+  Operator_Act_RELU,
+  Operator_Act_SIGMOID,
+  Operator_Act_RELU6,
+  Operator_Act_RELU1,
+  Operator_Act_TANH,
+};
+
+std::pair<bool, FusionEltwiseParameter *> CheckSupportOrCreateParam(
+  LiteKernel *node, bool create_param = false,
+  const std::map<lite::Tensor *, FusionEltwiseParameter *> &replace_map = {}) {
+  MS_ASSERT(node);
+  MS_ASSERT(param);
+  PrimitiveType node_type = node->Type();
+  auto operator_ = static_cast<const EltwiseOperator>(node_type);
+  auto *op_parameter = reinterpret_cast<OpenCLKernel *>(node)->GetParameter();
+  bool support = false;
+  FusionEltwiseParameter *param = nullptr;
+
+  if (node_type == PrimitiveType_FusionEltwise) {
+    support = true;
+    if (create_param) {
+      auto *eltwise = reinterpret_cast<FusionEltwiseOpenCLKernel *>(node);
+      param = reinterpret_cast<FusionEltwiseParameter *>(eltwise->GetParameter());
+      eltwise->ClearParameter();
+    }
+  } else if (IsArithmetic(node_type)) {
+    auto act_type =
+      static_cast<ActivationType>(reinterpret_cast<ArithmeticParameter *>(op_parameter)->activation_type_);
+    EltwiseOperator act_operator = Activation2Operator(act_type);
+    support =
+      node->in_tensors().size() == 2 && SupportedOperators.count(operator_) && SupportedOperators.count(act_operator);
+    if (create_param) {
+      param = new (std::nothrow) FusionEltwiseParameter(operator_, node->name(), node->in_tensors(), replace_map);
+      MS_ASSERT(param);
+      if (act_operator != Operator_Act_NO_ACTIVATION) {
+        std::string act_name = schema::EnumNameActivationType(act_type);
+        auto *fake_tensor = reinterpret_cast<lite::Tensor *>(param);
+        param =
+          new (std::nothrow) FusionEltwiseParameter(act_operator, act_name, {fake_tensor}, {{fake_tensor, param}});
+        MS_ASSERT(param);
+      }
+    }
+  } else if (IsArithmeticSelf(node_type)) {
+    support = node->in_tensors().size() == 1 && SupportedOperators.count(operator_);
+    if (create_param) {
+      param = new (std::nothrow) FusionEltwiseParameter(operator_, node->name(), node->in_tensors(), replace_map);
+      MS_ASSERT(param);
+    }
+  } else if (node_type == schema::PrimitiveType_Scale) {
+    support = node->in_tensors().size() == 3 && SupportedOperators.count(operator_);
+    if (create_param) {
+      param = new (std::nothrow) FusionEltwiseParameter(operator_, node->name(), node->in_tensors(), replace_map);
+      MS_ASSERT(param);
+    }
+  } else if (node_type == schema::PrimitiveType_Activation) {
+    auto act_type = static_cast<ActivationType>(reinterpret_cast<ActivationParameter *>(op_parameter)->type_);
+    EltwiseOperator act_operator = Activation2Operator(act_type);
+    support = node->in_tensors().size() == 1 && SupportedOperators.count(act_operator);
+    if (create_param) {
+      param = new (std::nothrow) FusionEltwiseParameter(act_operator, node->name(), node->in_tensors(), replace_map);
+      MS_ASSERT(param);
+    }
+  }
+  return {support, param};
+}
+
+bool IsOperatorSupported(LiteKernel *node) { return CheckSupportOrCreateParam(node).first; }
+
+FusionEltwiseParameter *CreateFusionEltwiseParameter(
+  LiteKernel *node, const std::map<lite::Tensor *, FusionEltwiseParameter *> &replace_map) {
+  return CheckSupportOrCreateParam(node, true, replace_map).second;
+}
+
+bool IsEltwiseAndOperatorSupported(LiteKernel *node) {
+  MS_ASSERT(node);
+  if (!IsOperatorSupported(node)) {
+    return false;
+  }
+  if (node->out_tensors().size() != 1) {
+    return false;
+  }
+  auto *output_tensor = node->out_tensors().front();
+  MS_ASSERT(output_tensor);
+  auto output_info = GpuTensorInfo(output_tensor);
+  auto output_shape = output_tensor->shape();
+  for (auto *in_tensor : node->in_tensors()) {
+    MS_ASSERT(in_tensor);
+    auto shape = in_tensor->shape();
+    bool is_scalar = shape.empty() || (shape.size() == 1 && shape.front() == 1);
+    bool is_vector = shape.size() == 1 && shape.front() == output_info.C;
+    bool _111C = shape.size() == 4 && shape[0] == 1 && shape[1] == 1 && shape[2] == 1 && shape[3] == output_info.C;
+    bool same_with_out = shape == output_shape;
+    if (!(is_scalar || is_vector || _111C || same_with_out)) {
+      return false;
+    }
+    if (in_tensor->data_type() != kNumberTypeFloat16 && in_tensor->data_type() != kNumberTypeFloat32) {
+      return false;
+    }
+  }
+  if (output_tensor->data_type() != kNumberTypeFloat16 && output_tensor->data_type() != kNumberTypeFloat32) {
+    return false;
+  }
+  return true;
+}
+
+int FusionEltwiseOpenCLKernel::Prepare() {
+  static std::set<std::string> code_map;
+  std::string source = Codegen();
+  code_map.insert(source);
+  //  std::cout << name() << "\n" << source;
+
+  std::string program_name = "FusionEltwise" + std::to_string(code_map.size());
+  std::string kernel_name = "FusionEltwise";
+  ocl_runtime_->LoadSource(program_name, source);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+
+  InitWeights();
+  SetGlobalLocal();
+  SetConstArgs();
+  return RET_OK;
+}
+
+template <typename DstT, typename SrcT>
+void CopyNumber(void *dst, void *src, size_t n) {
+  MS_ASSERT(dst);
+  MS_ASSERT(src);
+  if (sizeof(DstT) == sizeof(SrcT)) {
+    memcpy(dst, src, n * sizeof(DstT));
+  } else {
+    auto *dst_ = static_cast<DstT *>(dst);
+    auto *src_ = static_cast<SrcT *>(src);
+    for (int i = 0; i < n; ++i) {
+      dst_[i] = static_cast<DstT>(src_[i]);
+    }
+  }
+}
+
+int FusionEltwiseOpenCLKernel::InitWeights() {
+  auto allocator = ocl_runtime_->GetAllocator();
+  bool use_fp16 = ocl_runtime_->GetFp16Enable();
+  for (auto *tensor : in_tensors_) {
+    MS_ASSERT(tensor);
+    if (tensor->IsConst()) {
+      if (IsScalar(tensor->shape())) {
+        float value = (tensor->data_type() == kNumberTypeFloat16) ? *(reinterpret_cast<float16_t *>(tensor->data_c()))
+                                                                  : *(reinterpret_cast<float32_t *>(tensor->data_c()));
+        //        std::cout << "value=" << value << std::endl;
+        scalar_weights_.push_back(value);
+      } else {
+        auto tensor_info = GpuTensorInfo(tensor);
+        size_t num = tensor_info.ElementsNum;
+        size_t size = tensor_info.Image2DSize;
+        void *buffer = allocator->Malloc(size);
+        allocator->MapBuffer(buffer, CL_MAP_WRITE, nullptr, true);
+        memset(buffer, 0x00, size);
+        if (tensor->data_type() == kNumberTypeFloat16) {
+          if (use_fp16) {
+            CopyNumber<float16_t, float16_t>(buffer, tensor->data_c(), num);
+          } else {
+            CopyNumber<float32_t, float16_t>(buffer, tensor->data_c(), num);
+          }
+        } else {
+          if (use_fp16) {
+            CopyNumber<float16_t, float32_t>(buffer, tensor->data_c(), num);
+          } else {
+            CopyNumber<float32_t, float32_t>(buffer, tensor->data_c(), num);
+          }
+        }
+        allocator->UnmapBuffer(buffer);
+        buffer_weights_.push_back(buffer);
+      }
+    }
+  }
+  return RET_OK;
+}
+
+void FusionEltwiseOpenCLKernel::SetGlobalLocal() {
+  auto output = GpuTensorInfo(out_tensors_.front());
+  global_size_ = {output.N * output.H, output.W, output.Slice};
+  local_size_ = {};
+  AlignGlobalLocal(global_size_, local_size_);
+}
+
+void FusionEltwiseOpenCLKernel::SetConstArgs() {
+  auto output = GpuTensorInfo(out_tensors_.front());
+  cl_int4 output_shape = {static_cast<cl_int>(output.N), static_cast<cl_int>(output.H), static_cast<cl_int>(output.W),
+                          static_cast<cl_int>(output.C)};
+  int arg_idx = 0;
+  int scalar_idx = 0;
+  int buffer_idx = 0;
+  for (auto *in_tensor : in_tensors_) {
+    MS_ASSERT(in_tensor);
+    if (in_tensor->IsConst()) {
+      if (IsScalar(in_tensor->shape())) {
+        if (ocl_runtime_->GetFp16Enable()) {
+          auto value = static_cast<float16_t>(scalar_weights_[scalar_idx++]);
+          ocl_runtime_->SetKernelArg(kernel_, arg_idx, *(reinterpret_cast<cl_half *>(&value)));
+        } else {
+          ocl_runtime_->SetKernelArg(kernel_, arg_idx, scalar_weights_[scalar_idx++]);
+        }
+      } else {
+        ocl_runtime_->SetKernelArg(kernel_, arg_idx, buffer_weights_[buffer_idx++], lite::opencl::MemType::BUF);
+      }
+    }
+    arg_idx++;  // for act input
+  }
+  arg_idx++;  // for output
+  ocl_runtime_->SetKernelArg(kernel_, arg_idx, output_shape);
+}
+
+int FusionEltwiseOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << this->name() << " Running!";
+  int arg_idx = 0;
+  for (auto *in_tensor : in_tensors_) {
+    if (!in_tensor->IsConst()) {
+      ocl_runtime_->SetKernelArg(kernel_, arg_idx, in_tensor->data_c());
+    }
+    arg_idx++;
+  }
+  ocl_runtime_->SetKernelArg(kernel_, arg_idx, out_tensors_.front()->data_c());
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
+  return RET_OK;
+}
+
+std::string FusionEltwiseOpenCLKernel::Codegen() {
+  std::stringstream code;
+  code << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+          "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
+          "__kernel void FusionEltwise(";
+
+  for (int i = 0; i < in_tensors_.size(); ++i) {
+    MS_ASSERT(in_tensors_[i]);
+    if (in_tensors_[i]->IsConst()) {
+      if (IsScalar(in_tensors_[i]->shape())) {
+        code << "FLT in" << i << ", ";
+      } else {
+        code << "__global FLT4 *input" << i << ", ";
+      }
+    } else {
+      code << "__read_only image2d_t input" << i << ", ";
+    }
+  }
+
+  code << "__write_only image2d_t output, int4 output_shape) {\n"
+          "  int N = output_shape.x, H = output_shape.y, W = output_shape.z, C = output_shape.w;\n"
+          "  int SLICES = (C + 3) / 4;\n"
+          "  int nh = get_global_id(0);\n"
+          "  int w = get_global_id(1);\n"
+          "  int slice = get_global_id(2);\n"
+          "  int n = nh / H;\n"
+          "  int h = nh % H;\n"
+          "  if (n >= N || h >= H || w >= W || slice >= SLICES) {\n"
+          "    return;\n"
+          "  }\n";
+
+  auto output = GpuTensorInfo(out_tensors_.front());
+  for (int i = 0; i < in_tensors_.size(); ++i) {
+    auto *tensor = in_tensors_[i];
+    MS_ASSERT(tensor);
+    auto shape = in_tensors_[i]->shape();
+    bool is_scalar = IsScalar(shape);
+    bool is_vector = shape.size() == 1 && shape.front() == output.C;
+    bool _111C = shape.size() == 4 && shape[0] == 1 && shape[1] == 1 && shape[2] == 1 && shape[3] == output.C;
+    if (tensor->IsConst()) {
+      if (!is_scalar) {
+        code << "  FLT4 in" << i << " = input" << i << "[";
+        if (is_vector || _111C) {
+          code << "slice";
+        } else {
+          code << "(nh * W + w) * SLICES + slice";
+        }
+        code << "];\n";
+      }
+    } else {
+      code << "  FLT4 in" << i << " = READ_IMAGE(input" << i << ", smp_zero, (int2)(";
+      if (is_scalar) {
+        code << "0, 0";
+      } else if (is_vector || _111C) {
+        code << "slice, 0";
+      } else {
+        code << "w * SLICES + slice, nh";
+      }
+      code << "));\n";
+    }
+  }
+  code << "\n";
+  MS_LOG(DEBUG) << "\n" << reinterpret_cast<FusionEltwiseParameter *>(op_parameter_)->name_ << ":";
+  code << CodegenCore(reinterpret_cast<FusionEltwiseParameter *>(op_parameter_));
+  code << "\n  WRITE_IMAGE(output, (int2)(w * SLICES + slice, nh), out);\n"
+          "}\n\n";
+  return code.str();
+}
+
+std::string FusionEltwiseOpenCLKernel::CodegenCore(FusionEltwiseParameter *param, const std::string &out_name,
+                                                   int degree) {
+  std::stringstream code;
+  std::string log_prefix(degree * 2, ' ');
+  std::string cl_prefix((degree + 1) * 2, ' ');
+
+  std::vector<std::string> input_names;
+  MS_ASSERT(param);
+  for (const auto &input : param->inputs_) {
+    if (input.is_leaf_) {
+      input_names.push_back("in" + std::to_string(GetTensorIdx(reinterpret_cast<lite::Tensor *>(input.value_))));
+      MS_LOG(DEBUG) << log_prefix << degree << " Tensor=" << input.value_;
+    } else {
+      std::string var = GetFormatVarName(input.name_);
+      input_names.push_back(var);
+      MS_LOG(DEBUG) << log_prefix << degree << " Parameter(degree=" << degree << ")";
+      code << CodegenCore(input.value_, var, degree + 1);
+    }
+  }
+  const std::string &var0 = input_names.at(0);
+
+  static std::map<EltwiseOperator, char> simple_symbols = {
+    {Operator_Add, '+'},
+    {Operator_Sub, '-'},
+    {Operator_Mul, '*'},
+    {Operator_Div, '/'},
+  };
+  if (simple_symbols.count(param->operator_)) {
+    const std::string &var1 = input_names.at(1);
+    code << cl_prefix << "FLT4 " << out_name << " = " << var0 << " " << simple_symbols[param->operator_] << " " << var1
+         << ";\n";
+  } else if (param->operator_ == Operator_Neg) {
+    code << cl_prefix << "FLT4 " << out_name << " = -" << var0 << ";\n";
+  } else if (param->operator_ == Operator_Scale) {
+    const std::string &var1 = input_names.at(1);
+    const std::string &var2 = input_names.at(2);
+    code << cl_prefix << "FLT4 " << out_name << " = " << var0 << " * " << var1 << " + " << var2 << ";\n";
+  } else {
+    if (param->operator_ == Operator_Act_NO_ACTIVATION) {
+      code << cl_prefix << "FLT4 " << out_name << " = " << var0 << ";\n";
+    } else if (param->operator_ == Operator_Act_RELU) {
+      code << cl_prefix << "FLT4 " << out_name << " =  max(" << var0 << ", (FLT4)(0.0f));\n";
+    } else if (param->operator_ == Operator_Act_SIGMOID) {
+      code << cl_prefix << "FLT4 " << out_name << " =  (FLT4)(1.f) / ((FLT4)(1.f) + exp(-" << var0 << "));\n";
+    } else if (param->operator_ == Operator_Act_RELU6) {
+      code << cl_prefix << "FLT4 " << out_name << " =  clamp(" << var0 << ", (FLT4)(0.0f), (FLT4)(6.0f));\n";
+    } else if (param->operator_ == Operator_Act_LEAKY_RELU) {
+    } else if (param->operator_ == Operator_Act_RELU1) {
+      code << cl_prefix << "FLT4 " << out_name << " =  clamp(" << var0 << ", (FLT4)(0.0f), (FLT4)(1.0f));\n";
+    } else if (param->operator_ == Operator_Act_TANH) {
+      std::string exp0 = GetFormatVarName();
+      std::string exp1 = GetFormatVarName();
+      code << cl_prefix << "FLT4 " << exp0 << " =  exp(" + var0 + ");\n";
+      code << cl_prefix << "FLT4 " << exp1 << " =  exp(-" + var0 + ");\n";
+      code << cl_prefix << "FLT4 " << out_name << " =  (" << exp0 << " - " << exp1 << ") / (" << exp0 << " + " << exp1
+           << "));\n";
+    }
+  }
+
+  return code.str();
+}
+
+std::string FusionEltwiseOpenCLKernel::GetFormatVarName(std::string name) {
+  if (var_names_.count(name)) {
+    return name;
+  }
+  if (name.empty()) {
+    name = "_var_" + std::to_string(var_names_.size());
+  } else {
+    char c = name.front();
+    if (c != '_' && !std::isalpha(c)) {
+      name = '_' + name;
+    }
+    std::replace_if(
+      name.begin(), name.end(), [](char c) { return !std::isalnum(c); }, '_');
+  }
+  var_names_.insert(name);
+  return name;
+}
+
+int FusionEltwiseOpenCLKernel::GetTensorIdx(lite::Tensor *in_tensor) {
+  MS_ASSERT(in_tensor);
+  auto pos = std::find(in_tensors_.begin(), in_tensors_.end(), in_tensor);
+  if (pos != in_tensors_.end()) {
+    return pos - in_tensors_.begin();
+  } else {
+    for (const auto &in_kernel : in_kernels_) {
+      MS_ASSERT(in_kernel);
+      MS_ASSERT(in_kernel->in_tensors().size());
+      MS_ASSERT(in_kernel->out_tensors().size());
+      if (in_kernel->Type() == schema::PrimitiveType_ToFormat) {
+        if (in_tensor == in_kernel->in_tensors().front()) {
+          return std::find(in_tensors_.begin(), in_tensors_.end(), in_kernel->out_tensors().front()) -
+                 in_tensors_.begin();
+        }
+      }
+    }
+    return 0;
+  }
+}
+
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.h
@ -17,4 +17,176 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FUSION_ELTWISE_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FUSION_ELTWISE_H_

+#include <utility>
+#include <vector>
+#include <string>
+#include <sstream>
+#include <map>
+#include <set>
+#include "src/runtime/kernel/opencl/opencl_kernel.h"
+#include "src/runtime/kernel/opencl/kernel/arithmetic.h"
+#include "src/runtime/kernel/opencl/kernel/arithmetic_self.h"
+#include "src/runtime/kernel/opencl/kernel/to_format.h"
+#include "schema/ops_generated.h"
+
+using mindspore::schema::ActivationType;
+using mindspore::schema::PrimitiveType;
+using mindspore::schema::PrimitiveType_MAX;
+
+namespace mindspore::kernel {
+
+constexpr schema::PrimitiveType PrimitiveType_FusionEltwise = static_cast<schema::PrimitiveType>(-100);
+
+enum EltwiseOperator {
+  // Arithmetic Primitive
+  Operator_Mul = PrimitiveType_Mul,
+  Operator_Add = PrimitiveType_Add,
+  Operator_Sub = PrimitiveType_Sub,
+  Operator_Div = PrimitiveType_Div,
+  Operator_LogicalAnd = PrimitiveType_LogicalAnd,
+  Operator_LogicalOr = PrimitiveType_LogicalOr,
+  Operator_Maximum = PrimitiveType_Maximum,
+  Operator_Minimum = PrimitiveType_Minimum,
+  Operator_FloorDiv = PrimitiveType_FloorDiv,
+  Operator_FloorMod = PrimitiveType_FloorMod,
+  Operator_SquaredDifference = PrimitiveType_SquaredDifference,
+  Operator_Equal = PrimitiveType_Equal,
+  Operator_NotEqual = PrimitiveType_NotEqual,
+  Operator_Less = PrimitiveType_Less,
+  Operator_LessEqual = PrimitiveType_LessEqual,
+  Operator_Greater = PrimitiveType_Greater,
+  Operator_GreaterEqual = PrimitiveType_GreaterEqual,
+  Operator_Eltwise = PrimitiveType_Eltwise,
+
+  // ArithmeticSelf Primitive
+  Operator_Abs = PrimitiveType_Abs,
+  Operator_Ceil = PrimitiveType_Ceil,
+  Operator_Cos = PrimitiveType_Cos,
+  Operator_Exp = PrimitiveType_Exp,
+  Operator_Floor = PrimitiveType_Floor,
+  Operator_Log = PrimitiveType_Log,
+  Operator_LogicalNot = PrimitiveType_LogicalNot,
+  Operator_Round = PrimitiveType_Round,
+  Operator_Rsqrt = PrimitiveType_Rsqrt,
+  Operator_Sin = PrimitiveType_Sin,
+  Operator_Neg = PrimitiveType_Neg,
+  Operator_Sqrt = PrimitiveType_Sqrt,
+  Operator_Square = PrimitiveType_Square,
+
+  // Other Primitive
+  Operator_Scale = schema::PrimitiveType_Scale,
+
+  // Activation
+  Operator_Act_NO_ACTIVATION = schema::ActivationType_NO_ACTIVATION + PrimitiveType_MAX,
+  Operator_Act_RELU = schema::ActivationType_RELU + PrimitiveType_MAX,
+  Operator_Act_SIGMOID = schema::ActivationType_SIGMOID + PrimitiveType_MAX,
+  Operator_Act_RELU6 = schema::ActivationType_RELU6 + PrimitiveType_MAX,
+  Operator_Act_ELU = schema::ActivationType_ELU + PrimitiveType_MAX,
+  Operator_Act_LEAKY_RELU = schema::ActivationType_LEAKY_RELU + PrimitiveType_MAX,
+  Operator_Act_ABS = schema::ActivationType_ABS + PrimitiveType_MAX,
+  Operator_Act_RELU1 = schema::ActivationType_RELU1 + PrimitiveType_MAX,
+  Operator_Act_SOFTSIGN = schema::ActivationType_SOFTSIGN + PrimitiveType_MAX,
+  Operator_Act_SOFTPLUS = schema::ActivationType_SOFTPLUS + PrimitiveType_MAX,
+  Operator_Act_TANH = schema::ActivationType_TANH + PrimitiveType_MAX,
+  Operator_Act_SELU = schema::ActivationType_SELU + PrimitiveType_MAX,
+  Operator_Act_HSWISH = schema::ActivationType_HSWISH + PrimitiveType_MAX,
+  Operator_Act_HSIGMOID = schema::ActivationType_HSIGMOID + PrimitiveType_MAX,
+  Operator_Act_THRESHOLDRELU = schema::ActivationType_THRESHOLDRELU + PrimitiveType_MAX,
+  Operator_Act_LINEAR = schema::ActivationType_LINEAR + PrimitiveType_MAX,
+  Operator_Act_HARD_TANH = schema::ActivationType_HARD_TANH + PrimitiveType_MAX,
+  Operator_Act_SIGN = schema::ActivationType_SIGN + PrimitiveType_MAX,
+  Operator_Act_SWISH = schema::ActivationType_SWISH + PrimitiveType_MAX,
+};
+
+struct FusionEltwiseParameter {
+  struct Node_ {
+    bool is_leaf_;
+    FusionEltwiseParameter *value_;  // if is_leaf_=true, value_ is a Tensor
+    std::string name_;
+    Node_(bool is_leaf, FusionEltwiseParameter *value, std::string value_name)
+        : is_leaf_(is_leaf), value_(value), name_(std::move(value_name)) {}
+  };
+  OpParameter op_parameter_{};
+  EltwiseOperator operator_;
+  std::string name_;
+  std::vector<Node_> inputs_;
+
+  FusionEltwiseParameter(EltwiseOperator operator_init, std::string kernel_name,
+                         const std::vector<lite::Tensor *> &in_tensors,
+                         const std::map<lite::Tensor *, FusionEltwiseParameter *> &replace_map = {})
+      : operator_(operator_init), name_(std::move(kernel_name)) {
+    op_parameter_.type_ = PrimitiveType_FusionEltwise;
+    snprintf(op_parameter_.name_, strlen("FusionEltwiseParameter"), "FusionEltwiseParameter");
+    for (int i = 0; i < in_tensors.size(); ++i) {
+      auto *in_tensor = in_tensors[i];
+      if (replace_map.count(in_tensor)) {
+        auto *pred_param = replace_map.at(in_tensor);
+        inputs_.emplace_back(false, pred_param, pred_param->name_);
+        if (reinterpret_cast<void *>(in_tensor) == reinterpret_cast<void *>(pred_param)) {
+          this->name_ = pred_param->name_ + "(" + this->name_ + ")";
+        } else {
+          this->name_ = pred_param->name_ + ", " + this->name_;
+        }
+      } else {
+        inputs_.emplace_back(true, reinterpret_cast<FusionEltwiseParameter *>(in_tensor), "tensor" + std::to_string(i));
+      }
+    }
+  }
+
+  ~FusionEltwiseParameter() {
+    for (const auto &input : inputs_) {
+      if (!input.is_leaf_) {
+        delete input.value_;
+      }
+    }
+  }
+};
+
+constexpr EltwiseOperator Activation2Operator(ActivationType act_type) {
+  return static_cast<EltwiseOperator>(act_type + PrimitiveType_MAX);
+}
+
+FusionEltwiseParameter *CreateFusionEltwiseParameter(
+  LiteKernel *node, const std::map<lite::Tensor *, FusionEltwiseParameter *> &replace_map = {});
+
+bool IsEltwiseAndOperatorSupported(LiteKernel *node);
+
+class FusionEltwiseOpenCLKernel : public OpenCLKernel {
+ public:
+  FusionEltwiseOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                            const std::vector<lite::Tensor *> &outputs)
+      : OpenCLKernel(parameter, inputs, outputs) {}
+
+  ~FusionEltwiseOpenCLKernel() override {
+    if (op_parameter_ != nullptr) {
+      delete op_parameter_;
+      op_parameter_ = nullptr;
+    }
+  }
+
+  int Prepare() override;
+  int InitWeights() override;
+  void SetGlobalLocal() override;
+  void SetConstArgs() override;
+  int Run() override;
+
+  void ClearParameter() { op_parameter_ = nullptr; }
+
+ public:
+  std::string Codegen();
+  std::string CodegenCore(FusionEltwiseParameter *param, const std::string &out_name = "out", int degree = 0);
+  std::string GetFormatVarName(std::string name = "");
+  int GetTensorIdx(lite::Tensor *in_tensor);
+
+  static inline bool IsScalar(const std::vector<int> &shape) {
+    return shape.empty() || (shape.size() == 1 && shape.front() == 1);
+  }
+
+  std::set<std::string> var_names_;
+  std::vector<float> scalar_weights_;
+  std::vector<void *> buffer_weights_;
+};
+
+}  // namespace mindspore::kernel
+
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FUSION_ELTWISE_H_
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
@ -97,7 +97,7 @@ void PadOpenCLKernel::SetConstArgs() {
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast<cl_float>(param_->constant_value_));
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn, param_->constant_value_);
  local_size_ = {8, 4, 1};
  global_size_ = {output.N * output.H, output.W, output.Slice};
  AlignGlobalLocal(global_size_, local_size_);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
@ -0,0 +1,195 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/opencl/kernel/winograd.h"
+#include "src/runtime/kernel/opencl/cl/winograd.cl.inc"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+
+namespace {
+void Align(const std::vector<int> &global, const std::vector<int> &local, cl::NDRange *global_range,
+           cl::NDRange *local_range) {
+  *local_range = cl::NDRange(local[0], local[1], local[2]);
+  *global_range =
+    cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2]));
+}
+
+std::vector<float> GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, size_t CI) {
+  constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
+                          0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
+                          0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};
+  constexpr float G[] = {1.0000000000, 0.0000000000,  0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702,
+                         1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808,
+                         1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000};
+  auto src_fp32 = reinterpret_cast<float *>(src);
+  auto src_fp16 = reinterpret_cast<float16_t *>(src);
+  std::function<float(int)> access_func;
+  if (dtype == kNumberTypeFloat32) {
+    access_func = [=](int idx) { return src_fp32[idx]; };
+  } else {
+    access_func = [=](int idx) { return static_cast<float>(src_fp16[idx]); };
+  }
+  // OHWI -> O66I
+  std::vector<float> dst(CO * 6 * 6 * CI);
+  if (src == nullptr) {
+    return dst;
+  }
+  for (int co = 0; co < CO; ++co) {
+    for (int ci = 0; ci < CI; ++ci) {
+      float in_vals[9];
+      for (int kh = 0; kh < 3; ++kh) {
+        for (int kw = 0; kw < 3; ++kw) {
+          const int f_index = ((co * 3 + kh) * 3 + kw) * CI + ci;
+          in_vals[kh * 3 + kw] = access_func(f_index);
+        }
+      }
+      auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3);
+      auto out_vals = MatrixMultiply(temp_vals.data(), Gt, 6, 3, 6);
+      for (int kh = 0; kh < 6; ++kh) {
+        for (int kw = 0; kw < 6; ++kw) {
+          const int f_index = ((co * 6 + kh) * 6 + kw) * CI + ci;
+          dst[f_index] = out_vals[kh * 6 + kw];
+        }
+      }
+    }
+  }
+  return dst;
+}
+}  // namespace
+
+void WinogradOpenCLKernel::BuildKernel() {
+  std::string program_name = "winograd";
+  ocl_runtime_->LoadSource(program_name, GetActDefines() + winograd_source);
+  ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36");
+  ocl_runtime_->BuildKernel(kernel_, program_name,
+                            filter_type_ == MemType::IMG ? "WinogradConv2D_Img" : "WinogradConv2D");
+  ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4");
+}
+
+void WinogradOpenCLKernel::InitFilter() {
+  auto allocator = ocl_runtime_->GetAllocator();
+
+  auto ret = DequantWeight();
+  if (ret != RET_OK) {
+    return;
+  }
+
+  // allocate opencl memory: buffer or image2d
+  size_t size = 0;
+  int Ogroup = 2;
+  if (filter_type_ == MemType::IMG) {
+    size_t width = 6 * 6 * UP_ROUND(CI_, CI_TILE);
+    size_t height = CO_SLICES_;
+    size_t dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
+    size = width * height * CO_TILE * sizeof_FLT_;
+    packed_filter_ = allocator->Malloc(size, {width, height, dtype});
+  } else {
+    size = UP_DIV(CO_SLICES_, Ogroup) * 6 * 6 * CI_SLICES_ * Ogroup * CI_TILE * CO_TILE * sizeof_FLT_;
+    packed_filter_ = allocator->Malloc(size);
+  }
+
+  // rearrange filter
+  auto filter_tensor = in_tensors_.at(1);
+  auto winograd_filter = GenerateWinogradFilter(filter_tensor->data_c(), filter_tensor->data_type(), CO_, CI_);
+
+  void *src_data = winograd_filter.data();
+  auto src_dtype = kNumberTypeFloat32;
+  auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32;
+  std::vector<char> tmp(size, 0);
+  if (filter_type_ == MemType::IMG) {
+    ConvertFilter(src_data, tmp.data(), src_dtype, dst_dtype, OHWI, OHWIOgroupI4O4, CO_, 6, 6, CI_);
+  } else {
+    ConvertFilter(src_data, tmp.data(), src_dtype, dst_dtype, OHWI, OHWIOgroupI4O4, CO_, 6, 6, CI_, Ogroup);
+  }
+
+  // unmap
+  if (filter_type_ == MemType::IMG) {
+    ocl_runtime_->WriteImage(packed_filter_, tmp.data());
+  } else {
+    allocator->MapBuffer(packed_filter_, CL_MAP_WRITE, nullptr, true);
+    memcpy(packed_filter_, tmp.data(), size);
+    allocator->UnmapBuffer(packed_filter_);
+  }
+
+  FreeDequantedWeight();
+}
+
+void WinogradOpenCLKernel::AllocateMemory() {
+  auto allocator = ocl_runtime_->GetAllocator();
+  size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
+
+  size_t width = TILE_HW_;
+  size_t height = CI_SLICES_ * 36;
+  winograd_mem0_ = allocator->Malloc(width * height * sizeof_FLT_, {width, height, img_dtype});
+
+  width = TILE_HW_;
+  height = CO_SLICES_ * 36;
+  winograd_mem1_ = allocator->Malloc(width * height * sizeof_FLT_, {width, height, img_dtype});
+}
+
+void WinogradOpenCLKernel::SetConstArgs() {
+  AllocateMemory();
+
+  int arg_cn = 1;
+  cl_int4 input_shape = {batch_size_, OH_, OW_, CI_SLICES_};  // maybe pad=0, so use OH/OW
+  ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_);
+  ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, input_shape);
+  ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, TILE_HW_);
+  ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn, param_->pad_u_);
+
+  arg_cn = 0;
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem0_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem1_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_filter_, filter_type_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, TILE_HW_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, CI_SLICES_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn, CO_SLICES_);
+
+  arg_cn = 2;
+  cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};
+  ocl_runtime_->SetKernelArg(kernel_36to4x4_, 0, winograd_mem1_);
+  ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, MemType::BUF);
+  ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape);
+  ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, TILE_HW_);
+  ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, param_->act_type_);
+  ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, alpha_);
+}
+
+void WinogradOpenCLKernel::SetGlobalLocal() {
+  Align({TILE_HW_, 6, CI_SLICES_}, {8, 6, 4}, &global_4x4to36_, &local_4x4to36_);
+  Align({UP_DIV(TILE_HW_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 6, 2}, &global_range_, &local_range_);
+  Align({TILE_HW_, 4, CO_SLICES_}, {32, 4, 2}, &global_36to4x4_, &local_36to4x4_);
+}
+
+int WinogradOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << this->name() << " winograd Running!";
+  MS_LOG(DEBUG) << "winograd kernel0 Running!";
+  ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c());
+  ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_, nullptr, &event_);
+
+  MS_LOG(DEBUG) << "winograd kernel1 Running!";
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
+
+  MS_LOG(DEBUG) << "winograd kernel2 Running!";
+  ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c());
+  ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_, nullptr, &event_);
+  return RET_OK;
+}
+
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h
@ -0,0 +1,58 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_WINOGRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_WINOGRAD_H_
+
+#include <string>
+#include <vector>
+#include "src/runtime/kernel/opencl/kernel/conv2d.h"
+
+namespace mindspore::kernel {
+
+class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
+ public:
+  WinogradOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                       const std::vector<lite::Tensor *> &outputs)
+      : Conv2DOpenCLKernel(parameter, inputs, outputs) {
+    filter_type_ = MemType::BUF;
+  }
+
+  ~WinogradOpenCLKernel() override = default;
+
+  void SetConstArgs() override;
+  void SetGlobalLocal() override;
+  int Run() override;
+
+  std::vector<BaseTuningParameter> GenerateTuningParam() override { return {}; }
+  int Tune() override { return RET_OK; }
+
+ private:
+  void BuildKernel() override;
+  void InitFilter() override;
+  void AllocateMemory();
+
+  cl::Kernel kernel_4x4to36_;
+  cl::Kernel kernel_36to4x4_;
+  cl::NDRange global_4x4to36_, local_4x4to36_;
+  cl::NDRange global_36to4x4_, local_36to4x4_;
+  void *winograd_mem0_{nullptr};
+  void *winograd_mem1_{nullptr};
+};
+
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_WINOGRAD_H_
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc
@ -13,11 +13,650 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-
+#include <vector>
+#include <queue>
+#include <set>
+#include <ctime>
 #include "src/runtime/kernel/opencl/opencl_subgraph.h"
+#include "src/runtime/kernel/opencl/opencl_kernel.h"
+#include "src/runtime/kernel/opencl/kernel/arithmetic.h"
+#include "src/runtime/kernel/opencl/kernel/conv2d.h"
+#include "src/runtime/kernel/opencl/kernel/fusion_eltwise.h"
+#include "src/runtime/kernel/opencl/utils.h"
+#include "src/runtime/opencl/opencl_executor.h"
+#include "include/errorcode.h"
+#include "schema/ops_generated.h"
+#include "src/common/utils.h"
+#include "nnacl/conv_parameter.h"
+#include "nnacl/pad_parameter.h"
+#include "nnacl/pooling_parameter.h"
+#include "nnacl/fp32/activation_fp32.h"
+#include "nnacl/matmul_parameter.h"
+#include "nnacl/scale.h"
+
+using mindspore::schema::ActivationType;
+using mindspore::schema::ActivationType_LEAKY_RELU;
+using mindspore::schema::ActivationType_NO_ACTIVATION;
+using mindspore::schema::ActivationType_RELU;
+using mindspore::schema::ActivationType_RELU6;
+using mindspore::schema::ActivationType_TANH;
+using mindspore::schema::PrimitiveType;
+using mindspore::schema::PrimitiveType_Activation;
+using mindspore::schema::PrimitiveType_Eltwise;
+using mindspore::schema::PrimitiveType_NONE;

 namespace mindspore::kernel {

-void OpenCLSubGraph::Fusion() {}
+namespace {
+
+template <typename T0, typename T1>
+inline bool AIsInB(const T0 *a, const T1 *b) {
+  MS_ASSERT(a);
+  MS_ASSERT(b);
+  return std::find(b->begin(), b->end(), a) != b->end();
+}
+
+inline bool PredIs(const LiteKernel *node, PrimitiveType type, std::vector<LiteKernel *> *nodes) {
+  MS_ASSERT(node);
+  if (node->in_kernels().size() == 1) {
+    LiteKernel *pred = node->in_kernels().front();
+    MS_ASSERT(pred);
+    if (AIsInB(pred, nodes) && pred->Type() == type && pred->out_kernels().size() == 1) {
+      MS_ASSERT(pred->out_kernels().front() == node);
+      return true;
+    }
+  }
+  return false;
+}
+
+inline std::string GetTypeName(const LiteKernel *node) {
+  MS_ASSERT(node);
+  if (node->Type() == PrimitiveType_FusionEltwise) {
+    return "FusionEltwise";
+  } else {
+    return schema::EnumNamePrimitiveType(node->Type());
+  }
+}
+
+inline bool NC_N11C(const LiteKernel *node) {
+  MS_ASSERT(node);
+  if (node->in_tensors().empty() || node->out_tensors().empty()) {
+    return false;
+  } else {
+    MS_ASSERT(node->in_tensors().front());
+    MS_ASSERT(node->out_tensors().front());
+    auto input_shape = node->in_tensors().front()->shape();
+    auto output_shape = node->out_tensors().front()->shape();
+    return input_shape.size() == 2 && output_shape.size() == 4 &&
+           output_shape == std::vector<int>({input_shape[0], 1, 1, input_shape[1]});
+  }
+}
+
+inline bool N11C_NC(const LiteKernel *node) {
+  MS_ASSERT(node);
+  if (node->in_tensors().empty() || node->out_tensors().empty()) {
+    return false;
+  } else {
+    MS_ASSERT(node->in_tensors().front());
+    MS_ASSERT(node->out_tensors().front());
+    auto input_shape = node->in_tensors().front()->shape();
+    auto output_shape = node->out_tensors().front()->shape();
+    return input_shape.size() == 4 && output_shape.size() == 2 &&
+           input_shape == std::vector<int>({output_shape[0], 1, 1, output_shape[1]});
+  }
+}
+
+inline bool NC11_NC(const LiteKernel *node) {
+  if (node->in_tensors().empty() || node->out_tensors().empty()) {
+    return false;
+  } else {
+    MS_ASSERT(node->in_tensors().front());
+    MS_ASSERT(node->out_tensors().front());
+    auto input_shape = node->in_tensors().front()->shape();
+    auto output_shape = node->out_tensors().front()->shape();
+    return input_shape.size() == 4 && output_shape.size() == 2 &&
+           input_shape == std::vector<int>({output_shape[0], output_shape[1], 1, 1});
+  }
+}
+
+template <typename T>
+std::vector<T *> RemoveDuplicationsButKeepOrder(const std::vector<T *> &vec) {
+  std::vector<T *> ret;
+  std::set<T *> s;
+  for (auto *x : vec) {
+    if (0 == s.count(x)) {
+      ret.push_back(x);
+      s.insert(x);
+    }
+  }
+  return ret;
+}
+
+void Merge(LiteKernel *a, LiteKernel *b, bool remove_a) {
+  MS_ASSERT(a);
+  MS_ASSERT(b);
+  if (remove_a) {  // pred->tensor0->a->tensor1->b: remove a tensor1
+    // update pred out_kernels: a.in_kernels.out_kernels.replace(a,b)
+    for (auto *pred : a->in_kernels()) {
+      MS_ASSERT(pred);
+      auto pred_out_kernels = pred->out_kernels();
+      std::replace_if(
+        pred_out_kernels.begin(), pred_out_kernels.end(), [&](LiteKernel *x) { return x == a; }, b);
+      pred->set_out_kernels(RemoveDuplicationsButKeepOrder(pred_out_kernels));
+    }
+
+    // update b in_tensors: b.in_tensors.replace(a.out_tensors[0], a.in_tensors)
+    auto b_in_tensors = b->in_tensors();
+    for (int i = 0; i < b_in_tensors.size(); ++i) {
+      if (b_in_tensors[i] == a->out_tensors().front()) {
+        // reshape: 2nd input tensor is removed
+        if (a->Type() == schema::PrimitiveType_Reshape) {
+          b_in_tensors[i] = a->in_tensors().front();
+          b->set_in_tensors(b_in_tensors);
+        } else {
+          b_in_tensors.erase(b_in_tensors.begin() + i);
+          b_in_tensors.insert(b_in_tensors.begin() + i, a->in_tensors().begin(), a->in_tensors().end());
+          b->set_in_tensors(RemoveDuplicationsButKeepOrder(b_in_tensors));
+        }
+        break;
+      }
+    }
+
+    // update b in_kernels: b.in_kernels.replace(a, a.in_kernels)
+    auto b_in_kernels = b->in_kernels();
+    for (int i = 0; i < b_in_kernels.size(); ++i) {
+      if (a == b_in_kernels[i]) {
+        b_in_kernels.erase(b_in_kernels.begin() + i);
+        b_in_kernels.insert(b_in_kernels.begin() + i, a->in_kernels().begin(), a->in_kernels().end());
+        b->set_in_kernels(RemoveDuplicationsButKeepOrder(b_in_kernels));
+        break;
+      }
+    }
+  } else {  // a->tensor1->b->tensor2->succ: remove tensor1 b
+    // update a.out_tensors
+    a->set_out_tensors(b->out_tensors());
+
+    // update a.out_kernels
+    a->set_out_kernels(b->out_kernels());
+
+    // update succ in_kernels
+    for (auto *succ : b->out_kernels()) {
+      MS_ASSERT(succ);
+      auto succ_in_kernels = succ->in_kernels();
+      std::replace_if(
+        succ_in_kernels.begin(), succ_in_kernels.end(), [&](LiteKernel *x) { return x == b; }, a);
+      succ->set_in_kernels(RemoveDuplicationsButKeepOrder(succ_in_kernels));
+    }
+  }
+}
+
+inline void MergeRemoveA(LiteKernel *a, LiteKernel *b, std::set<LiteKernel *> *removed_set,
+                         bool do_check_specs = true) {
+  MS_ASSERT(a);
+  MS_ASSERT(b);
+  MS_ASSERT(removed_set);
+  Merge(a, b, true);
+  removed_set->insert(a);
+  if (do_check_specs && reinterpret_cast<OpenCLKernel *>(b)->CheckSpecs() != RET_OK) {
+    MS_LOG(ERROR) << "fusion kernel CheckSpecs() error: kernel name is " << b->name();
+  }
+}
+
+inline void MergeRemoveB(LiteKernel *a, LiteKernel *b, std::set<LiteKernel *> *removed_set) {
+  MS_ASSERT(a);
+  MS_ASSERT(b);
+  MS_ASSERT(removed_set);
+  Merge(a, b, false);
+  removed_set->insert(b);
+  if (reinterpret_cast<OpenCLKernel *>(a)->CheckSpecs() != RET_OK) {
+    MS_LOG(ERROR) << "fusion kernel CheckSpecs() error: kernel name is " << a->name();
+  }
+}
+
+// Pad + Conv2D
+// Pad + DepthwiseConv2D
+// Pad + DeConv2D
+// Pad + Pooling
+template <typename ParamType>
+void TryMergePad(LiteKernel *node, std::set<LiteKernel *> *removed_set) {
+  MS_ASSERT(node);
+  MS_ASSERT(removed_set);
+  LiteKernel *pad = node->in_kernels().front();
+  MS_ASSERT(pad);
+  if (pad->in_tensors().front()->shape().size() != 4) {
+    return;
+  }
+  auto *pad_param = reinterpret_cast<PadParameter *>(reinterpret_cast<OpenCLKernel *>(pad)->GetParameter());
+  MS_ASSERT(pad_param);
+  if (pad_param->pad_mode_ != schema::PaddingMode::PaddingMode_CONSTANT ||
+      std::fabs(pad_param->constant_value_) > 1e-5) {
+    return;
+  }
+
+  auto *conv_param = reinterpret_cast<ParamType *>(reinterpret_cast<OpenCLKernel *>(node)->GetParameter());
+  MS_ASSERT(conv_param);
+  conv_param->pad_u_ += pad_param->paddings_[2];
+  conv_param->pad_d_ += pad_param->paddings_[3];
+  conv_param->pad_l_ += pad_param->paddings_[4];
+  conv_param->pad_r_ += pad_param->paddings_[5];
+
+  MergeRemoveA(pad, node, removed_set);
+  MS_LOG(DEBUG) << "Merge Pad and " + GetTypeName(node) + " success";
+}
+
+// Conv2D + Reshape(N11C->NC)
+void TryMergeConvReshape(LiteKernel *reshape, std::set<LiteKernel *> *removed_set) {
+  MS_ASSERT(reshape);
+  MS_ASSERT(removed_set);
+  if (N11C_NC(reshape)) {
+    LiteKernel *conv = reshape->in_kernels().front();
+    MS_ASSERT(conv);
+    MergeRemoveB(conv, reshape, removed_set);
+    MS_LOG(DEBUG) << "Merge Conv2D and Reshape(N11C->NC) success";
+  }
+}
+
+// FullConnection + Reshape(NC->N11C or N11C->NC)
+void TryMergeFcReshape(LiteKernel *reshape, std::set<LiteKernel *> *removed_set) {
+  MS_ASSERT(reshape);
+  MS_ASSERT(removed_set);
+  bool NC_N11C_flag = NC_N11C(reshape);
+  if (NC_N11C_flag || N11C_NC(reshape)) {
+    LiteKernel *fc = reshape->in_kernels().front();
+    MS_ASSERT(fc);
+    MergeRemoveB(fc, reshape, removed_set);
+    MS_LOG(DEBUG) << "Merge FullConnection and Reshape" + (NC_N11C_flag ? std::string("(NC->N11C)") : "(N11C->NC)") +
+                       " success";
+  }
+}
+
+// Reshape(NC11->NC) + FullConnection
+// Reshape(NC->N11C) + FullConnection
+void TryMergeReshapeFc(LiteKernel *fc, std::set<LiteKernel *> *removed_set) {
+  MS_ASSERT(fc);
+  MS_ASSERT(removed_set);
+  LiteKernel *reshape = fc->in_kernels().front();
+  MS_ASSERT(reshape);
+  bool NC11_NC_flag = NC11_NC(reshape);
+  if (NC11_NC_flag || NC_N11C(reshape)) {
+    MergeRemoveA(reshape, fc, removed_set);
+    MS_LOG(DEBUG) << "Merge Reshape" + (NC11_NC_flag ? std::string("(NC11->NC)") : "(NC->N11C)") +
+                       " and FullConnection success";
+  }
+}
+
+// Arithmetic(NO_ACTIVATION) + Activation(RELU/RELU6)
+void TryMergeArithmeticAct(LiteKernel *act, std::set<LiteKernel *> *removed_set) {
+  MS_ASSERT(act);
+  MS_ASSERT(removed_set);
+  LiteKernel *arithmetic = act->in_kernels().front();
+  MS_ASSERT(arithmetic);
+  auto *arithmetic_param =
+    reinterpret_cast<ArithmeticParameter *>(reinterpret_cast<OpenCLKernel *>(arithmetic)->GetParameter());
+  auto *act_param = reinterpret_cast<ActivationParameter *>(reinterpret_cast<OpenCLKernel *>(act)->GetParameter());
+  MS_ASSERT(arithmetic_param);
+  MS_ASSERT(act_param);
+
+  if (arithmetic_param->activation_type_ == ActivationType_NO_ACTIVATION &&
+      (act_param->type_ == ActivationType_RELU || act_param->type_ == ActivationType_RELU6)) {
+    arithmetic_param->activation_type_ = act_param->type_;
+    MergeRemoveB(arithmetic, act, removed_set);
+    MS_LOG(DEBUG) << "Merge " + GetTypeName(arithmetic) + "(NO_ACTIVATION) and Activation(RELU or RELU6) success";
+  }
+}
+
+// Conv2D(NO_ACTIVATION)         + Activation(RELU/RELU6/TANH)
+// FullConnection(NO_ACTIVATION) + Activation(RELU/RELU6/TANH)
+template <typename ParamType>
+void TryMergeActivation(LiteKernel *act, std::set<LiteKernel *> *removed_set) {
+  MS_ASSERT(node);
+  MS_ASSERT(removed_set);
+  auto *act_param = reinterpret_cast<ActivationParameter *>(reinterpret_cast<OpenCLKernel *>(act)->GetParameter());
+  LiteKernel *node = act->in_kernels().front();
+  auto *param = reinterpret_cast<ParamType *>(reinterpret_cast<OpenCLKernel *>(node)->GetParameter());
+  MS_ASSERT(param);
+  if (param->act_type_ == ActType_No) {
+    param->act_type_ = static_cast<ActType>(act_param->type_);
+    std::string act_name;
+    if (act_param->type_ == ActivationType_RELU) {
+      act_name = "RELU";
+    } else if (act_param->type_ == ActivationType_RELU6) {
+      act_name = "RELU6";
+    } else if (act_param->type_ == ActivationType_TANH) {
+      act_name = "TANH";
+    }
+    MergeRemoveB(node, act, removed_set);
+    MS_LOG(DEBUG) << "Merge " + GetTypeName(node) + "(NO_ACTIVATION) and Activation(" + act_name + ") success";
+  }
+}
+
+// Conv2D(NO_ACTIVATION) + PReLU(weight is scalar)
+void TryMergeConvPReLU(LiteKernel *prelu, std::set<LiteKernel *> *removed_set) {
+  MS_ASSERT(prelu);
+  MS_ASSERT(removed_set);
+  if (prelu->in_tensors().size() != 2) {
+    return;
+  }
+  auto *prelu_weight = prelu->in_tensors().at(1);
+  bool shape_is_valid =
+    prelu_weight->IsScalar() || (prelu_weight->shape().size() == 1 && prelu_weight->shape().front() == 1);
+  if (!shape_is_valid) {
+    return;
+  }
+  if (prelu_weight->data_type() != kNumberTypeFloat32) {
+    return;
+  }
+
+  LiteKernel *conv = prelu->in_kernels().front();
+  auto *param = reinterpret_cast<ConvParameter *>(reinterpret_cast<OpenCLKernel *>(conv)->GetParameter());
+  MS_ASSERT(param);
+  if (param->act_type_ == ActType_No) {
+    param->act_type_ = static_cast<ActType>(ActivationType_LEAKY_RELU);
+    reinterpret_cast<Conv2DOpenCLKernel *>(conv)->alpha_ = *reinterpret_cast<float *>(prelu_weight->data_c());
+    MergeRemoveB(conv, prelu, removed_set);
+    MS_LOG(DEBUG) << "Merge Conv2D(NO_ACTIVATION) and PReLU(weight is scalar) success";
+  }
+}
+
+int TryFusionConvScaleWeight(LiteKernel *conv_kernel, LiteKernel *scale_kernel) {
+  MS_ASSERT(conv_kernel);
+  MS_ASSERT(scale_kernel);
+  auto *scale_param =
+    reinterpret_cast<ScaleParameter *>(reinterpret_cast<OpenCLKernel *>(scale_kernel)->GetParameter());
+  MS_ASSERT(scale_param);
+  MS_ASSERT(conv_kernel->in_tensors().size() >= 2);
+  auto *filter = conv_kernel->in_tensors().at(1);
+  auto *bias = conv_kernel->in_tensors().size() == 3 ? conv_kernel->in_tensors().at(2) : nullptr;
+  auto *scale = scale_kernel->in_tensors().at(1);
+  auto *offset = scale_kernel->in_tensors().at(2);
+  MS_ASSERT(filter);
+  MS_ASSERT(bias);
+  MS_ASSERT(scale);
+  MS_ASSERT(offset);
+
+  if (scale_kernel->in_tensors().size() != 3) {
+    return RET_ERROR;
+  }
+  if (scale->shape().size() != 1 || scale->shape().at(0) != filter->shape().back() ||
+      scale->shape() != offset->shape()) {
+    return RET_ERROR;
+  }
+  if (!(scale_param->axis_ == -1 || scale_param->axis_ == 3)) {
+    return RET_ERROR;
+  }
+  if (filter->data_type() != kNumberTypeFloat32 || (bias && bias->data_type() != kNumberTypeFloat32) ||
+      scale->data_type() != kNumberTypeFloat32 || offset->data_type() != kNumberTypeFloat32) {
+    return RET_ERROR;
+  }
+
+  // update filter: filter*=scale
+  MS_ASSERT(filter->shape().size() == 4);
+  int CI = filter->shape()[0];
+  int KH = filter->shape()[1];
+  int KW = filter->shape()[2];
+  int CO = filter->shape()[3];
+  auto *filter_data = reinterpret_cast<float *>(filter->data_c());
+  auto *scale_data = reinterpret_cast<float *>(scale->data_c());
+  for (int i = 0; i < CI * KH * KW * CO; ++i) {
+    filter_data[i] *= scale_data[i % CO];
+  }
+
+  // update bias: bias=bias*scale+offset
+  if (bias != nullptr) {
+    auto *bias_data = reinterpret_cast<float *>(bias->data_c());
+    auto *offset_data = reinterpret_cast<float *>(offset->data_c());
+    for (int co = 0; co < CO; ++co) {
+      bias_data[co] *= scale_data[co];
+      bias_data[co] += offset_data[co];
+    }
+  } else {  // if deconv dont't have bias, let scale's offset be deconv's bias
+    auto tmp = conv_kernel->in_tensors();
+    tmp.push_back(offset);
+    conv_kernel->set_in_tensors(tmp);
+  }
+  return RET_OK;
+}
+
+// DeConv2D + Scale (can't both has activation)
+void TryMergeDeconvScale(LiteKernel *scale, std::set<LiteKernel *> *removed_set) {
+  MS_ASSERT(scale);
+  MS_ASSERT(removed_set);
+  LiteKernel *deconv = scale->in_kernels().front();
+  MS_ASSERT(deconv);
+
+  // check act_type_
+  auto *deconv_param = reinterpret_cast<ConvParameter *>(reinterpret_cast<OpenCLKernel *>(deconv)->GetParameter());
+  auto *scale_param = reinterpret_cast<ScaleParameter *>(reinterpret_cast<OpenCLKernel *>(scale)->GetParameter());
+  MS_ASSERT(deconv_param);
+  MS_ASSERT(scale_param);
+  if (deconv_param->act_type_ == ActType_No) {
+    if (!(scale_param->activation_type_ == ActivationType_NO_ACTIVATION ||
+          scale_param->activation_type_ == ActivationType_RELU ||
+          scale_param->activation_type_ == ActivationType_RELU6)) {
+      return;
+    }
+  } else if (deconv_param->act_type_ == ActType_Relu || deconv_param->act_type_ == ActType_Relu6) {
+    if (deconv_param->act_type_ != ActType_No) {
+      return;
+    }
+  } else {
+    return;
+  }
+
+  // fusion weight
+  if (TryFusionConvScaleWeight(deconv, scale) == RET_ERROR) {
+    return;
+  }
+
+  // update act_type_
+  if (deconv_param->act_type_ == ActType_No) {
+    deconv_param->act_type_ = static_cast<ActType>(scale_param->activation_type_);
+  }
+
+  MergeRemoveB(deconv, scale, removed_set);
+  MS_LOG(DEBUG) << "Merge DeConv2D and Scale success";
+}
+
+void CreateEltwiseKernelReplaceOld(FusionEltwiseParameter *param, LiteKernel *old, std::vector<LiteKernel *> *nodes,
+                                   std::set<LiteKernel *> *removed_set) {
+  MS_ASSERT(param);
+  MS_ASSERT(old);
+  MS_ASSERT(nodes);
+  MS_ASSERT(removed_set);
+  auto *eltwise = new (std::nothrow)
+    FusionEltwiseOpenCLKernel(reinterpret_cast<OpParameter *>(param), old->in_tensors(), old->out_tensors());
+  if (eltwise == nullptr) {
+    MS_LOG(ERROR) << "create FusionEltwiseOpenCLKernel error.";
+    return;
+  }
+  eltwise->set_name("FusionEltwise: " + param->name_);
+  eltwise->set_in_kernels(old->in_kernels());
+  eltwise->set_out_kernels(old->out_kernels());
+
+  for (auto *pred : old->in_kernels()) {
+    MS_ASSERT(pred);
+    auto tmp = pred->out_kernels();
+    std::replace_if(
+      tmp.begin(), tmp.end(), [&](LiteKernel *x) { return x == old; }, eltwise);
+    pred->set_out_kernels(tmp);
+  }
+
+  for (auto *succ : old->out_kernels()) {
+    MS_ASSERT(succ);
+    auto tmp = succ->in_kernels();
+    std::replace_if(
+      tmp.begin(), tmp.end(), [&](LiteKernel *x) { return x == old; }, eltwise);
+    succ->set_in_kernels(tmp);
+  }
+  std::replace(nodes->begin(), nodes->end(), old, reinterpret_cast<LiteKernel *>(eltwise));
+  removed_set->insert(old);
+}
+
+// Eltwise + Eltwise
+int TryMergeEltwiseEltwise(LiteKernel *node, std::vector<LiteKernel *> *nodes, std::set<LiteKernel *> *removed_set) {
+  MS_ASSERT(node);
+  MS_ASSERT(nodes);
+  MS_ASSERT(removed_set);
+  // node must be eltwise-like op
+  if (!IsEltwiseAndOperatorSupported(node)) {
+    return RET_ERROR;
+  }
+
+  // preds must contain eltwise-like op
+  const std::vector<LiteKernel *> preds = node->in_kernels();
+  std::set<LiteKernel *> pred_eltwises;
+  std::map<lite::Tensor *, FusionEltwiseParameter *> pred_params;
+  for (LiteKernel *pred : preds) {
+    MS_ASSERT(pred);
+    if (AIsInB(pred, nodes) && IsEltwiseAndOperatorSupported(pred) && pred->out_kernels().size() == 1) {
+      auto *tensor = pred->out_tensors().front();
+      MS_ASSERT(pred->out_kernels().front() == node);
+      MS_ASSERT(AIsInB(tensor, node.in_tensors()));
+      pred_eltwises.insert(pred);
+      // create FusionEltwiseParameter for this pred eltwise
+      auto param = CreateFusionEltwiseParameter(pred);
+      pred_params.emplace(tensor, param);
+    }
+  }
+  if (pred_eltwises.empty()) {
+    return RET_ERROR;
+  }
+
+  // 1. create FusionEltwiseParameter for this node
+  FusionEltwiseParameter *param = CreateFusionEltwiseParameter(node, pred_params);
+  MS_ASSERT(param);
+  // 2. merge pred eltwise op
+  for (LiteKernel *pred_eltwise : pred_eltwises) {
+    MergeRemoveA(pred_eltwise, node, removed_set, false);
+  }
+  // 3. create FusionFusionEltwiseOpenCLKernel and replace old kernel by new
+  CreateEltwiseKernelReplaceOld(param, node, nodes, removed_set);
+
+  MS_LOG(DEBUG) << "Merge Eltwise and Eltwise success: " << param->name_;
+  return RET_OK;
+}
+
+}  // namespace
+
+void OpenCLSubGraph::Fusion() {
+  MS_LOG(DEBUG) << "start Fusion";
+
+  std::vector<LiteKernel *> input_nodes;
+  for (auto *node : nodes_) {
+    if (std::any_of(node->in_tensors().begin(), node->in_tensors().end(),
+                    [&](lite::Tensor *tensor) { return AIsInB(tensor, &in_tensors_); })) {
+      input_nodes.push_back(node);
+    }
+  }
+
+  auto cmp = [&](LiteKernel *a, LiteKernel *b) {
+    return std::find(nodes_.begin(), nodes_.end(), a) > std::find(nodes_.begin(), nodes_.end(), b);
+  };
+  std::priority_queue<LiteKernel *, std::vector<LiteKernel *>, decltype(cmp)> q(cmp, input_nodes);
+  std::set<LiteKernel *> qset(input_nodes.begin(), input_nodes.end());
+  std::set<LiteKernel *> removed_set;
+  while (!q.empty()) {
+    LiteKernel *node = q.top();
+    MS_ASSERT(node);
+    q.pop();
+    qset.erase(node);
+    if (AIsInB(node, &removed_set)) {
+      continue;
+    }
+
+    // insert node->out_kernels to q only if succ
+    // 1. not in q
+    // 2. not be removed
+    // 3. in nodes_
+    for (auto *succ : node->out_kernels()) {
+      if (!AIsInB(succ, &qset) && !AIsInB(succ, &removed_set) && AIsInB(succ, &nodes_)) {
+        q.push(succ);
+        qset.insert(succ);
+      }
+    }
+
+    // do element-wise fusion, like mul+add, mul+add+relu
+    if (TryMergeEltwiseEltwise(node, &nodes_, &removed_set) == RET_OK) {
+      continue;
+    }
+
+    // do special fusion, like pad+conv2d, fc+reshape
+    switch (node->Type()) {
+      case schema::PrimitiveType_Conv2D:
+      case schema::PrimitiveType_DepthwiseConv2D:
+      case schema::PrimitiveType_DeConv2D: {
+        if (PredIs(node, schema::PrimitiveType_Pad, &nodes_)) {
+          TryMergePad<ConvParameter>(node, &removed_set);
+        }
+        break;
+      }
+      case schema::PrimitiveType_Pooling: {
+        if (PredIs(node, schema::PrimitiveType_Pad, &nodes_)) {
+          TryMergePad<PoolingParameter>(node, &removed_set);
+        }
+        break;
+      }
+      case schema::PrimitiveType_Reshape: {
+        if (PredIs(node, schema::PrimitiveType_FullConnection, &nodes_)) {
+          TryMergeFcReshape(node, &removed_set);
+        } else if (PredIs(node, schema::PrimitiveType_Conv2D, &nodes_)) {
+          TryMergeConvReshape(node, &removed_set);
+        }
+        break;
+      }
+      case schema::PrimitiveType_FullConnection: {
+        if (PredIs(node, schema::PrimitiveType_Reshape, &nodes_)) {
+          TryMergeReshapeFc(node, &removed_set);
+        }
+        break;
+      }
+      case schema::PrimitiveType_Activation: {
+        // try merge Conv2D/FC(without act) + RELU/RELU6/TANH
+        auto *param = reinterpret_cast<ActivationParameter *>(reinterpret_cast<OpenCLKernel *>(node)->GetParameter());
+        MS_ASSERT(param);
+        if (param->type_ == ActivationType_RELU || param->type_ == ActivationType_RELU6 ||
+            param->type_ == ActivationType_TANH) {
+          if (PredIs(node, schema::PrimitiveType_Conv2D, &nodes_)) {
+            TryMergeActivation<ConvParameter>(node, &removed_set);
+            break;
+          } else if (PredIs(node, schema::PrimitiveType_FullConnection, &nodes_)) {
+            TryMergeActivation<MatMulParameter>(node, &removed_set);
+            break;
+          }
+        }
+        if (std::any_of(ArithmeticPrimitives.begin(), ArithmeticPrimitives.end(),
+                        [&](schema::PrimitiveType type) { return PredIs(node, type, &nodes_); })) {
+          TryMergeArithmeticAct(node, &removed_set);
+        }
+        break;
+      }
+      case schema::PrimitiveType_PReLU: {
+        if (PredIs(node, schema::PrimitiveType_Conv2D, &nodes_)) {
+          TryMergeConvPReLU(node, &removed_set);
+          break;
+        }
+        break;
+      }
+      case schema::PrimitiveType_Scale: {
+        if (PredIs(node, schema::PrimitiveType_DeConv2D, &nodes_)) {
+          TryMergeDeconvScale(node, &removed_set);
+          break;
+        }
+        break;
+      }
+      default:
+        break;
+    }
+  }
+
+  for (auto kernel : removed_set) {
+    delete kernel;
+  }
+  MS_LOG(DEBUG) << "number of kernels(before fusion): " << nodes_.size();
+  nodes_.erase(
+    std::remove_if(nodes_.begin(), nodes_.end(), [&](LiteKernel *node) { return AIsInB(node, &removed_set); }),
+    nodes_.end());
+  MS_LOG(DEBUG) << "number of kernels(after fusion) : " << nodes_.size();
+}

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc
@ -71,6 +71,57 @@ int OpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  return RET_OK;
 }

+void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
+  printf("%-30s", name().c_str());
+  if (out_tensors().empty()) {
+    return;
+  }
+  auto *tensor = out_tensors()[0];
+  auto mem_type = GetMemType();
+  if (tensor == nullptr || tensor->data_c() == nullptr) {
+    return;
+  }
+
+  GpuTensorInfo img_info(tensor);
+  auto size = mem_type == lite::opencl::MemType::BUF ? img_info.OriginSize : img_info.Image2DSize;
+  std::vector<char> data(size);
+  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
+  auto runtime = runtime_wrapper.GetInstance();
+  auto allocator = runtime->GetAllocator();
+  runtime->SyncCommandQueue();
+  if (mem_type == lite::opencl::MemType::BUF) {
+    allocator->MapBuffer(tensor->data_c(), CL_MAP_READ, nullptr, true);
+    memcpy(data.data(), tensor->data_c(), img_info.OriginSize);
+    allocator->UnmapBuffer(tensor->data_c());
+  } else {
+    runtime->ReadImage(tensor->data_c(), data.data());
+  }
+
+  printf("shape=(");
+  auto shape = tensor->shape();
+  for (int i = 0; i < shape.size(); ++i) {
+    printf("%4d", shape[i]);
+    if (i + 1 < shape.size()) {
+      printf(",");
+    }
+  }
+  printf(") ");
+
+  auto total_num = mem_type == lite::opencl::MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num;
+  for (int i = 0; i < print_num && i < total_num; ++i) {
+    if (tensor->data_type() == kNumberTypeFloat16) {
+      printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]);
+    } else {
+      printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
+    }
+  }
+  printf("\n");
+
+  if (!out_file.empty()) {
+    (void)WriteToBin(out_file, data.data(), data.size());
+  }
+}
+
 int OpenCLKernel::PostProcess() {
  for (auto *output : this->out_tensors()) {
    MS_ASSERT(output != nullptr);
@ -213,8 +264,10 @@ int OpenCLKernel::DequantWeight() {
 #ifdef ENABLE_ARM64
      if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) {
        dequant_weight = kernel::DequantUtil::DequantData<int8_t, float16_t>(weight_tensor);
+        weight_tensor->set_data_type(kNumberTypeFloat16);
      } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt16) {
        dequant_weight = kernel::DequantUtil::DequantData<int16_t, float16_t>(weight_tensor);
+        weight_tensor->set_data_type(kNumberTypeFloat16);
      } else {
        set_flag = false;
      }
@ -224,8 +277,10 @@ int OpenCLKernel::DequantWeight() {
    } else {
      if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) {
        dequant_weight = kernel::DequantUtil::DequantData<int8_t, float>(weight_tensor);
+        weight_tensor->set_data_type(kNumberTypeFloat32);
      } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt16) {
        dequant_weight = kernel::DequantUtil::DequantData<int16_t, float>(weight_tensor);
+        weight_tensor->set_data_type(kNumberTypeFloat32);
      } else {
        set_flag = false;
      }
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@ -26,6 +26,7 @@
 #include "include/errorcode.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/arm/base/dequant.h"
+#include "src/runtime/kernel/opencl/utils.h"

 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
@ -181,6 +182,7 @@ class OpenCLKernel : public LiteKernel {
  virtual int Tune();

  int GetImageSize(size_t idx, std::vector<size_t> *img_size);
+  void PrintOutput(int print_num = 10, const std::string &out_file = "");
  lite::opencl::MemType GetMemType() { return out_mem_type_; }
  void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; }
  OpParameter *GetParameter() { return op_parameter_; }
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@ -19,12 +19,8 @@
 #include <algorithm>
 #include <vector>
 #include "src/kernel_registry.h"
-#include "src/runtime/opencl/opencl_runtime.h"
-#include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "src/common/file_utils.h"

-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::opencl::MemType;
 using mindspore::schema::ActivationType_LEAKY_RELU;
 using mindspore::schema::ActivationType_RELU;
 using mindspore::schema::ActivationType_RELU6;
@ -298,65 +294,6 @@ int WriteToBin(const std::string &file_path, void *data, size_t size) {
  return 0;
 }

-void PrintTensor(const lite::Tensor *tensor, MemType mem_type, int n, const std::string &out_file) {
-  if (tensor == nullptr || tensor->data_c() == nullptr) {
-    return;
-  }
-
-  GpuTensorInfo img_info(tensor);
-  auto size = mem_type == MemType::BUF ? img_info.OriginSize : img_info.Image2DSize;
-  std::vector<char> data(size);
-  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  auto runtime = runtime_wrapper.GetInstance();
-  auto allocator = runtime->GetAllocator();
-  runtime->SyncCommandQueue();
-  allocator->MapBuffer(tensor->data_c(), CL_MAP_READ, nullptr, true);
-  if (mem_type == MemType::BUF) {
-    memcpy(data.data(), tensor->data_c(), img_info.OriginSize);
-  } else {
-    auto row_size = img_info.width * img_info.FLT4_size;
-    for (int i = 0; i < img_info.height; ++i) {
-      memcpy(reinterpret_cast<char *>(data.data()) + i * row_size,
-             static_cast<char *>(tensor->data_c()) + i * img_info.RowPitch(), row_size);
-    }
-  }
-  allocator->UnmapBuffer(tensor->data_c());
-
-  printf("shape=(");
-  auto shape = tensor->shape();
-  for (int i = 0; i < shape.size(); ++i) {
-    printf("%4d", shape[i]);
-    if (i + 1 < shape.size()) {
-      printf(",");
-    }
-  }
-  printf(") ");
-
-  auto num = mem_type == MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num;
-  for (int i = 0; i < n && i < num; ++i) {
-    if (tensor->data_type() == kNumberTypeFloat16) {
-      printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]);
-    } else {
-      printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
-    }
-  }
-  printf("\n");
-
-  if (!out_file.empty()) {
-    (void)WriteToBin(out_file, data.data(), data.size());
-  }
-}
-
-void PrintKernelOutput(OpenCLKernel *kernel, int n, const std::string &out_file) {
-  if (kernel == nullptr) {
-    return;
-  }
-  printf("%-30s", kernel->name().c_str());
-  if (!kernel->out_tensors().empty()) {
-    PrintTensor(kernel->out_tensors()[0], kernel->GetMemType(), n, out_file);
-  }
-}
-
 std::vector<int> GetNHWCShape(const std::vector<int> &tensor_shape) {
  int n, h, w, c;
  n = h = w = c = 1;
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@ -25,8 +25,6 @@
 #include "nnacl/op_base.h"
 #include "src/lite_kernel.h"
 #include "src/common/utils.h"
-#include "src/runtime/opencl/opencl_runtime.h"
-#include "src/runtime/kernel/opencl/opencl_kernel.h"

 namespace mindspore::lite {
 kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
@ -59,11 +57,6 @@ std::string CLErrorCode(cl_int error_code);

 int WriteToBin(const std::string &file_path, void *data, size_t size);

-void PrintTensor(const lite::Tensor *tensor, lite::opencl::MemType mem_type, int n = 10,
-                 const std::string &out_file = "");
-
-void PrintKernelOutput(OpenCLKernel *kernel, int n = 10, const std::string &out_file = "");
-
 std::vector<int> GetNHWCShape(const std::vector<int> &tensor_shape);

 std::vector<size_t> GetImage2dShapeFromNHWC(const std::vector<int> &tensor_shape, schema::Format format);
@ -154,38 +147,6 @@ std::vector<T> MatrixMultiply(const T A[], const T B[], int M, int N, int K) {
  return C;
 }

-template <typename SRC_T, typename DST_T>
-void ConvertConvWeight4DTo7D(void *src, void *dst, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1,
-                             const size_t CI_TILE = 4, const size_t CO_TILE = 4) {
-  MS_ASSERT(src);
-  MS_ASSERT(dst);
-  MS_ASSERT(CI_TILE);
-  MS_ASSERT(CO_TILE);
-  MS_ASSERT(OGroup);
-  if (CO_TILE == 0 || CI_TILE == 0) return;
-  auto origin_weight = reinterpret_cast<SRC_T *>(src);
-  auto packed_weight = reinterpret_cast<DST_T *>(dst);
-  auto CI_SLICES = UP_DIV(CI, CI_TILE);
-  for (size_t co = 0, src_idx = 0; co < CO; ++co) {
-    for (size_t kh = 0; kh < KH; ++kh) {
-      for (size_t kw = 0; kw < KW; ++kw) {
-        for (size_t ci = 0; ci < CI; ++ci) {
-          size_t co_outer = co / (CO_TILE * OGroup);
-          size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE;
-          size_t co_inner = co % CO_TILE;
-          size_t ci_outer = ci / CI_TILE;
-          size_t ci_inner = ci % CI_TILE;
-          size_t dst_idx =
-            (((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) *
-              CO_TILE +
-            co_inner;
-          packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]);
-        }
-      }
-    }
-  }
-}
-
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_UTILS_H_
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
@ -507,6 +507,37 @@ bool OpenCLRuntime::BuildProgram(const std::string &build_options, const cl::Pro
  return true;
 }

+int OpenCLRuntime::ReadOrWriteImage(void *buffer, void *data, bool is_read) {
+  cl::CommandQueue *command_queue = profiling_ ? profiling_command_queue_ : default_command_queue_;
+  auto *image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(buffer));
+  if (image == nullptr) {
+    MS_LOG(WARNING) << "Can't get Image2D for " << buffer;
+    return RET_ERROR;
+  }
+  std::vector<size_t> img_size;
+  int ret = allocator_->GetImageSize(buffer, &img_size);
+  if (ret != RET_OK) {
+    MS_LOG(WARNING) << "Can't get GetImageSize for " << buffer;
+    return RET_ERROR;
+  }
+  cl::array<size_t, 3> origin = {0, 0, 0};
+  cl::array<size_t, 3> region = {img_size[0], img_size[1], 1};
+  if (is_read) {
+    ret = command_queue->enqueueReadImage(*image, true, origin, region, 0, 0, data, nullptr, nullptr);
+  } else {
+    ret = command_queue->enqueueWriteImage(*image, true, origin, region, 0, 0, data, nullptr, nullptr);
+  }
+  if (ret != CL_SUCCESS) {
+    MS_LOG(ERROR) << CLErrorCode(ret);
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int OpenCLRuntime::ReadImage(void *buffer, void *dst_data) { return ReadOrWriteImage(buffer, dst_data, true); }
+
+int OpenCLRuntime::WriteImage(void *buffer, void *src_data) { return ReadOrWriteImage(buffer, src_data, false); }
+
 bool OpenCLRuntime::CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue,
                                        bool sync) const {
  if (command_queue == nullptr) {
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
@ -119,6 +119,9 @@ class OpenCLRuntime {
                  const std::set<std::string> &build_options = {});
  int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
                cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr);
+  int ReadOrWriteImage(void *buffer, void *data, bool is_read);
+  int ReadImage(void *buffer, void *dst_data);
+  int WriteImage(void *buffer, void *src_data);
  bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
                           bool sync = false) const;
  bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,