!15668 Abstract WRITEIMAGE and READIMAGE into the kernel

From: @yeyunpeng2020 Reviewed-by: @ddwsky,@hangangqiang Signed-off-by: @ddwsky
2021-04-26 14:31:28 +08:00 · 2021-04-26 14:31:28 +08:00 · db6ce1d343
parent 25ce6a104e deeff1828d
commit db6ce1d343
45 changed files with 497 additions and 191 deletions
--- a/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc
@ -374,11 +374,11 @@ int OpenCLRuntime::BuildKernel(const cl::Kernel &kernel, const std::string &prog
  if (fp16_enable_) {
    build_option +=
      " -DFP16_ENABLE=1 -DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 -DAS_UINT4=as_ushort4 -DUINT4=ushort4"
-      " -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT=convert_half -DTO_FLT4=convert_half4";
+      " -DTO_FLT=convert_half -DTO_FLT4=convert_half4";
  } else {
    build_option +=
      " -DFP16_ENABLE=0 -DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 -DAS_UINT4=as_uint4 -DUINT4=uint4"
-      " -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT=convert_float -DTO_FLT4=convert_float4";
+      " -DTO_FLT=convert_float -DTO_FLT4=convert_float4";
  }
  build_option += " -DMAX_IMAGE2D_WIDTH=" + std::to_string(max_image2d_width_);
  build_option =
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
@ -11,7 +11,7 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE |
  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { \
    return;                                                                                 \
  }                                                                                         \
-  FLT4 result;
+  TYPE4 result;

 // axis = 1
 #define DOConcat2inputaxis1_NHWC4                                              \
@ -248,12 +248,12 @@ CONCAT2(2input, axis3, _NHWC4)
  int Align_OutShape = output_shape.w;                                                            \
  int index_output = (IN * output_shape.y + IH) * stride_w + IW * Align_OutShape * C4NUM;

-int doconcat(__read_only image2d_t input, __global FLT *output, int Align_Shape, int4 input_shape, int IN, int IH,
+int doconcat(__read_only image2d_t input, __global TYPE *output, int Align_Shape, int4 input_shape, int IN, int IH,
             int Y, int index_output) {
  int Remainder = input_shape.w % C4NUM;
  for (int i = 0; i < Align_Shape; ++i) {
-    FLT4 result = READ_IMAGE(input, smp_none, (int2)((Y * Align_Shape + i), (IN * input_shape.y + IH)));
-    FLT result_temp[4] = {result.x, result.y, result.z, result.w};
+    TYPE4 result = READ_IMAGE(input, smp_none, (int2)((Y * Align_Shape + i), (IN * input_shape.y + IH)));
+    TYPE result_temp[4] = {result.x, result.y, result.z, result.w};
    if ((i + 1) * C4NUM <= input_shape.w) {
      for (int j = 0; j < C4NUM; ++j) {
        output[index_output++] = result_temp[j];
@ -268,7 +268,7 @@ int doconcat(__read_only image2d_t input, __global FLT *output, int Align_Shape,
 }

 __kernel void ConcatInput2UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1,
-                                        __global FLT *output, int4 input_shape0, int4 input_shape1, int stride_w,
+                                        __global TYPE *output, int4 input_shape0, int4 input_shape1, int stride_w,
                                        int4 output_shape) {
  CHECK_IDX_UNALIGN;
  index_output = doconcat(input0, output, Align_Shape0, input_shape0, IN, IH, Y, index_output);
@ -276,7 +276,7 @@ __kernel void ConcatInput2UnAlign_NHWC4(__read_only image2d_t input0, __read_onl
 }

 __kernel void ConcatInput3UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1,
-                                        __read_only image2d_t input2, __global FLT *output, int4 input_shape0,
+                                        __read_only image2d_t input2, __global TYPE *output, int4 input_shape0,
                                        int4 input_shape1, int4 input_shape2, int stride_w, int4 output_shape) {
  CHECK_IDX_UNALIGN;
  int Align_Shape2 = UP_DIV(input_shape2.w, C4NUM);
@ -287,7 +287,7 @@ __kernel void ConcatInput3UnAlign_NHWC4(__read_only image2d_t input0, __read_onl

 __kernel void ConcatInput4UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1,
                                        __read_only image2d_t input2, __read_only image2d_t input3,
-                                        __global FLT *output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
+                                        __global TYPE *output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
                                        int4 input_shape3, int stride_w, int4 output_shape) {
  CHECK_IDX_UNALIGN;
  int Align_Shape2 = UP_DIV(input_shape2.w, C4NUM), Align_Shape3 = UP_DIV(input_shape3.w, C4NUM);
@ -299,7 +299,7 @@ __kernel void ConcatInput4UnAlign_NHWC4(__read_only image2d_t input0, __read_onl

 __kernel void ConcatInput5UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1,
                                        __read_only image2d_t input2, __read_only image2d_t input3,
-                                        __read_only image2d_t input4, __global FLT *output, int4 input_shape0,
+                                        __read_only image2d_t input4, __global TYPE *output, int4 input_shape0,
                                        int4 input_shape1, int4 input_shape2, int4 input_shape3, int4 input_shape4,
                                        int stride_w, int4 output_shape) {
  CHECK_IDX_UNALIGN;
@ -315,7 +315,7 @@ __kernel void ConcatInput5UnAlign_NHWC4(__read_only image2d_t input0, __read_onl
 __kernel void ConcatInput6UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1,
                                        __read_only image2d_t input2, __read_only image2d_t input3,
                                        __read_only image2d_t input4, __read_only image2d_t input5,
-                                        __global FLT *output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
+                                        __global TYPE *output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
                                        int4 input_shape3, int4 input_shape4, int4 input_shape5, int stride_w,
                                        int4 output_shape) {
  CHECK_IDX_UNALIGN;
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl
@ -1,48 +1,43 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define C4NUM 4
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-#define GATHER(SUFFIX, READ_IMAGE, WRITE_IMAGE, TYPE)                                                                \
-  __kernel void gather_##SUFFIX(__write_only image2d_t dst_data, __read_only image2d_t src_data,                     \
-                                __global int *indices, int4 src_size, int4 dst_size, int indices_num, int axis) {    \
-    int X = get_global_id(0);                                                                                        \
-    int Y = get_global_id(1);                                                                                        \
-    int Z = get_global_id(2);                                                                                        \
-    if (X >= dst_size.x || Y >= dst_size.y * dst_size.w || Z >= dst_size.z || dst_size.y == 0) {                     \
-      return;                                                                                                        \
-    }                                                                                                                \
-    TYPE##4 res_data = (TYPE##4)(0.0f, 0.0f, 0.0f, 0.0f);                                                            \
-    int batch = Y / dst_size.y;                                                                                      \
-    int height = Y % dst_size.y;                                                                                     \
-    if (axis == 0) {                                                                                                 \
-      res_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + Z, indices[batch] * src_size.y + height));   \
-    } else if (axis == 1) {                                                                                          \
-      res_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + Z, batch * src_size.y + indices[height]));   \
-    } else if (axis == 2) {                                                                                          \
-      res_data = READ_IMAGE(src_data, smp_zero, (int2)(indices[X] * src_size.z + Z, batch * src_size.y + height));   \
-    } else if (axis == 3) {                                                                                          \
-      int offset[4] = {indices[Z * 4] / 4, indices[Z * 4 + 1] / 4, indices[Z * 4 + 2] / 4, indices[Z * 4 + 3] / 4};  \
-      TYPE tmp[4];                                                                                                   \
-      TYPE res_tmp[4];                                                                                               \
-      for (int i = 0; i < indices_num; ++i) {                                                                        \
-        TYPE##4 rd_data = (TYPE##4)(0.0f, 0.0f, 0.0f, 0.0f);                                                         \
-        rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + offset[i], batch * src_size.y + height));   \
-        if (i >= 1 && offset[i] != offset[i - 1]) {                                                                  \
-          rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + offset[i], batch * src_size.y + height)); \
-        }                                                                                                            \
-        tmp[0] = rd_data.x;                                                                                          \
-        tmp[1] = rd_data.y;                                                                                          \
-        tmp[2] = rd_data.z;                                                                                          \
-        tmp[3] = rd_data.w;                                                                                          \
-        res_tmp[i] = tmp[indices[Z * 4 + i] % 4];                                                                    \
-      }                                                                                                              \
-      res_data.x = res_tmp[0];                                                                                       \
-      res_data.y = res_tmp[1];                                                                                       \
-      res_data.z = res_tmp[2];                                                                                       \
-      res_data.w = res_tmp[3];                                                                                       \
-    }                                                                                                                \
-    WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, batch * dst_size.y + height), res_data);                        \
+__kernel void gather(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices,
+                     int4 src_size, int4 dst_size, int indices_num, int axis) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= dst_size.x || Y >= dst_size.y * dst_size.w || Z >= dst_size.z || dst_size.y == 0) {
+    return;
  }
-// GATHER(SUFFIX, READ_IMAGE, WRITE_IMAGE, TYPE)
-GATHER(float, READ_IMAGE, WRITE_IMAGE, FLT);
-GATHER(int, read_imagei, write_imagei, int);
+  TYPE4 res_data = (TYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
+  int batch = Y / dst_size.y;
+  int height = Y % dst_size.y;
+  if (axis == 0) {
+    res_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + Z, indices[batch] * src_size.y + height));
+  } else if (axis == 1) {
+    res_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + Z, batch * src_size.y + indices[height]));
+  } else if (axis == 2) {
+    res_data = READ_IMAGE(src_data, smp_zero, (int2)(indices[X] * src_size.z + Z, batch * src_size.y + height));
+  } else if (axis == 3) {
+    int offset[4] = {indices[Z * 4] / 4, indices[Z * 4 + 1] / 4, indices[Z * 4 + 2] / 4, indices[Z * 4 + 3] / 4};
+    TYPE tmp[4];
+    TYPE res_tmp[4];
+    for (int i = 0; i < indices_num; ++i) {
+      TYPE4 rd_data = (TYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
+      rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + offset[i], batch * src_size.y + height));
+      if (i >= 1 && offset[i] != offset[i - 1]) {
+        rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + offset[i], batch * src_size.y + height));
+      }
+      tmp[0] = rd_data.x;
+      tmp[1] = rd_data.y;
+      tmp[2] = rd_data.z;
+      tmp[3] = rd_data.w;
+      res_tmp[i] = tmp[indices[Z * 4 + i] % 4];
+    }
+    res_data.x = res_tmp[0];
+    res_data.y = res_tmp[1];
+    res_data.z = res_tmp[2];
+    res_data.w = res_tmp[3];
+  }
+  WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, batch * dst_size.y + height), res_data);
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/one_hot.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/one_hot.cl
@ -22,7 +22,7 @@ __kernel void OneHotAxis0(__read_only image2d_t src_data, __write_only image2d_t
  int N = Z / out_shape.y;
  int H = Z % out_shape.y;
  int in_index = (H * out_shape.z + Y) * out_shape.w + X;
-  int4 indices = read_imagei(src_data, smp_zero, (int2)(in_index % in_image2d_shape.x, in_index / in_image2d_shape.x));
+  int4 indices = READ_IMAGE(src_data, smp_zero, (int2)(in_index % in_image2d_shape.x, in_index / in_image2d_shape.x));
  int *indices_int = (int *)&indices;
  for (int i = 0; i < C4NUM; i++) {
    if (support_neg_index != 0 && indices_int[i] < 0) {
@ -42,7 +42,7 @@ __kernel void OneHotAxis0(__read_only image2d_t src_data, __write_only image2d_t
  if (4 * X + 3 < C) {
    SET_ON_OR_OFF_VALUE(result.w, N, indices_int[3], on_value, off_value);
  }
-  write_imagef(dst_data, (int2)(Y * out_shape.w + X, Z), result);
+  WRITE_IMAGE(dst_data, (int2)(Y * out_shape.w + X, Z), result);
 }

 __kernel void OneHotAxis1(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 in_image2d_shape,
@ -54,7 +54,7 @@ __kernel void OneHotAxis1(__read_only image2d_t src_data, __write_only image2d_t
  int N = Z / out_shape.y;
  int H = Z % out_shape.y;
  int in_index = (N * out_shape.z + Y) * out_shape.w + X;
-  int4 indices = read_imagei(src_data, smp_zero, (int2)(in_index % in_image2d_shape.x, in_index / in_image2d_shape.x));
+  int4 indices = READ_IMAGE(src_data, smp_zero, (int2)(in_index % in_image2d_shape.x, in_index / in_image2d_shape.x));
  int *indices_int = (int *)&indices;
  for (int i = 0; i < C4NUM; i++) {
    if (support_neg_index != 0 && indices_int[i] < 0) {
@ -74,7 +74,7 @@ __kernel void OneHotAxis1(__read_only image2d_t src_data, __write_only image2d_t
  if (4 * X + 3 < C) {
    SET_ON_OR_OFF_VALUE(result.w, H, indices_int[3], on_value, off_value);
  }
-  write_imagef(dst_data, (int2)(Y * out_shape.w + X, Z), result);
+  WRITE_IMAGE(dst_data, (int2)(Y * out_shape.w + X, Z), result);
 }

 __kernel void OneHotAxis2(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 in_image2d_shape,
@ -86,7 +86,7 @@ __kernel void OneHotAxis2(__read_only image2d_t src_data, __write_only image2d_t
  int N = Z / out_shape.y;
  int H = Z % out_shape.y;
  int in_index = (N * out_shape.y + H) * out_shape.w + X;
-  int4 indices = read_imagei(src_data, smp_zero, (int2)(in_index % in_image2d_shape.x, in_index / in_image2d_shape.x));
+  int4 indices = READ_IMAGE(src_data, smp_zero, (int2)(in_index % in_image2d_shape.x, in_index / in_image2d_shape.x));
  int *indices_int = (int *)&indices;
  for (int i = 0; i < C4NUM; i++) {
    if (support_neg_index != 0 && indices_int[i] < 0) {
@ -106,7 +106,7 @@ __kernel void OneHotAxis2(__read_only image2d_t src_data, __write_only image2d_t
  if (4 * X + 3 < C) {
    SET_ON_OR_OFF_VALUE(result.w, Y, indices_int[3], on_value, off_value);
  }
-  write_imagef(dst_data, (int2)(Y * out_shape.w + X, Z), result);
+  WRITE_IMAGE(dst_data, (int2)(Y * out_shape.w + X, Z), result);
 }

 __kernel void OneHotAxis3(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 in_image2d_shape,
@ -121,7 +121,7 @@ __kernel void OneHotAxis3(__read_only image2d_t src_data, __write_only image2d_t
  int in_index_c4 = (N * out_shape.y + H) * ci4_size + Y / 4;
  int in_index_c4_remainder = Y % 4;
  int4 indices =
-    read_imagei(src_data, smp_zero, (int2)(in_index_c4 % in_image2d_shape.x, in_index_c4 / in_image2d_shape.x));
+    READ_IMAGE(src_data, smp_zero, (int2)(in_index_c4 % in_image2d_shape.x, in_index_c4 / in_image2d_shape.x));
  int *indices_int = (int *)&indices;
  int index_one = indices_int[in_index_c4_remainder];
  if (support_neg_index != 0 && index_one < 0) {
@ -140,7 +140,7 @@ __kernel void OneHotAxis3(__read_only image2d_t src_data, __write_only image2d_t
  if (4 * X + 3 < C) {
    SET_ON_OR_OFF_VALUE(result.w, 4 * X + 3, index_one, on_value, off_value);
  }
-  write_imagef(dst_data, (int2)(Y * out_shape.w + X, Z), result);
+  WRITE_IMAGE(dst_data, (int2)(Y * out_shape.w + X, Z), result);
 }

 __kernel void OneHot2DAxis3(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 in_image2d_shape,
@ -150,7 +150,7 @@ __kernel void OneHot2DAxis3(__read_only image2d_t src_data, __write_only image2d
  int Z = get_global_id(2);  // H * N (out_shape.h is 1, so N == Z)
  if (X >= out_shape.w || Y >= out_shape.z || Z >= out_shape.x * out_shape.y) return;
  int in_index_c4_remainder = Z % 4;
-  int4 indices = read_imagei(src_data, smp_zero, (int2)(Z / C4NUM, 0));
+  int4 indices = READ_IMAGE(src_data, smp_zero, (int2)(Z / C4NUM, 0));
  int *indices_int = (int *)&indices;
  int index_one = indices_int[in_index_c4_remainder];
  if (support_neg_index != 0 && index_one < 0) {
@ -169,5 +169,5 @@ __kernel void OneHot2DAxis3(__read_only image2d_t src_data, __write_only image2d
  if (4 * X + 3 < C) {
    SET_ON_OR_OFF_VALUE(result.w, 4 * X + 3, index_one, on_value, off_value);
  }
-  write_imagef(dst_data, (int2)(Y * out_shape.w + X, Z), result);
+  WRITE_IMAGE(dst_data, (int2)(Y * out_shape.w + X, Z), result);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
@ -3,52 +3,47 @@
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-#define RESHAPE_NHWC4(SUFFIX, READ_IMAGE, WRITE_IMAGE, TYPE)                                                           \
-  __kernel void reshape_NHWC4_##SUFFIX(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 src_size, \
-                                       int4 dst_size) {                                                                \
-    int X = get_global_id(0);                                                                                          \
-    int Y = get_global_id(1);                                                                                          \
-    int CO4 = UP_DIV(dst_size.z, C4NUM);                                                                               \
-    int CO4_rem = dst_size.z % C4NUM;                                                                                  \
-    if (X >= dst_size.x || Y > dst_size.y) {                                                                           \
-      return;                                                                                                          \
-    }                                                                                                                  \
-    int CI4 = UP_DIV(src_size.x, C4NUM);                                                                               \
-    int CI4_rem = src_size.x % C4NUM;                                                                                  \
-    CI4_rem = (CI4_rem == 0) ? C4NUM : CI4_rem;                                                                        \
-    int in_img_x = CI4 * src_size.y;                                                                                   \
-    TYPE##4 res = (TYPE##4)(0.0f);                                                                                     \
-    TYPE tmp[4];                                                                                                       \
-    TYPE res_tmp[4];                                                                                                   \
-    int gcnt = 0;                                                                                                      \
-    if (CO4_rem == 0 && ((CI4_rem & 0x3) == 0)) {                                                                      \
-      gcnt = X + dst_size.x * Y;                                                                                       \
-      res = READ_IMAGE(src_data, smp_zero, (int2)(gcnt % in_img_x, gcnt / in_img_x));                                  \
-      WRITE_IMAGE(dst_data, (int2)(X, Y), res);                                                                        \
-    } else {                                                                                                           \
-      int start = ((X / CO4 * dst_size.z + min(dst_size.z, (X % CO4) * C4NUM)) + dst_size.w * Y);                      \
-      gcnt = start / src_size.x * CI4 + (start % src_size.x) / C4NUM;                                                  \
-      start = start % src_size.x % C4NUM;                                                                              \
-      for (int i = 0, n = 0, j = start; i < C4NUM; ++n, j = 0) {                                                       \
-        int X_src = (gcnt + n) % in_img_x;                                                                             \
-        res = READ_IMAGE(src_data, smp_zero, (int2)(X_src, (gcnt + n) / in_img_x));                                    \
-        tmp[0] = res.x;                                                                                                \
-        tmp[1] = res.y;                                                                                                \
-        tmp[2] = res.z;                                                                                                \
-        tmp[3] = res.w;                                                                                                \
-        int k = (X_src % CI4) == (CI4 - 1) ? CI4_rem : C4NUM;                                                          \
-        for (; j < k && i < C4NUM; ++j, ++i) {                                                                         \
-          res_tmp[i] = tmp[j];                                                                                         \
-        }                                                                                                              \
-      }                                                                                                                \
-      res.x = res_tmp[0];                                                                                              \
-      res.y = res_tmp[1];                                                                                              \
-      res.z = res_tmp[2];                                                                                              \
-      res.w = res_tmp[3];                                                                                              \
-      WRITE_IMAGE(dst_data, (int2)(X, Y), res);                                                                        \
-    }                                                                                                                  \
+__kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 src_size,
+                            int4 dst_size) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int CO4 = UP_DIV(dst_size.z, C4NUM);
+  int CO4_rem = dst_size.z % C4NUM;
+  if (X >= dst_size.x || Y > dst_size.y) {
+    return;
  }
-
-// RESHAPE_NHWC4(SUFFIX, READ_IMAGE, WRITE_IMAGE, TYPE)
-RESHAPE_NHWC4(float, READ_IMAGE, WRITE_IMAGE, FLT);
-RESHAPE_NHWC4(int, read_imagei, write_imagei, int);
+  int CI4 = UP_DIV(src_size.x, C4NUM);
+  int CI4_rem = src_size.x % C4NUM;
+  CI4_rem = (CI4_rem == 0) ? C4NUM : CI4_rem;
+  int in_img_x = CI4 * src_size.y;
+  TYPE4 res = (TYPE4)(0.0f);
+  TYPE tmp[4];
+  TYPE res_tmp[4];
+  int gcnt = 0;
+  if (CO4_rem == 0 && ((CI4_rem & 0x3) == 0)) {
+    gcnt = X + dst_size.x * Y;
+    res = READ_IMAGE(src_data, smp_zero, (int2)(gcnt % in_img_x, gcnt / in_img_x));
+    WRITE_IMAGE(dst_data, (int2)(X, Y), res);
+  } else {
+    int start = ((X / CO4 * dst_size.z + min(dst_size.z, (X % CO4) * C4NUM)) + dst_size.w * Y);
+    gcnt = start / src_size.x * CI4 + (start % src_size.x) / C4NUM;
+    start = start % src_size.x % C4NUM;
+    for (int i = 0, n = 0, j = start; i < C4NUM; ++n, j = 0) {
+      int X_src = (gcnt + n) % in_img_x;
+      res = READ_IMAGE(src_data, smp_zero, (int2)(X_src, (gcnt + n) / in_img_x));
+      tmp[0] = res.x;
+      tmp[1] = res.y;
+      tmp[2] = res.z;
+      tmp[3] = res.w;
+      int k = (X_src % CI4) == (CI4 - 1) ? CI4_rem : C4NUM;
+      for (; j < k && i < C4NUM; ++j, ++i) {
+        res_tmp[i] = tmp[j];
+      }
+    }
+    res.x = res_tmp[0];
+    res.y = res_tmp[1];
+    res.z = res_tmp[2];
+    res.w = res_tmp[3];
+    WRITE_IMAGE(dst_data, (int2)(X, Y), res);
+  }
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@ -73,7 +73,15 @@ int ActivationOpenCLKernel::Prepare() {
  std::string program_name = "Activation";
  ocl_runtime_->LoadSource(program_name, source);
  std::string kernel_name = GetActTypeString(type_);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeInt32) {
+    build_options_ext = {" -DTYPE=int -DTYPE4=int4  -DWRITE_IMAGE=write_imagei  -DREAD_IMAGE=read_imagei "};
+  } else if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
  SetConstArgs();
  SetGlobalLocal();
  MS_LOG(DEBUG) << kernel_name << " init Done!";
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
@ -149,7 +149,15 @@ int ArgMinMaxOpenCLKernel::Prepare() {
  std::string source = argminmax_source;
  std::string program_name = "argminmax";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeInt32) {
+    build_options_ext = {" -DTYPE=int -DTYPE4=int4  -DWRITE_IMAGE=write_imagei  -DREAD_IMAGE=read_imagei "};
+  } else if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif

  auto *param = reinterpret_cast<ArgMinMaxParameter *>(this->op_parameter_);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@ -191,7 +191,15 @@ int ArithmeticOpenCLKernel::Prepare() {
  std::string program_name = "Arithmetic";
  std::string source = arithmetic_source;
  ocl_runtime_->LoadSource(program_name, source);
-  int error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name_);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeInt32) {
+    build_options_ext = {" -DTYPE=int -DTYPE4=int4  -DWRITE_IMAGE=write_imagei  -DREAD_IMAGE=read_imagei "};
+  } else if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  int error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name_, build_options_ext);
 #endif
  if (error_code != RET_OK) {
    return error_code;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
@ -89,7 +89,15 @@ int ArithmeticSelfOpenCLKernel::Prepare() {
  MS_LOG(DEBUG) << "execute kernel name : " << kernel_name;
  std::string program_name = "ArithmeticSelf";
  ocl_runtime_->LoadSource(program_name, arithmeticself_source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeInt32) {
+    build_options_ext = {" -DTYPE=int -DTYPE4=int4  -DWRITE_IMAGE=write_imagei  -DREAD_IMAGE=read_imagei "};
+  } else if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
  SetGlobalLocal();
  SetConstArgs();
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc
@ -94,7 +94,13 @@ int BatchToSpaceNDOpenCLKernel::Prepare() {
  std::string source = batch_to_space_nd_source;
  std::string program_name = "batch_to_space_nd";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif

  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
@ -168,7 +168,13 @@ int BatchNormOpenCLKernel::Prepare() {
  std::string source = batchnorm_source;
  std::string program_name = "Batch_normalization";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  int ret = Initweight();
  if (ret) {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@ -170,7 +170,24 @@ int ConcatOpenCLKernel::ConvertWeightToTensor() {
      bool src_is_fp16 = in_tensor->data_type() == kNumberTypeFloat16;
      PackNHWCToNHWC4(in_tensor->data_c(), weight.data(), src_is_fp16,
                      fp16_enable && in_tensor->data_type() != kNumberTypeInt32, in_shape);
-      size_t dtype = fp16_enable && in_tensor->data_type() != kNumberTypeInt32 ? CL_HALF_FLOAT : CL_FLOAT;
+      size_t dtype;
+      switch (in_tensor->data_type()) {
+        case kNumberTypeInt32: {
+          dtype = CL_SIGNED_INT32;
+          break;
+        }
+        case kNumberTypeFloat32: {
+          dtype = CL_FLOAT;
+          break;
+        }
+        case kNumberTypeFloat16: {
+          dtype = CL_HALF_FLOAT;
+          break;
+        }
+        default:
+          MS_LOG(ERROR) << "Unsupported data type is" << in_tensor->data_type();
+          return RET_ERROR;
+      }
      ImageSize img_size{in_shape.width, in_shape.height, dtype};
      auto weight_ptr_ = allocator->Malloc(img_size, weight.data());
      weight_ptrs_.push_back(weight_ptr_);
@ -207,7 +224,15 @@ int ConcatOpenCLKernel::Prepare() {
  std::string source = concat_source;
  std::string program_name = "Concat";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, {});
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeInt32) {
+    build_options_ext = {" -DTYPE=int -DTYPE4=int4  -DWRITE_IMAGE=write_imagei  -DREAD_IMAGE=read_imagei "};
+  } else if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  SetConstArgs();
  SetGlobalLocal();
@ -235,4 +260,5 @@ int ConcatOpenCLKernel::Run() {

 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeInt32, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@ -144,7 +144,13 @@ void Conv2DOpenCLKernel::BuildKernel() {
    kernel_name << "_Img";
  }
  ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str());
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str(), build_options_ext);
 }

 void Conv2DOpenCLKernel::SetBlockSize() {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@ -65,7 +65,13 @@ int Conv2dTransposeOpenCLKernel::Prepare() {
  std::string source = GetActDefines() + conv2d_transpose_source;
  std::string program_name = "conv2d_transpose";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif
  auto ret = InitWeights();
  if (ret != RET_OK) {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@ -85,7 +85,13 @@ int DepthwiseConv2dOpenCLKernel::Prepare() {
  std::string program_name = "DepthwiseConv2d";
  std::string source = depthwise_conv2d_source;
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif
  auto ret = InitWeights();
  if (ret != RET_OK) {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
@ -105,7 +105,13 @@ int FullConnectionOpenCLKernel::Prepare() {
  std::string source = fullconnection_source;
  std::string program_name = "FullConnection";
  ocl_runtime_->LoadSource(program_name, GetActDefines() + source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif
  auto ret = InitWeights();
  if (ret != RET_OK) {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc
@ -147,7 +147,13 @@ int FusionEltwiseOpenCLKernel::Prepare() {
  std::string program_name = "FusionEltwise\n" + source;
  std::string kernel_name = "FusionEltwise";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (ocl_runtime_->GetFp16Enable()) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  } else {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
  InitWeights();
  SetGlobalLocal();
  SetConstArgs();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
@ -106,11 +106,6 @@ void GatherOpenCLKernel::SetGlobalLocal() {

 int GatherOpenCLKernel::Prepare() {
  std::string kernel_name = "gather";
-  if (desc_.data_type == kNumberTypeInt32) {
-    kernel_name += "_int";
-  } else {
-    kernel_name += "_float";
-  }
  if (in_tensors_.at(0)->shape().size() == 1 && axis_ == 0) {
    axis_ = 3;
  }
@ -119,7 +114,15 @@ int GatherOpenCLKernel::Prepare() {
 #else
  std::string program_name = "gather";
  ocl_runtime_->LoadSource(program_name, gather_source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, {});
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeInt32) {
+    build_options_ext = {" -DTYPE=int -DTYPE4=int4  -DWRITE_IMAGE=write_imagei  -DREAD_IMAGE=read_imagei "};
+  } else if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif
  if (in_tensors_.at(1)->IsConst()) {
    intensor1_is_tensor = false;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc
@ -171,9 +171,15 @@ int LayerNormOpenCLKernel::Prepare() {
  std::string source = layer_norm_source;
  std::string program_name = "LayerNormalization";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
  kernel_name_mean_var += "Axis" + std::to_string(normalized_axis_) + "NHWC4";
-  ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var);
+  ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var, build_options_ext);
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  SetConstArgs();
  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@ -92,7 +92,13 @@ int MatMulOpenCLKernel::Prepare() {
  std::string source = matmul_source;
  std::string program_name = "MatMul";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);

 #endif
  SetConstArgs();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc
@ -51,10 +51,17 @@ int OneHotOpenCLKernel::Prepare() {
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
+
  std::string source = one_hot_source;
  std::string program_name = "OneHot";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, {});
+  std::vector<std::string> build_options_ext;
+  if (ocl_runtime_->GetFp16Enable()) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=write_imagei "};
+  } else {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagei "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif
  InitWeights();
  SetConstArgs();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
@ -74,7 +74,13 @@ int PadOpenCLKernel::Prepare() {
  const std::string source = pad_source;
  const std::string program_name = "Pad";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, "Pad");
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, "Pad", build_options_ext);
  SetConstArgs();
  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@ -82,7 +82,13 @@ int PoolingOpenCLKernel::Prepare() {
  std::string source = pooling2d_source;
  std::string program_name = "Pooling2d";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif
  SetConstArgs();
  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc
@ -118,7 +118,13 @@ int PowerOpenCLKernel::Prepare() {
  scale_ = param->scale_;
  shift_ = param->shift_;
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  SetGlobalLocal();
  SetConstArgs();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
@ -130,7 +130,13 @@ int PReluOpenCLKernel::Prepare() {
  std::string program_name = "PRelu";
  std::string kernel_name = "PRelu_" + std::string(weight_is_scalar ? "scalar" : "vector");
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
  InitWeights();
  MS_LOG(DEBUG) << program_name << " init Done!";
  MS_LOG(DEBUG) << "kernel_name=: " << kernel_name << " init Done!";
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
@ -186,7 +186,13 @@ int ReduceOpenCLKernel::Prepare() {
  std::string source = reduce_source;
  std::string program_name = "Reduce";
  ocl_runtime_->LoadSource(program_name, source);
-  auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
  if (ret != RET_OK) {
    return ret;
  }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@ -73,18 +73,21 @@ void ReshapeOpenCLKernel::SetGlobalLocal() {

 int ReshapeOpenCLKernel::Prepare() {
  std::string kernel_name = "reshape_NHWC4";
-  if (desc_.data_type == kNumberTypeInt32) {
-    kernel_name += "_int";
-  } else {
-    kernel_name += "_float";
-  }
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
  std::string source = reshape_source;
  std::string program_name = "reshape";
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeInt32) {
+    build_options_ext = {" -DTYPE=int -DTYPE4=int4  -DWRITE_IMAGE=write_imagei  -DREAD_IMAGE=read_imagei "};
+  } else if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, {});
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif

  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc
@ -70,7 +70,13 @@ int ResizeOpenCLKernel::Prepare() {
  std::string source = resize_source;
  std::string program_name = "Resize";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif
  SetConstArgs();
  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
@ -166,7 +166,13 @@ int ScaleOpenCLKernel::Prepare() {
  std::string program_name = "Scale";
  std::string source = GetActDefines() + scale_source;
  ocl_runtime_->LoadSource(program_name, source);
-  error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif
  if (error_code != RET_OK) {
    return error_code;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@ -84,13 +84,15 @@ int SoftmaxOpenCLKernel::Prepare() {
 #else
  std::string program_name = "Softmax";
  ocl_runtime_->LoadSource(program_name, source);
-  std::vector<std::string> ext_build_opt;
-  if (out_tensors_[0]->data_type() == kNumberTypeFloat32) {
-    ext_build_opt.push_back("-DOUT_FLT4=convert_float4 -DWRITE_IMAGEOUT=write_imagef");
-  } else {
-    ext_build_opt.push_back("-DOUT_FLT4=convert_half4 -DWRITE_IMAGEOUT=write_imageh");
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {
+      " -DOUT_FLT4=convert_float4 -DWRITE_IMAGEOUT=write_imagef -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {
+      " -DOUT_FLT4=convert_half4 -DWRITE_IMAGEOUT=write_imageh -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
  }
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif
  SetConstArgs();
  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc
@ -100,7 +100,13 @@ int SpaceToBatchNDOpenCLKernel::Prepare() {
  std::string source = space_to_batch_nd_source;
  std::string program_name = "space_to_batch_nd";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif

  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc
@ -58,7 +58,13 @@ int SpaceToDepthOpenCLKernel::Prepare() {
  std::string source = space_to_depth_source;
  std::string program_name = "SpaceToDepth";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif
  SetConstArgs();
  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
@ -148,7 +148,13 @@ int SparseToDenseOpenCLKernel::Prepare() {
  std::string source = sparse_to_dense_source;
  std::string program_name = "SparseToDense";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);

  if (in_tensors_.size() > 3) {
    auto input_tensor3 = in_tensors_[3];
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc
@ -136,7 +136,13 @@ int SplitOpenCLKernel::Prepare() {
  std::string source = split_source;
  std::string program_name = "split";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  SetConstArgs();
  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
@ -165,7 +165,13 @@ int StackOpenCLKernel::Prepare() {
  std::string source = stack_source;
  std::string program_name = "stack";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
  SetConstArgs();
  SetGlobalLocal();

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc
@ -32,12 +32,18 @@ int StrassenOpenCLKernel::Prepare() {
  std::string source = strassen_source;
  std::string program_name = "MatMul";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
-  ocl_runtime_->BuildKernel(kernel_IMG_add_sub_2, program_name, "MatMul_IMG_Add_Sub_2");
-  ocl_runtime_->BuildKernel(kernel_BUF_add_sub_2, program_name, "MatMul_BUF_Add_Sub_2");
-  ocl_runtime_->BuildKernel(kernel_back_result, program_name, "Strassen_Back_Result");
-  ocl_runtime_->BuildKernel(MatMul_StrassenBUFFilled, program_name, "MatMul_BUF_Filled");
-  ocl_runtime_->BuildKernel(MatMul_StrassenIMGFilled, program_name, "MatMul_IMG_Filled");
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
+  ocl_runtime_->BuildKernel(kernel_IMG_add_sub_2, program_name, "MatMul_IMG_Add_Sub_2", build_options_ext);
+  ocl_runtime_->BuildKernel(kernel_BUF_add_sub_2, program_name, "MatMul_BUF_Add_Sub_2", build_options_ext);
+  ocl_runtime_->BuildKernel(kernel_back_result, program_name, "Strassen_Back_Result", build_options_ext);
+  ocl_runtime_->BuildKernel(MatMul_StrassenBUFFilled, program_name, "MatMul_BUF_Filled", build_options_ext);
+  ocl_runtime_->BuildKernel(MatMul_StrassenIMGFilled, program_name, "MatMul_IMG_Filled", build_options_ext);
  auto ret = InitWeights();
  if (ret != RET_OK) {
    return ret;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
@ -89,7 +89,13 @@ int StridedSliceOpenCLKernel::CheckSpecs() {
 int StridedSliceOpenCLKernel::Prepare() {
  std::string program_name = "strided_slice";
  ocl_runtime_->LoadSource(program_name, strided_slice_source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, "strided_slice");
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, "strided_slice", build_options_ext);
  SetConstArgs();
  SetGlobalLocal();
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@ -84,21 +84,19 @@ int TransposeOpenCLKernel::Prepare() {
    perm_4d_[2] = 2;
    perm_4d_[3] = 3;
  }
+  std::string kernel_name = "transpose";
+
  if (tensor_size_.N == 1 && perm_4d_[0] == 0 && perm_4d_[1] == 3 && perm_4d_[2] == 1 && perm_4d_[3] == 2) {
    type_ = TransposeType::AXIS0312;
+    kernel_name += "_0312";
  } else if (tensor_size_.N == 1 && perm_4d_[0] == 0 && perm_4d_[1] == 2 && perm_4d_[2] == 3 && perm_4d_[3] == 1) {
    type_ = TransposeType::AXIS0231;
-  } else {
-    type_ = TransposeType::GENERAL;
-  }
-  std::string kernel_name = "transpose";
-  if (type_ == TransposeType::AXIS0312) {
-    kernel_name += "_0312";
-  } else if (type_ == TransposeType::AXIS0231) {
    kernel_name += "_0231";
  } else {
+    type_ = TransposeType::GENERAL;
    kernel_name += "_general";
  }
+
  if (in_tensors_[0]->shape().size() == 4 &&
      in_tensors_[0]->shape()[2] * UP_DIV(in_tensors_[0]->shape()[3], C4NUM) > ocl_runtime_->GetMaxImage2DWidth()) {
    // just for input
@ -112,7 +110,13 @@ int TransposeOpenCLKernel::Prepare() {
  std::string source = transpose_source;
  std::string program_name = "transpose";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
 #endif
  SetConstArgs();
  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
@ -82,10 +82,16 @@ std::vector<float> GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, si
 void WinogradOpenCLKernel::BuildKernel() {
  std::string program_name = "winograd";
  ocl_runtime_->LoadSource(program_name, GetActDefines() + winograd_source);
-  ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36");
+  std::vector<std::string> build_options_ext;
+  if (desc_.data_type == kNumberTypeFloat32) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
+  } else if (desc_.data_type == kNumberTypeFloat16) {
+    build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
+  }
+  ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options_ext);
  ocl_runtime_->BuildKernel(kernel_, program_name,
-                            filter_type_ == MemType::IMG ? "WinogradConv2D_Img" : "WinogradConv2D");
-  ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4");
+                            filter_type_ == MemType::IMG ? "WinogradConv2D_Img" : "WinogradConv2D", build_options_ext);
+  ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options_ext);
 }

 void WinogradOpenCLKernel::InitFilter() {
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@ -234,19 +234,24 @@ int GetBroadcastGpuAxis(int ndim, int ori_axis) {
  return axis;
 }

-void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor) {
+void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
+                     int data_type) {
  MS_ASSERT(src);
  MS_ASSERT(dst);
  auto src_fp16 = reinterpret_cast<float16_t *>(src);
  auto src_fp32 = reinterpret_cast<float32_t *>(src);
+  auto src_int32 = reinterpret_cast<int32_t *>(src);
  auto dst_fp16 = reinterpret_cast<float16_t *>(dst);
  auto dst_fp32 = reinterpret_cast<float32_t *>(dst);
+  auto dst_int32 = reinterpret_cast<int32_t *>(dst);
  for (int n = 0, src_idx = 0; n < tensor.N; n++) {
    for (int h = 0; h < tensor.H; ++h) {
      for (int w = 0; w < tensor.W; ++w) {
        for (int c = 0; c < tensor.C; ++c, ++src_idx) {
          int dst_idx = ((n * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c;
-          if (dst_is_fp16) {
+          if (data_type == kNumberTypeInt32) {
+            dst_int32[dst_idx] = src_int32[src_idx];
+          } else if (dst_is_fp16) {
            dst_fp16[dst_idx] = src_is_fp16 ? src_fp16[src_idx] : static_cast<float16_t>(src_fp32[src_idx]);
          } else {
            dst_fp32[dst_idx] = src_is_fp16 ? static_cast<float32_t>(src_fp16[src_idx]) : src_fp32[src_idx];
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@ -58,7 +58,8 @@ int WriteToBin(const std::string &file_path, void *data, size_t size);

 int GetBroadcastGpuAxis(int ndim, int ori_axis);

-void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor);
+void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
+                     int data_type = kNumberTypeFloat32);

 int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor,
                         TypeId expect_data_type, const std::vector<int> &expect_shape);
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@ -394,7 +394,7 @@ kernel::LiteKernel *Scheduler::FindGpuKernel(const std::vector<Tensor *> &in_ten
  if (context_->IsGpuEnabled()) {
    // support more data type like int32
    kernel::KernelKey gpu_desc{kGPU, desc.data_type, desc.type};
-    if (context_->IsGpuFloat16Enabled()) {
+    if (desc.data_type == kNumberTypeFloat32 && context_->IsGpuFloat16Enabled()) {
      gpu_desc.data_type = kNumberTypeFloat16;
    }

--- a/mindspore/lite/test/models_gpu_fp16.cfg
+++ b/mindspore/lite/test/models_gpu_fp16.cfg
@ -9,3 +9,4 @@ mtk_model_emotions_0727_nosoftmax.tflite
 landmark
 PoseNet_dla_17_x512_tmp
 plat_isface
+ml_location_lane_counter.onnx;5.5
--- a/mindspore/lite/test/models_gpu_fp32.cfg
+++ b/mindspore/lite/test/models_gpu_fp32.cfg
@ -57,3 +57,45 @@ mtk_model_face_dress.pb;0.5;1;1,128,128,3
 hiai_model_normalize_object_scene_ps_20200519.pb;0.5;1;1,224,224,3
 hiai_label_and_video.pb;0.5;1;1,224,224,3
 tinyyolov2-8.onnx;0.5;1;1,416,416,3
+mtk_detect-mbv2-shortcut-400-400-simplified.onnx
+emotion-ferplus-8.onnx
+rcnn-ilsvrc13-9.onnx
+shufflenet-v2-10.onnx
+squeezenet1.1-7.onnx
+ml_table_detection_fp32_tmp.onnx
+ml_table_segment.onnx
+googlenet-9.onnx
+inception-v1-9.onnx
+shufflenet-9.onnx
+ml_face_3d.onnx
+gts_version-RFB-320_simplified.onnx
+mnist-8.onnx
+ml_video_edit_judge.onnx
+ml_video_edit_vignet.onnx
+hdc_mobilenet_1w_class.onnx
+ml_video_edit_imitate_filter.onnx
+ml_edu_kit_hand_detection.onnx
+ml_edu_kit_hand_key_position.onnx
+mtk_detect-deeper-halfdeeper-mbv1-shortcut-400-400_nopostprocess_simplified_onnx.onnx
+mtk_detect-mbv1-shortcut-400-400_nopostprocess_simplified_onnx.onnx
+mtk_detect-deeper-halfdeeper-mbv1-lastearlySSD-shortcut-400-400_nopostprocess_simplified_onnx.onnx
+ml_2012_ocr_detection_tmp.onnx
+ml_video_edit_enhance_update_tmp.onnx
+bloom_hongmo_detection_tmp.onnx
+Q_face_recognition.onnx
+Q888_iris_detect.onnx
+ml_ocr_bank_card_detection_inception_tmp
+ml_ocr_detect_20200305
+Q_iMaxDN_RGB_385_p_RGB_RGB_pb2tflite.tflite
+Q_iMaxSR_RGB_385_p_pb2tflite.tflite
+mtk_age_gender.pb
+mtk_model_ckpt.pb
+Q_inception-249970-672-11-16.pb
+Q_crnn_screen_slim400w_more_20w.pb
+hiai_ssd_mobilenetv2_object.pb
+hiai_humanDetection.pb
+mtk_face_features_v1.pb
+Q_crnn_ori_75w_slim_norm.pb
+Q_crnn_ori_v2_405001_notrans_nopre.pb
+bolt_segment.pb
+ml_location_lane_counter.onnx;2
--- a/mindspore/lite/test/run_benchmark_nets.sh
+++ b/mindspore/lite/test/run_benchmark_nets.sh
@ -2123,15 +2123,17 @@ function Run_gpu() {
            input_files=$input_files${data_path}'input/'$model_name'.ms.bin_'$i','
          done
        fi
+        if [[ ${accuracy_limit} == "" ]]; then
+          accuracy_limit="0.5"
+        fi
        echo ${model_name} >> "${run_gpu_log_file}"
        echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
        if [[ $input_shapes == "" ]]; then
-          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
-          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt
+          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
+          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt
        else
          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt
-
        fi

        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_gpu_log_file}"
@ -2148,17 +2150,39 @@ function Run_gpu() {
        if [[ $model_name == \#* ]]; then
          continue
        fi
+        model_name=`echo ${line} | awk -F ';' '{print $1}'`
+        accuracy_limit=`echo ${line} | awk -F ';' '{print $2}'`
+        input_num=`echo ${line} | awk -F ';' '{print $3}'`
+        input_shapes=`echo ${line} | awk -F ';' '{print $4}'`
+        input_files=""
+        data_path="/data/local/tmp/input_output/"
+        output_file=${data_path}'output/'${model_name}'.ms.out'
+        if [[ ${input_num} == "" || ${input_num} == 1 ]]; then
+          input_files=/data/local/tmp/input_output/input/${model_name}.ms.bin
+        else
+          for i in $(seq 1 $input_num)
+          do
+            input_files=$input_files${data_path}'input/'$model_name'.ms.bin_'$i','
+          done
+        fi
+        if [[ ${accuracy_limit} == "" ]]; then
+          accuracy_limit="5"
+        fi
        echo ${model_name} >> "${run_gpu_log_file}"
        echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/data/local/tmp/input_output/output/'${model_name}'.ms.out --enableFp16=true --accuracyThreshold=5' >> "${run_gpu_log_file}"
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/data/local/tmp/input_output/output/'${model_name}'.ms.out --enableFp16=true --accuracyThreshold=5' >> adb_run_cmd.txt
+        if [[ $input_shapes == "" ]]; then
+          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
+          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt
+        else
+          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
+          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt
+        fi
        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_gpu_log_file}"
        if [ $? = 0 ]; then
            run_result='arm64_gpu_fp16: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file}
        else
            run_result='arm64_gpu_fp16: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1
        fi
-    #sleep 1
    done < ${models_gpu_fp16_config}

    # Run GPU weightquant converted models:
@ -2167,17 +2191,39 @@ function Run_gpu() {
        if [[ $model_name == \#* ]]; then
          continue
        fi
+        model_name=`echo ${line} | awk -F ';' '{print $1}'`
+        accuracy_limit=`echo ${line} | awk -F ';' '{print $2}'`
+        input_num=`echo ${line} | awk -F ';' '{print $3}'`
+        input_shapes=`echo ${line} | awk -F ';' '{print $4}'`
+        input_files=""
+        data_path="/data/local/tmp/input_output/"
+        output_file=${data_path}'output/'${model_name}'.ms.out'
+        if [[ ${input_num} == "" || ${input_num} == 1 ]]; then
+          input_files=/data/local/tmp/input_output/input/${model_name}.ms.bin
+        else
+          for i in $(seq 1 $input_num)
+          do
+            input_files=$input_files${data_path}'input/'$model_name'.ms.bin_'$i','
+          done
+        fi
+        if [[ ${accuracy_limit} == "" ]]; then
+          accuracy_limit="5"
+        fi
        echo ${model_name} >> "${run_gpu_log_file}"
        echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'_weightquant.ms --inDataFile=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/data/local/tmp/input_output/output/'${model_name}'.ms.out --enableFp16=true --accuracyThreshold=5' >> "${run_gpu_log_file}"
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'_weightquant.ms --inDataFile=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/data/local/tmp/input_output/output/'${model_name}'.ms.out --enableFp16=true --accuracyThreshold=5' >> adb_run_cmd.txt
+        if [[ $input_shapes == "" ]]; then
+          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
+          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt
+        else
+          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
+          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt
+        fi
        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_gpu_log_file}"
        if [ $? = 0 ]; then
            run_result='arm64_gpu_weightquant: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file}
        else
            run_result='arm64_gpu_weightquant: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1
        fi
-    #sleep 1
    done < ${models_gpu_weightquant_config}
 }