change opencl code path

2020-08-20 14:30:20 +08:00 · 2020-08-20 14:30:20 +08:00 · 605c2b0b16
parent 98565d8b54
commit 605c2b0b16
40 changed files with 218 additions and 513 deletions
--- a/build.sh
+++ b/build.sh
@ -460,24 +460,20 @@ build_gtest() {

 gene_clhpp() {
    CL_SRC_DIR="${BASEPATH}/mindspore/lite/src/runtime/kernel/opencl/cl"
-    for sub_dir in "${CL_SRC_DIR}"/*
+    if [ ! -d ${CL_SRC_DIR} ]; then
+      return
+    fi
+    cd ${CL_SRC_DIR}/
+    rm -rf *.inc
+    echo "$(cd "$(dirname $0)"; pwd)"
+    for file_path in "${CL_SRC_DIR}"/*
    do
-        data_type="$(basename ${sub_dir})"
-        if [ ! -d ${CL_SRC_DIR}/${data_type} ]; then
-          continue
-        fi
-        cd ${CL_SRC_DIR}/${data_type}
-        rm -rf *.inc
-        echo "$(cd "$(dirname $0)"; pwd)"
-        for file_path in "${CL_SRC_DIR}/${data_type}"/*
-        do
-            file="$(basename ${file_path})"
-            inc_file=`echo ${CL_SRC_DIR}/${data_type}/${file} | sed 's/$/.inc/'`
-            sed 's/^/\"/;s/$/    \\n\" \\/' ${CL_SRC_DIR}/${data_type}/${file} > ${inc_file}
-            kernel_name=`echo ${file} | sed s'/.\{3\}$//'`
-	    sed -i "1i\static const char *${kernel_name}_source_${data_type} =\"\\n\" \\" ${inc_file}
-            sed -i '$a\;' ${inc_file}
-        done
+        file="$(basename ${file_path})"
+        inc_file=`echo ${CL_SRC_DIR}/${file} | sed 's/$/.inc/'`
+        sed 's/^/\"/;s/$/    \\n\" \\/' ${CL_SRC_DIR}/${file} > ${inc_file}
+        kernel_name=`echo ${file} | sed s'/.\{3\}$//'`
+  sed -i "1i\static const char *${kernel_name}_source =\"\\n\" \\" ${inc_file}
+        sed -i '$a\;' ${inc_file}
    done
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/activation.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/activation.cl
@ -2,10 +2,7 @@

 #define SLICES 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
-#define FLT4 float4
 #define MIN(X, Y) (X < Y ? X : Y)
-#define READ_FLT4 read_imagef
-#define WRITE_FLT4 write_imagef
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

 __kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
@ -14,13 +11,13 @@ __kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t out
  int Y = get_global_id(0);  // height id
  int X = get_global_id(1);  // weight id
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
    tmp.x = in_c4.x >= 0 ? in_c4.x : in_c4.x * alpha;
    tmp.y = in_c4.y >= 0 ? in_c4.y : in_c4.y * alpha;
    tmp.z = in_c4.z >= 0 ? in_c4.z : in_c4.z * alpha;
    tmp.w = in_c4.w >= 0 ? in_c4.w : in_c4.w * alpha;
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }

@ -29,13 +26,13 @@ __kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, c
  int Y = get_global_id(0);  // height id
  int X = get_global_id(1);  // weight id
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
    tmp.x = in_c4.x >= 0 ? in_c4.x : 0;
    tmp.y = in_c4.y >= 0 ? in_c4.y : 0;
    tmp.z = in_c4.z >= 0 ? in_c4.z : 0;
    tmp.w = in_c4.w >= 0 ? in_c4.w : 0;
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }

@ -44,13 +41,13 @@ __kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output,
  int Y = get_global_id(0);  // height id
  int X = get_global_id(1);  // weight id
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
    tmp.x = in_c4.x >= 0 ? MIN(in_c4.x, 6) : 0;
    tmp.y = in_c4.y >= 0 ? MIN(in_c4.y, 6) : 0;
    tmp.z = in_c4.z >= 0 ? MIN(in_c4.z, 6) : 0;
    tmp.w = in_c4.w >= 0 ? MIN(in_c4.w, 6) : 0;
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }

@ -59,12 +56,12 @@ __kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output
  int Y = get_global_id(0);  // height id
  int X = get_global_id(1);  // weight id
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
    tmp.x = 1 / (1 + exp(-in_c4.x));
    tmp.y = 1 / (1 + exp(-in_c4.y));
    tmp.z = 1 / (1 + exp(-in_c4.z));
    tmp.w = 1 / (1 + exp(-in_c4.w));
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
@ -0,0 +1,101 @@
+#define divide_no_check(a, b) (a / b)
+__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+
+__kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), a + b);
+}
+
+__kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), a - b);
+}
+
+__kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), a * b);
+}
+
+__kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b));
+}
+
+__kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
+                                 const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), weight * a + bias);
+}
+
+__kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
+                             const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] + input_b[idx];
+}
+
+__kernel void ElementSub_BUF(__global float *input_a, __global float *input_b, __global float *output,
+                             const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] - input_b[idx];
+}
+
+__kernel void ElementMul_BUF(__global float *input_a, __global float *input_b, __global float *output,
+                             const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] * input_b[idx];
+}
+
+__kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, __global float *output,
+                             const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] * input_b[idx];
+}
+
+__kernel void BoardcastArith_BUF(__global float *input_a, float weight, float bias, __global float *output,
+                                 const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = weight * input_a[idx] + bias;
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/batchnorm.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/batchnorm.cl
@ -1,4 +1,3 @@
-#define FLT4 float4
 #define INT4 int4
 #define INT2 int2
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl
@ -2,9 +2,6 @@

 #define SLICES 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
-#define FLT4 float4
-#define READ_FLT4 read_imagef
-#define WRITE_FLT4 write_imagef
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

 __kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
@ -14,13 +11,13 @@ __kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t out
  int Y = get_global_id(0);  // height id
  int X = get_global_id(1);  // weight id
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
    int index = num * 4;
    tmp.x = in_c4.x * alpha[index];
    tmp.y = in_c4.y * alpha[index + 1];
    tmp.z = in_c4.z * alpha[index + 2];
    tmp.w = in_c4.w * alpha[index + 3];
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl
@ -1,5 +1,4 @@
 // #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define FLT4 float4
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

 __kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl
@ -1,8 +1,3 @@
-#define FLT float
-#define FLT4 float4
-#define FLT16 float16
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
                                  __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl
@ -1,12 +1,3 @@
-#ifdef ENABLE_FP16
-#define FLT half
-#define FLT4 half4
-#define TO_FLT4 convert_half4
-#else
-#define FLT float
-#define FLT4 float4
-#define TO_FLT4 convert_float4
-#endif
 __constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
                                         float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
@ -1,61 +0,0 @@
-#define FLT half
-#define FLT4 half4
-#define FLT16 half16
-#define READ_IMAGE read_imageh
-#define WRITE_IMAGE write_imageh
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
-                                  __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
-                                  int4 src_size, int4 dst_size) {
-  int h = get_global_id(0);
-  int kh = h % 2;
-  int src_h = h / 2;
-  src_h = src_h * 2;
-  int w = get_global_id(1);
-  int kw = w % 2;
-  int src_w = w / 2;
-  src_w = src_w * 2;
-  int co = get_global_id(2);
-  if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return;
-  FLT4 r0 = (FLT4)(0.f);
-  FLT4 r1 = (FLT4)(0.f);
-  FLT4 r2 = (FLT4)(0.f);
-  FLT4 r3 = (FLT4)(0.f);
-  int base_w = (co * 4 + kh + kw * 2) * src_size.z;
-  for (int ci = 0; ci < src_size.z; ++ci) {
-    FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h));
-    FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h + 1));
-    FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h));
-    FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h + 1));
-    FLT16 weight_cache = weight[base_w++];
-    r0 += x0.x * weight_cache.s0123;
-    r0 += x0.y * weight_cache.s4567;
-    r0 += x0.z * weight_cache.s89ab;
-    r0 += x0.w * weight_cache.scdef;
-
-    r1 += x1.x * weight_cache.s0123;
-    r1 += x1.y * weight_cache.s4567;
-    r1 += x1.z * weight_cache.s89ab;
-    r1 += x1.w * weight_cache.scdef;
-
-    r2 += x2.x * weight_cache.s0123;
-    r2 += x2.y * weight_cache.s4567;
-    r2 += x2.z * weight_cache.s89ab;
-    r2 += x2.w * weight_cache.scdef;
-
-    r3 += x3.x * weight_cache.s0123;
-    r3 += x3.y * weight_cache.s4567;
-    r3 += x3.z * weight_cache.s89ab;
-    r3 += x3.w * weight_cache.scdef;
-  }
-  FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0));
-  r0 += bias_val;
-  r1 += bias_val;
-  r2 += bias_val;
-  r3 += bias_val;
-
-  WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh), r0);
-  WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh + 2), r1);
-  WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2);
-  WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3);
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
@ -1,76 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define ACCUM_FLT4 half4
-#define FLT half
-#define FLT2 half2
-#define FLT3 half3
-#define FLT4 half4
-#define TO_FLT4 convert_half4
-#define TO_ACCUM_TYPE convert_half4
-#define TO_ACCUM_FLT convert_half
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
-__constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void DepthwiseConv2d_NC4HW4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
-                                     float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-                                     int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
-  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
-  int fx_c = Z * kernel_size.x * kernel_size.y;
-  for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
-    bool outside_y = y_c < 0 || y_c >= src_size.y;
-    for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
-      bool outside_x = x_c < 0 || x_c >= src_size.x;
-      if (!outside_x && !outside_y) {
-        FLT4 f = filters[fx_c];
-        FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
-        r += TO_ACCUM_TYPE(src_final * f);
-      }
-      fx_c++;
-    }
-  }
-  FLT4 bias_val = biases[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
-}
-
-__kernel void DepthwiseConv2d_NHWC4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
-                                    float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-                                    int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
-  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
-  int fx_c = Z * kernel_size.x * kernel_size.y;
-  for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
-    bool outside_y = y_c < 0 || y_c >= src_size.y;
-    for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
-      bool outside_x = x_c < 0 || x_c >= src_size.x;
-      if (!outside_x && !outside_y) {
-        FLT4 f = filters[fx_c];
-        FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
-        r += TO_ACCUM_TYPE(src_final * f);
-      }
-      fx_c++;
-    }
-  }
-  FLT4 bias_val = biases[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl
@ -1,45 +0,0 @@
-#define FLT half
-#define FLT4 half4
-#define READ_IMAGE read_imageh
-#define WRITE_IMAGE write_imageh
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= HW.y || Y >= C.y) {
-    return;
-  }
-  FLT4 result[4];
-  result[0] = (FLT4)(0.0f);
-  result[1] = (FLT4)(0.0f);
-  result[2] = (FLT4)(0.0f);
-  result[3] = (FLT4)(0.0f);
-  FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X));
-  FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1));
-  FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2));
-  FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3));
-  result[0].x = x0.x;
-  result[0].y = x1.x;
-  result[0].z = x2.x;
-  result[0].w = x3.x;
-
-  result[1].x = x0.y;
-  result[1].y = x1.y;
-  result[1].z = x2.y;
-  result[1].w = x3.y;
-
-  result[2].x = x0.z;
-  result[2].y = x1.z;
-  result[2].z = x2.z;
-  result[2].w = x3.z;
-
-  result[3].x = x0.w;
-  result[3].y = x1.w;
-  result[3].z = x2.w;
-  result[3].w = x3.w;
-
-  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]);
-  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]);
-  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]);
-  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
@ -1,34 +0,0 @@
-__kernel void ElementAdd(__global float *input_a, __global float *input_b, __global float *output,
-                         const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] + input_b[idx];
-}
-
-__kernel void ElementSub(__global float *input_a, __global float *input_b, __global float *output,
-                         const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] - input_b[idx];
-}
-
-__kernel void ElementMul(__global float *input_a, __global float *input_b, __global float *output,
-                         const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] * input_b[idx];
-}
-
-__kernel void ElementDiv(__global float *input_a, __global float *input_b, __global float *output,
-                         const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] / input_b[idx];
-}
-
-__kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output,
-                             const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = weight * input_a[idx] + bias;
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
@ -1,66 +0,0 @@
-#define divide_no_check(a, b) (a/b)
-__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
-
-__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                         const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), a + b);
-}
-
-__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                         const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), a - b);
-}
-
-__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                         const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), a * b);
-}
-
-__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                         const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), divide_no_check(a, b));
-}
-
-__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
-                             const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), weight * a + bias);
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
@ -1,32 +0,0 @@
-#define FLT4 float4
-#define FLT16 float16
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
-                     __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
-  int2 gid = (int2)(get_global_id(0), get_global_id(1));
-  int2 lid = (int2)(get_local_id(0), get_local_id(1));
-  FLT4 result = (FLT4)(0.0f);
-  bool inside = gid.x < offset_co.y;
-  for (uint i = lid.y; i < offset_ci.y && inside; i += 4) {
-    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, 0));
-    FLT16 w = weight[gid.x + i * offset_co.y];
-    result.x += dot(v, w.s0123);
-    result.y += dot(v, w.s4567);
-    result.z += dot(v, w.s89ab);
-    result.w += dot(v, w.scdef);
-  }
-  __local FLT4 temp[64][4];
-  temp[lid.x][lid.y] = result;
-  barrier(CLK_LOCAL_MEM_FENCE);
-  if (lid.y == 0 && inside) {
-    result += temp[lid.x][1];
-    result += temp[lid.x][2];
-    result += temp[lid.x][3];
-    if (has_bias != 0) {
-      result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0));
-    }
-    WRITE_IMAGE(output, (int2)(gid.x, 0), result);
-  }
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl
@ -1,14 +0,0 @@
-#define FLT float
-#define FLT4 float4
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl
@ -1,7 +1,3 @@
-#define FLT4 half4
-#define FLT16 half16
-#define READ_IMAGE read_imageh
-#define WRITE_IMAGE write_imageh
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
                     __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/reshape.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/reshape.cl
@ -1,7 +1,3 @@
-#define FLT half
-#define FLT4 half4
-#define READ_IMAGE read_imageh
-#define WRITE_IMAGE write_imageh
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl
@ -1,5 +1,5 @@
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
-
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 // what is mask and args.slices_x32
 __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
                             const int slices, const int slices_x32) {
@ -54,11 +54,11 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu
  int tid = get_local_id(0);
  float sum = 0.0f;
  for (size_t i = tid; i < slices - 1; i += 32) {
-    float4 src = read_imagef(input, smp_none, (int2)(i, 0));
+    float4 src = read_imagef(input, smp_zero, (int2)(i, 0));
    sum += dot((float4)(1.0f), exp(src));
  }
  if ((slices - 1) % 32 == tid) {
-    float4 src = read_imagef(input, smp_none, (int2)(slices - 1, 0));
+    float4 src = read_imagef(input, smp_zero, (int2)(slices - 1, 0));
    sum += dot(mask, exp(src));
  }

@ -80,12 +80,12 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu
  barrier(CLK_LOCAL_MEM_FENCE);
  sum = tmpx1[0];
  for (size_t i = tid; i < slices - 1; i += 32) {
-    float4 result = read_imagef(input, smp_none, (int2)(i, 0));
+    float4 result = read_imagef(input, smp_zero, (int2)(i, 0));
    result = exp(result) * sum;
    output[i] = result;
  }
  if ((slices - 1) % 32 == tid) {
-    float4 result = read_imagef(input, smp_none, (int2)(slices - 1, 0));
+    float4 result = read_imagef(input, smp_zero, (int2)(slices - 1, 0));
    result = exp(result) * sum;
    __global float4 *remain_ptr4 = output;
    remain_ptr4 += slices - 1;
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl
@ -1,22 +1,3 @@
-#define FLT float
-#define FLT4 float4
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
-// enum Format {
-//  Format_NCHW = 0,
-//  Format_NHWC = 1,
-//  Format_NHWC4 = 2,
-//  Format_HWKC = 3,
-//  Format_HWCK = 4,
-//  Format_KCHW = 5,
-//  Format_CKHW = 6,
-//  Format_KHWC = 7,
-//  Format_CHWK = 8,
-//  Format_NC4HW4 = 100,
-//  Format_NUM_OF_FORMAT = 101,
-//  Format_MIN = Format_NCHW,
-//  Format_MAX = Format_NUM_OF_FORMAT
-//};
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
                                          int4 shape) {
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl
@ -1,7 +1,3 @@
-#define FLT float
-#define FLT4 float4
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
  int X = get_global_id(0);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@ -24,7 +24,7 @@
 #include "src/runtime/runtime_api.h"
 #include "include/errorcode.h"

-#include "src/runtime/kernel/opencl/cl/fp32/activation.cl.inc"
+#include "src/runtime/kernel/opencl/cl/activation.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@ -46,7 +46,7 @@ int ActivationOpenClKernel::Init() {
  }
  std::string program_name = "";
  std::string kernel_name = "";
-  std::string source = activation_source_fp32;
+  std::string source = activation_source;
  if (type_ == ActivationType_RELU) {
    program_name = "RELU";
    kernel_name = "Relu";
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@ -22,8 +22,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #ifndef PROGRAM_WITH_IL
-#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl.inc"
+#include "src/runtime/kernel/opencl/cl/arithmetic.cl.inc"
 #endif

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -109,9 +108,14 @@ int ArithmeticOpenCLKernel::Init() {
    error_code = RET_ERROR;
  }
 #else
+  if (out_mem_type_ == OpenCLMemType::IMG) {
+    kernel_name += "_IMG";
+  } else {
+    kernel_name += "_BUF";
+  }
  std::string program_name = "Arithmetic";
  std::set<std::string> build_options;
-  std::string source = arithmetic_image2d_source_fp32;
+  std::string source = arithmetic_source;
  runtime_->LoadSource(program_name, source);
  error_code = runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
@ -20,7 +20,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/batchnorm.h"
-#include "src/runtime/kernel/opencl/cl/fp32/batchnorm.cl.inc"
+#include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@ -50,7 +50,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
 }
 int BatchNormOpenCLKernel::Init() {
  std::set<std::string> build_options;
-  std::string source = batchnorm_source_fp32;
+  std::string source = batchnorm_source;
  std::string program_name = "batch_normalization";
  std::string kernel_name = "batch_normalization";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc
@ -24,7 +24,7 @@
 #include "include/errorcode.h"
 #include "src/runtime/kernel/opencl/kernel/caffe_prelu.h"
 #include "src/runtime/opencl/opencl_runtime.h"
-#include "src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl.inc"
+#include "src/runtime/kernel/opencl/cl/caffe_prelu.cl.inc"
 #include "src/runtime/kernel/arm/nnacl/caffeprelu.h"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -57,7 +57,7 @@ int CaffePReluOpenCLKernel::Init() {
  }
  CaffeWeight();
  std::set<std::string> build_options;
-  std::string source = caffe_prelu_source_fp32;
+  std::string source = caffe_prelu_source;
  std::string program_name = "CaffePRelu";
  std::string kernel_name = "CaffePRelu";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@ -20,7 +20,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/concat.h"
-#include "src/runtime/kernel/opencl/cl/fp32/concat.cl.inc"
+#include "src/runtime/kernel/opencl/cl/concat.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@ -66,7 +66,7 @@ int ConcatOpenCLKernel::Init() {
  }
  if (in_tensors_.size() == 2) {
    std::set<std::string> build_options;
-    std::string source = concat_source_fp32;
+    std::string source = concat_source;
    std::string program_name = "Concat";
    std::string kernel_name = "Concat";
    auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
@ -76,7 +76,7 @@ int ConcatOpenCLKernel::Init() {

  if (in_tensors_.size() == 3) {
    std::set<std::string> build_options;
-    std::string source = concat_source_fp32;
+    std::string source = concat_source;
    std::string program_name = "Concat3input";
    std::string kernel_name = "Concat3input";
    auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@ -20,8 +20,7 @@
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/conv2d_transpose.h"
 #ifndef PROGRAM_WITH_IL
-#include "src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl.inc"
+#include "src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl.inc"
 #endif

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -45,11 +44,7 @@ int Conv2dTransposeOpenCLKernel::Init() {
 #ifdef PROGRAM_WITH_IL
  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
 #else
-#ifdef ENABLE_FP16
-  std::string source = conv2d_transpose2x2_source_fp16;
-#else
-  std::string source = conv2d_transpose2x2_source_fp32;
-#endif
+  std::string source = conv2d_transpose2x2_source;
  std::set<std::string> build_options;
  std::string program_name = "conv2d_transpose2x2";
  ocl_runtime->LoadSource(program_name, source);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@ -26,8 +26,7 @@

 #ifndef PROGRAM_WITH_IL

-#include "src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl.inc"
+#include "src/runtime/kernel/opencl/cl/depthwise_conv2d.cl.inc"

 #endif

@ -68,11 +67,7 @@ int DepthwiseConv2dOpenCLKernel::Init() {
 #else
  std::string program_name = "DepthwiseConv2d";
  std::set<std::string> build_options;
-#ifdef ENABLE_FP16
-  std::string source = depthwise_conv2d_source_fp16;
-#else
-  std::string source = depthwise_conv2d_source_fp32;
-#endif
+  std::string source = depthwise_conv2d_source;
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@ -21,8 +21,7 @@
 #include "src/runtime/kernel/arm/nnacl/fp32/matmul.h"
 #include "src/runtime/kernel/opencl/kernel/matmul.h"
 #ifndef PROGRAM_WITH_IL
-#include "src/runtime/kernel/opencl/cl/fp16/matmul.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/matmul.cl.inc"
+#include "src/runtime/kernel/opencl/cl/matmul.cl.inc"
 #endif

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -40,11 +39,7 @@ int MatMulOpenCLKernel::Init() {
  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
 #else
  std::set<std::string> build_options;
-#ifdef ENABLE_FP16
-  std::string source = matmul_source_fp16;
-#else
-  std::string source = matmul_source_fp32;
-#endif
+  std::string source = matmul_source;
  std::string program_name = "MatMul";
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@ -24,8 +24,8 @@
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/image_format.h"
 #ifndef PROGRAM_WITH_IL
-#include "src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl.inc"
+#include "src/runtime/kernel/opencl/cl/avg_pool2d.cl.inc"
+#include "src/runtime/kernel/opencl/cl/max_pool2d.cl.inc"
 #endif

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -46,13 +46,13 @@ int PoolingOpenCLKernel::Init() {
  if (parameter_->max_pooling_) {
    kernel_name = "MaxPooling2d";
 #ifndef PROGRAM_WITH_IL
-    source = max_pool2d_source_fp32;
+    source = max_pool2d_source;
    program_name = "MaxPooling2d";
 #endif
  } else if (parameter_->avg_pooling_) {
    kernel_name = "AvgPooling2d";
 #ifndef PROGRAM_WITH_IL
-    source = avg_pool2d_source_fp32;
+    source = avg_pool2d_source;
    program_name = "AvgPooling2d";
 #endif
  } else {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
@ -23,7 +23,7 @@
 #include "include/errorcode.h"
 #include "src/runtime/kernel/opencl/kernel/prelu.h"
 #include "src/runtime/opencl/opencl_runtime.h"
-#include "src/runtime/kernel/opencl/cl/fp32/activation.cl.inc"
+#include "src/runtime/kernel/opencl/cl/activation.cl.inc"
 #include "src/runtime/kernel/arm/nnacl/prelu_parameter.h"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -40,7 +40,7 @@ int PReluOpenCLKernel::Init() {
    return RET_ERROR;
  }
  std::set<std::string> build_options;
-  std::string source = activation_source_fp32;
+  std::string source = activation_source;
  std::string program_name = "PRelu";
  std::string kernel_name = "ReluScalar";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@ -20,8 +20,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/reshape.h"
-#include "src/runtime/kernel/opencl/cl/fp16/reshape.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/reshape.cl.inc"
+#include "src/runtime/kernel/opencl/cl/reshape.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@ -39,11 +38,7 @@ int ReshapeOpenCLKernel::Init() {
  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
 #else
  std::set<std::string> build_options;
-#ifdef ENABLE_FP16
-  std::string source = reshape_source_fp16;
-#else
-  std::string source = reshape_source_fp32;
-#endif
+  std::string source = reshape_source;
  std::string program_name = "reshape";
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@ -22,8 +22,8 @@
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #ifndef PROGRAM_WITH_IL
-#include "src/runtime/kernel/opencl/cl/fp32/softmax.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl.inc"
+#include "src/runtime/kernel/opencl/cl/softmax.cl.inc"
+#include "src/runtime/kernel/opencl/cl/softmax1x1.cl.inc"
 #endif

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -88,7 +88,7 @@ int SoftmaxOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
 int SoftmaxOpenCLKernel::Init() {
  std::string kernel_name = "SoftMax";
  std::string program_name = "SoftMax";
-  std::string source = softmax_source_fp32;
+  std::string source = softmax_source;
  runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
  // framework not set this param yet! just use default.
  if (parameter_->axis_ == -1) {
@ -101,7 +101,7 @@ int SoftmaxOpenCLKernel::Init() {
    // support 2d tensor
    kernel_name += "1x1";
    program_name += "1x1";
-    source = softmax1x1_source_fp32;
+    source = softmax1x1_source;
    onexone_flag_ = true;
  } else {
    MS_LOG(EXCEPTION) << "Init `Softmax` kernel failed: Unsupported axis: " << parameter_->axis_;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@ -22,7 +22,7 @@
 #include "include/errorcode.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
-#include "src/runtime/kernel/opencl/cl/fp32/to_format.cl.inc"
+#include "src/runtime/kernel/opencl/cl/to_format.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@ -37,10 +37,9 @@ int ToFormatOpenCLKernel::Init() {
  auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_);
  out_mem_type_ = parameter->out_mem_type;
  std::string program_name = "to_format";
-  std::map<schema::Format, std::string> format_str{{schema::Format_NCHW, "NCHW"},
-                                                   {schema::Format_NHWC, "NHWC"},
-                                                   {schema::Format_NC4HW4, "NC4HW4"},
-                                                   {schema::Format_NHWC4, "NHWC4"}};
+  std::map<schema::Format, std::string> format_str{{schema::Format_NCHW, "NCHW"},     {schema::Format_NHWC, "NHWC"},
+                                                   {schema::Format_NC4HW4, "NC4HW4"}, {schema::Format_NC4, "NHWC4"},
+                                                   {schema::Format_NC, "NHWC"},       {schema::Format_NHWC4, "NHWC4"}};
  std::string kernel_name =
    "to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()];
  if (out_mem_type_ == OpenCLMemType::IMG) {
@ -49,49 +48,54 @@ int ToFormatOpenCLKernel::Init() {
    kernel_name += "_BUF";
  }

+  this->set_name(kernel_name);
 #ifdef PROGRAM_WITH_IL
  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
 #else
  std::set<std::string> build_options;
-#ifdef ENABLE_FP16
-  std::string source = to_format_source_fp16;
-#else
-  std::string source = to_format_source_fp32;
-#endif
+  std::string source = to_format_source;
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
+  InitNHWCShape();
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  return RET_OK;
 }

+int ToFormatOpenCLKernel::InitNHWCShape() {
+  std::vector<int> shapex = out_tensors_[0]->shape();
+  size_t n, h, w, c;
+  if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC) {
+    n = shapex[0];
+    h = shapex[1];
+    w = shapex[2];
+    c = shapex[3];
+  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 ||
+             out_tensors_[0]->GetFormat() == schema::Format_NCHW) {
+    n = shapex[0];
+    h = shapex[2];
+    w = shapex[3];
+    c = shapex[1];
+  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4 || out_tensors_[0]->GetFormat() == schema::Format_NC) {
+    n = shapex[0];
+    h = 1;
+    w = 1;
+    c = shapex[1];
+  } else {
+    n = shapex[0];
+    h = shapex[1];
+    w = shapex[2];
+    c = shapex[3];
+  }
+  nhwc_shape_ = {n, h, w, c};
+  return RET_OK;
+}
+
 int ToFormatOpenCLKernel::ReSize() { return RET_OK; }

 int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) {
-  std::vector<int> shapex = out_tensors_[0]->shape();
-  if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC) {
-    int h = shapex[1];
-    int w = shapex[2];
-    int c = shapex[3];
-    int c4 = UP_DIV(c, C4NUM);
-    std::vector<size_t> vec = {(size_t)h, (size_t)w, (size_t)c4};
-    *global_size = std::move(vec);
-  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 ||
-             out_tensors_[0]->GetFormat() == schema::Format_NCHW) {
-    int h = shapex[2];
-    int w = shapex[3];
-    int c = shapex[1];
-    int c4 = UP_DIV(c, C4NUM);
-    std::vector<size_t> vec = {(size_t)c4, (size_t)h, (size_t)w};
-    *global_size = std::move(vec);
-  } else if (out_tensors_[0]->GetFormat() == out_tensors_[0]->GetFormat() == schema::Format_NCHW) {
-    int h = shapex[2];
-    int w = shapex[3];
-    int c = shapex[1];
-    int w4 = UP_DIV(w, C4NUM);
-    std::vector<size_t> vec = {(size_t)w4, (size_t)h, (size_t)c};
-    *global_size = std::move(vec);
-  }
+  std::vector<size_t> vec = {nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)};
+  *global_size = std::move(vec);
  return RET_OK;
 }
 int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size,
@ -114,6 +118,12 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
    int c = shapex[3];
    im_dst_x = w * UP_DIV(c, C4NUM);
    im_dst_y = h;
+  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
+    int h = 1;
+    int w = 1;
+    int c = shapex[1];
+    im_dst_x = w * UP_DIV(c, C4NUM);
+    im_dst_y = h;
  } else {
    MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat();
  }
@ -128,15 +138,13 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
  return RET_OK;
 }
 int ToFormatOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << "ToFormat"
-                << " Running!";
+  MS_LOG(DEBUG) << this->name() << " Running!";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  std::vector<size_t> local = {};
  std::vector<size_t> global;
  GetGlobalSize(0, &global);
-  auto shapex = in_tensors_[0]->shape();
-  cl_int4 shape{shapex.size() > 0 ? shapex[0] : 1, shapex.size() > 1 ? shapex[1] : 1, shapex.size() > 2 ? shapex[2] : 1,
-                shapex.size() > 3 ? shapex[3] : 1};
+
+  cl_int4 shape{(cl_int)nhwc_shape_[0], (cl_int)nhwc_shape_[1], (cl_int)nhwc_shape_[2], (cl_int)nhwc_shape_[3]};
  cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};
  ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data());
  ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data());
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
@ -37,9 +37,11 @@ class ToFormatOpenCLKernel : public OpenCLKernel {
  int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
  int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override;
  int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override;
+  int InitNHWCShape();

 private:
  cl::Kernel kernel_;
+  std::vector<size_t> nhwc_shape_;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@ -21,8 +21,7 @@
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/transpose.h"
 #ifndef PROGRAM_WITH_IL
-#include "src/runtime/kernel/opencl/cl/fp16/transpose.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/transpose.cl.inc"
+#include "src/runtime/kernel/opencl/cl/transpose.cl.inc"
 #endif

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -45,11 +44,7 @@ int TransposeOpenCLKernel::Init() {
  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
 #else
  std::set<std::string> build_options;
-#ifdef ENABLE_FP16
-  std::string source = transpose_source_fp16;
-#else
-  std::string source = transpose_source_fp32;
-#endif
+  std::string source = transpose_source;
  std::string program_name = "transpose";
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
@ -288,13 +288,13 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
  if (fp16_enable_) {
    // fp16 enable, kernel will use half and read_imageh and write_imageh.
    build_options_str =
-      "-DFLOAT=half -DFLOAT4=half4 -DRI_F=read_imageh "
-      "-DWI_F=write_imageh";
+      "-DFLT=half -DFLT4=half4 -DFLT16=half16 "
+      "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4";
  } else {
    // fp16 not enable, kernel will use float and read_imagef and write_imagef.
    build_options_str =
-      "-DFLOAT=float -DFLOAT4=float4 -DRI_F=read_imagef "
-      "-DWI_F=write_imagef";
+      "-DFLT=float -DFLT4=float4 -DFLT16=float16 "
+      "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4";
  }

  build_options_str = std::accumulate(